1 /*
2 Copyright (c) 2013-2020 Genome Research Ltd.
3 Author: James Bonfield <jkb@sanger.ac.uk>
4
5 Redistribution and use in source and binary forms, with or without
6 modification, are permitted provided that the following conditions are met:
7
8 1. Redistributions of source code must retain the above copyright notice,
9 this list of conditions and the following disclaimer.
10
11 2. Redistributions in binary form must reproduce the above copyright notice,
12 this list of conditions and the following disclaimer in the documentation
13 and/or other materials provided with the distribution.
14
15 3. Neither the names Genome Research Ltd and Wellcome Trust Sanger
16 Institute nor the names of its contributors may be used to endorse or promote
17 products derived from this software without specific prior written permission.
18
19 THIS SOFTWARE IS PROVIDED BY GENOME RESEARCH LTD AND CONTRIBUTORS "AS IS" AND
20 ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
21 WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
22 DISCLAIMED. IN NO EVENT SHALL GENOME RESEARCH LTD OR CONTRIBUTORS BE LIABLE
23 FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
24 DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
25 SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
26 CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
27 OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
28 OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
29 */
30
31 /*
32 * The index is a gzipped tab-delimited text file with one line per slice.
33 * The columns are:
34 * 1: reference number (0 to N-1, as per BAM ref_id)
35 * 2: reference position of 1st read in slice (1..?)
36 * 3: number of reads in slice
37 * 4: offset of container start (relative to end of SAM header, so 1st
38 * container is offset 0).
39 * 5: slice number within container (ie which landmark).
40 *
41 * In memory, we hold this in a nested containment list. Each list element is
42 * a cram_index struct. Each element in turn can contain its own list of
43 * cram_index structs.
44 *
45 * Any start..end range which is entirely contained within another (and
46 * earlier as it is sorted) range will be held within it. This ensures that
47 * the outer list will never have containments and we can safely do a
48 * binary search to find the first range which overlaps any given coordinate.
49 */
50
51 #define HTS_BUILDING_LIBRARY // Enables HTSLIB_EXPORT, see htslib/hts_defs.h
52 #include <config.h>
53
54 #include <stdio.h>
55 #include <errno.h>
56 #include <assert.h>
57 #include <inttypes.h>
58 #include <stdlib.h>
59 #include <string.h>
60 #include <zlib.h>
61 #include <sys/types.h>
62 #include <sys/stat.h>
63 #include <math.h>
64
65 #include "../htslib/bgzf.h"
66 #include "../htslib/hfile.h"
67 #include "../hts_internal.h"
68 #include "cram.h"
69 #include "os.h"
70
71 #if 0
72 static void dump_index_(cram_index *e, int level) {
73 int i, n;
74 n = printf("%*s%d / %d .. %d, ", level*4, "", e->refid, e->start, e->end);
75 printf("%*soffset %"PRId64"\n", MAX(0,50-n), "", e->offset);
76 for (i = 0; i < e->nslice; i++) {
77 dump_index_(&e->e[i], level+1);
78 }
79 }
80
81 static void dump_index(cram_fd *fd) {
82 int i;
83 for (i = 0; i < fd->index_sz; i++) {
84 dump_index_(&fd->index[i], 0);
85 }
86 }
87 #endif
88
kget_int32(kstring_t * k,size_t * pos,int32_t * val_p)89 static int kget_int32(kstring_t *k, size_t *pos, int32_t *val_p) {
90 int sign = 1;
91 int32_t val = 0;
92 size_t p = *pos;
93
94 while (p < k->l && (k->s[p] == ' ' || k->s[p] == '\t'))
95 p++;
96
97 if (p < k->l && k->s[p] == '-')
98 sign = -1, p++;
99
100 if (p >= k->l || !(k->s[p] >= '0' && k->s[p] <= '9'))
101 return -1;
102
103 while (p < k->l && k->s[p] >= '0' && k->s[p] <= '9') {
104 int digit = k->s[p++]-'0';
105 val = val*10 + digit;
106 }
107
108 *pos = p;
109 *val_p = sign*val;
110
111 return 0;
112 }
113
kget_int64(kstring_t * k,size_t * pos,int64_t * val_p)114 static int kget_int64(kstring_t *k, size_t *pos, int64_t *val_p) {
115 int sign = 1;
116 int64_t val = 0;
117 size_t p = *pos;
118
119 while (p < k->l && (k->s[p] == ' ' || k->s[p] == '\t'))
120 p++;
121
122 if (p < k->l && k->s[p] == '-')
123 sign = -1, p++;
124
125 if (p >= k->l || !(k->s[p] >= '0' && k->s[p] <= '9'))
126 return -1;
127
128 while (p < k->l && k->s[p] >= '0' && k->s[p] <= '9') {
129 int digit = k->s[p++]-'0';
130 val = val*10 + digit;
131 }
132
133 *pos = p;
134 *val_p = sign*val;
135
136 return 0;
137 }
138
139 /*
140 * Loads a CRAM .crai index into memory.
141 *
142 * Returns 0 for success
143 * -1 for failure
144 */
cram_index_load(cram_fd * fd,const char * fn,const char * fn_idx)145 int cram_index_load(cram_fd *fd, const char *fn, const char *fn_idx) {
146
147 char *tfn_idx = NULL;
148 char buf[65536];
149 ssize_t len;
150 kstring_t kstr = {0};
151 hFILE *fp;
152 cram_index *idx;
153 cram_index **idx_stack = NULL, *ep, e;
154 int idx_stack_alloc = 0, idx_stack_ptr = 0;
155 size_t pos = 0;
156
157 /* Check if already loaded */
158 if (fd->index)
159 return 0;
160
161 fd->index = calloc((fd->index_sz = 1), sizeof(*fd->index));
162 if (!fd->index)
163 return -1;
164
165 idx = &fd->index[0];
166 idx->refid = -1;
167 idx->start = INT_MIN;
168 idx->end = INT_MAX;
169
170 idx_stack = calloc(++idx_stack_alloc, sizeof(*idx_stack));
171 if (!idx_stack)
172 goto fail;
173
174 idx_stack[idx_stack_ptr] = idx;
175
176 if (!fn_idx) {
177 if (hts_idx_check_local(fn, HTS_FMT_CRAI, &tfn_idx) == 0 && hisremote(fn))
178 tfn_idx = hts_idx_getfn(fn, ".crai");
179
180 if (!tfn_idx) {
181 hts_log_error("Could not retrieve index file for '%s'", fn);
182 goto fail;
183 }
184 fn_idx = tfn_idx;
185 }
186
187 if (!(fp = hopen(fn_idx, "r"))) {
188 hts_log_error("Could not open index file '%s'", fn_idx);
189 goto fail;
190 }
191
192 // Load the file into memory
193 while ((len = hread(fp, buf, sizeof(buf))) > 0) {
194 if (kputsn(buf, len, &kstr) < 0)
195 goto fail;
196 }
197
198 if (len < 0 || kstr.l < 2)
199 goto fail;
200
201 if (hclose(fp) < 0)
202 goto fail;
203
204 // Uncompress if required
205 if (kstr.s[0] == 31 && (uc)kstr.s[1] == 139) {
206 size_t l = 0;
207 char *s = zlib_mem_inflate(kstr.s, kstr.l, &l);
208 if (!s)
209 goto fail;
210
211 free(kstr.s);
212 kstr.s = s;
213 kstr.l = l;
214 kstr.m = l; // conservative estimate of the size allocated
215 if (kputsn("", 0, &kstr) < 0) // ensure kstr.s is NUL-terminated
216 goto fail;
217 }
218
219
220 // Parse it line at a time
221 while (pos < kstr.l) {
222 /* 1.1 layout */
223 if (kget_int32(&kstr, &pos, &e.refid) == -1)
224 goto fail;
225
226 if (kget_int32(&kstr, &pos, &e.start) == -1)
227 goto fail;
228
229 if (kget_int32(&kstr, &pos, &e.end) == -1)
230 goto fail;
231
232 if (kget_int64(&kstr, &pos, &e.offset) == -1)
233 goto fail;
234
235 if (kget_int32(&kstr, &pos, &e.slice) == -1)
236 goto fail;
237
238 if (kget_int32(&kstr, &pos, &e.len) == -1)
239 goto fail;
240
241 e.end += e.start-1;
242 //printf("%d/%d..%d-offset=%" PRIu64 ",len=%d,slice=%d\n", e.refid, e.start, e.end, e.offset, e.len, e.slice);
243
244 if (e.refid < -1) {
245 hts_log_error("Malformed index file, refid %d", e.refid);
246 goto fail;
247 }
248
249 if (e.refid != idx->refid) {
250 if (fd->index_sz < e.refid+2) {
251 cram_index *new_idx;
252 int new_sz = e.refid+2;
253 size_t index_end = fd->index_sz * sizeof(*fd->index);
254 new_idx = realloc(fd->index,
255 new_sz * sizeof(*fd->index));
256 if (!new_idx)
257 goto fail;
258
259 fd->index = new_idx;
260 fd->index_sz = new_sz;
261 memset(((char *)fd->index) + index_end, 0,
262 fd->index_sz * sizeof(*fd->index) - index_end);
263 }
264 idx = &fd->index[e.refid+1];
265 idx->refid = e.refid;
266 idx->start = INT_MIN;
267 idx->end = INT_MAX;
268 idx->nslice = idx->nalloc = 0;
269 idx->e = NULL;
270 idx_stack[(idx_stack_ptr = 0)] = idx;
271 }
272
273 while (!(e.start >= idx->start && e.end <= idx->end) || idx->end == 0) {
274 idx = idx_stack[--idx_stack_ptr];
275 }
276
277 // Now contains, so append
278 if (idx->nslice+1 >= idx->nalloc) {
279 cram_index *new_e;
280 idx->nalloc = idx->nalloc ? idx->nalloc*2 : 16;
281 new_e = realloc(idx->e, idx->nalloc * sizeof(*idx->e));
282 if (!new_e)
283 goto fail;
284
285 idx->e = new_e;
286 }
287
288 e.nalloc = e.nslice = 0; e.e = NULL;
289 *(ep = &idx->e[idx->nslice++]) = e;
290 idx = ep;
291
292 if (++idx_stack_ptr >= idx_stack_alloc) {
293 cram_index **new_stack;
294 idx_stack_alloc *= 2;
295 new_stack = realloc(idx_stack, idx_stack_alloc*sizeof(*idx_stack));
296 if (!new_stack)
297 goto fail;
298 idx_stack = new_stack;
299 }
300 idx_stack[idx_stack_ptr] = idx;
301
302 while (pos < kstr.l && kstr.s[pos] != '\n')
303 pos++;
304 pos++;
305 }
306
307 free(idx_stack);
308 free(kstr.s);
309 free(tfn_idx);
310
311 // dump_index(fd);
312
313 return 0;
314
315 fail:
316 free(kstr.s);
317 free(idx_stack);
318 free(tfn_idx);
319 cram_index_free(fd); // Also sets fd->index = NULL
320 return -1;
321 }
322
cram_index_free_recurse(cram_index * e)323 static void cram_index_free_recurse(cram_index *e) {
324 if (e->e) {
325 int i;
326 for (i = 0; i < e->nslice; i++) {
327 cram_index_free_recurse(&e->e[i]);
328 }
329 free(e->e);
330 }
331 }
332
cram_index_free(cram_fd * fd)333 void cram_index_free(cram_fd *fd) {
334 int i;
335
336 if (!fd->index)
337 return;
338
339 for (i = 0; i < fd->index_sz; i++) {
340 cram_index_free_recurse(&fd->index[i]);
341 }
342 free(fd->index);
343
344 fd->index = NULL;
345 }
346
347 /*
348 * Searches the index for the first slice overlapping a reference ID
349 * and position, or one immediately preceding it if none is found in
350 * the index to overlap this position. (Our index may have missing
351 * entries, but we require at least one per reference.)
352 *
353 * If the index finds multiple slices overlapping this position we
354 * return the first one only. Subsequent calls should specifying
355 * "from" as the last slice we checked to find the next one. Otherwise
356 * set "from" to be NULL to find the first one.
357 *
358 * Refid can also be any of the special HTS_IDX_ values.
359 * For backwards compatibility, refid -1 is equivalent to HTS_IDX_NOCOOR.
360 *
361 * Returns the cram_index pointer on success
362 * NULL on failure
363 */
cram_index_query(cram_fd * fd,int refid,hts_pos_t pos,cram_index * from)364 cram_index *cram_index_query(cram_fd *fd, int refid, hts_pos_t pos,
365 cram_index *from) {
366 int i, j, k;
367 cram_index *e;
368
369 switch(refid) {
370 case HTS_IDX_NONE:
371 case HTS_IDX_REST:
372 // fail, or already there, dealt with elsewhere.
373 return NULL;
374
375 case HTS_IDX_NOCOOR:
376 refid = -1;
377 pos = 0;
378 break;
379
380 case HTS_IDX_START: {
381 int64_t min_idx = INT64_MAX;
382 for (i = 0, j = -1; i < fd->index_sz; i++) {
383 if (fd->index[i].e && fd->index[i].e[0].offset < min_idx) {
384 min_idx = fd->index[i].e[0].offset;
385 j = i;
386 }
387 }
388 if (j < 0)
389 return NULL;
390 return fd->index[j].e;
391 }
392
393 default:
394 if (refid < HTS_IDX_NONE || refid+1 >= fd->index_sz)
395 return NULL;
396 }
397
398 if (!from)
399 from = &fd->index[refid+1];
400
401 // Ref with nothing aligned against it.
402 if (!from->e)
403 return NULL;
404
405 // This sequence is covered by the index, so binary search to find
406 // the optimal starting block.
407 i = 0, j = fd->index[refid+1].nslice-1;
408 for (k = j/2; k != i; k = (j-i)/2 + i) {
409 if (from->e[k].refid > refid) {
410 j = k;
411 continue;
412 }
413
414 if (from->e[k].refid < refid) {
415 i = k;
416 continue;
417 }
418
419 if (from->e[k].start >= pos) {
420 j = k;
421 continue;
422 }
423
424 if (from->e[k].start < pos) {
425 i = k;
426 continue;
427 }
428 }
429 // i==j or i==j-1. Check if j is better.
430 if (j >= 0 && from->e[j].start < pos && from->e[j].refid == refid)
431 i = j;
432
433 /* The above found *a* bin overlapping, but not necessarily the first */
434 while (i > 0 && from->e[i-1].end >= pos)
435 i--;
436
437 /* We may be one bin before the optimum, so check */
438 while (i+1 < from->nslice &&
439 (from->e[i].refid < refid ||
440 from->e[i].end < pos))
441 i++;
442
443 e = &from->e[i];
444
445 return e;
446 }
447
448 // Return the index entry for last slice on a specific reference.
cram_index_last(cram_fd * fd,int refid,cram_index * from)449 cram_index *cram_index_last(cram_fd *fd, int refid, cram_index *from) {
450 int slice;
451
452 if (refid+1 < 0 || refid+1 >= fd->index_sz)
453 return NULL;
454
455 if (!from)
456 from = &fd->index[refid+1];
457
458 // Ref with nothing aligned against it.
459 if (!from->e)
460 return NULL;
461
462 slice = fd->index[refid+1].nslice - 1;
463
464 return &from->e[slice];
465 }
466
cram_index_query_last(cram_fd * fd,int refid,hts_pos_t end)467 cram_index *cram_index_query_last(cram_fd *fd, int refid, hts_pos_t end) {
468 cram_index *first = cram_index_query(fd, refid, end, NULL);
469 cram_index *last = cram_index_last(fd, refid, NULL);
470 if (!first || !last)
471 return NULL;
472
473 while (first < last && (first+1)->start <= end)
474 first++;
475
476 while (first->e) {
477 int count = 0;
478 int nslices = first->nslice;
479 first = first->e;
480 while (++count < nslices && (first+1)->start <= end)
481 first++;
482 }
483
484 // Compute the start location of next container.
485 //
486 // This is useful for stitching containers together in the multi-region
487 // iterator. Sadly we can't compute this from the single index line.
488 //
489 // Note we can have neighbouring index entries at the same location
490 // for when we have multi-reference mode and/or multiple slices per
491 // container.
492 cram_index *next = first;
493 do {
494 if (next >= last) {
495 // Next non-empty reference
496 while (++refid+1 < fd->index_sz)
497 if (fd->index[refid+1].nslice)
498 break;
499 if (refid+1 >= fd->index_sz) {
500 next = NULL;
501 } else {
502 next = fd->index[refid+1].e;
503 last = fd->index[refid+1].e + fd->index[refid+1].nslice;
504 }
505 } else {
506 next++;
507 }
508 } while (next && next->offset == first->offset);
509
510 first->next = next ? next->offset : 0;
511
512 return first;
513 }
514
515 /*
516 * Skips to a container overlapping the start coordinate listed in
517 * cram_range.
518 *
519 * In theory we call cram_index_query multiple times, once per slice
520 * overlapping the range. However slices may be absent from the index
521 * which makes this problematic. Instead we find the left-most slice
522 * and then read from then on, skipping decoding of slices and/or
523 * whole containers when they don't overlap the specified cram_range.
524 *
525 * This function also updates the cram_fd range field.
526 *
527 * Returns 0 on success
528 * -1 on general failure
529 * -2 on no-data (empty chromosome)
530 */
cram_seek_to_refpos(cram_fd * fd,cram_range * r)531 int cram_seek_to_refpos(cram_fd *fd, cram_range *r) {
532 int ret = 0;
533 cram_index *e;
534
535 if (r->refid == HTS_IDX_NONE) {
536 ret = -2; goto err;
537 }
538
539 // Ideally use an index, so see if we have one.
540 if ((e = cram_index_query(fd, r->refid, r->start, NULL))) {
541 if (0 != cram_seek(fd, e->offset, SEEK_SET)) {
542 if (0 != cram_seek(fd, e->offset - fd->first_container, SEEK_CUR)) {
543 ret = -1; goto err;
544 }
545 }
546 } else {
547 // Absent from index, but this most likely means it simply has no data.
548 ret = -2; goto err;
549 }
550
551 pthread_mutex_lock(&fd->range_lock);
552 fd->range = *r;
553 if (r->refid == HTS_IDX_NOCOOR) {
554 fd->range.refid = -1;
555 fd->range.start = 0;
556 } else if (r->refid == HTS_IDX_START || r->refid == HTS_IDX_REST) {
557 fd->range.refid = -2; // special case in cram_next_slice
558 }
559 pthread_mutex_unlock(&fd->range_lock);
560
561 if (fd->ctr) {
562 cram_free_container(fd->ctr);
563 if (fd->ctr_mt && fd->ctr_mt != fd->ctr)
564 cram_free_container(fd->ctr_mt);
565 fd->ctr = NULL;
566 fd->ctr_mt = NULL;
567 fd->ooc = 0;
568 fd->eof = 0;
569 }
570
571 return 0;
572
573 err:
574 // It's unlikely fd->range will be accessed after EOF or error,
575 // but this maintains identical behaviour to the previous code.
576 pthread_mutex_lock(&fd->range_lock);
577 fd->range = *r;
578 pthread_mutex_unlock(&fd->range_lock);
579 return ret;
580 }
581
582
583 /*
584 * A specialised form of cram_index_build (below) that deals with slices
585 * having multiple references in this (ref_id -2). In this scenario we
586 * decode the slice to look at the RI data series instead.
587 *
588 * Returns 0 on success
589 * -1 on read failure
590 * -2 on wrong sort order
591 * -4 on write failure
592 */
cram_index_build_multiref(cram_fd * fd,cram_container * c,cram_slice * s,BGZF * fp,off_t cpos,int32_t landmark,int sz)593 static int cram_index_build_multiref(cram_fd *fd,
594 cram_container *c,
595 cram_slice *s,
596 BGZF *fp,
597 off_t cpos,
598 int32_t landmark,
599 int sz) {
600 int i, ref = -2;
601 int64_t ref_start = 0, ref_end;
602 char buf[1024];
603
604 if (fd->mode != 'w') {
605 if (0 != cram_decode_slice(fd, c, s, fd->header))
606 return -1;
607 }
608
609 ref_end = INT_MIN;
610
611 int32_t last_ref = -9;
612 int32_t last_pos = -9;
613 for (i = 0; i < s->hdr->num_records; i++) {
614 if (s->crecs[i].ref_id == last_ref && s->crecs[i].apos < last_pos) {
615 hts_log_error("CRAM file is not sorted by chromosome / position");
616 return -2;
617 }
618 last_ref = s->crecs[i].ref_id;
619 last_pos = s->crecs[i].apos;
620
621 if (s->crecs[i].ref_id == ref) {
622 if (ref_end < s->crecs[i].aend)
623 ref_end = s->crecs[i].aend;
624 continue;
625 }
626
627 if (ref != -2) {
628 sprintf(buf, "%d\t%"PRId64"\t%"PRId64"\t%"PRId64"\t%d\t%d\n",
629 ref, ref_start, ref_end - ref_start + 1,
630 (int64_t)cpos, landmark, sz);
631 if (bgzf_write(fp, buf, strlen(buf)) < 0)
632 return -4;
633 }
634
635 ref = s->crecs[i].ref_id;
636 ref_start = s->crecs[i].apos;
637 ref_end = s->crecs[i].aend;
638 }
639
640 if (ref != -2) {
641 sprintf(buf, "%d\t%"PRId64"\t%"PRId64"\t%"PRId64"\t%d\t%d\n",
642 ref, ref_start, ref_end - ref_start + 1,
643 (int64_t)cpos, landmark, sz);
644 if (bgzf_write(fp, buf, strlen(buf)) < 0)
645 return -4;
646 }
647
648 return 0;
649 }
650
651 /*
652 * Adds a single slice to the index.
653 */
cram_index_slice(cram_fd * fd,cram_container * c,cram_slice * s,BGZF * fp,off_t cpos,off_t spos,off_t sz)654 int cram_index_slice(cram_fd *fd,
655 cram_container *c,
656 cram_slice *s,
657 BGZF *fp,
658 off_t cpos,
659 off_t spos, // relative to cpos
660 off_t sz) {
661 int ret;
662 char buf[1024];
663
664 if (sz > INT_MAX) {
665 hts_log_error("CRAM slice is too big (%"PRId64" bytes)",
666 (int64_t) sz);
667 return -1;
668 }
669
670 if (s->hdr->ref_seq_id == -2) {
671 ret = cram_index_build_multiref(fd, c, s, fp, cpos, spos, sz);
672 } else {
673 sprintf(buf, "%d\t%"PRId64"\t%"PRId64"\t%"PRId64"\t%d\t%d\n",
674 s->hdr->ref_seq_id, s->hdr->ref_seq_start,
675 s->hdr->ref_seq_span, (int64_t)cpos, (int)spos, (int)sz);
676 ret = (bgzf_write(fp, buf, strlen(buf)) >= 0)? 0 : -4;
677 }
678
679 return ret;
680 }
681
682 /*
683 * Adds a single container to the index.
684 */
685 static
cram_index_container(cram_fd * fd,cram_container * c,BGZF * fp,off_t cpos)686 int cram_index_container(cram_fd *fd,
687 cram_container *c,
688 BGZF *fp,
689 off_t cpos) {
690 int j;
691 off_t spos;
692
693 // 2.0 format
694 for (j = 0; j < c->num_landmarks; j++) {
695 cram_slice *s;
696 off_t sz;
697 int ret;
698
699 spos = htell(fd->fp);
700 if (spos - cpos - c->offset != c->landmark[j]) {
701 hts_log_error("CRAM slice offset %"PRId64" does not match"
702 " landmark %d in container header (%d)",
703 spos - cpos - c->offset, j, c->landmark[j]);
704 return -1;
705 }
706
707 if (!(s = cram_read_slice(fd))) {
708 return -1;
709 }
710
711 sz = htell(fd->fp) - spos;
712 ret = cram_index_slice(fd, c, s, fp, cpos, c->landmark[j], sz);
713
714 cram_free_slice(s);
715
716 if (ret < 0) {
717 return ret;
718 }
719 }
720
721 return 0;
722 }
723
724
725 /*
726 * Builds an index file.
727 *
728 * fd is a newly opened cram file that we wish to index.
729 * fn_base is the filename of the associated CRAM file.
730 * fn_idx is the filename of the index file to be written;
731 * if NULL, we add ".crai" to fn_base to get the index filename.
732 *
733 * Returns 0 on success,
734 * negative on failure (-1 for read failure, -4 for write failure)
735 */
cram_index_build(cram_fd * fd,const char * fn_base,const char * fn_idx)736 int cram_index_build(cram_fd *fd, const char *fn_base, const char *fn_idx) {
737 cram_container *c;
738 off_t cpos, hpos;
739 BGZF *fp;
740 kstring_t fn_idx_str = {0};
741 int64_t last_ref = -9, last_start = -9;
742
743 // Useful for cram_index_build_multiref
744 cram_set_option(fd, CRAM_OPT_REQUIRED_FIELDS, SAM_RNAME | SAM_POS | SAM_CIGAR);
745
746 if (! fn_idx) {
747 kputs(fn_base, &fn_idx_str);
748 kputs(".crai", &fn_idx_str);
749 fn_idx = fn_idx_str.s;
750 }
751
752 if (!(fp = bgzf_open(fn_idx, "wg"))) {
753 perror(fn_idx);
754 free(fn_idx_str.s);
755 return -4;
756 }
757
758 free(fn_idx_str.s);
759
760 cpos = htell(fd->fp);
761 while ((c = cram_read_container(fd))) {
762 if (fd->err) {
763 perror("Cram container read");
764 return -1;
765 }
766
767 hpos = htell(fd->fp);
768
769 if (!(c->comp_hdr_block = cram_read_block(fd)))
770 return -1;
771 assert(c->comp_hdr_block->content_type == COMPRESSION_HEADER);
772
773 c->comp_hdr = cram_decode_compression_header(fd, c->comp_hdr_block);
774 if (!c->comp_hdr)
775 return -1;
776
777 if (c->ref_seq_id == last_ref && c->ref_seq_start < last_start) {
778 hts_log_error("CRAM file is not sorted by chromosome / position");
779 return -2;
780 }
781 last_ref = c->ref_seq_id;
782 last_start = c->ref_seq_start;
783
784 if (cram_index_container(fd, c, fp, cpos) < 0) {
785 bgzf_close(fp);
786 return -1;
787 }
788
789 cpos = htell(fd->fp);
790 assert(cpos == hpos + c->length);
791
792 cram_free_container(c);
793 }
794 if (fd->err) {
795 bgzf_close(fp);
796 return -1;
797 }
798
799 return (bgzf_close(fp) >= 0)? 0 : -4;
800 }
801