1 /*
2 Copyright (c) 2013-2020 Genome Research Ltd.
3 Author: James Bonfield <jkb@sanger.ac.uk>
4 
5 Redistribution and use in source and binary forms, with or without
6 modification, are permitted provided that the following conditions are met:
7 
8    1. Redistributions of source code must retain the above copyright notice,
9 this list of conditions and the following disclaimer.
10 
11    2. Redistributions in binary form must reproduce the above copyright notice,
12 this list of conditions and the following disclaimer in the documentation
13 and/or other materials provided with the distribution.
14 
15    3. Neither the names Genome Research Ltd and Wellcome Trust Sanger
16 Institute nor the names of its contributors may be used to endorse or promote
17 products derived from this software without specific prior written permission.
18 
19 THIS SOFTWARE IS PROVIDED BY GENOME RESEARCH LTD AND CONTRIBUTORS "AS IS" AND
20 ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
21 WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
22 DISCLAIMED. IN NO EVENT SHALL GENOME RESEARCH LTD OR CONTRIBUTORS BE LIABLE
23 FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
24 DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
25 SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
26 CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
27 OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
28 OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
29 */
30 
31 /*
32  * The index is a gzipped tab-delimited text file with one line per slice.
33  * The columns are:
34  * 1: reference number (0 to N-1, as per BAM ref_id)
35  * 2: reference position of 1st read in slice (1..?)
36  * 3: number of reads in slice
37  * 4: offset of container start (relative to end of SAM header, so 1st
38  *    container is offset 0).
39  * 5: slice number within container (ie which landmark).
40  *
41  * In memory, we hold this in a nested containment list. Each list element is
42  * a cram_index struct. Each element in turn can contain its own list of
43  * cram_index structs.
44  *
45  * Any start..end range which is entirely contained within another (and
46  * earlier as it is sorted) range will be held within it. This ensures that
47  * the outer list will never have containments and we can safely do a
48  * binary search to find the first range which overlaps any given coordinate.
49  */
50 
51 #define HTS_BUILDING_LIBRARY // Enables HTSLIB_EXPORT, see htslib/hts_defs.h
52 #include <config.h>
53 
54 #include <stdio.h>
55 #include <errno.h>
56 #include <assert.h>
57 #include <inttypes.h>
58 #include <stdlib.h>
59 #include <string.h>
60 #include <zlib.h>
61 #include <sys/types.h>
62 #include <sys/stat.h>
63 #include <math.h>
64 
65 #include "../htslib/bgzf.h"
66 #include "../htslib/hfile.h"
67 #include "../hts_internal.h"
68 #include "cram.h"
69 #include "os.h"
70 
71 #if 0
72 static void dump_index_(cram_index *e, int level) {
73     int i, n;
74     n = printf("%*s%d / %d .. %d, ", level*4, "", e->refid, e->start, e->end);
75     printf("%*soffset %"PRId64"\n", MAX(0,50-n), "", e->offset);
76     for (i = 0; i < e->nslice; i++) {
77         dump_index_(&e->e[i], level+1);
78     }
79 }
80 
81 static void dump_index(cram_fd *fd) {
82     int i;
83     for (i = 0; i < fd->index_sz; i++) {
84         dump_index_(&fd->index[i], 0);
85     }
86 }
87 #endif
88 
kget_int32(kstring_t * k,size_t * pos,int32_t * val_p)89 static int kget_int32(kstring_t *k, size_t *pos, int32_t *val_p) {
90     int sign = 1;
91     int32_t val = 0;
92     size_t p = *pos;
93 
94     while (p < k->l && (k->s[p] == ' ' || k->s[p] == '\t'))
95         p++;
96 
97     if (p < k->l && k->s[p] == '-')
98         sign = -1, p++;
99 
100     if (p >= k->l || !(k->s[p] >= '0' && k->s[p] <= '9'))
101         return -1;
102 
103     while (p < k->l && k->s[p] >= '0' && k->s[p] <= '9') {
104         int digit = k->s[p++]-'0';
105         val = val*10 + digit;
106     }
107 
108     *pos = p;
109     *val_p = sign*val;
110 
111     return 0;
112 }
113 
kget_int64(kstring_t * k,size_t * pos,int64_t * val_p)114 static int kget_int64(kstring_t *k, size_t *pos, int64_t *val_p) {
115     int sign = 1;
116     int64_t val = 0;
117     size_t p = *pos;
118 
119     while (p < k->l && (k->s[p] == ' ' || k->s[p] == '\t'))
120         p++;
121 
122     if (p < k->l && k->s[p] == '-')
123         sign = -1, p++;
124 
125     if (p >= k->l || !(k->s[p] >= '0' && k->s[p] <= '9'))
126         return -1;
127 
128     while (p < k->l && k->s[p] >= '0' && k->s[p] <= '9') {
129         int digit = k->s[p++]-'0';
130         val = val*10 + digit;
131     }
132 
133     *pos = p;
134     *val_p = sign*val;
135 
136     return 0;
137 }
138 
139 /*
140  * Loads a CRAM .crai index into memory.
141  *
142  * Returns 0 for success
143  *        -1 for failure
144  */
cram_index_load(cram_fd * fd,const char * fn,const char * fn_idx)145 int cram_index_load(cram_fd *fd, const char *fn, const char *fn_idx) {
146 
147     char *tfn_idx = NULL;
148     char buf[65536];
149     ssize_t len;
150     kstring_t kstr = {0};
151     hFILE *fp;
152     cram_index *idx;
153     cram_index **idx_stack = NULL, *ep, e;
154     int idx_stack_alloc = 0, idx_stack_ptr = 0;
155     size_t pos = 0;
156 
157     /* Check if already loaded */
158     if (fd->index)
159         return 0;
160 
161     fd->index = calloc((fd->index_sz = 1), sizeof(*fd->index));
162     if (!fd->index)
163         return -1;
164 
165     idx = &fd->index[0];
166     idx->refid = -1;
167     idx->start = INT_MIN;
168     idx->end   = INT_MAX;
169 
170     idx_stack = calloc(++idx_stack_alloc, sizeof(*idx_stack));
171     if (!idx_stack)
172         goto fail;
173 
174     idx_stack[idx_stack_ptr] = idx;
175 
176     if (!fn_idx) {
177         if (hts_idx_check_local(fn, HTS_FMT_CRAI, &tfn_idx) == 0 && hisremote(fn))
178             tfn_idx = hts_idx_getfn(fn, ".crai");
179 
180         if (!tfn_idx) {
181             hts_log_error("Could not retrieve index file for '%s'", fn);
182             goto fail;
183         }
184         fn_idx = tfn_idx;
185     }
186 
187     if (!(fp = hopen(fn_idx, "r"))) {
188         hts_log_error("Could not open index file '%s'", fn_idx);
189         goto fail;
190     }
191 
192     // Load the file into memory
193     while ((len = hread(fp, buf, sizeof(buf))) > 0) {
194         if (kputsn(buf, len, &kstr) < 0)
195             goto fail;
196     }
197 
198     if (len < 0 || kstr.l < 2)
199         goto fail;
200 
201     if (hclose(fp) < 0)
202         goto fail;
203 
204     // Uncompress if required
205     if (kstr.s[0] == 31 && (uc)kstr.s[1] == 139) {
206         size_t l = 0;
207         char *s = zlib_mem_inflate(kstr.s, kstr.l, &l);
208         if (!s)
209             goto fail;
210 
211         free(kstr.s);
212         kstr.s = s;
213         kstr.l = l;
214         kstr.m = l; // conservative estimate of the size allocated
215         if (kputsn("", 0, &kstr) < 0) // ensure kstr.s is NUL-terminated
216             goto fail;
217     }
218 
219 
220     // Parse it line at a time
221     while (pos < kstr.l) {
222         /* 1.1 layout */
223         if (kget_int32(&kstr, &pos, &e.refid) == -1)
224             goto fail;
225 
226         if (kget_int32(&kstr, &pos, &e.start) == -1)
227             goto fail;
228 
229         if (kget_int32(&kstr, &pos, &e.end) == -1)
230             goto fail;
231 
232         if (kget_int64(&kstr, &pos, &e.offset) == -1)
233             goto fail;
234 
235         if (kget_int32(&kstr, &pos, &e.slice) == -1)
236             goto fail;
237 
238         if (kget_int32(&kstr, &pos, &e.len) == -1)
239             goto fail;
240 
241         e.end += e.start-1;
242         //printf("%d/%d..%d-offset=%" PRIu64 ",len=%d,slice=%d\n", e.refid, e.start, e.end, e.offset, e.len, e.slice);
243 
244         if (e.refid < -1) {
245             hts_log_error("Malformed index file, refid %d", e.refid);
246             goto fail;
247         }
248 
249         if (e.refid != idx->refid) {
250             if (fd->index_sz < e.refid+2) {
251                 cram_index *new_idx;
252                 int new_sz = e.refid+2;
253                 size_t index_end = fd->index_sz * sizeof(*fd->index);
254                 new_idx = realloc(fd->index,
255                                   new_sz * sizeof(*fd->index));
256                 if (!new_idx)
257                     goto fail;
258 
259                 fd->index = new_idx;
260                 fd->index_sz = new_sz;
261                 memset(((char *)fd->index) + index_end, 0,
262                        fd->index_sz * sizeof(*fd->index) - index_end);
263             }
264             idx = &fd->index[e.refid+1];
265             idx->refid = e.refid;
266             idx->start = INT_MIN;
267             idx->end   = INT_MAX;
268             idx->nslice = idx->nalloc = 0;
269             idx->e = NULL;
270             idx_stack[(idx_stack_ptr = 0)] = idx;
271         }
272 
273         while (!(e.start >= idx->start && e.end <= idx->end) || idx->end == 0) {
274             idx = idx_stack[--idx_stack_ptr];
275         }
276 
277         // Now contains, so append
278         if (idx->nslice+1 >= idx->nalloc) {
279             cram_index *new_e;
280             idx->nalloc = idx->nalloc ? idx->nalloc*2 : 16;
281             new_e = realloc(idx->e, idx->nalloc * sizeof(*idx->e));
282             if (!new_e)
283                 goto fail;
284 
285             idx->e = new_e;
286         }
287 
288         e.nalloc = e.nslice = 0; e.e = NULL;
289         *(ep = &idx->e[idx->nslice++]) = e;
290         idx = ep;
291 
292         if (++idx_stack_ptr >= idx_stack_alloc) {
293             cram_index **new_stack;
294             idx_stack_alloc *= 2;
295             new_stack = realloc(idx_stack, idx_stack_alloc*sizeof(*idx_stack));
296             if (!new_stack)
297                 goto fail;
298             idx_stack = new_stack;
299         }
300         idx_stack[idx_stack_ptr] = idx;
301 
302         while (pos < kstr.l && kstr.s[pos] != '\n')
303             pos++;
304         pos++;
305     }
306 
307     free(idx_stack);
308     free(kstr.s);
309     free(tfn_idx);
310 
311     // dump_index(fd);
312 
313     return 0;
314 
315  fail:
316     free(kstr.s);
317     free(idx_stack);
318     free(tfn_idx);
319     cram_index_free(fd); // Also sets fd->index = NULL
320     return -1;
321 }
322 
cram_index_free_recurse(cram_index * e)323 static void cram_index_free_recurse(cram_index *e) {
324     if (e->e) {
325         int i;
326         for (i = 0; i < e->nslice; i++) {
327             cram_index_free_recurse(&e->e[i]);
328         }
329         free(e->e);
330     }
331 }
332 
cram_index_free(cram_fd * fd)333 void cram_index_free(cram_fd *fd) {
334     int i;
335 
336     if (!fd->index)
337         return;
338 
339     for (i = 0; i < fd->index_sz; i++) {
340         cram_index_free_recurse(&fd->index[i]);
341     }
342     free(fd->index);
343 
344     fd->index = NULL;
345 }
346 
347 /*
348  * Searches the index for the first slice overlapping a reference ID
349  * and position, or one immediately preceding it if none is found in
350  * the index to overlap this position. (Our index may have missing
351  * entries, but we require at least one per reference.)
352  *
353  * If the index finds multiple slices overlapping this position we
354  * return the first one only. Subsequent calls should specifying
355  * "from" as the last slice we checked to find the next one. Otherwise
356  * set "from" to be NULL to find the first one.
357  *
358  * Refid can also be any of the special HTS_IDX_ values.
359  * For backwards compatibility, refid -1 is equivalent to HTS_IDX_NOCOOR.
360  *
361  * Returns the cram_index pointer on success
362  *         NULL on failure
363  */
cram_index_query(cram_fd * fd,int refid,hts_pos_t pos,cram_index * from)364 cram_index *cram_index_query(cram_fd *fd, int refid, hts_pos_t pos,
365                              cram_index *from) {
366     int i, j, k;
367     cram_index *e;
368 
369     switch(refid) {
370     case HTS_IDX_NONE:
371     case HTS_IDX_REST:
372         // fail, or already there, dealt with elsewhere.
373         return NULL;
374 
375     case HTS_IDX_NOCOOR:
376         refid = -1;
377         pos = 0;
378         break;
379 
380     case HTS_IDX_START: {
381         int64_t min_idx = INT64_MAX;
382         for (i = 0, j = -1; i < fd->index_sz; i++) {
383             if (fd->index[i].e && fd->index[i].e[0].offset < min_idx) {
384                 min_idx = fd->index[i].e[0].offset;
385                 j = i;
386             }
387         }
388         if (j < 0)
389             return NULL;
390         return fd->index[j].e;
391     }
392 
393     default:
394         if (refid < HTS_IDX_NONE || refid+1 >= fd->index_sz)
395             return NULL;
396     }
397 
398     if (!from)
399         from = &fd->index[refid+1];
400 
401     // Ref with nothing aligned against it.
402     if (!from->e)
403         return NULL;
404 
405     // This sequence is covered by the index, so binary search to find
406     // the optimal starting block.
407     i = 0, j = fd->index[refid+1].nslice-1;
408     for (k = j/2; k != i; k = (j-i)/2 + i) {
409         if (from->e[k].refid > refid) {
410             j = k;
411             continue;
412         }
413 
414         if (from->e[k].refid < refid) {
415             i = k;
416             continue;
417         }
418 
419         if (from->e[k].start >= pos) {
420             j = k;
421             continue;
422         }
423 
424         if (from->e[k].start < pos) {
425             i = k;
426             continue;
427         }
428     }
429     // i==j or i==j-1. Check if j is better.
430     if (j >= 0 && from->e[j].start < pos && from->e[j].refid == refid)
431         i = j;
432 
433     /* The above found *a* bin overlapping, but not necessarily the first */
434     while (i > 0 && from->e[i-1].end >= pos)
435         i--;
436 
437     /* We may be one bin before the optimum, so check */
438     while (i+1 < from->nslice &&
439            (from->e[i].refid < refid ||
440             from->e[i].end < pos))
441         i++;
442 
443     e = &from->e[i];
444 
445     return e;
446 }
447 
448 // Return the index entry for last slice on a specific reference.
cram_index_last(cram_fd * fd,int refid,cram_index * from)449 cram_index *cram_index_last(cram_fd *fd, int refid, cram_index *from) {
450     int slice;
451 
452     if (refid+1 < 0 || refid+1 >= fd->index_sz)
453         return NULL;
454 
455     if (!from)
456         from = &fd->index[refid+1];
457 
458     // Ref with nothing aligned against it.
459     if (!from->e)
460         return NULL;
461 
462     slice = fd->index[refid+1].nslice - 1;
463 
464     return &from->e[slice];
465 }
466 
cram_index_query_last(cram_fd * fd,int refid,hts_pos_t end)467 cram_index *cram_index_query_last(cram_fd *fd, int refid, hts_pos_t end) {
468     cram_index *first = cram_index_query(fd, refid, end, NULL);
469     cram_index *last =  cram_index_last(fd, refid, NULL);
470     if (!first || !last)
471         return NULL;
472 
473     while (first < last && (first+1)->start <= end)
474         first++;
475 
476     while (first->e) {
477         int count = 0;
478         int nslices = first->nslice;
479         first = first->e;
480         while (++count < nslices && (first+1)->start <= end)
481             first++;
482     }
483 
484     // Compute the start location of next container.
485     //
486     // This is useful for stitching containers together in the multi-region
487     // iterator.  Sadly we can't compute this from the single index line.
488     //
489     // Note we can have neighbouring index entries at the same location
490     // for when we have multi-reference mode and/or multiple slices per
491     // container.
492     cram_index *next = first;
493     do {
494         if (next >= last) {
495             // Next non-empty reference
496             while (++refid+1 < fd->index_sz)
497                 if (fd->index[refid+1].nslice)
498                     break;
499             if (refid+1 >= fd->index_sz) {
500                 next = NULL;
501             } else {
502                 next = fd->index[refid+1].e;
503                 last = fd->index[refid+1].e + fd->index[refid+1].nslice;
504             }
505         } else {
506             next++;
507         }
508     } while (next && next->offset == first->offset);
509 
510     first->next = next ? next->offset : 0;
511 
512     return first;
513 }
514 
515 /*
516  * Skips to a container overlapping the start coordinate listed in
517  * cram_range.
518  *
519  * In theory we call cram_index_query multiple times, once per slice
520  * overlapping the range. However slices may be absent from the index
521  * which makes this problematic. Instead we find the left-most slice
522  * and then read from then on, skipping decoding of slices and/or
523  * whole containers when they don't overlap the specified cram_range.
524  *
525  * This function also updates the cram_fd range field.
526  *
527  * Returns 0 on success
528  *        -1 on general failure
529  *        -2 on no-data (empty chromosome)
530  */
cram_seek_to_refpos(cram_fd * fd,cram_range * r)531 int cram_seek_to_refpos(cram_fd *fd, cram_range *r) {
532     int ret = 0;
533     cram_index *e;
534 
535     if (r->refid == HTS_IDX_NONE) {
536         ret = -2; goto err;
537     }
538 
539     // Ideally use an index, so see if we have one.
540     if ((e = cram_index_query(fd, r->refid, r->start, NULL))) {
541         if (0 != cram_seek(fd, e->offset, SEEK_SET)) {
542             if (0 != cram_seek(fd, e->offset - fd->first_container, SEEK_CUR)) {
543                 ret = -1; goto err;
544             }
545         }
546     } else {
547         // Absent from index, but this most likely means it simply has no data.
548         ret = -2; goto err;
549     }
550 
551     pthread_mutex_lock(&fd->range_lock);
552     fd->range = *r;
553     if (r->refid == HTS_IDX_NOCOOR) {
554         fd->range.refid = -1;
555         fd->range.start = 0;
556     } else if (r->refid == HTS_IDX_START || r->refid == HTS_IDX_REST) {
557         fd->range.refid = -2; // special case in cram_next_slice
558     }
559     pthread_mutex_unlock(&fd->range_lock);
560 
561     if (fd->ctr) {
562         cram_free_container(fd->ctr);
563         if (fd->ctr_mt && fd->ctr_mt != fd->ctr)
564             cram_free_container(fd->ctr_mt);
565         fd->ctr = NULL;
566         fd->ctr_mt = NULL;
567         fd->ooc = 0;
568         fd->eof = 0;
569     }
570 
571     return 0;
572 
573  err:
574     // It's unlikely fd->range will be accessed after EOF or error,
575     // but this maintains identical behaviour to the previous code.
576     pthread_mutex_lock(&fd->range_lock);
577     fd->range = *r;
578     pthread_mutex_unlock(&fd->range_lock);
579     return ret;
580 }
581 
582 
583 /*
584  * A specialised form of cram_index_build (below) that deals with slices
585  * having multiple references in this (ref_id -2). In this scenario we
586  * decode the slice to look at the RI data series instead.
587  *
588  * Returns 0 on success
589  *        -1 on read failure
590  *        -2 on wrong sort order
591  *        -4 on write failure
592  */
cram_index_build_multiref(cram_fd * fd,cram_container * c,cram_slice * s,BGZF * fp,off_t cpos,int32_t landmark,int sz)593 static int cram_index_build_multiref(cram_fd *fd,
594                                      cram_container *c,
595                                      cram_slice *s,
596                                      BGZF *fp,
597                                      off_t cpos,
598                                      int32_t landmark,
599                                      int sz) {
600     int i, ref = -2;
601     int64_t ref_start = 0, ref_end;
602     char buf[1024];
603 
604     if (fd->mode != 'w') {
605         if (0 != cram_decode_slice(fd, c, s, fd->header))
606             return -1;
607     }
608 
609     ref_end = INT_MIN;
610 
611     int32_t last_ref = -9;
612     int32_t last_pos = -9;
613     for (i = 0; i < s->hdr->num_records; i++) {
614         if (s->crecs[i].ref_id == last_ref && s->crecs[i].apos < last_pos) {
615             hts_log_error("CRAM file is not sorted by chromosome / position");
616             return -2;
617         }
618         last_ref = s->crecs[i].ref_id;
619         last_pos = s->crecs[i].apos;
620 
621         if (s->crecs[i].ref_id == ref) {
622             if (ref_end < s->crecs[i].aend)
623                 ref_end = s->crecs[i].aend;
624             continue;
625         }
626 
627         if (ref != -2) {
628             sprintf(buf, "%d\t%"PRId64"\t%"PRId64"\t%"PRId64"\t%d\t%d\n",
629                     ref, ref_start, ref_end - ref_start + 1,
630                     (int64_t)cpos, landmark, sz);
631             if (bgzf_write(fp, buf, strlen(buf)) < 0)
632                 return -4;
633         }
634 
635         ref = s->crecs[i].ref_id;
636         ref_start = s->crecs[i].apos;
637         ref_end   = s->crecs[i].aend;
638     }
639 
640     if (ref != -2) {
641         sprintf(buf, "%d\t%"PRId64"\t%"PRId64"\t%"PRId64"\t%d\t%d\n",
642                 ref, ref_start, ref_end - ref_start + 1,
643                 (int64_t)cpos, landmark, sz);
644         if (bgzf_write(fp, buf, strlen(buf)) < 0)
645             return -4;
646     }
647 
648     return 0;
649 }
650 
651 /*
652  * Adds a single slice to the index.
653  */
cram_index_slice(cram_fd * fd,cram_container * c,cram_slice * s,BGZF * fp,off_t cpos,off_t spos,off_t sz)654 int cram_index_slice(cram_fd *fd,
655                      cram_container *c,
656                      cram_slice *s,
657                      BGZF *fp,
658                      off_t cpos,
659                      off_t spos, // relative to cpos
660                      off_t sz) {
661     int ret;
662     char buf[1024];
663 
664     if (sz > INT_MAX) {
665         hts_log_error("CRAM slice is too big (%"PRId64" bytes)",
666                       (int64_t) sz);
667         return -1;
668     }
669 
670     if (s->hdr->ref_seq_id == -2) {
671         ret = cram_index_build_multiref(fd, c, s, fp, cpos, spos, sz);
672     } else {
673         sprintf(buf, "%d\t%"PRId64"\t%"PRId64"\t%"PRId64"\t%d\t%d\n",
674                 s->hdr->ref_seq_id, s->hdr->ref_seq_start,
675                 s->hdr->ref_seq_span, (int64_t)cpos, (int)spos, (int)sz);
676         ret = (bgzf_write(fp, buf, strlen(buf)) >= 0)? 0 : -4;
677     }
678 
679     return ret;
680 }
681 
682 /*
683  * Adds a single container to the index.
684  */
685 static
cram_index_container(cram_fd * fd,cram_container * c,BGZF * fp,off_t cpos)686 int cram_index_container(cram_fd *fd,
687                          cram_container *c,
688                          BGZF *fp,
689                          off_t cpos) {
690     int j;
691     off_t spos;
692 
693     // 2.0 format
694     for (j = 0; j < c->num_landmarks; j++) {
695         cram_slice *s;
696         off_t sz;
697         int ret;
698 
699         spos = htell(fd->fp);
700         if (spos - cpos - c->offset != c->landmark[j]) {
701             hts_log_error("CRAM slice offset %"PRId64" does not match"
702                           " landmark %d in container header (%d)",
703                           spos - cpos - c->offset, j, c->landmark[j]);
704             return -1;
705         }
706 
707         if (!(s = cram_read_slice(fd))) {
708             return -1;
709         }
710 
711         sz = htell(fd->fp) - spos;
712         ret = cram_index_slice(fd, c, s, fp, cpos, c->landmark[j], sz);
713 
714         cram_free_slice(s);
715 
716         if (ret < 0) {
717             return ret;
718         }
719     }
720 
721     return 0;
722 }
723 
724 
725 /*
726  * Builds an index file.
727  *
728  * fd is a newly opened cram file that we wish to index.
729  * fn_base is the filename of the associated CRAM file.
730  * fn_idx is the filename of the index file to be written;
731  * if NULL, we add ".crai" to fn_base to get the index filename.
732  *
733  * Returns 0 on success,
734  *         negative on failure (-1 for read failure, -4 for write failure)
735  */
cram_index_build(cram_fd * fd,const char * fn_base,const char * fn_idx)736 int cram_index_build(cram_fd *fd, const char *fn_base, const char *fn_idx) {
737     cram_container *c;
738     off_t cpos, hpos;
739     BGZF *fp;
740     kstring_t fn_idx_str = {0};
741     int64_t last_ref = -9, last_start = -9;
742 
743     // Useful for cram_index_build_multiref
744     cram_set_option(fd, CRAM_OPT_REQUIRED_FIELDS, SAM_RNAME | SAM_POS | SAM_CIGAR);
745 
746     if (! fn_idx) {
747         kputs(fn_base, &fn_idx_str);
748         kputs(".crai", &fn_idx_str);
749         fn_idx = fn_idx_str.s;
750     }
751 
752     if (!(fp = bgzf_open(fn_idx, "wg"))) {
753         perror(fn_idx);
754         free(fn_idx_str.s);
755         return -4;
756     }
757 
758     free(fn_idx_str.s);
759 
760     cpos = htell(fd->fp);
761     while ((c = cram_read_container(fd))) {
762         if (fd->err) {
763             perror("Cram container read");
764             return -1;
765         }
766 
767         hpos = htell(fd->fp);
768 
769         if (!(c->comp_hdr_block = cram_read_block(fd)))
770             return -1;
771         assert(c->comp_hdr_block->content_type == COMPRESSION_HEADER);
772 
773         c->comp_hdr = cram_decode_compression_header(fd, c->comp_hdr_block);
774         if (!c->comp_hdr)
775             return -1;
776 
777         if (c->ref_seq_id == last_ref && c->ref_seq_start < last_start) {
778             hts_log_error("CRAM file is not sorted by chromosome / position");
779             return -2;
780         }
781         last_ref = c->ref_seq_id;
782         last_start = c->ref_seq_start;
783 
784         if (cram_index_container(fd, c, fp, cpos) < 0) {
785             bgzf_close(fp);
786             return -1;
787         }
788 
789         cpos = htell(fd->fp);
790         assert(cpos == hpos + c->length);
791 
792         cram_free_container(c);
793     }
794     if (fd->err) {
795         bgzf_close(fp);
796         return -1;
797     }
798 
799     return (bgzf_close(fp) >= 0)? 0 : -4;
800 }
801