1 /*
2 Copyright (c) 2013-2014 Genome Research Ltd.
3 Author: James Bonfield <jkb@sanger.ac.uk>
4 
5 Redistribution and use in source and binary forms, with or without
6 modification, are permitted provided that the following conditions are met:
7 
8    1. Redistributions of source code must retain the above copyright notice,
9 this list of conditions and the following disclaimer.
10 
11    2. Redistributions in binary form must reproduce the above copyright notice,
12 this list of conditions and the following disclaimer in the documentation
13 and/or other materials provided with the distribution.
14 
15    3. Neither the names Genome Research Ltd and Wellcome Trust Sanger
16 Institute nor the names of its contributors may be used to endorse or promote
17 products derived from this software without specific prior written permission.
18 
19 THIS SOFTWARE IS PROVIDED BY GENOME RESEARCH LTD AND CONTRIBUTORS "AS IS" AND
20 ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
21 WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
22 DISCLAIMED. IN NO EVENT SHALL GENOME RESEARCH LTD OR CONTRIBUTORS BE LIABLE
23 FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
24 DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
25 SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
26 CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
27 OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
28 OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
29 */
30 
31 /*
32  * The index is a gzipped tab-delimited text file with one line per slice.
33  * The columns are:
34  * 1: reference number (0 to N-1, as per BAM ref_id)
35  * 2: reference position of 1st read in slice (1..?)
36  * 3: number of reads in slice
37  * 4: offset of container start (relative to end of SAM header, so 1st
38  *    container is offset 0).
39  * 5: slice number within container (ie which landmark).
40  *
41  * In memory, we hold this in a nested containment list. Each list element is
42  * a cram_index struct. Each element in turn can contain its own list of
43  * cram_index structs.
44  *
45  * Any start..end range which is entirely contained within another (and
46  * earlier as it is sorted) range will be held within it. This ensures that
47  * the outer list will never have containments and we can safely do a
48  * binary search to find the first range which overlaps any given coordinate.
49  */
50 
51 #include <config.h>
52 
53 #include <stdio.h>
54 #include <errno.h>
55 #include <assert.h>
56 #include <stdlib.h>
57 #include <string.h>
58 #include <zlib.h>
59 #include <sys/types.h>
60 #include <sys/stat.h>
61 #include <math.h>
62 
63 #include "htslib/hfile.h"
64 #include "hts_internal.h"
65 #include "cram/cram.h"
66 #include "cram/os.h"
67 #include "cram/zfio.h"
68 
69 #if 0
70 static void dump_index_(cram_index *e, int level) {
71     int i, n;
72     n = printf("%*s%d / %d .. %d, ", level*4, "", e->refid, e->start, e->end);
73     printf("%*soffset %"PRId64"\n", MAX(0,50-n), "", e->offset);
74     for (i = 0; i < e->nslice; i++) {
75 	dump_index_(&e->e[i], level+1);
76     }
77 }
78 
79 static void dump_index(cram_fd *fd) {
80     int i;
81     for (i = 0; i < fd->index_sz; i++) {
82 	dump_index_(&fd->index[i], 0);
83     }
84 }
85 #endif
86 
kget_int32(kstring_t * k,size_t * pos,int32_t * val_p)87 static int kget_int32(kstring_t *k, size_t *pos, int32_t *val_p) {
88     int sign = 1;
89     int32_t val = 0;
90     size_t p = *pos;
91 
92     while (p < k->l && (k->s[p] == ' ' || k->s[p] == '\t'))
93 	   p++;
94 
95     if (p < k->l && k->s[p] == '-')
96 	sign = -1, p++;
97 
98     if (p >= k->l || !(k->s[p] >= '0' && k->s[p] <= '9'))
99 	return -1;
100 
101     while (p < k->l && k->s[p] >= '0' && k->s[p] <= '9')
102 	val = val*10 + k->s[p++]-'0';
103 
104     *pos = p;
105     *val_p = sign*val;
106 
107     return 0;
108 }
109 
kget_int64(kstring_t * k,size_t * pos,int64_t * val_p)110 static int kget_int64(kstring_t *k, size_t *pos, int64_t *val_p) {
111     int sign = 1;
112     int64_t val = 0;
113     size_t p = *pos;
114 
115     while (p < k->l && (k->s[p] == ' ' || k->s[p] == '\t'))
116 	   p++;
117 
118     if (p < k->l && k->s[p] == '-')
119 	sign = -1, p++;
120 
121     if (p >= k->l || !(k->s[p] >= '0' && k->s[p] <= '9'))
122 	return -1;
123 
124     while (p < k->l && k->s[p] >= '0' && k->s[p] <= '9')
125 	val = val*10 + k->s[p++]-'0';
126 
127     *pos = p;
128     *val_p = sign*val;
129 
130     return 0;
131 }
132 
133 /*
134  * Loads a CRAM .crai index into memory.
135  *
136  * Returns 0 for success
137  *        -1 for failure
138  */
cram_index_load(cram_fd * fd,const char * fn,const char * fn_idx)139 int cram_index_load(cram_fd *fd, const char *fn, const char *fn_idx) {
140     char *fn2 = NULL;
141     char buf[65536];
142     ssize_t len;
143     kstring_t kstr = {0};
144     hFILE *fp;
145     cram_index *idx;
146     cram_index **idx_stack = NULL, *ep, e;
147     int idx_stack_alloc = 0, idx_stack_ptr = 0;
148     size_t pos = 0;
149 
150     /* Check if already loaded */
151     if (fd->index)
152 	return 0;
153 
154     fd->index = calloc((fd->index_sz = 1), sizeof(*fd->index));
155     if (!fd->index)
156 	return -1;
157 
158     idx = &fd->index[0];
159     idx->refid = -1;
160     idx->start = INT_MIN;
161     idx->end   = INT_MAX;
162 
163     idx_stack = calloc(++idx_stack_alloc, sizeof(*idx_stack));
164     if (!idx_stack)
165         goto fail;
166 
167     idx_stack[idx_stack_ptr] = idx;
168 
169     if (!fn_idx) {
170 	fn2 = hts_idx_getfn(fn, ".crai");
171 	if (!fn2)
172             goto fail;
173 
174 	fn_idx = fn2;
175     }
176 
177     if (!(fp = hopen(fn_idx, "r"))) {
178 	perror(fn_idx);
179         goto fail;
180     }
181 
182     // Load the file into memory
183     while ((len = hread(fp, buf, sizeof(buf))) > 0) {
184 	if (kputsn(buf, len, &kstr) < 0)
185             goto fail;
186     }
187 
188     if (len < 0 || kstr.l < 2)
189         goto fail;
190 
191     if (hclose(fp) < 0)
192         goto fail;
193 
194     // Uncompress if required
195     if (kstr.s[0] == 31 && (uc)kstr.s[1] == 139) {
196 	size_t l;
197 	char *s = zlib_mem_inflate(kstr.s, kstr.l, &l);
198 	if (!s)
199             goto fail;
200 
201 	free(kstr.s);
202 	kstr.s = s;
203 	kstr.l = l;
204 	kstr.m = l; // conservative estimate of the size allocated
205 	if (kputsn("", 0, &kstr) < 0) // ensure kstr.s is NUL-terminated
206             goto fail;
207     }
208 
209 
210     // Parse it line at a time
211     do {
212 	/* 1.1 layout */
213 	if (kget_int32(&kstr, &pos, &e.refid) == -1)
214             goto fail;
215 
216 	if (kget_int32(&kstr, &pos, &e.start) == -1)
217             goto fail;
218 
219 	if (kget_int32(&kstr, &pos, &e.end) == -1)
220             goto fail;
221 
222 	if (kget_int64(&kstr, &pos, &e.offset) == -1)
223             goto fail;
224 
225 	if (kget_int32(&kstr, &pos, &e.slice) == -1)
226             goto fail;
227 
228 	if (kget_int32(&kstr, &pos, &e.len) == -1)
229             goto fail;
230 
231 	e.end += e.start-1;
232 	//printf("%d/%d..%d\n", e.refid, e.start, e.end);
233 
234 	if (e.refid < -1) {
235 	    fprintf(stderr, "Malformed index file, refid %d\n", e.refid);
236             goto fail;
237 	}
238 
239 	if (e.refid != idx->refid) {
240 	    if (fd->index_sz < e.refid+2) {
241                 cram_index *new_idx;
242                 int new_sz = e.refid+2;
243 		size_t index_end = fd->index_sz * sizeof(*fd->index);
244                 new_idx = realloc(fd->index,
245                                   new_sz * sizeof(*fd->index));
246                 if (!new_idx)
247                     goto fail;
248 
249                 fd->index = new_idx;
250                 fd->index_sz = new_sz;
251 		memset(((char *)fd->index) + index_end, 0,
252 		       fd->index_sz * sizeof(*fd->index) - index_end);
253 	    }
254 	    idx = &fd->index[e.refid+1];
255 	    idx->refid = e.refid;
256 	    idx->start = INT_MIN;
257 	    idx->end   = INT_MAX;
258 	    idx->nslice = idx->nalloc = 0;
259 	    idx->e = NULL;
260 	    idx_stack[(idx_stack_ptr = 0)] = idx;
261 	}
262 
263 	while (!(e.start >= idx->start && e.end <= idx->end) || idx->end == 0) {
264 	    idx = idx_stack[--idx_stack_ptr];
265 	}
266 
267 	// Now contains, so append
268 	if (idx->nslice+1 >= idx->nalloc) {
269             cram_index *new_e;
270 	    idx->nalloc = idx->nalloc ? idx->nalloc*2 : 16;
271 	    new_e = realloc(idx->e, idx->nalloc * sizeof(*idx->e));
272             if (!new_e)
273                 goto fail;
274 
275             idx->e = new_e;
276 	}
277 
278 	e.nalloc = e.nslice = 0; e.e = NULL;
279 	*(ep = &idx->e[idx->nslice++]) = e;
280 	idx = ep;
281 
282 	if (++idx_stack_ptr >= idx_stack_alloc) {
283             cram_index **new_stack;
284 	    idx_stack_alloc *= 2;
285 	    new_stack = realloc(idx_stack, idx_stack_alloc*sizeof(*idx_stack));
286             if (!new_stack)
287                 goto fail;
288             idx_stack = new_stack;
289 	}
290 	idx_stack[idx_stack_ptr] = idx;
291 
292 	while (pos < kstr.l && kstr.s[pos] != '\n')
293 	    pos++;
294 	pos++;
295     } while (pos < kstr.l);
296 
297     free(idx_stack);
298     free(kstr.s);
299     free(fn2);
300 
301     // dump_index(fd);
302 
303     return 0;
304 
305  fail:
306     free(kstr.s);
307     free(idx_stack);
308     free(fn2);
309     cram_index_free(fd); // Also sets fd->index = NULL
310     return -1;
311 }
312 
cram_index_free_recurse(cram_index * e)313 static void cram_index_free_recurse(cram_index *e) {
314     if (e->e) {
315 	int i;
316 	for (i = 0; i < e->nslice; i++) {
317 	    cram_index_free_recurse(&e->e[i]);
318 	}
319 	free(e->e);
320     }
321 }
322 
cram_index_free(cram_fd * fd)323 void cram_index_free(cram_fd *fd) {
324     int i;
325 
326     if (!fd->index)
327 	return;
328 
329     for (i = 0; i < fd->index_sz; i++) {
330 	cram_index_free_recurse(&fd->index[i]);
331     }
332     free(fd->index);
333 
334     fd->index = NULL;
335 }
336 
337 /*
338  * Searches the index for the first slice overlapping a reference ID
339  * and position, or one immediately preceding it if none is found in
340  * the index to overlap this position. (Our index may have missing
341  * entries, but we require at least one per reference.)
342  *
343  * If the index finds multiple slices overlapping this position we
344  * return the first one only. Subsequent calls should specifying
345  * "from" as the last slice we checked to find the next one. Otherwise
346  * set "from" to be NULL to find the first one.
347  *
348  * Returns the cram_index pointer on sucess
349  *         NULL on failure
350  */
cram_index_query(cram_fd * fd,int refid,int pos,cram_index * from)351 cram_index *cram_index_query(cram_fd *fd, int refid, int pos,
352 			     cram_index *from) {
353     int i, j, k;
354     cram_index *e;
355 
356     if (refid+1 < 0 || refid+1 >= fd->index_sz)
357 	return NULL;
358 
359     if (!from)
360 	from = &fd->index[refid+1];
361 
362     // Ref with nothing aligned against it.
363     if (!from->e)
364 	return NULL;
365 
366     // This sequence is covered by the index, so binary search to find
367     // the optimal starting block.
368     i = 0, j = fd->index[refid+1].nslice-1;
369     for (k = j/2; k != i; k = (j-i)/2 + i) {
370 	if (from->e[k].refid > refid) {
371 	    j = k;
372 	    continue;
373 	}
374 
375 	if (from->e[k].refid < refid) {
376 	    i = k;
377 	    continue;
378 	}
379 
380 	if (from->e[k].start >= pos) {
381 	    j = k;
382 	    continue;
383 	}
384 
385 	if (from->e[k].start < pos) {
386 	    i = k;
387 	    continue;
388 	}
389     }
390     // i==j or i==j-1. Check if j is better.
391     if (j >= 0 && from->e[j].start < pos && from->e[j].refid == refid)
392 	i = j;
393 
394     /* The above found *a* bin overlapping, but not necessarily the first */
395     while (i > 0 && from->e[i-1].end >= pos)
396 	i--;
397 
398     /* We may be one bin before the optimum, so check */
399     while (i+1 < from->nslice &&
400 	   (from->e[i].refid < refid ||
401 	    from->e[i].end < pos))
402 	i++;
403 
404     e = &from->e[i];
405 
406     return e;
407 }
408 
409 
410 /*
411  * Skips to a container overlapping the start coordinate listed in
412  * cram_range.
413  *
414  * In theory we call cram_index_query multiple times, once per slice
415  * overlapping the range. However slices may be absent from the index
416  * which makes this problematic. Instead we find the left-most slice
417  * and then read from then on, skipping decoding of slices and/or
418  * whole containers when they don't overlap the specified cram_range.
419  *
420  * Returns 0 on success
421  *        -1 on general failure
422  *        -2 on no-data (empty chromosome)
423  */
cram_seek_to_refpos(cram_fd * fd,cram_range * r)424 int cram_seek_to_refpos(cram_fd *fd, cram_range *r) {
425     cram_index *e;
426 
427     // Ideally use an index, so see if we have one.
428     if ((e = cram_index_query(fd, r->refid, r->start, NULL))) {
429 	if (0 != cram_seek(fd, e->offset, SEEK_SET))
430 	    if (0 != cram_seek(fd, e->offset - fd->first_container, SEEK_CUR))
431 		return -1;
432     } else {
433 	// Absent from index, but this most likely means it simply has no data.
434 	return -2;
435     }
436 
437     if (fd->ctr) {
438 	cram_free_container(fd->ctr);
439 	fd->ctr = NULL;
440 	fd->ooc = 0;
441     }
442 
443     return 0;
444 }
445 
446 
447 /*
448  * A specialised form of cram_index_build (below) that deals with slices
449  * having multiple references in this (ref_id -2). In this scenario we
450  * decode the slice to look at the RI data series instead.
451  *
452  * Returns 0 on success
453  *        -1 on failure
454  */
cram_index_build_multiref(cram_fd * fd,cram_container * c,cram_slice * s,zfp * fp,off_t cpos,int32_t landmark,int sz)455 static int cram_index_build_multiref(cram_fd *fd,
456 				     cram_container *c,
457 				     cram_slice *s,
458 				     zfp *fp,
459 				     off_t cpos,
460 				     int32_t landmark,
461 				     int sz) {
462     int i, ref = -2, ref_start = 0, ref_end;
463     char buf[1024];
464 
465     if (0 != cram_decode_slice(fd, c, s, fd->header))
466 	return -1;
467 
468     ref_end = INT_MIN;
469     for (i = 0; i < s->hdr->num_records; i++) {
470 	if (s->crecs[i].ref_id == ref) {
471 	    if (ref_end < s->crecs[i].aend)
472 		ref_end = s->crecs[i].aend;
473 	    continue;
474 	}
475 
476 	if (ref != -2) {
477 	    sprintf(buf, "%d\t%d\t%d\t%"PRId64"\t%d\t%d\n",
478 		    ref, ref_start, ref_end - ref_start + 1,
479 		    (int64_t)cpos, landmark, sz);
480 	    zfputs(buf, fp);
481 	}
482 
483 	ref = s->crecs[i].ref_id;
484 	ref_start = s->crecs[i].apos;
485 	ref_end = INT_MIN;
486     }
487 
488     if (ref != -2) {
489 	sprintf(buf, "%d\t%d\t%d\t%"PRId64"\t%d\t%d\n",
490 		ref, ref_start, ref_end - ref_start + 1,
491 		(int64_t)cpos, landmark, sz);
492 	zfputs(buf, fp);
493     }
494 
495     return 0;
496 }
497 
498 /*
499  * Builds an index file.
500  *
501  * fd is a newly opened cram file that we wish to index.
502  * fn_base is the filename of the associated CRAM file.
503  * fn_idx is the filename of the index file to be written;
504  * if NULL, we add ".crai" to fn_base to get the index filename.
505  *
506  * Returns 0 on success,
507  *         negative on failure (-1 for read failure, -4 for write failure)
508  */
cram_index_build(cram_fd * fd,const char * fn_base,const char * fn_idx)509 int cram_index_build(cram_fd *fd, const char *fn_base, const char *fn_idx) {
510     cram_container *c;
511     off_t cpos, spos, hpos;
512     zfp *fp;
513     kstring_t fn_idx_str = {0};
514 
515     if (! fn_idx) {
516         kputs(fn_base, &fn_idx_str);
517         kputs(".crai", &fn_idx_str);
518         fn_idx = fn_idx_str.s;
519     }
520 
521     if (!(fp = zfopen(fn_idx, "wz"))) {
522         perror(fn_idx);
523         free(fn_idx_str.s);
524         return -4;
525     }
526 
527     free(fn_idx_str.s);
528 
529     cpos = htell(fd->fp);
530     while ((c = cram_read_container(fd))) {
531         int j;
532 
533         if (fd->err) {
534             perror("Cram container read");
535             return -1;
536         }
537 
538         hpos = htell(fd->fp);
539 
540         if (!(c->comp_hdr_block = cram_read_block(fd)))
541             return -1;
542         assert(c->comp_hdr_block->content_type == COMPRESSION_HEADER);
543 
544         c->comp_hdr = cram_decode_compression_header(fd, c->comp_hdr_block);
545         if (!c->comp_hdr)
546             return -1;
547 
548         // 2.0 format
549         for (j = 0; j < c->num_landmarks; j++) {
550             char buf[1024];
551             cram_slice *s;
552             int sz;
553 
554             spos = htell(fd->fp);
555             assert(spos - cpos - c->offset == c->landmark[j]);
556 
557             if (!(s = cram_read_slice(fd))) {
558 		zfclose(fp);
559 		return -1;
560 	    }
561 
562             sz = (int)(htell(fd->fp) - spos);
563 
564 	    if (s->hdr->ref_seq_id == -2) {
565 		cram_index_build_multiref(fd, c, s, fp,
566 					  cpos, c->landmark[j], sz);
567 	    } else {
568 		sprintf(buf, "%d\t%d\t%d\t%"PRId64"\t%d\t%d\n",
569 			s->hdr->ref_seq_id, s->hdr->ref_seq_start,
570 			s->hdr->ref_seq_span, (int64_t)cpos,
571 			c->landmark[j], sz);
572 		zfputs(buf, fp);
573 	    }
574 
575             cram_free_slice(s);
576         }
577 
578         cpos = htell(fd->fp);
579         assert(cpos == hpos + c->length);
580 
581         cram_free_container(c);
582     }
583     if (fd->err) {
584 	zfclose(fp);
585 	return -1;
586     }
587 
588 
589     return (zfclose(fp) >= 0)? 0 : -4;
590 }
591