1 /*
2 Copyright (c) 2013-2014 Genome Research Ltd.
3 Author: James Bonfield <jkb@sanger.ac.uk>
4
5 Redistribution and use in source and binary forms, with or without
6 modification, are permitted provided that the following conditions are met:
7
8 1. Redistributions of source code must retain the above copyright notice,
9 this list of conditions and the following disclaimer.
10
11 2. Redistributions in binary form must reproduce the above copyright notice,
12 this list of conditions and the following disclaimer in the documentation
13 and/or other materials provided with the distribution.
14
15 3. Neither the names Genome Research Ltd and Wellcome Trust Sanger
16 Institute nor the names of its contributors may be used to endorse or promote
17 products derived from this software without specific prior written permission.
18
19 THIS SOFTWARE IS PROVIDED BY GENOME RESEARCH LTD AND CONTRIBUTORS "AS IS" AND
20 ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
21 WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
22 DISCLAIMED. IN NO EVENT SHALL GENOME RESEARCH LTD OR CONTRIBUTORS BE LIABLE
23 FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
24 DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
25 SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
26 CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
27 OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
28 OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
29 */
30
31 /*
32 * The index is a gzipped tab-delimited text file with one line per slice.
33 * The columns are:
34 * 1: reference number (0 to N-1, as per BAM ref_id)
35 * 2: reference position of 1st read in slice (1..?)
36 * 3: number of reads in slice
37 * 4: offset of container start (relative to end of SAM header, so 1st
38 * container is offset 0).
39 * 5: slice number within container (ie which landmark).
40 *
41 * In memory, we hold this in a nested containment list. Each list element is
42 * a cram_index struct. Each element in turn can contain its own list of
43 * cram_index structs.
44 *
45 * Any start..end range which is entirely contained within another (and
46 * earlier as it is sorted) range will be held within it. This ensures that
47 * the outer list will never have containments and we can safely do a
48 * binary search to find the first range which overlaps any given coordinate.
49 */
50
51 #include <config.h>
52
53 #include <stdio.h>
54 #include <errno.h>
55 #include <assert.h>
56 #include <stdlib.h>
57 #include <string.h>
58 #include <zlib.h>
59 #include <sys/types.h>
60 #include <sys/stat.h>
61 #include <math.h>
62
63 #include "htslib/hfile.h"
64 #include "hts_internal.h"
65 #include "cram/cram.h"
66 #include "cram/os.h"
67 #include "cram/zfio.h"
68
69 #if 0
70 static void dump_index_(cram_index *e, int level) {
71 int i, n;
72 n = printf("%*s%d / %d .. %d, ", level*4, "", e->refid, e->start, e->end);
73 printf("%*soffset %"PRId64"\n", MAX(0,50-n), "", e->offset);
74 for (i = 0; i < e->nslice; i++) {
75 dump_index_(&e->e[i], level+1);
76 }
77 }
78
79 static void dump_index(cram_fd *fd) {
80 int i;
81 for (i = 0; i < fd->index_sz; i++) {
82 dump_index_(&fd->index[i], 0);
83 }
84 }
85 #endif
86
kget_int32(kstring_t * k,size_t * pos,int32_t * val_p)87 static int kget_int32(kstring_t *k, size_t *pos, int32_t *val_p) {
88 int sign = 1;
89 int32_t val = 0;
90 size_t p = *pos;
91
92 while (p < k->l && (k->s[p] == ' ' || k->s[p] == '\t'))
93 p++;
94
95 if (p < k->l && k->s[p] == '-')
96 sign = -1, p++;
97
98 if (p >= k->l || !(k->s[p] >= '0' && k->s[p] <= '9'))
99 return -1;
100
101 while (p < k->l && k->s[p] >= '0' && k->s[p] <= '9')
102 val = val*10 + k->s[p++]-'0';
103
104 *pos = p;
105 *val_p = sign*val;
106
107 return 0;
108 }
109
kget_int64(kstring_t * k,size_t * pos,int64_t * val_p)110 static int kget_int64(kstring_t *k, size_t *pos, int64_t *val_p) {
111 int sign = 1;
112 int64_t val = 0;
113 size_t p = *pos;
114
115 while (p < k->l && (k->s[p] == ' ' || k->s[p] == '\t'))
116 p++;
117
118 if (p < k->l && k->s[p] == '-')
119 sign = -1, p++;
120
121 if (p >= k->l || !(k->s[p] >= '0' && k->s[p] <= '9'))
122 return -1;
123
124 while (p < k->l && k->s[p] >= '0' && k->s[p] <= '9')
125 val = val*10 + k->s[p++]-'0';
126
127 *pos = p;
128 *val_p = sign*val;
129
130 return 0;
131 }
132
133 /*
134 * Loads a CRAM .crai index into memory.
135 *
136 * Returns 0 for success
137 * -1 for failure
138 */
cram_index_load(cram_fd * fd,const char * fn,const char * fn_idx)139 int cram_index_load(cram_fd *fd, const char *fn, const char *fn_idx) {
140 char *fn2 = NULL;
141 char buf[65536];
142 ssize_t len;
143 kstring_t kstr = {0};
144 hFILE *fp;
145 cram_index *idx;
146 cram_index **idx_stack = NULL, *ep, e;
147 int idx_stack_alloc = 0, idx_stack_ptr = 0;
148 size_t pos = 0;
149
150 /* Check if already loaded */
151 if (fd->index)
152 return 0;
153
154 fd->index = calloc((fd->index_sz = 1), sizeof(*fd->index));
155 if (!fd->index)
156 return -1;
157
158 idx = &fd->index[0];
159 idx->refid = -1;
160 idx->start = INT_MIN;
161 idx->end = INT_MAX;
162
163 idx_stack = calloc(++idx_stack_alloc, sizeof(*idx_stack));
164 if (!idx_stack)
165 goto fail;
166
167 idx_stack[idx_stack_ptr] = idx;
168
169 if (!fn_idx) {
170 fn2 = hts_idx_getfn(fn, ".crai");
171 if (!fn2)
172 goto fail;
173
174 fn_idx = fn2;
175 }
176
177 if (!(fp = hopen(fn_idx, "r"))) {
178 perror(fn_idx);
179 goto fail;
180 }
181
182 // Load the file into memory
183 while ((len = hread(fp, buf, sizeof(buf))) > 0) {
184 if (kputsn(buf, len, &kstr) < 0)
185 goto fail;
186 }
187
188 if (len < 0 || kstr.l < 2)
189 goto fail;
190
191 if (hclose(fp) < 0)
192 goto fail;
193
194 // Uncompress if required
195 if (kstr.s[0] == 31 && (uc)kstr.s[1] == 139) {
196 size_t l;
197 char *s = zlib_mem_inflate(kstr.s, kstr.l, &l);
198 if (!s)
199 goto fail;
200
201 free(kstr.s);
202 kstr.s = s;
203 kstr.l = l;
204 kstr.m = l; // conservative estimate of the size allocated
205 if (kputsn("", 0, &kstr) < 0) // ensure kstr.s is NUL-terminated
206 goto fail;
207 }
208
209
210 // Parse it line at a time
211 do {
212 /* 1.1 layout */
213 if (kget_int32(&kstr, &pos, &e.refid) == -1)
214 goto fail;
215
216 if (kget_int32(&kstr, &pos, &e.start) == -1)
217 goto fail;
218
219 if (kget_int32(&kstr, &pos, &e.end) == -1)
220 goto fail;
221
222 if (kget_int64(&kstr, &pos, &e.offset) == -1)
223 goto fail;
224
225 if (kget_int32(&kstr, &pos, &e.slice) == -1)
226 goto fail;
227
228 if (kget_int32(&kstr, &pos, &e.len) == -1)
229 goto fail;
230
231 e.end += e.start-1;
232 //printf("%d/%d..%d\n", e.refid, e.start, e.end);
233
234 if (e.refid < -1) {
235 fprintf(stderr, "Malformed index file, refid %d\n", e.refid);
236 goto fail;
237 }
238
239 if (e.refid != idx->refid) {
240 if (fd->index_sz < e.refid+2) {
241 cram_index *new_idx;
242 int new_sz = e.refid+2;
243 size_t index_end = fd->index_sz * sizeof(*fd->index);
244 new_idx = realloc(fd->index,
245 new_sz * sizeof(*fd->index));
246 if (!new_idx)
247 goto fail;
248
249 fd->index = new_idx;
250 fd->index_sz = new_sz;
251 memset(((char *)fd->index) + index_end, 0,
252 fd->index_sz * sizeof(*fd->index) - index_end);
253 }
254 idx = &fd->index[e.refid+1];
255 idx->refid = e.refid;
256 idx->start = INT_MIN;
257 idx->end = INT_MAX;
258 idx->nslice = idx->nalloc = 0;
259 idx->e = NULL;
260 idx_stack[(idx_stack_ptr = 0)] = idx;
261 }
262
263 while (!(e.start >= idx->start && e.end <= idx->end) || idx->end == 0) {
264 idx = idx_stack[--idx_stack_ptr];
265 }
266
267 // Now contains, so append
268 if (idx->nslice+1 >= idx->nalloc) {
269 cram_index *new_e;
270 idx->nalloc = idx->nalloc ? idx->nalloc*2 : 16;
271 new_e = realloc(idx->e, idx->nalloc * sizeof(*idx->e));
272 if (!new_e)
273 goto fail;
274
275 idx->e = new_e;
276 }
277
278 e.nalloc = e.nslice = 0; e.e = NULL;
279 *(ep = &idx->e[idx->nslice++]) = e;
280 idx = ep;
281
282 if (++idx_stack_ptr >= idx_stack_alloc) {
283 cram_index **new_stack;
284 idx_stack_alloc *= 2;
285 new_stack = realloc(idx_stack, idx_stack_alloc*sizeof(*idx_stack));
286 if (!new_stack)
287 goto fail;
288 idx_stack = new_stack;
289 }
290 idx_stack[idx_stack_ptr] = idx;
291
292 while (pos < kstr.l && kstr.s[pos] != '\n')
293 pos++;
294 pos++;
295 } while (pos < kstr.l);
296
297 free(idx_stack);
298 free(kstr.s);
299 free(fn2);
300
301 // dump_index(fd);
302
303 return 0;
304
305 fail:
306 free(kstr.s);
307 free(idx_stack);
308 free(fn2);
309 cram_index_free(fd); // Also sets fd->index = NULL
310 return -1;
311 }
312
cram_index_free_recurse(cram_index * e)313 static void cram_index_free_recurse(cram_index *e) {
314 if (e->e) {
315 int i;
316 for (i = 0; i < e->nslice; i++) {
317 cram_index_free_recurse(&e->e[i]);
318 }
319 free(e->e);
320 }
321 }
322
cram_index_free(cram_fd * fd)323 void cram_index_free(cram_fd *fd) {
324 int i;
325
326 if (!fd->index)
327 return;
328
329 for (i = 0; i < fd->index_sz; i++) {
330 cram_index_free_recurse(&fd->index[i]);
331 }
332 free(fd->index);
333
334 fd->index = NULL;
335 }
336
337 /*
338 * Searches the index for the first slice overlapping a reference ID
339 * and position, or one immediately preceding it if none is found in
340 * the index to overlap this position. (Our index may have missing
341 * entries, but we require at least one per reference.)
342 *
343 * If the index finds multiple slices overlapping this position we
344 * return the first one only. Subsequent calls should specifying
345 * "from" as the last slice we checked to find the next one. Otherwise
346 * set "from" to be NULL to find the first one.
347 *
348 * Returns the cram_index pointer on sucess
349 * NULL on failure
350 */
cram_index_query(cram_fd * fd,int refid,int pos,cram_index * from)351 cram_index *cram_index_query(cram_fd *fd, int refid, int pos,
352 cram_index *from) {
353 int i, j, k;
354 cram_index *e;
355
356 if (refid+1 < 0 || refid+1 >= fd->index_sz)
357 return NULL;
358
359 if (!from)
360 from = &fd->index[refid+1];
361
362 // Ref with nothing aligned against it.
363 if (!from->e)
364 return NULL;
365
366 // This sequence is covered by the index, so binary search to find
367 // the optimal starting block.
368 i = 0, j = fd->index[refid+1].nslice-1;
369 for (k = j/2; k != i; k = (j-i)/2 + i) {
370 if (from->e[k].refid > refid) {
371 j = k;
372 continue;
373 }
374
375 if (from->e[k].refid < refid) {
376 i = k;
377 continue;
378 }
379
380 if (from->e[k].start >= pos) {
381 j = k;
382 continue;
383 }
384
385 if (from->e[k].start < pos) {
386 i = k;
387 continue;
388 }
389 }
390 // i==j or i==j-1. Check if j is better.
391 if (j >= 0 && from->e[j].start < pos && from->e[j].refid == refid)
392 i = j;
393
394 /* The above found *a* bin overlapping, but not necessarily the first */
395 while (i > 0 && from->e[i-1].end >= pos)
396 i--;
397
398 /* We may be one bin before the optimum, so check */
399 while (i+1 < from->nslice &&
400 (from->e[i].refid < refid ||
401 from->e[i].end < pos))
402 i++;
403
404 e = &from->e[i];
405
406 return e;
407 }
408
409
410 /*
411 * Skips to a container overlapping the start coordinate listed in
412 * cram_range.
413 *
414 * In theory we call cram_index_query multiple times, once per slice
415 * overlapping the range. However slices may be absent from the index
416 * which makes this problematic. Instead we find the left-most slice
417 * and then read from then on, skipping decoding of slices and/or
418 * whole containers when they don't overlap the specified cram_range.
419 *
420 * Returns 0 on success
421 * -1 on general failure
422 * -2 on no-data (empty chromosome)
423 */
cram_seek_to_refpos(cram_fd * fd,cram_range * r)424 int cram_seek_to_refpos(cram_fd *fd, cram_range *r) {
425 cram_index *e;
426
427 // Ideally use an index, so see if we have one.
428 if ((e = cram_index_query(fd, r->refid, r->start, NULL))) {
429 if (0 != cram_seek(fd, e->offset, SEEK_SET))
430 if (0 != cram_seek(fd, e->offset - fd->first_container, SEEK_CUR))
431 return -1;
432 } else {
433 // Absent from index, but this most likely means it simply has no data.
434 return -2;
435 }
436
437 if (fd->ctr) {
438 cram_free_container(fd->ctr);
439 fd->ctr = NULL;
440 fd->ooc = 0;
441 }
442
443 return 0;
444 }
445
446
447 /*
448 * A specialised form of cram_index_build (below) that deals with slices
449 * having multiple references in this (ref_id -2). In this scenario we
450 * decode the slice to look at the RI data series instead.
451 *
452 * Returns 0 on success
453 * -1 on failure
454 */
cram_index_build_multiref(cram_fd * fd,cram_container * c,cram_slice * s,zfp * fp,off_t cpos,int32_t landmark,int sz)455 static int cram_index_build_multiref(cram_fd *fd,
456 cram_container *c,
457 cram_slice *s,
458 zfp *fp,
459 off_t cpos,
460 int32_t landmark,
461 int sz) {
462 int i, ref = -2, ref_start = 0, ref_end;
463 char buf[1024];
464
465 if (0 != cram_decode_slice(fd, c, s, fd->header))
466 return -1;
467
468 ref_end = INT_MIN;
469 for (i = 0; i < s->hdr->num_records; i++) {
470 if (s->crecs[i].ref_id == ref) {
471 if (ref_end < s->crecs[i].aend)
472 ref_end = s->crecs[i].aend;
473 continue;
474 }
475
476 if (ref != -2) {
477 sprintf(buf, "%d\t%d\t%d\t%"PRId64"\t%d\t%d\n",
478 ref, ref_start, ref_end - ref_start + 1,
479 (int64_t)cpos, landmark, sz);
480 zfputs(buf, fp);
481 }
482
483 ref = s->crecs[i].ref_id;
484 ref_start = s->crecs[i].apos;
485 ref_end = INT_MIN;
486 }
487
488 if (ref != -2) {
489 sprintf(buf, "%d\t%d\t%d\t%"PRId64"\t%d\t%d\n",
490 ref, ref_start, ref_end - ref_start + 1,
491 (int64_t)cpos, landmark, sz);
492 zfputs(buf, fp);
493 }
494
495 return 0;
496 }
497
498 /*
499 * Builds an index file.
500 *
501 * fd is a newly opened cram file that we wish to index.
502 * fn_base is the filename of the associated CRAM file.
503 * fn_idx is the filename of the index file to be written;
504 * if NULL, we add ".crai" to fn_base to get the index filename.
505 *
506 * Returns 0 on success,
507 * negative on failure (-1 for read failure, -4 for write failure)
508 */
cram_index_build(cram_fd * fd,const char * fn_base,const char * fn_idx)509 int cram_index_build(cram_fd *fd, const char *fn_base, const char *fn_idx) {
510 cram_container *c;
511 off_t cpos, spos, hpos;
512 zfp *fp;
513 kstring_t fn_idx_str = {0};
514
515 if (! fn_idx) {
516 kputs(fn_base, &fn_idx_str);
517 kputs(".crai", &fn_idx_str);
518 fn_idx = fn_idx_str.s;
519 }
520
521 if (!(fp = zfopen(fn_idx, "wz"))) {
522 perror(fn_idx);
523 free(fn_idx_str.s);
524 return -4;
525 }
526
527 free(fn_idx_str.s);
528
529 cpos = htell(fd->fp);
530 while ((c = cram_read_container(fd))) {
531 int j;
532
533 if (fd->err) {
534 perror("Cram container read");
535 return -1;
536 }
537
538 hpos = htell(fd->fp);
539
540 if (!(c->comp_hdr_block = cram_read_block(fd)))
541 return -1;
542 assert(c->comp_hdr_block->content_type == COMPRESSION_HEADER);
543
544 c->comp_hdr = cram_decode_compression_header(fd, c->comp_hdr_block);
545 if (!c->comp_hdr)
546 return -1;
547
548 // 2.0 format
549 for (j = 0; j < c->num_landmarks; j++) {
550 char buf[1024];
551 cram_slice *s;
552 int sz;
553
554 spos = htell(fd->fp);
555 assert(spos - cpos - c->offset == c->landmark[j]);
556
557 if (!(s = cram_read_slice(fd))) {
558 zfclose(fp);
559 return -1;
560 }
561
562 sz = (int)(htell(fd->fp) - spos);
563
564 if (s->hdr->ref_seq_id == -2) {
565 cram_index_build_multiref(fd, c, s, fp,
566 cpos, c->landmark[j], sz);
567 } else {
568 sprintf(buf, "%d\t%d\t%d\t%"PRId64"\t%d\t%d\n",
569 s->hdr->ref_seq_id, s->hdr->ref_seq_start,
570 s->hdr->ref_seq_span, (int64_t)cpos,
571 c->landmark[j], sz);
572 zfputs(buf, fp);
573 }
574
575 cram_free_slice(s);
576 }
577
578 cpos = htell(fd->fp);
579 assert(cpos == hpos + c->length);
580
581 cram_free_container(c);
582 }
583 if (fd->err) {
584 zfclose(fp);
585 return -1;
586 }
587
588
589 return (zfclose(fp) >= 0)? 0 : -4;
590 }
591