1 /*
2 Copyright (c) 2012-2013 Genome Research Ltd.
3 Author: James Bonfield <jkb@sanger.ac.uk>
4
5 Redistribution and use in source and binary forms, with or without
6 modification, are permitted provided that the following conditions are met:
7
8 1. Redistributions of source code must retain the above copyright notice,
9 this list of conditions and the following disclaimer.
10
11 2. Redistributions in binary form must reproduce the above copyright notice,
12 this list of conditions and the following disclaimer in the documentation
13 and/or other materials provided with the distribution.
14
15 3. Neither the names Genome Research Ltd and Wellcome Trust Sanger
16 Institute nor the names of its contributors may be used to endorse or promote
17 products derived from this software without specific prior written permission.
18
19 THIS SOFTWARE IS PROVIDED BY GENOME RESEARCH LTD AND CONTRIBUTORS "AS IS" AND
20 ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
21 WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
22 DISCLAIMED. IN NO EVENT SHALL GENOME RESEARCH LTD OR CONTRIBUTORS BE LIABLE
23 FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
24 DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
25 SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
26 CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
27 OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
28 OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
29 */
30
31 #include <config.h>
32
33 #include <stdio.h>
34 #include <errno.h>
35 #include <assert.h>
36 #include <stdlib.h>
37 #include <string.h>
38 #include <zlib.h>
39 #include <sys/types.h>
40 #include <sys/stat.h>
41 #include <math.h>
42
43 #include "cram/cram.h"
44 #include "cram/os.h"
45 #include "htslib/hts.h"
46 #include "htslib/hts_endian.h"
47
48 KHASH_MAP_INIT_STR(m_s2u64, uint64_t)
49
50 #define Z_CRAM_STRAT Z_FILTERED
51 //#define Z_CRAM_STRAT Z_RLE
52 //#define Z_CRAM_STRAT Z_HUFFMAN_ONLY
53 //#define Z_CRAM_STRAT Z_DEFAULT_STRATEGY
54
55 static int process_one_read(cram_fd *fd, cram_container *c,
56 cram_slice *s, cram_record *cr,
57 bam_seq_t *b, int rnum);
58
59 /*
60 * Returns index of val into key.
61 * Basically strchr(key, val)-key;
62 */
sub_idx(char * key,char val)63 static int sub_idx(char *key, char val) {
64 int i;
65
66 for (i = 0; *key && *key++ != val; i++);
67 return i;
68 }
69
70 /*
71 * Encodes a compression header block into a generic cram_block structure.
72 *
73 * Returns cram_block ptr on success
74 * NULL on failure
75 */
cram_encode_compression_header(cram_fd * fd,cram_container * c,cram_block_compression_hdr * h)76 cram_block *cram_encode_compression_header(cram_fd *fd, cram_container *c,
77 cram_block_compression_hdr *h) {
78 cram_block *cb = cram_new_block(COMPRESSION_HEADER, 0);
79 cram_block *map = cram_new_block(COMPRESSION_HEADER, 0);
80 int i, mc;
81
82 if (!cb || !map)
83 return NULL;
84
85 /*
86 * This is a concatenation of several blocks of data:
87 * header + landmarks, preservation map, read encoding map, and the tag
88 * encoding map.
89 * All 4 are variable sized and we need to know how large these are
90 * before creating the compression header itself as this starts with
91 * the total size (stored as a variable length string).
92 */
93
94 // Duplicated from container itself, and removed in 1.1
95 if (CRAM_MAJOR_VERS(fd->version) == 1) {
96 itf8_put_blk(cb, h->ref_seq_id);
97 itf8_put_blk(cb, h->ref_seq_start);
98 itf8_put_blk(cb, h->ref_seq_span);
99 itf8_put_blk(cb, h->num_records);
100 itf8_put_blk(cb, h->num_landmarks);
101 for (i = 0; i < h->num_landmarks; i++) {
102 itf8_put_blk(cb, h->landmark[i]);
103 }
104 }
105
106 if (h->preservation_map)
107 kh_destroy(map, h->preservation_map);
108
109 /* Create in-memory preservation map */
110 /* FIXME: should create this when we create the container */
111 {
112 khint_t k;
113 int r;
114
115 if (!(h->preservation_map = kh_init(map)))
116 return NULL;
117
118 k = kh_put(map, h->preservation_map, "RN", &r);
119 if (-1 == r) return NULL;
120 kh_val(h->preservation_map, k).i = !fd->lossy_read_names;
121
122 if (CRAM_MAJOR_VERS(fd->version) == 1) {
123 k = kh_put(map, h->preservation_map, "PI", &r);
124 if (-1 == r) return NULL;
125 kh_val(h->preservation_map, k).i = 0;
126
127 k = kh_put(map, h->preservation_map, "UI", &r);
128 if (-1 == r) return NULL;
129 kh_val(h->preservation_map, k).i = 1;
130
131 k = kh_put(map, h->preservation_map, "MI", &r);
132 if (-1 == r) return NULL;
133 kh_val(h->preservation_map, k).i = 1;
134
135 } else {
136 // Technically SM was in 1.0, but wasn't in Java impl.
137 k = kh_put(map, h->preservation_map, "SM", &r);
138 if (-1 == r) return NULL;
139 kh_val(h->preservation_map, k).i = 0;
140
141 k = kh_put(map, h->preservation_map, "TD", &r);
142 if (-1 == r) return NULL;
143 kh_val(h->preservation_map, k).i = 0;
144
145 k = kh_put(map, h->preservation_map, "AP", &r);
146 if (-1 == r) return NULL;
147 kh_val(h->preservation_map, k).i = h->AP_delta;
148
149 if (fd->no_ref || fd->embed_ref) {
150 // Reference Required == No
151 k = kh_put(map, h->preservation_map, "RR", &r);
152 if (-1 == r) return NULL;
153 kh_val(h->preservation_map, k).i = 0;
154 }
155 }
156 }
157
158 /* Encode preservation map; could collapse this and above into one */
159 mc = 0;
160 BLOCK_SIZE(map) = 0;
161 if (h->preservation_map) {
162 khint_t k;
163
164 for (k = kh_begin(h->preservation_map);
165 k != kh_end(h->preservation_map);
166 k++) {
167 const char *key;
168 khash_t(map) *pmap = h->preservation_map;
169
170
171 if (!kh_exist(pmap, k))
172 continue;
173
174 key = kh_key(pmap, k);
175 BLOCK_APPEND(map, key, 2);
176
177 switch(CRAM_KEY(key[0], key[1])) {
178 case CRAM_KEY('M','I'):
179 BLOCK_APPEND_CHAR(map, kh_val(pmap, k).i);
180 break;
181
182 case CRAM_KEY('U','I'):
183 BLOCK_APPEND_CHAR(map, kh_val(pmap, k).i);
184 break;
185
186 case CRAM_KEY('P','I'):
187 BLOCK_APPEND_CHAR(map, kh_val(pmap, k).i);
188 break;
189
190 case CRAM_KEY('A','P'):
191 BLOCK_APPEND_CHAR(map, kh_val(pmap, k).i);
192 break;
193
194 case CRAM_KEY('R','N'):
195 BLOCK_APPEND_CHAR(map, kh_val(pmap, k).i);
196 break;
197
198 case CRAM_KEY('R','R'):
199 BLOCK_APPEND_CHAR(map, kh_val(pmap, k).i);
200 break;
201
202 case CRAM_KEY('S','M'): {
203 char smat[5], *mp = smat;
204 *mp++ =
205 (sub_idx("CGTN", h->substitution_matrix[0][0]) << 6) |
206 (sub_idx("CGTN", h->substitution_matrix[0][1]) << 4) |
207 (sub_idx("CGTN", h->substitution_matrix[0][2]) << 2) |
208 (sub_idx("CGTN", h->substitution_matrix[0][3]) << 0);
209 *mp++ =
210 (sub_idx("AGTN", h->substitution_matrix[1][0]) << 6) |
211 (sub_idx("AGTN", h->substitution_matrix[1][1]) << 4) |
212 (sub_idx("AGTN", h->substitution_matrix[1][2]) << 2) |
213 (sub_idx("AGTN", h->substitution_matrix[1][3]) << 0);
214 *mp++ =
215 (sub_idx("ACTN", h->substitution_matrix[2][0]) << 6) |
216 (sub_idx("ACTN", h->substitution_matrix[2][1]) << 4) |
217 (sub_idx("ACTN", h->substitution_matrix[2][2]) << 2) |
218 (sub_idx("ACTN", h->substitution_matrix[2][3]) << 0);
219 *mp++ =
220 (sub_idx("ACGN", h->substitution_matrix[3][0]) << 6) |
221 (sub_idx("ACGN", h->substitution_matrix[3][1]) << 4) |
222 (sub_idx("ACGN", h->substitution_matrix[3][2]) << 2) |
223 (sub_idx("ACGN", h->substitution_matrix[3][3]) << 0);
224 *mp++ =
225 (sub_idx("ACGT", h->substitution_matrix[4][0]) << 6) |
226 (sub_idx("ACGT", h->substitution_matrix[4][1]) << 4) |
227 (sub_idx("ACGT", h->substitution_matrix[4][2]) << 2) |
228 (sub_idx("ACGT", h->substitution_matrix[4][3]) << 0);
229 BLOCK_APPEND(map, smat, 5);
230 break;
231 }
232
233 case CRAM_KEY('T','D'): {
234 itf8_put_blk(map, BLOCK_SIZE(h->TD_blk));
235 BLOCK_APPEND(map,
236 BLOCK_DATA(h->TD_blk),
237 BLOCK_SIZE(h->TD_blk));
238 break;
239 }
240
241 default:
242 fprintf(stderr, "Unknown preservation key '%.2s'\n", key);
243 break;
244 }
245
246 mc++;
247 }
248 }
249 itf8_put_blk(cb, BLOCK_SIZE(map) + itf8_size(mc));
250 itf8_put_blk(cb, mc);
251 BLOCK_APPEND(cb, BLOCK_DATA(map), BLOCK_SIZE(map));
252
253 /* rec encoding map */
254 mc = 0;
255 BLOCK_SIZE(map) = 0;
256 if (h->codecs[DS_BF]) {
257 if (-1 == h->codecs[DS_BF]->store(h->codecs[DS_BF], map, "BF",
258 fd->version))
259 return NULL;
260 mc++;
261 }
262 if (h->codecs[DS_CF]) {
263 if (-1 == h->codecs[DS_CF]->store(h->codecs[DS_CF], map, "CF",
264 fd->version))
265 return NULL;
266 mc++;
267 }
268 if (h->codecs[DS_RL]) {
269 if (-1 == h->codecs[DS_RL]->store(h->codecs[DS_RL], map, "RL",
270 fd->version))
271 return NULL;
272 mc++;
273 }
274 if (h->codecs[DS_AP]) {
275 if (-1 == h->codecs[DS_AP]->store(h->codecs[DS_AP], map, "AP",
276 fd->version))
277 return NULL;
278 mc++;
279 }
280 if (h->codecs[DS_RG]) {
281 if (-1 == h->codecs[DS_RG]->store(h->codecs[DS_RG], map, "RG",
282 fd->version))
283 return NULL;
284 mc++;
285 }
286 if (h->codecs[DS_MF]) {
287 if (-1 == h->codecs[DS_MF]->store(h->codecs[DS_MF], map, "MF",
288 fd->version))
289 return NULL;
290 mc++;
291 }
292 if (h->codecs[DS_NS]) {
293 if (-1 == h->codecs[DS_NS]->store(h->codecs[DS_NS], map, "NS",
294 fd->version))
295 return NULL;
296 mc++;
297 }
298 if (h->codecs[DS_NP]) {
299 if (-1 == h->codecs[DS_NP]->store(h->codecs[DS_NP], map, "NP",
300 fd->version))
301 return NULL;
302 mc++;
303 }
304 if (h->codecs[DS_TS]) {
305 if (-1 == h->codecs[DS_TS]->store(h->codecs[DS_TS], map, "TS",
306 fd->version))
307 return NULL;
308 mc++;
309 }
310 if (h->codecs[DS_NF]) {
311 if (-1 == h->codecs[DS_NF]->store(h->codecs[DS_NF], map, "NF",
312 fd->version))
313 return NULL;
314 mc++;
315 }
316 if (h->codecs[DS_TC]) {
317 if (-1 == h->codecs[DS_TC]->store(h->codecs[DS_TC], map, "TC",
318 fd->version))
319 return NULL;
320 mc++;
321 }
322 if (h->codecs[DS_TN]) {
323 if (-1 == h->codecs[DS_TN]->store(h->codecs[DS_TN], map, "TN",
324 fd->version))
325 return NULL;
326 mc++;
327 }
328 if (h->codecs[DS_TL]) {
329 if (-1 == h->codecs[DS_TL]->store(h->codecs[DS_TL], map, "TL",
330 fd->version))
331 return NULL;
332 mc++;
333 }
334 if (h->codecs[DS_FN]) {
335 if (-1 == h->codecs[DS_FN]->store(h->codecs[DS_FN], map, "FN",
336 fd->version))
337 return NULL;
338 mc++;
339 }
340 if (h->codecs[DS_FC]) {
341 if (-1 == h->codecs[DS_FC]->store(h->codecs[DS_FC], map, "FC",
342 fd->version))
343 return NULL;
344 mc++;
345 }
346 if (h->codecs[DS_FP]) {
347 if (-1 == h->codecs[DS_FP]->store(h->codecs[DS_FP], map, "FP",
348 fd->version))
349 return NULL;
350 mc++;
351 }
352 if (h->codecs[DS_BS]) {
353 if (-1 == h->codecs[DS_BS]->store(h->codecs[DS_BS], map, "BS",
354 fd->version))
355 return NULL;
356 mc++;
357 }
358 if (h->codecs[DS_IN]) {
359 if (-1 == h->codecs[DS_IN]->store(h->codecs[DS_IN], map, "IN",
360 fd->version))
361 return NULL;
362 mc++;
363 }
364 if (h->codecs[DS_DL]) {
365 if (-1 == h->codecs[DS_DL]->store(h->codecs[DS_DL], map, "DL",
366 fd->version))
367 return NULL;
368 mc++;
369 }
370 if (h->codecs[DS_BA]) {
371 if (-1 == h->codecs[DS_BA]->store(h->codecs[DS_BA], map, "BA",
372 fd->version))
373 return NULL;
374 mc++;
375 }
376 if (h->codecs[DS_BB]) {
377 if (-1 == h->codecs[DS_BB]->store(h->codecs[DS_BB], map, "BB",
378 fd->version))
379 return NULL;
380 mc++;
381 }
382 if (h->codecs[DS_MQ]) {
383 if (-1 == h->codecs[DS_MQ]->store(h->codecs[DS_MQ], map, "MQ",
384 fd->version))
385 return NULL;
386 mc++;
387 }
388 if (h->codecs[DS_RN]) {
389 if (-1 == h->codecs[DS_RN]->store(h->codecs[DS_RN], map, "RN",
390 fd->version))
391 return NULL;
392 mc++;
393 }
394 if (h->codecs[DS_QS]) {
395 if (-1 == h->codecs[DS_QS]->store(h->codecs[DS_QS], map, "QS",
396 fd->version))
397 return NULL;
398 mc++;
399 }
400 if (h->codecs[DS_QQ]) {
401 if (-1 == h->codecs[DS_QQ]->store(h->codecs[DS_QQ], map, "QQ",
402 fd->version))
403 return NULL;
404 mc++;
405 }
406 if (h->codecs[DS_RI]) {
407 if (-1 == h->codecs[DS_RI]->store(h->codecs[DS_RI], map, "RI",
408 fd->version))
409 return NULL;
410 mc++;
411 }
412 if (CRAM_MAJOR_VERS(fd->version) != 1) {
413 if (h->codecs[DS_SC]) {
414 if (-1 == h->codecs[DS_SC]->store(h->codecs[DS_SC], map, "SC",
415 fd->version))
416 return NULL;
417 mc++;
418 }
419 if (h->codecs[DS_RS]) {
420 if (-1 == h->codecs[DS_RS]->store(h->codecs[DS_RS], map, "RS",
421 fd->version))
422 return NULL;
423 mc++;
424 }
425 if (h->codecs[DS_PD]) {
426 if (-1 == h->codecs[DS_PD]->store(h->codecs[DS_PD], map, "PD",
427 fd->version))
428 return NULL;
429 mc++;
430 }
431 if (h->codecs[DS_HC]) {
432 if (-1 == h->codecs[DS_HC]->store(h->codecs[DS_HC], map, "HC",
433 fd->version))
434 return NULL;
435 mc++;
436 }
437 }
438 if (h->codecs[DS_TM]) {
439 if (-1 == h->codecs[DS_TM]->store(h->codecs[DS_TM], map, "TM",
440 fd->version))
441 return NULL;
442 mc++;
443 }
444 if (h->codecs[DS_TV]) {
445 if (-1 == h->codecs[DS_TV]->store(h->codecs[DS_TV], map, "TV",
446 fd->version))
447 return NULL;
448 mc++;
449 }
450 itf8_put_blk(cb, BLOCK_SIZE(map) + itf8_size(mc));
451 itf8_put_blk(cb, mc);
452 BLOCK_APPEND(cb, BLOCK_DATA(map), BLOCK_SIZE(map));
453
454 /* tag encoding map */
455 #if 0
456 mp = map; mc = 0;
457 if (h->tag_encoding_map) {
458 HashItem *hi;
459 HashIter *iter = HashTableIterCreate();
460 if (!iter)
461 return NULL;
462
463 while ((hi = HashTableIterNext(h->tag_encoding_map, iter))) {
464 cram_map *m = hi->data.p;
465 int sz;
466
467 mp += itf8_put(mp, (hi->key[0]<<16)|(hi->key[1]<<8)|hi->key[2]);
468 if (-1 == (sz = m->codec->store(m->codec, mp, NULL, fd->version)))
469 return NULL;
470 mp += sz;
471 mc++;
472 }
473
474 HashTableIterDestroy(iter);
475 }
476 #else
477 mc = 0;
478 BLOCK_SIZE(map) = 0;
479 if (c->tags_used) {
480 khint_t k;
481
482 for (k = kh_begin(c->tags_used); k != kh_end(c->tags_used); k++) {
483 int key;
484 if (!kh_exist(c->tags_used, k))
485 continue;
486
487 key = kh_key(c->tags_used, k);
488 cram_codec *cd = kh_val(c->tags_used, k)->codec;
489
490 itf8_put_blk(map, key);
491 if (-1 == cd->store(cd, map, NULL, fd->version))
492 return NULL;
493
494 mc++;
495 }
496 }
497 #endif
498 itf8_put_blk(cb, BLOCK_SIZE(map) + itf8_size(mc));
499 itf8_put_blk(cb, mc);
500 BLOCK_APPEND(cb, BLOCK_DATA(map), BLOCK_SIZE(map));
501
502 if (fd->verbose)
503 fprintf(stderr, "Wrote compression block header in %d bytes\n",
504 (int)BLOCK_SIZE(cb));
505
506 BLOCK_UPLEN(cb);
507
508 cram_free_block(map);
509
510 return cb;
511 }
512
513
514 /*
515 * Encodes a slice compression header.
516 *
517 * Returns cram_block on success
518 * NULL on failure
519 */
cram_encode_slice_header(cram_fd * fd,cram_slice * s)520 cram_block *cram_encode_slice_header(cram_fd *fd, cram_slice *s) {
521 char *buf;
522 char *cp;
523 cram_block *b = cram_new_block(MAPPED_SLICE, 0);
524 int j;
525
526 if (!b)
527 return NULL;
528
529 if (NULL == (cp = buf = malloc(16+5*(8+s->hdr->num_blocks)))) {
530 cram_free_block(b);
531 return NULL;
532 }
533
534 cp += itf8_put(cp, s->hdr->ref_seq_id);
535 cp += itf8_put(cp, s->hdr->ref_seq_start);
536 cp += itf8_put(cp, s->hdr->ref_seq_span);
537 cp += itf8_put(cp, s->hdr->num_records);
538 if (CRAM_MAJOR_VERS(fd->version) == 2)
539 cp += itf8_put(cp, s->hdr->record_counter);
540 else if (CRAM_MAJOR_VERS(fd->version) >= 3)
541 cp += ltf8_put(cp, s->hdr->record_counter);
542 cp += itf8_put(cp, s->hdr->num_blocks);
543 cp += itf8_put(cp, s->hdr->num_content_ids);
544 for (j = 0; j < s->hdr->num_content_ids; j++) {
545 cp += itf8_put(cp, s->hdr->block_content_ids[j]);
546 }
547 if (s->hdr->content_type == MAPPED_SLICE)
548 cp += itf8_put(cp, s->hdr->ref_base_id);
549
550 if (CRAM_MAJOR_VERS(fd->version) != 1) {
551 memcpy(cp, s->hdr->md5, 16); cp += 16;
552 }
553
554 assert(cp-buf <= 16+5*(8+s->hdr->num_blocks));
555
556 b->data = (unsigned char *)buf;
557 b->comp_size = b->uncomp_size = cp-buf;
558
559 return b;
560 }
561
562
563 /*
564 * Encodes a single read.
565 *
566 * Returns 0 on success
567 * -1 on failure
568 */
cram_encode_slice_read(cram_fd * fd,cram_container * c,cram_block_compression_hdr * h,cram_slice * s,cram_record * cr,int * last_pos)569 static int cram_encode_slice_read(cram_fd *fd,
570 cram_container *c,
571 cram_block_compression_hdr *h,
572 cram_slice *s,
573 cram_record *cr,
574 int *last_pos) {
575 int r = 0;
576 int32_t i32;
577 unsigned char uc;
578
579 //fprintf(stderr, "Encode seq %d, %d/%d FN=%d, %s\n", rec, core->byte, core->bit, cr->nfeature, s->name_ds->str + cr->name);
580
581 //printf("BF=0x%x\n", cr->flags);
582 // bf = cram_flag_swap[cr->flags];
583 i32 = fd->cram_flag_swap[cr->flags & 0xfff];
584 r |= h->codecs[DS_BF]->encode(s, h->codecs[DS_BF], (char *)&i32, 1);
585
586 i32 = cr->cram_flags & CRAM_FLAG_MASK;
587 r |= h->codecs[DS_CF]->encode(s, h->codecs[DS_CF], (char *)&i32, 1);
588
589 if (CRAM_MAJOR_VERS(fd->version) != 1 && s->hdr->ref_seq_id == -2)
590 r |= h->codecs[DS_RI]->encode(s, h->codecs[DS_RI], (char *)&cr->ref_id, 1);
591
592 r |= h->codecs[DS_RL]->encode(s, h->codecs[DS_RL], (char *)&cr->len, 1);
593
594 if (c->pos_sorted) {
595 i32 = cr->apos - *last_pos;
596 r |= h->codecs[DS_AP]->encode(s, h->codecs[DS_AP], (char *)&i32, 1);
597 *last_pos = cr->apos;
598 } else {
599 i32 = cr->apos;
600 r |= h->codecs[DS_AP]->encode(s, h->codecs[DS_AP], (char *)&i32, 1);
601 }
602
603 r |= h->codecs[DS_RG]->encode(s, h->codecs[DS_RG], (char *)&cr->rg, 1);
604
605 if (cr->cram_flags & CRAM_FLAG_DETACHED) {
606 i32 = cr->mate_flags;
607 r |= h->codecs[DS_MF]->encode(s, h->codecs[DS_MF], (char *)&i32, 1);
608
609 r |= h->codecs[DS_NS]->encode(s, h->codecs[DS_NS],
610 (char *)&cr->mate_ref_id, 1);
611
612 r |= h->codecs[DS_NP]->encode(s, h->codecs[DS_NP],
613 (char *)&cr->mate_pos, 1);
614
615 r |= h->codecs[DS_TS]->encode(s, h->codecs[DS_TS],
616 (char *)&cr->tlen, 1);
617 } else if (cr->cram_flags & CRAM_FLAG_MATE_DOWNSTREAM) {
618 r |= h->codecs[DS_NF]->encode(s, h->codecs[DS_NF],
619 (char *)&cr->mate_line, 1);
620 }
621
622 /* Aux tags */
623 if (CRAM_MAJOR_VERS(fd->version) == 1) {
624 int j;
625 uc = cr->ntags;
626 r |= h->codecs[DS_TC]->encode(s, h->codecs[DS_TC], (char *)&uc, 1);
627
628 for (j = 0; j < cr->ntags; j++) {
629 uint32_t i32 = s->TN[cr->TN_idx + j]; // id
630 r |= h->codecs[DS_TN]->encode(s, h->codecs[DS_TN], (char *)&i32, 1);
631 }
632 } else {
633 r |= h->codecs[DS_TL]->encode(s, h->codecs[DS_TL], (char *)&cr->TL, 1);
634 }
635
636 // qual
637 // QS codec : Already stored in block[2].
638
639 // features (diffs)
640 if (!(cr->flags & BAM_FUNMAP)) {
641 int prev_pos = 0, j;
642
643 r |= h->codecs[DS_FN]->encode(s, h->codecs[DS_FN],
644 (char *)&cr->nfeature, 1);
645 for (j = 0; j < cr->nfeature; j++) {
646 cram_feature *f = &s->features[cr->feature + j];
647
648 uc = f->X.code;
649 r |= h->codecs[DS_FC]->encode(s, h->codecs[DS_FC], (char *)&uc, 1);
650 i32 = f->X.pos - prev_pos;
651 r |= h->codecs[DS_FP]->encode(s, h->codecs[DS_FP], (char *)&i32, 1);
652 prev_pos = f->X.pos;
653
654 switch(f->X.code) {
655 //char *seq;
656
657 case 'X':
658 //fprintf(stderr, " FC=%c FP=%d base=%d\n", f->X.code, i32, f->X.base);
659
660 uc = f->X.base;
661 r |= h->codecs[DS_BS]->encode(s, h->codecs[DS_BS],
662 (char *)&uc, 1);
663 break;
664 case 'S':
665 // Already done
666 // r |= h->codecs[DS_SC]->encode(s, h->codecs[DS_SC],
667 // BLOCK_DATA(s->soft_blk) + f->S.seq_idx,
668 // f->S.len);
669
670 // if (IS_CRAM_3_VERS(fd)) {
671 // r |= h->codecs[DS_BB]->encode(s, h->codecs[DS_BB],
672 // BLOCK_DATA(s->seqs_blk) + f->S.seq_idx,
673 // f->S.len);
674 // }
675 break;
676 case 'I':
677 //seq = DSTRING_STR(s->seqs_ds) + f->S.seq_idx;
678 //r |= h->codecs[DS_IN]->encode(s, h->codecs[DS_IN],
679 // seq, f->S.len);
680 // if (IS_CRAM_3_VERS(fd)) {
681 // r |= h->codecs[DS_BB]->encode(s, h->codecs[DS_BB],
682 // BLOCK_DATA(s->seqs_blk) + f->I.seq_idx,
683 // f->I.len);
684 // }
685 break;
686 case 'i':
687 uc = f->i.base;
688 r |= h->codecs[DS_BA]->encode(s, h->codecs[DS_BA],
689 (char *)&uc, 1);
690 //seq = DSTRING_STR(s->seqs_ds) + f->S.seq_idx;
691 //r |= h->codecs[DS_IN]->encode(s, h->codecs[DS_IN],
692 // seq, 1);
693 break;
694 case 'D':
695 i32 = f->D.len;
696 r |= h->codecs[DS_DL]->encode(s, h->codecs[DS_DL],
697 (char *)&i32, 1);
698 break;
699
700 case 'B':
701 // // Used when we try to store a non ACGTN base or an N
702 // // that aligns against a non ACGTN reference
703
704 uc = f->B.base;
705 r |= h->codecs[DS_BA]->encode(s, h->codecs[DS_BA],
706 (char *)&uc, 1);
707
708 // Already added
709 // uc = f->B.qual;
710 // r |= h->codecs[DS_QS]->encode(s, h->codecs[DS_QS],
711 // (char *)&uc, 1);
712 break;
713
714 case 'b':
715 // string of bases
716 r |= h->codecs[DS_BB]->encode(s, h->codecs[DS_BB],
717 (char *)BLOCK_DATA(s->seqs_blk)
718 + f->b.seq_idx,
719 f->b.len);
720 break;
721
722 case 'Q':
723 // Already added
724 // uc = f->B.qual;
725 // r |= h->codecs[DS_QS]->encode(s, h->codecs[DS_QS],
726 // (char *)&uc, 1);
727 break;
728
729 case 'N':
730 i32 = f->N.len;
731 r |= h->codecs[DS_RS]->encode(s, h->codecs[DS_RS],
732 (char *)&i32, 1);
733 break;
734
735 case 'P':
736 i32 = f->P.len;
737 r |= h->codecs[DS_PD]->encode(s, h->codecs[DS_PD],
738 (char *)&i32, 1);
739 break;
740
741 case 'H':
742 i32 = f->H.len;
743 r |= h->codecs[DS_HC]->encode(s, h->codecs[DS_HC],
744 (char *)&i32, 1);
745 break;
746
747
748 default:
749 fprintf(stderr, "unhandled feature code %c\n",
750 f->X.code);
751 return -1;
752 }
753 }
754
755 r |= h->codecs[DS_MQ]->encode(s, h->codecs[DS_MQ],
756 (char *)&cr->mqual, 1);
757 } else {
758 char *seq = (char *)BLOCK_DATA(s->seqs_blk) + cr->seq;
759 if (cr->len)
760 r |= h->codecs[DS_BA]->encode(s, h->codecs[DS_BA], seq, cr->len);
761 }
762
763 return r ? -1 : 0;
764 }
765
766
767 /*
768 * Applies various compression methods to specific blocks, depending on
769 * known observations of how data series compress.
770 *
771 * Returns 0 on success
772 * -1 on failure
773 */
cram_compress_slice(cram_fd * fd,cram_container * c,cram_slice * s)774 static int cram_compress_slice(cram_fd *fd, cram_container *c, cram_slice *s) {
775 int level = fd->level, i;
776 int method = 1<<GZIP | 1<<GZIP_RLE, methodF = method;
777
778 /* Compress the CORE Block too, with minimal zlib level */
779 if (level > 5 && s->block[0]->uncomp_size > 500)
780 cram_compress_block(fd, s->block[0], NULL, 1<<GZIP, 1);
781
782 if (fd->use_bz2)
783 method |= 1<<BZIP2;
784
785 if (fd->use_rans)
786 method |= (1<<RANS0) | (1<<RANS1);
787
788 if (fd->use_lzma)
789 method |= (1<<LZMA);
790
791 /* Faster method for data series we only need entropy encoding on */
792 methodF = method & ~(1<<GZIP | 1<<BZIP2 | 1<<LZMA);
793 if (level >= 6)
794 methodF = method;
795
796
797 /* Specific compression methods for certain block types */
798 if (cram_compress_block(fd, s->block[DS_IN], fd->m[DS_IN], //IN (seq)
799 method, level))
800 return -1;
801
802 if (fd->level == 0) {
803 /* Do nothing */
804 } else if (fd->level == 1) {
805 if (cram_compress_block(fd, s->block[DS_QS], fd->m[DS_QS],
806 methodF, 1))
807 return -1;
808 for (i = DS_aux; i <= DS_aux_oz; i++) {
809 if (s->block[i])
810 if (cram_compress_block(fd, s->block[i], fd->m[i],
811 method, 1))
812 return -1;
813 }
814 } else if (fd->level < 3) {
815 if (cram_compress_block(fd, s->block[DS_QS], fd->m[DS_QS],
816 method, 1))
817 return -1;
818 if (cram_compress_block(fd, s->block[DS_BA], fd->m[DS_BA],
819 method, 1))
820 return -1;
821 if (s->block[DS_BB])
822 if (cram_compress_block(fd, s->block[DS_BB], fd->m[DS_BB],
823 method, 1))
824 return -1;
825 for (i = DS_aux; i <= DS_aux_oz; i++) {
826 if (s->block[i])
827 if (cram_compress_block(fd, s->block[i], fd->m[i],
828 method, level))
829 return -1;
830 }
831 } else {
832 if (cram_compress_block(fd, s->block[DS_QS], fd->m[DS_QS],
833 method, level))
834 return -1;
835 if (cram_compress_block(fd, s->block[DS_BA], fd->m[DS_BA],
836 method, level))
837 return -1;
838 if (s->block[DS_BB])
839 if (cram_compress_block(fd, s->block[DS_BB], fd->m[DS_BB],
840 method, level))
841 return -1;
842 for (i = DS_aux; i <= DS_aux_oz; i++) {
843 if (s->block[i])
844 if (cram_compress_block(fd, s->block[i], fd->m[i],
845 method, level))
846 return -1;
847 }
848 }
849
850 // NAME: best is generally xz, bzip2, zlib then rans1
851 // It benefits well from a little bit extra compression level.
852 if (cram_compress_block(fd, s->block[DS_RN], fd->m[DS_RN],
853 method & ~(1<<RANS0 | 1<<GZIP_RLE),
854 MIN(9,level)))
855 return -1;
856
857 // NS shows strong local correlation as rearrangements are localised
858 if (s->block[DS_NS] != s->block[0])
859 if (cram_compress_block(fd, s->block[DS_NS], fd->m[DS_NS],
860 method, level))
861 return -1;
862
863
864 /*
865 * Compress any auxiliary tags with their own per-tag metrics
866 */
867 {
868 int i;
869 for (i = 0; i < s->naux_block; i++) {
870 if (!s->aux_block[i] || s->aux_block[i] == s->block[0])
871 continue;
872
873 if (s->aux_block[i]->method != RAW)
874 continue;
875
876 if (cram_compress_block(fd, s->aux_block[i], s->aux_block[i]->m,
877 method, level))
878 return -1;
879 }
880 }
881
882 /*
883 * Minimal compression of any block still uncompressed, bar CORE
884 */
885 {
886 int i;
887 for (i = 1; i < s->hdr->num_blocks && i < DS_END; i++) {
888 if (!s->block[i] || s->block[i] == s->block[0])
889 continue;
890
891 if (s->block[i]->method != RAW)
892 continue;
893
894 if (cram_compress_block(fd, s->block[i], fd->m[i],
895 methodF, level))
896 return -1;
897 }
898 }
899
900 return 0;
901 }
902
903 /*
904 * Encodes a single slice from a container
905 *
906 * Returns 0 on success
907 * -1 on failure
908 */
cram_encode_slice(cram_fd * fd,cram_container * c,cram_block_compression_hdr * h,cram_slice * s)909 static int cram_encode_slice(cram_fd *fd, cram_container *c,
910 cram_block_compression_hdr *h, cram_slice *s) {
911 int rec, r = 0, last_pos;
912 int embed_ref;
913 enum cram_DS_ID id;
914
915 embed_ref = fd->embed_ref && s->hdr->ref_seq_id != -1 ? 1 : 0;
916
917 /*
918 * Slice external blocks:
919 * ID 0 => base calls (insertions, soft-clip)
920 * ID 1 => qualities
921 * ID 2 => names
922 * ID 3 => TS (insert size), NP (next frag)
923 * ID 4 => tag values
924 * ID 6 => tag IDs (TN), if CRAM_V1.0
925 * ID 7 => TD tag dictionary, if !CRAM_V1.0
926 */
927
928 /* Create cram slice header */
929 s->hdr->ref_base_id = embed_ref ? DS_ref : -1;
930 s->hdr->record_counter = c->num_records + c->record_counter;
931 c->num_records += s->hdr->num_records;
932
933 int ntags = c->tags_used ? c->tags_used->n_occupied : 0;
934 s->block = calloc(DS_END + ntags, sizeof(s->block[0]));
935 s->hdr->block_content_ids = malloc(DS_END * sizeof(int32_t));
936 if (!s->block || !s->hdr->block_content_ids)
937 return -1;
938
939 // Create first fixed blocks, always external.
940 // CORE
941 if (!(s->block[0] = cram_new_block(CORE, 0)))
942 return -1;
943
944 // TN block for CRAM v1
945 if (CRAM_MAJOR_VERS(fd->version) == 1) {
946 if (h->codecs[DS_TN]->codec == E_EXTERNAL) {
947 if (!(s->block[DS_TN] = cram_new_block(EXTERNAL,DS_TN))) return -1;
948 h->codecs[DS_TN]->external.content_id = DS_TN;
949 } else {
950 s->block[DS_TN] = s->block[0];
951 }
952 s->block[DS_TN] = s->block[DS_TN];
953 }
954
955 // Embedded reference
956 if (embed_ref) {
957 if (!(s->block[DS_ref] = cram_new_block(EXTERNAL, DS_ref)))
958 return -1;
959 s->ref_id = DS_ref; // needed?
960 BLOCK_APPEND(s->block[DS_ref],
961 c->ref + c->first_base - c->ref_start,
962 c->last_base - c->first_base + 1);
963 }
964
965 /*
966 * All the data-series blocks if appropriate.
967 */
968 for (id = DS_BF; id < DS_TN; id++) {
969 if (h->codecs[id] && (h->codecs[id]->codec == E_EXTERNAL ||
970 h->codecs[id]->codec == E_BYTE_ARRAY_STOP ||
971 h->codecs[id]->codec == E_BYTE_ARRAY_LEN)) {
972 switch (h->codecs[id]->codec) {
973 case E_EXTERNAL:
974 if (!(s->block[id] = cram_new_block(EXTERNAL, id)))
975 return -1;
976 h->codecs[id]->external.content_id = id;
977 break;
978
979 case E_BYTE_ARRAY_STOP:
980 if (!(s->block[id] = cram_new_block(EXTERNAL, id)))
981 return -1;
982 h->codecs[id]->byte_array_stop.content_id = id;
983 break;
984
985 case E_BYTE_ARRAY_LEN: {
986 cram_codec *cc;
987
988 cc = h->codecs[id]->e_byte_array_len.len_codec;
989 if (cc->codec == E_EXTERNAL) {
990 int eid = cc->external.content_id;
991 if (!(s->block[eid] = cram_new_block(EXTERNAL, eid)))
992 return -1;
993 cc->external.content_id = eid;
994 cc->out = s->block[eid];
995 }
996
997 cc = h->codecs[id]->e_byte_array_len.val_codec;
998 if (cc->codec == E_EXTERNAL) {
999 int eid = cc->external.content_id;
1000 if (!s->block[eid])
1001 if (!(s->block[eid] = cram_new_block(EXTERNAL, eid)))
1002 return -1;
1003 cc->external.content_id = eid;
1004 cc->out = s->block[eid];
1005 }
1006 break;
1007 }
1008 default:
1009 break;
1010 }
1011 } else {
1012 if (!(id == DS_BB && !h->codecs[DS_BB]))
1013 s->block[id] = s->block[0];
1014 }
1015 if (h->codecs[id])
1016 h->codecs[id]->out = s->block[id];
1017 }
1018
1019 /*
1020 * Add in the external tag blocks too.
1021 */
1022 if (c->tags_used) {
1023 int n;
1024 s->hdr->num_blocks = DS_END;
1025 for (n = 0; n < s->naux_block; n++)
1026 s->block[s->hdr->num_blocks++] = s->aux_block[n];
1027 }
1028
1029 /* Encode reads */
1030 last_pos = s->hdr->ref_seq_start;
1031 for (rec = 0; rec < s->hdr->num_records; rec++) {
1032 cram_record *cr = &s->crecs[rec];
1033 if (cram_encode_slice_read(fd, c, h, s, cr, &last_pos) == -1)
1034 return -1;
1035 }
1036
1037 s->block[0]->uncomp_size = s->block[0]->byte + (s->block[0]->bit < 7);
1038 s->block[0]->comp_size = s->block[0]->uncomp_size;
1039
1040 // Make sure the fixed blocks point to the correct sources
1041 s->block[DS_IN] = s->base_blk; s->base_blk = NULL;
1042 s->block[DS_QS] = s->qual_blk; s->qual_blk = NULL;
1043 s->block[DS_RN] = s->name_blk; s->name_blk = NULL;
1044 s->block[DS_SC] = s->soft_blk; s->soft_blk = NULL;
1045
1046 // Ensure block sizes are up to date.
1047 for (id = 1; id < s->hdr->num_blocks; id++) {
1048 if (!s->block[id] || s->block[id] == s->block[0])
1049 continue;
1050
1051 if (s->block[id]->uncomp_size == 0)
1052 BLOCK_UPLEN(s->block[id]);
1053 }
1054
1055 // Compress it all
1056 if (cram_compress_slice(fd, c, s) == -1)
1057 return -1;
1058
1059 // Collapse empty blocks and create hdr_block
1060 {
1061 int i, j;
1062
1063 s->hdr->block_content_ids = realloc(s->hdr->block_content_ids,
1064 s->hdr->num_blocks * sizeof(int32_t));
1065 if (!s->hdr->block_content_ids)
1066 return -1;
1067
1068 for (i = j = 1; i < s->hdr->num_blocks; i++) {
1069 if (!s->block[i] || s->block[i] == s->block[0])
1070 continue;
1071 if (s->block[i]->uncomp_size == 0) {
1072 cram_free_block(s->block[i]);
1073 s->block[i] = NULL;
1074 continue;
1075 }
1076 s->block[j] = s->block[i];
1077 s->hdr->block_content_ids[j-1] = s->block[i]->content_id;
1078 j++;
1079 }
1080 s->hdr->num_content_ids = j-1;
1081 s->hdr->num_blocks = j;
1082
1083 if (!(s->hdr_block = cram_encode_slice_header(fd, s)))
1084 return -1;
1085 }
1086
1087 return r ? -1 : 0;
1088 }
1089
1090 /*
1091 * Returns the number of expected read names for this record.
1092 */
expected_template_count(bam_seq_t * b)1093 static int expected_template_count(bam_seq_t *b) {
1094 int expected = bam_flag(b) & BAM_FPAIRED ? 2 : 1;
1095
1096 uint8_t *TC = (uint8_t *)bam_aux_get(b, "TC");
1097 if (TC) {
1098 int n = bam_aux2i(TC);
1099 if (expected < n)
1100 expected = n;
1101 }
1102
1103 if (!TC && bam_aux_get(b, "SA")) {
1104 // We could count the semicolons, but we'd have to do this for
1105 // read1, read2 and read(not-1-or-2) combining the results
1106 // together. This is a cheap and safe alternative for now.
1107 expected = INT_MAX;
1108 }
1109
1110 return expected;
1111 }
1112
1113 /*
1114 * Lossily reject read names.
1115 *
1116 * The rule here is that if all reads for this template reside in the
1117 * same slice then we can lose the name. Otherwise we keep them as we
1118 * do not know when (or if) the other reads will turn up.
1119 *
1120 * Note there may be only 1 read (non-paired library) or more than 2
1121 * reads (paired library with supplementary reads), or other weird
1122 * setups. We need to know how many are expected. Ways to guess:
1123 *
1124 * - Flags (0x1 - has > 1 read)
1125 * - TC aux field (not mandatory)
1126 * - SA tags (count semicolons, NB per fragment so sum - hard)
1127 * - RNEXT/PNEXT uniqueness count. (not implemented, tricky)
1128 *
1129 * Returns 0 on success
1130 * -1 on failure
1131 */
lossy_read_names(cram_fd * fd,cram_container * c,cram_slice * s,int bam_start)1132 static int lossy_read_names(cram_fd *fd, cram_container *c, cram_slice *s,
1133 int bam_start) {
1134 int r1, r2, ret = -1;
1135
1136 // Initialise cram_flags
1137 for (r2 = 0; r2 < s->hdr->num_records; r2++)
1138 s->crecs[r2].cram_flags = 0;
1139
1140 if (!fd->lossy_read_names)
1141 return 0;
1142
1143 khash_t(m_s2u64) *names = kh_init(m_s2u64);
1144 if (!names)
1145 goto fail;
1146
1147 // 1: Iterate through names to count frequency
1148 for (r1 = bam_start, r2 = 0; r2 < s->hdr->num_records; r1++, r2++) {
1149 //cram_record *cr = &s->crecs[r2];
1150 bam_seq_t *b = c->bams[r1];
1151 khint_t k;
1152 int n;
1153 uint64_t e;
1154 union {
1155 uint64_t i64;
1156 struct {
1157 int32_t e,c; // expected & observed counts.
1158 };
1159 } u;
1160
1161 e = expected_template_count(b);
1162 u.e = e; u.c = 1;
1163
1164 k = kh_put(m_s2u64, names, bam_name(b), &n);
1165 if (n == -1)
1166 goto fail;
1167
1168 if (n == 0) {
1169 // not a new name
1170 u.i64 = kh_val(names, k);
1171 if (u.e != e) {
1172 // different expectation or already hit the max
1173 //fprintf(stderr, "Err computing no. %s recs\n", bam_name(b));
1174 kh_val(names, k) = 0;
1175 } else {
1176 u.c++;
1177 if (u.e == u.c) {
1178 // Reached expected count.
1179 kh_val(names, k) = -1;
1180 } else {
1181 kh_val(names, k) = u.i64;
1182 }
1183 }
1184 } else {
1185 // new name
1186 kh_val(names, k) = u.i64;
1187 }
1188 }
1189
1190 // 2: Remove names if all present (hd.i == -1)
1191 for (r1 = bam_start, r2 = 0; r2 < s->hdr->num_records; r1++, r2++) {
1192 cram_record *cr = &s->crecs[r2];
1193 bam_seq_t *b = c->bams[r1];
1194 khint_t k;
1195
1196 k = kh_get(m_s2u64, names, bam_name(b));
1197
1198 if (k == kh_end(names))
1199 goto fail;
1200
1201 if (kh_val(names, k) == -1)
1202 cr->cram_flags = CRAM_FLAG_DISCARD_NAME;
1203 }
1204
1205 ret = 0;
1206 fail: // ret==-1
1207
1208 if (names)
1209 kh_destroy(m_s2u64, names);
1210
1211 return ret;
1212 }
1213
1214 /*
1215 * Adds the reading names. We do this here as a separate pass rather
1216 * than per record in the process_one_read calls as that function can
1217 * go back and change the CRAM_FLAG_DETACHED status of a previously
1218 * processed read if it subsequently determines the TLEN field is
1219 * incorrect. Given DETACHED reads always try to decode read names,
1220 * we need to know their status before generating the read-name block.
1221 *
1222 * Output is an update s->name_blk, and cr->name / cr->name_len
1223 * fields.
1224 */
add_read_names(cram_fd * fd,cram_container * c,cram_slice * s,int bam_start)1225 static void add_read_names(cram_fd *fd, cram_container *c, cram_slice *s,
1226 int bam_start) {
1227 int r1, r2;
1228 int keep_names = !fd->lossy_read_names;
1229
1230 for (r1 = bam_start, r2 = 0;
1231 r1 < c->curr_c_rec && r2 < s->hdr->num_records;
1232 r1++, r2++) {
1233 cram_record *cr = &s->crecs[r2];
1234 bam_seq_t *b = c->bams[r1];
1235
1236 cr->name = BLOCK_SIZE(s->name_blk);
1237 if ((cr->cram_flags & CRAM_FLAG_DETACHED) || keep_names) {
1238 BLOCK_APPEND(s->name_blk, bam_name(b), bam_name_len(b));
1239 cr->name_len = bam_name_len(b);
1240 } else {
1241 // Can only discard duplicate names if not detached
1242 cr->name_len = 0;
1243 }
1244 cram_stats_add(c->stats[DS_RN], cr->name_len);
1245 }
1246 }
1247
1248 /*
1249 * Encodes all slices in a container into blocks.
1250 * Returns 0 on success
1251 * -1 on failure
1252 */
cram_encode_container(cram_fd * fd,cram_container * c)1253 int cram_encode_container(cram_fd *fd, cram_container *c) {
1254 int i, j, slice_offset;
1255 cram_block_compression_hdr *h = c->comp_hdr;
1256 cram_block *c_hdr;
1257 int multi_ref = 0;
1258 int r1, r2, sn, nref;
1259 spare_bams *spares;
1260
1261 /* Cache references up-front if we have unsorted access patterns */
1262 pthread_mutex_lock(&fd->ref_lock);
1263 nref = fd->refs->nref;
1264 pthread_mutex_unlock(&fd->ref_lock);
1265
1266 if (!fd->no_ref && c->refs_used) {
1267 for (i = 0; i < nref; i++) {
1268 if (c->refs_used[i])
1269 cram_get_ref(fd, i, 1, 0);
1270 }
1271 }
1272
1273 /* To create M5 strings */
1274 /* Fetch reference sequence */
1275 if (!fd->no_ref) {
1276 bam_seq_t *b = c->bams[0];
1277 char *ref;
1278
1279 ref = cram_get_ref(fd, bam_ref(b), 1, 0);
1280 if (!ref && bam_ref(b) >= 0) {
1281 fprintf(stderr, "Failed to load reference #%d\n", bam_ref(b));
1282 return -1;
1283 }
1284 if ((c->ref_id = bam_ref(b)) >= 0) {
1285 c->ref_seq_id = c->ref_id;
1286 c->ref = fd->refs->ref_id[c->ref_seq_id]->seq;
1287 c->ref_start = 1;
1288 c->ref_end = fd->refs->ref_id[c->ref_seq_id]->length;
1289 } else {
1290 c->ref_seq_id = c->ref_id; // FIXME remove one var!
1291 }
1292 } else {
1293 c->ref_id = bam_ref(c->bams[0]);
1294 cram_ref_incr(fd->refs, c->ref_id);
1295 c->ref_seq_id = c->ref_id;
1296 }
1297
1298 /* Turn bams into cram_records and gather basic stats */
1299 for (r1 = sn = 0; r1 < c->curr_c_rec; sn++) {
1300 cram_slice *s = c->slices[sn];
1301 int first_base = INT_MAX, last_base = INT_MIN;
1302
1303 int r1_start = r1;
1304
1305 assert(sn < c->curr_slice);
1306
1307 // Discover which read names *may* be safely removed.
1308 // Ie which ones have all their records in this slice.
1309 if (lossy_read_names(fd, c, s, r1_start) != 0)
1310 return -1;
1311
1312 // Iterate through records creating the cram blocks for some
1313 // fields and just gathering stats for others.
1314 for (r2 = 0; r1 < c->curr_c_rec && r2 < s->hdr->num_records; r1++, r2++) {
1315 cram_record *cr = &s->crecs[r2];
1316 bam_seq_t *b = c->bams[r1];
1317
1318 /* If multi-ref we need to cope with changing reference per seq */
1319 if (c->multi_seq && !fd->no_ref) {
1320 if (bam_ref(b) != c->ref_seq_id && bam_ref(b) >= 0) {
1321 if (c->ref_seq_id >= 0)
1322 cram_ref_decr(fd->refs, c->ref_seq_id);
1323
1324 if (!cram_get_ref(fd, bam_ref(b), 1, 0)) {
1325 fprintf(stderr, "Failed to load reference #%d\n",
1326 bam_ref(b));
1327 return -1;
1328 }
1329
1330 c->ref_seq_id = bam_ref(b); // overwritten later by -2
1331 if (!fd->refs->ref_id[c->ref_seq_id]->seq)
1332 return -1;
1333 c->ref = fd->refs->ref_id[c->ref_seq_id]->seq;
1334 c->ref_start = 1;
1335 c->ref_end = fd->refs->ref_id[c->ref_seq_id]->length;
1336 }
1337 }
1338
1339 if (process_one_read(fd, c, s, cr, b, r2) != 0)
1340 return -1;
1341
1342 if (first_base > cr->apos)
1343 first_base = cr->apos;
1344
1345 if (last_base < cr->aend)
1346 last_base = cr->aend;
1347 }
1348
1349 // Process_one_read doesn't add read names as it can change
1350 // its mind during the loop on the CRAM_FLAG_DETACHED setting
1351 // of earlier records (if it detects the auto-generation of
1352 // TLEN is incorrect). This affects which read-names can be
1353 // lossily compressed, so we do these in another pass.
1354 add_read_names(fd, c, s, r1_start);
1355
1356 if (c->multi_seq) {
1357 s->hdr->ref_seq_id = -2;
1358 s->hdr->ref_seq_start = 0;
1359 s->hdr->ref_seq_span = 0;
1360 } else {
1361 s->hdr->ref_seq_id = c->ref_id;
1362 s->hdr->ref_seq_start = first_base;
1363 s->hdr->ref_seq_span = MAX(0, last_base - first_base + 1);
1364 }
1365 s->hdr->num_records = r2;
1366
1367 // Processed a slice, now stash the aux blocks so the next
1368 // slice can start aggregating them from the start again.
1369 if (c->tags_used->n_occupied) {
1370 int ntags = c->tags_used->n_occupied;
1371 s->aux_block = calloc(ntags, sizeof(*s->aux_block));
1372 if (!s->aux_block)
1373 return -1;
1374
1375 khint_t k;
1376
1377 s->naux_block = 0;
1378 for (k = kh_begin(c->tags_used); k != kh_end(c->tags_used); k++) {
1379 if (!kh_exist(c->tags_used, k))
1380 continue;
1381
1382 cram_tag_map *tm = kh_val(c->tags_used, k);
1383 if (!tm->blk) continue;
1384 s->aux_block[s->naux_block++] = tm->blk;
1385 tm->blk = NULL;
1386 }
1387 assert(s->naux_block <= c->tags_used->n_occupied);
1388 }
1389 }
1390
1391 if (c->multi_seq && !fd->no_ref) {
1392 if (c->ref_seq_id >= 0)
1393 cram_ref_decr(fd->refs, c->ref_seq_id);
1394 }
1395
1396 /* Link our bams[] array onto the spare bam list for reuse */
1397 spares = malloc(sizeof(*spares));
1398 pthread_mutex_lock(&fd->bam_list_lock);
1399 spares->bams = c->bams;
1400 spares->next = fd->bl;
1401 fd->bl = spares;
1402 pthread_mutex_unlock(&fd->bam_list_lock);
1403 c->bams = NULL;
1404
1405 /* Detect if a multi-seq container */
1406 cram_stats_encoding(fd, c->stats[DS_RI]);
1407 multi_ref = c->stats[DS_RI]->nvals > 1;
1408
1409 if (multi_ref) {
1410 if (fd->verbose)
1411 fprintf(stderr, "Multi-ref container\n");
1412 c->ref_seq_id = -2;
1413 c->ref_seq_start = 0;
1414 c->ref_seq_span = 0;
1415 }
1416
1417
1418 /* Compute MD5s */
1419 for (i = 0; i < c->curr_slice; i++) {
1420 cram_slice *s = c->slices[i];
1421
1422 if (CRAM_MAJOR_VERS(fd->version) != 1) {
1423 if (s->hdr->ref_seq_id >= 0 && c->multi_seq == 0 && !fd->no_ref) {
1424 hts_md5_context *md5 = hts_md5_init();
1425 if (!md5)
1426 return -1;
1427 hts_md5_update(md5,
1428 c->ref + s->hdr->ref_seq_start - c->ref_start,
1429 s->hdr->ref_seq_span);
1430 hts_md5_final(s->hdr->md5, md5);
1431 hts_md5_destroy(md5);
1432 } else {
1433 memset(s->hdr->md5, 0, 16);
1434 }
1435 }
1436 }
1437
1438 c->num_records = 0;
1439 c->num_blocks = 1; // cram_block_compression_hdr
1440 c->length = 0;
1441
1442 //fprintf(stderr, "=== BF ===\n");
1443 h->codecs[DS_BF] = cram_encoder_init(cram_stats_encoding(fd, c->stats[DS_BF]),
1444 c->stats[DS_BF], E_INT, NULL,
1445 fd->version);
1446
1447 //fprintf(stderr, "=== CF ===\n");
1448 h->codecs[DS_CF] = cram_encoder_init(cram_stats_encoding(fd, c->stats[DS_CF]),
1449 c->stats[DS_CF], E_INT, NULL,
1450 fd->version);
1451 // fprintf(stderr, "=== RN ===\n");
1452 // h->codecs[DS_RN] = cram_encoder_init(cram_stats_encoding(fd, c->stats[DS_RN]),
1453 // c->stats[DS_RN], E_BYTE_ARRAY, NULL,
1454 // fd->version);
1455
1456 //fprintf(stderr, "=== AP ===\n");
1457 if (c->pos_sorted) {
1458 h->codecs[DS_AP] = cram_encoder_init(cram_stats_encoding(fd, c->stats[DS_AP]),
1459 c->stats[DS_AP], E_INT, NULL,
1460 fd->version);
1461 } else {
1462 int p[2] = {0, c->max_apos};
1463 h->codecs[DS_AP] = cram_encoder_init(E_BETA, NULL, E_INT, p,
1464 fd->version);
1465 }
1466
1467 //fprintf(stderr, "=== RG ===\n");
1468 h->codecs[DS_RG] = cram_encoder_init(cram_stats_encoding(fd, c->stats[DS_RG]),
1469 c->stats[DS_RG], E_INT, NULL,
1470 fd->version);
1471
1472 //fprintf(stderr, "=== MQ ===\n");
1473 h->codecs[DS_MQ] = cram_encoder_init(cram_stats_encoding(fd, c->stats[DS_MQ]),
1474 c->stats[DS_MQ], E_INT, NULL,
1475 fd->version);
1476
1477 //fprintf(stderr, "=== NS ===\n");
1478 h->codecs[DS_NS] = cram_encoder_init(cram_stats_encoding(fd, c->stats[DS_NS]),
1479 c->stats[DS_NS], E_INT, NULL,
1480 fd->version);
1481
1482 //fprintf(stderr, "=== MF ===\n");
1483 h->codecs[DS_MF] = cram_encoder_init(cram_stats_encoding(fd, c->stats[DS_MF]),
1484 c->stats[DS_MF], E_INT, NULL,
1485 fd->version);
1486
1487 //fprintf(stderr, "=== TS ===\n");
1488 h->codecs[DS_TS] = cram_encoder_init(cram_stats_encoding(fd, c->stats[DS_TS]),
1489 c->stats[DS_TS], E_INT, NULL,
1490 fd->version);
1491 //fprintf(stderr, "=== NP ===\n");
1492 h->codecs[DS_NP] = cram_encoder_init(cram_stats_encoding(fd, c->stats[DS_NP]),
1493 c->stats[DS_NP], E_INT, NULL,
1494 fd->version);
1495 //fprintf(stderr, "=== NF ===\n");
1496 h->codecs[DS_NF] = cram_encoder_init(cram_stats_encoding(fd, c->stats[DS_NF]),
1497 c->stats[DS_NF], E_INT, NULL,
1498 fd->version);
1499
1500 //fprintf(stderr, "=== RL ===\n");
1501 h->codecs[DS_RL] = cram_encoder_init(cram_stats_encoding(fd, c->stats[DS_RL]),
1502 c->stats[DS_RL], E_INT, NULL,
1503 fd->version);
1504
1505 //fprintf(stderr, "=== FN ===\n");
1506 h->codecs[DS_FN] = cram_encoder_init(cram_stats_encoding(fd, c->stats[DS_FN]),
1507 c->stats[DS_FN], E_INT, NULL,
1508 fd->version);
1509
1510 //fprintf(stderr, "=== FC ===\n");
1511 h->codecs[DS_FC] = cram_encoder_init(cram_stats_encoding(fd, c->stats[DS_FC]),
1512 c->stats[DS_FC], E_BYTE, NULL,
1513 fd->version);
1514
1515 //fprintf(stderr, "=== FP ===\n");
1516 h->codecs[DS_FP] = cram_encoder_init(cram_stats_encoding(fd, c->stats[DS_FP]),
1517 c->stats[DS_FP], E_INT, NULL,
1518 fd->version);
1519
1520 //fprintf(stderr, "=== DL ===\n");
1521 h->codecs[DS_DL] = cram_encoder_init(cram_stats_encoding(fd, c->stats[DS_DL]),
1522 c->stats[DS_DL], E_INT, NULL,
1523 fd->version);
1524
1525 //fprintf(stderr, "=== BA ===\n");
1526 h->codecs[DS_BA] = cram_encoder_init(cram_stats_encoding(fd, c->stats[DS_BA]),
1527 c->stats[DS_BA], E_BYTE, NULL,
1528 fd->version);
1529
1530 if (CRAM_MAJOR_VERS(fd->version) >= 3) {
1531 cram_byte_array_len_encoder e;
1532
1533 e.len_encoding = E_EXTERNAL;
1534 e.len_dat = (void *)DS_BB_len;
1535 //e.len_dat = (void *)DS_BB;
1536
1537 e.val_encoding = E_EXTERNAL;
1538 e.val_dat = (void *)DS_BB;
1539
1540 h->codecs[DS_BB] = cram_encoder_init(E_BYTE_ARRAY_LEN, NULL,
1541 E_BYTE_ARRAY, (void *)&e,
1542 fd->version);
1543 } else {
1544 h->codecs[DS_BB] = NULL;
1545 }
1546
1547 //fprintf(stderr, "=== BS ===\n");
1548 h->codecs[DS_BS] = cram_encoder_init(cram_stats_encoding(fd, c->stats[DS_BS]),
1549 c->stats[DS_BS], E_BYTE, NULL,
1550 fd->version);
1551
1552 if (CRAM_MAJOR_VERS(fd->version) == 1) {
1553 h->codecs[DS_TL] = NULL;
1554 h->codecs[DS_RI] = NULL;
1555 h->codecs[DS_RS] = NULL;
1556 h->codecs[DS_PD] = NULL;
1557 h->codecs[DS_HC] = NULL;
1558 h->codecs[DS_SC] = NULL;
1559
1560 //fprintf(stderr, "=== TC ===\n");
1561 h->codecs[DS_TC] = cram_encoder_init(cram_stats_encoding(fd, c->stats[DS_TC]),
1562 c->stats[DS_TC], E_BYTE, NULL,
1563 fd->version);
1564
1565 //fprintf(stderr, "=== TN ===\n");
1566 h->codecs[DS_TN] = cram_encoder_init(cram_stats_encoding(fd, c->stats[DS_TN]),
1567 c->stats[DS_TN], E_INT, NULL,
1568 fd->version);
1569 } else {
1570 h->codecs[DS_TC] = NULL;
1571 h->codecs[DS_TN] = NULL;
1572
1573 //fprintf(stderr, "=== TL ===\n");
1574 h->codecs[DS_TL] = cram_encoder_init(cram_stats_encoding(fd, c->stats[DS_TL]),
1575 c->stats[DS_TL], E_INT, NULL,
1576 fd->version);
1577
1578
1579 //fprintf(stderr, "=== RI ===\n");
1580 h->codecs[DS_RI] = cram_encoder_init(cram_stats_encoding(fd, c->stats[DS_RI]),
1581 c->stats[DS_RI], E_INT, NULL,
1582 fd->version);
1583
1584 //fprintf(stderr, "=== RS ===\n");
1585 h->codecs[DS_RS] = cram_encoder_init(cram_stats_encoding(fd, c->stats[DS_RS]),
1586 c->stats[DS_RS], E_INT, NULL,
1587 fd->version);
1588
1589 //fprintf(stderr, "=== PD ===\n");
1590 h->codecs[DS_PD] = cram_encoder_init(cram_stats_encoding(fd, c->stats[DS_PD]),
1591 c->stats[DS_PD], E_INT, NULL,
1592 fd->version);
1593
1594 //fprintf(stderr, "=== HC ===\n");
1595 h->codecs[DS_HC] = cram_encoder_init(cram_stats_encoding(fd, c->stats[DS_HC]),
1596 c->stats[DS_HC], E_INT, NULL,
1597 fd->version);
1598
1599 //fprintf(stderr, "=== SC ===\n");
1600 if (1) {
1601 int i2[2] = {0, DS_SC};
1602
1603 h->codecs[DS_SC] = cram_encoder_init(E_BYTE_ARRAY_STOP, NULL,
1604 E_BYTE_ARRAY, (void *)i2,
1605 fd->version);
1606 } else {
1607 // Appears to be no practical benefit to using this method,
1608 // but it may work better if we start mixing SC, IN and BB
1609 // elements into the same external block.
1610 cram_byte_array_len_encoder e;
1611
1612 e.len_encoding = E_EXTERNAL;
1613 e.len_dat = (void *)DS_SC_len;
1614
1615 e.val_encoding = E_EXTERNAL;
1616 e.val_dat = (void *)DS_SC;
1617
1618 h->codecs[DS_SC] = cram_encoder_init(E_BYTE_ARRAY_LEN, NULL,
1619 E_BYTE_ARRAY, (void *)&e,
1620 fd->version);
1621 }
1622 }
1623
1624 //fprintf(stderr, "=== IN ===\n");
1625 {
1626 int i2[2] = {0, DS_IN};
1627 h->codecs[DS_IN] = cram_encoder_init(E_BYTE_ARRAY_STOP, NULL,
1628 E_BYTE_ARRAY, (void *)i2,
1629 fd->version);
1630 }
1631
1632 h->codecs[DS_QS] = cram_encoder_init(E_EXTERNAL, NULL, E_BYTE,
1633 (void *)DS_QS,
1634 fd->version);
1635 {
1636 int i2[2] = {0, DS_RN};
1637 h->codecs[DS_RN] = cram_encoder_init(E_BYTE_ARRAY_STOP, NULL,
1638 E_BYTE_ARRAY, (void *)i2,
1639 fd->version);
1640 }
1641
1642
1643 /* Encode slices */
1644 for (i = 0; i < c->curr_slice; i++) {
1645 if (fd->verbose)
1646 fprintf(stderr, "Encode slice %d\n", i);
1647
1648 if (cram_encode_slice(fd, c, h, c->slices[i]) != 0)
1649 return -1;
1650 }
1651
1652 /* Create compression header */
1653 {
1654 h->ref_seq_id = c->ref_seq_id;
1655 h->ref_seq_start = c->ref_seq_start;
1656 h->ref_seq_span = c->ref_seq_span;
1657 h->num_records = c->num_records;
1658
1659 h->mapped_qs_included = 0; // fixme
1660 h->unmapped_qs_included = 0; // fixme
1661 h->AP_delta = c->pos_sorted;
1662 // h->... fixme
1663 memcpy(h->substitution_matrix, CRAM_SUBST_MATRIX, 20);
1664
1665 if (!(c_hdr = cram_encode_compression_header(fd, c, h)))
1666 return -1;
1667 }
1668
1669 /* Compute landmarks */
1670 /* Fill out slice landmarks */
1671 c->num_landmarks = c->curr_slice;
1672 c->landmark = malloc(c->num_landmarks * sizeof(*c->landmark));
1673 if (!c->landmark)
1674 return -1;
1675
1676 /*
1677 * Slice offset starts after the first block, so we need to simulate
1678 * writing it to work out the correct offset
1679 */
1680 {
1681 slice_offset = c_hdr->method == RAW
1682 ? c_hdr->uncomp_size
1683 : c_hdr->comp_size;
1684 slice_offset += 2 + 4*(CRAM_MAJOR_VERS(fd->version) >= 3) +
1685 itf8_size(c_hdr->content_id) +
1686 itf8_size(c_hdr->comp_size) +
1687 itf8_size(c_hdr->uncomp_size);
1688 }
1689
1690 c->ref_seq_id = c->slices[0]->hdr->ref_seq_id;
1691 c->ref_seq_start = c->slices[0]->hdr->ref_seq_start;
1692 c->ref_seq_span = c->slices[0]->hdr->ref_seq_span;
1693 for (i = 0; i < c->curr_slice; i++) {
1694 cram_slice *s = c->slices[i];
1695
1696 c->num_blocks += s->hdr->num_blocks + 1; // slice header
1697 c->landmark[i] = slice_offset;
1698
1699 if (s->hdr->ref_seq_start + s->hdr->ref_seq_span >
1700 c->ref_seq_start + c->ref_seq_span) {
1701 c->ref_seq_span = s->hdr->ref_seq_start + s->hdr->ref_seq_span
1702 - c->ref_seq_start;
1703 }
1704
1705 slice_offset += s->hdr_block->method == RAW
1706 ? s->hdr_block->uncomp_size
1707 : s->hdr_block->comp_size;
1708
1709 slice_offset += 2 + 4*(CRAM_MAJOR_VERS(fd->version) >= 3) +
1710 itf8_size(s->hdr_block->content_id) +
1711 itf8_size(s->hdr_block->comp_size) +
1712 itf8_size(s->hdr_block->uncomp_size);
1713
1714 for (j = 0; j < s->hdr->num_blocks; j++) {
1715 slice_offset += 2 + 4*(CRAM_MAJOR_VERS(fd->version) >= 3) +
1716 itf8_size(s->block[j]->content_id) +
1717 itf8_size(s->block[j]->comp_size) +
1718 itf8_size(s->block[j]->uncomp_size);
1719
1720 slice_offset += s->block[j]->method == RAW
1721 ? s->block[j]->uncomp_size
1722 : s->block[j]->comp_size;
1723 }
1724 }
1725 c->length += slice_offset; // just past the final slice
1726
1727 c->comp_hdr_block = c_hdr;
1728
1729 if (c->ref_seq_id >= 0) {
1730 cram_ref_decr(fd->refs, c->ref_seq_id);
1731 }
1732
1733 /* Cache references up-front if we have unsorted access patterns */
1734 if (!fd->no_ref && c->refs_used) {
1735 for (i = 0; i < fd->refs->nref; i++) {
1736 if (c->refs_used[i])
1737 cram_ref_decr(fd->refs, i);
1738 }
1739 }
1740
1741 return 0;
1742 }
1743
1744
1745 /*
1746 * Adds a feature code to a read within a slice. For purposes of minimising
1747 * memory allocations and fragmentation we have one array of features for all
1748 * reads within the slice. We return the index into this array for this new
1749 * feature.
1750 *
1751 * Returns feature index on success
1752 * -1 on failure.
1753 */
cram_add_feature(cram_container * c,cram_slice * s,cram_record * r,cram_feature * f)1754 static int cram_add_feature(cram_container *c, cram_slice *s,
1755 cram_record *r, cram_feature *f) {
1756 if (s->nfeatures >= s->afeatures) {
1757 s->afeatures = s->afeatures ? s->afeatures*2 : 1024;
1758 s->features = realloc(s->features, s->afeatures*sizeof(*s->features));
1759 if (!s->features)
1760 return -1;
1761 }
1762
1763 if (!r->nfeature++) {
1764 r->feature = s->nfeatures;
1765 cram_stats_add(c->stats[DS_FP], f->X.pos);
1766 } else {
1767 cram_stats_add(c->stats[DS_FP],
1768 f->X.pos - s->features[r->feature + r->nfeature-2].X.pos);
1769 }
1770 cram_stats_add(c->stats[DS_FC], f->X.code);
1771
1772 s->features[s->nfeatures++] = *f;
1773
1774 return 0;
1775 }
1776
cram_add_substitution(cram_fd * fd,cram_container * c,cram_slice * s,cram_record * r,int pos,char base,char qual,char ref)1777 static int cram_add_substitution(cram_fd *fd, cram_container *c,
1778 cram_slice *s, cram_record *r,
1779 int pos, char base, char qual, char ref) {
1780 cram_feature f;
1781
1782 // seq=ACGTN vs ref=ACGT or seq=ACGT vs ref=ACGTN
1783 if (fd->L2[(uc)base]<4 || (fd->L2[(uc)base]<5 && fd->L2[(uc)ref]<4)) {
1784 f.X.pos = pos+1;
1785 f.X.code = 'X';
1786 f.X.base = fd->cram_sub_matrix[ref&0x1f][base&0x1f];
1787 cram_stats_add(c->stats[DS_BS], f.X.base);
1788 } else {
1789 f.B.pos = pos+1;
1790 f.B.code = 'B';
1791 f.B.base = base;
1792 f.B.qual = qual;
1793 cram_stats_add(c->stats[DS_BA], f.B.base);
1794 cram_stats_add(c->stats[DS_QS], f.B.qual);
1795 BLOCK_APPEND_CHAR(s->qual_blk, qual);
1796 }
1797 return cram_add_feature(c, s, r, &f);
1798 }
1799
cram_add_bases(cram_fd * fd,cram_container * c,cram_slice * s,cram_record * r,int pos,int len,char * base)1800 static int cram_add_bases(cram_fd *fd, cram_container *c,
1801 cram_slice *s, cram_record *r,
1802 int pos, int len, char *base) {
1803 cram_feature f;
1804
1805 f.b.pos = pos+1;
1806 f.b.code = 'b';
1807 f.b.seq_idx = base - (char *)BLOCK_DATA(s->seqs_blk);
1808 f.b.len = len;
1809
1810 return cram_add_feature(c, s, r, &f);
1811 }
1812
cram_add_base(cram_fd * fd,cram_container * c,cram_slice * s,cram_record * r,int pos,char base,char qual)1813 static int cram_add_base(cram_fd *fd, cram_container *c,
1814 cram_slice *s, cram_record *r,
1815 int pos, char base, char qual) {
1816 cram_feature f;
1817 f.B.pos = pos+1;
1818 f.B.code = 'B';
1819 f.B.base = base;
1820 f.B.qual = qual;
1821 cram_stats_add(c->stats[DS_BA], base);
1822 cram_stats_add(c->stats[DS_QS], qual);
1823 BLOCK_APPEND_CHAR(s->qual_blk, qual);
1824 return cram_add_feature(c, s, r, &f);
1825 }
1826
cram_add_quality(cram_fd * fd,cram_container * c,cram_slice * s,cram_record * r,int pos,char qual)1827 static int cram_add_quality(cram_fd *fd, cram_container *c,
1828 cram_slice *s, cram_record *r,
1829 int pos, char qual) {
1830 cram_feature f;
1831 f.Q.pos = pos+1;
1832 f.Q.code = 'Q';
1833 f.Q.qual = qual;
1834 cram_stats_add(c->stats[DS_QS], qual);
1835 BLOCK_APPEND_CHAR(s->qual_blk, qual);
1836 return cram_add_feature(c, s, r, &f);
1837 }
1838
cram_add_deletion(cram_container * c,cram_slice * s,cram_record * r,int pos,int len,char * base)1839 static int cram_add_deletion(cram_container *c, cram_slice *s, cram_record *r,
1840 int pos, int len, char *base) {
1841 cram_feature f;
1842 f.D.pos = pos+1;
1843 f.D.code = 'D';
1844 f.D.len = len;
1845 cram_stats_add(c->stats[DS_DL], len);
1846 return cram_add_feature(c, s, r, &f);
1847 }
1848
cram_add_softclip(cram_container * c,cram_slice * s,cram_record * r,int pos,int len,char * base,int version)1849 static int cram_add_softclip(cram_container *c, cram_slice *s, cram_record *r,
1850 int pos, int len, char *base, int version) {
1851 cram_feature f;
1852 f.S.pos = pos+1;
1853 f.S.code = 'S';
1854 f.S.len = len;
1855 switch (CRAM_MAJOR_VERS(version)) {
1856 case 1:
1857 f.S.seq_idx = BLOCK_SIZE(s->base_blk);
1858 BLOCK_APPEND(s->base_blk, base, len);
1859 BLOCK_APPEND_CHAR(s->base_blk, '\0');
1860 break;
1861
1862 case 2:
1863 default:
1864 f.S.seq_idx = BLOCK_SIZE(s->soft_blk);
1865 if (base) {
1866 BLOCK_APPEND(s->soft_blk, base, len);
1867 } else {
1868 int i;
1869 for (i = 0; i < len; i++)
1870 BLOCK_APPEND_CHAR(s->soft_blk, 'N');
1871 }
1872 BLOCK_APPEND_CHAR(s->soft_blk, '\0');
1873 break;
1874
1875 // default:
1876 // // v3.0 onwards uses BB data-series
1877 // f.S.seq_idx = BLOCK_SIZE(s->soft_blk);
1878 }
1879 return cram_add_feature(c, s, r, &f);
1880 }
1881
cram_add_hardclip(cram_container * c,cram_slice * s,cram_record * r,int pos,int len,char * base)1882 static int cram_add_hardclip(cram_container *c, cram_slice *s, cram_record *r,
1883 int pos, int len, char *base) {
1884 cram_feature f;
1885 f.S.pos = pos+1;
1886 f.S.code = 'H';
1887 f.S.len = len;
1888 cram_stats_add(c->stats[DS_HC], len);
1889 return cram_add_feature(c, s, r, &f);
1890 }
1891
cram_add_skip(cram_container * c,cram_slice * s,cram_record * r,int pos,int len,char * base)1892 static int cram_add_skip(cram_container *c, cram_slice *s, cram_record *r,
1893 int pos, int len, char *base) {
1894 cram_feature f;
1895 f.S.pos = pos+1;
1896 f.S.code = 'N';
1897 f.S.len = len;
1898 cram_stats_add(c->stats[DS_RS], len);
1899 return cram_add_feature(c, s, r, &f);
1900 }
1901
cram_add_pad(cram_container * c,cram_slice * s,cram_record * r,int pos,int len,char * base)1902 static int cram_add_pad(cram_container *c, cram_slice *s, cram_record *r,
1903 int pos, int len, char *base) {
1904 cram_feature f;
1905 f.S.pos = pos+1;
1906 f.S.code = 'P';
1907 f.S.len = len;
1908 cram_stats_add(c->stats[DS_PD], len);
1909 return cram_add_feature(c, s, r, &f);
1910 }
1911
cram_add_insertion(cram_container * c,cram_slice * s,cram_record * r,int pos,int len,char * base)1912 static int cram_add_insertion(cram_container *c, cram_slice *s, cram_record *r,
1913 int pos, int len, char *base) {
1914 cram_feature f;
1915 f.I.pos = pos+1;
1916 if (len == 1) {
1917 char b = base ? *base : 'N';
1918 f.i.code = 'i';
1919 f.i.base = b;
1920 cram_stats_add(c->stats[DS_BA], b);
1921 } else {
1922 f.I.code = 'I';
1923 f.I.len = len;
1924 f.S.seq_idx = BLOCK_SIZE(s->base_blk);
1925 if (base) {
1926 BLOCK_APPEND(s->base_blk, base, len);
1927 } else {
1928 int i;
1929 for (i = 0; i < len; i++)
1930 BLOCK_APPEND_CHAR(s->base_blk, 'N');
1931 }
1932 BLOCK_APPEND_CHAR(s->base_blk, '\0');
1933 }
1934 return cram_add_feature(c, s, r, &f);
1935 }
1936
1937 /*
1938 * Encodes auxiliary data.
1939 * Returns the read-group parsed out of the BAM aux fields on success
1940 * NULL on failure or no rg present (FIXME)
1941 */
cram_encode_aux_1_0(cram_fd * fd,bam_seq_t * b,cram_container * c,cram_slice * s,cram_record * cr)1942 static char *cram_encode_aux_1_0(cram_fd *fd, bam_seq_t *b, cram_container *c,
1943 cram_slice *s, cram_record *cr) {
1944 char *aux, *tmp, *rg = NULL;
1945 int aux_size = bam_blk_size(b) -
1946 ((char *)bam_aux(b) - (char *)&bam_ref(b));
1947
1948 /* Worst case is 1 nul char on every ??:Z: string, so +33% */
1949 BLOCK_GROW(s->aux_blk, aux_size*1.34+1);
1950 tmp = (char *)BLOCK_END(s->aux_blk);
1951
1952 aux = (char *)bam_aux(b);
1953 cr->TN_idx = s->nTN;
1954
1955 while (aux[0] != 0) {
1956 int32_t i32;
1957 int r;
1958
1959 if (aux[0] == 'R' && aux[1] == 'G' && aux[2] == 'Z') {
1960 rg = &aux[3];
1961 while (*aux++);
1962 continue;
1963 }
1964 if (aux[0] == 'M' && aux[1] == 'D' && aux[2] == 'Z') {
1965 while (*aux++);
1966 continue;
1967 }
1968 if (aux[0] == 'N' && aux[1] == 'M') {
1969 switch(aux[2]) {
1970 case 'A': case 'C': case 'c': aux+=4; break;
1971 case 'I': case 'i': case 'f': aux+=7; break;
1972 default:
1973 fprintf(stderr, "Unhandled type code for NM tag\n");
1974 return NULL;
1975 }
1976 continue;
1977 }
1978
1979 cr->ntags++;
1980
1981 i32 = (aux[0]<<16) | (aux[1]<<8) | aux[2];
1982 kh_put(m_tagmap, c->tags_used, i32, &r);
1983 if (-1 == r)
1984 return NULL;
1985
1986 if (s->nTN >= s->aTN) {
1987 s->aTN = s->aTN ? s->aTN*2 : 1024;
1988 if (!(s->TN = realloc(s->TN, s->aTN * sizeof(*s->TN))))
1989 return NULL;
1990 }
1991 s->TN[s->nTN++] = i32;
1992 cram_stats_add(c->stats[DS_TN], i32);
1993
1994 switch(aux[2]) {
1995 case 'A': case 'C': case 'c':
1996 aux+=3; //*tmp++=*aux++; *tmp++=*aux++; *tmp++=*aux++;
1997 *tmp++=*aux++;
1998 break;
1999
2000 case 'S': case 's':
2001 aux+=3; //*tmp++=*aux++; *tmp++=*aux++; *tmp++=*aux++;
2002 *tmp++=*aux++; *tmp++=*aux++;
2003 break;
2004
2005 case 'I': case 'i': case 'f':
2006 aux+=3; //*tmp++=*aux++; *tmp++=*aux++; *tmp++=*aux++;
2007 *tmp++=*aux++; *tmp++=*aux++; *tmp++=*aux++; *tmp++=*aux++;
2008 break;
2009
2010 case 'd':
2011 aux+=3; //*tmp++=*aux++; *tmp++=*aux++; *tmp++=*aux++;
2012 *tmp++=*aux++; *tmp++=*aux++; *tmp++=*aux++; *tmp++=*aux++;
2013 *tmp++=*aux++; *tmp++=*aux++; *tmp++=*aux++; *tmp++=*aux++;
2014 break;
2015
2016 case 'Z': case 'H':
2017 aux+=3; //*tmp++=*aux++; *tmp++=*aux++; *tmp++=*aux++;
2018 while ((*tmp++=*aux++));
2019 *tmp++ = '\t'; // stop byte
2020 break;
2021
2022 case 'B': {
2023 int type = aux[3], blen;
2024 uint32_t count = (uint32_t)((((unsigned char *)aux)[4]<< 0) +
2025 (((unsigned char *)aux)[5]<< 8) +
2026 (((unsigned char *)aux)[6]<<16) +
2027 (((unsigned char *)aux)[7]<<24));
2028 // skip TN field
2029 aux+=3; //*tmp++=*aux++; *tmp++=*aux++; *tmp++=*aux++;
2030
2031 // We use BYTE_ARRAY_LEN with external length, so store that first
2032 switch (type) {
2033 case 'c': case 'C':
2034 blen = count;
2035 break;
2036 case 's': case 'S':
2037 blen = 2*count;
2038 break;
2039 case 'i': case 'I': case 'f':
2040 blen = 4*count;
2041 break;
2042 default:
2043 fprintf(stderr, "Unknown sub-type '%c' for aux type 'B'\n",
2044 type);
2045 return NULL;
2046
2047 }
2048
2049 tmp += itf8_put(tmp, blen+5);
2050
2051 *tmp++=*aux++; // sub-type & length
2052 *tmp++=*aux++; *tmp++=*aux++; *tmp++=*aux++; *tmp++=*aux++;
2053
2054 // The tag data itself
2055 memcpy(tmp, aux, blen); tmp += blen; aux += blen;
2056
2057 //cram_stats_add(c->aux_B_stats, blen);
2058 break;
2059 }
2060 default:
2061 fprintf(stderr, "Unknown aux type '%c'\n", aux[2]);
2062 return NULL;
2063 }
2064 }
2065 cram_stats_add(c->stats[DS_TC], cr->ntags);
2066
2067 cr->aux = BLOCK_SIZE(s->aux_blk);
2068 cr->aux_size = (uc *)tmp - (BLOCK_DATA(s->aux_blk) + cr->aux);
2069 BLOCK_SIZE(s->aux_blk) = (uc *)tmp - BLOCK_DATA(s->aux_blk);
2070 assert(s->aux_blk->byte <= s->aux_blk->alloc);
2071
2072 return rg;
2073 }
2074
2075 /*
2076 * Encodes auxiliary data. Largely duplicated from above, but done so to
2077 * keep it simple and avoid a myriad of version ifs.
2078 *
2079 * Returns the read-group parsed out of the BAM aux fields on success
2080 * NULL on failure or no rg present (FIXME)
2081 */
cram_encode_aux(cram_fd * fd,bam_seq_t * b,cram_container * c,cram_slice * s,cram_record * cr)2082 static char *cram_encode_aux(cram_fd *fd, bam_seq_t *b, cram_container *c,
2083 cram_slice *s, cram_record *cr) {
2084 char *aux, *orig, *rg = NULL;
2085 int aux_size = bam_get_l_aux(b);
2086 cram_block *td_b = c->comp_hdr->TD_blk;
2087 int TD_blk_size = BLOCK_SIZE(td_b), new;
2088 char *key;
2089 khint_t k;
2090
2091 orig = aux = (char *)bam_aux(b);
2092
2093 // Copy aux keys to td_b and aux values to slice aux blocks
2094 while (aux - orig < aux_size && aux[0] != 0) {
2095 int r;
2096
2097 // RG:Z
2098 if (aux[0] == 'R' && aux[1] == 'G' && aux[2] == 'Z') {
2099 rg = &aux[3];
2100 while (*aux++);
2101 continue;
2102 }
2103
2104 // MD:Z
2105 if (aux[0] == 'M' && aux[1] == 'D' && aux[2] == 'Z') {
2106 if (cr->len && !fd->no_ref && !(cr->flags & BAM_FUNMAP)) {
2107 while (*aux++);
2108 continue;
2109 }
2110 }
2111
2112 // NM:i
2113 if (aux[0] == 'N' && aux[1] == 'M') {
2114 if (cr->len && !fd->no_ref && !(cr->flags & BAM_FUNMAP)) {
2115 switch(aux[2]) {
2116 case 'A': case 'C': case 'c': aux+=4; break;
2117 case 'S': case 's': aux+=5; break;
2118 case 'I': case 'i': case 'f': aux+=7; break;
2119 default:
2120 fprintf(stderr, "Unhandled type code for NM tag\n");
2121 return NULL;
2122 }
2123 continue;
2124 }
2125 }
2126
2127 BLOCK_APPEND(td_b, aux, 3);
2128
2129 // Container level tags_used, for TD series
2130 // Maps integer key ('X0i') to cram_tag_map struct.
2131 int key = (aux[0]<<16)|(aux[1]<<8)|aux[2];
2132 k = kh_put(m_tagmap, c->tags_used, key, &r);
2133 if (-1 == r)
2134 return NULL;
2135
2136 if (r == 1) {
2137 khint_t k_global;
2138
2139 // Global tags_used for cram_metrics support
2140 pthread_mutex_lock(&fd->metrics_lock);
2141 k_global = kh_put(m_metrics, fd->tags_used, key, &r);
2142 if (-1 == r)
2143 return NULL;
2144 if (r == 1)
2145 kh_val(fd->tags_used, k_global) = cram_new_metrics();
2146
2147 pthread_mutex_unlock(&fd->metrics_lock);
2148
2149 int i2[2] = {'\t',key};
2150 size_t sk = key;
2151 cram_tag_map *m = calloc(1, sizeof(*m));
2152 kh_val(c->tags_used, k) = m;
2153
2154 cram_codec *c;
2155
2156 // Use a block content id based on the tag id.
2157 // Codec type depends on tag data type.
2158 switch(aux[2]) {
2159 case 'Z': case 'H':
2160 // string as byte_array_stop
2161 c = cram_encoder_init(E_BYTE_ARRAY_STOP, NULL,
2162 E_BYTE_ARRAY, (void *)i2,
2163 fd->version);
2164 break;
2165
2166 case 'A': case 'c': case 'C': {
2167 // byte array len, 1 byte
2168 cram_byte_array_len_encoder e;
2169 cram_stats st;
2170
2171 e.len_encoding = E_HUFFMAN;
2172 e.len_dat = NULL;
2173 memset(&st, 0, sizeof(st));
2174 cram_stats_add(&st, 1);
2175 cram_stats_encoding(fd, &st);
2176
2177 e.val_encoding = E_EXTERNAL;
2178 e.val_dat = (void *)sk;
2179
2180 c = cram_encoder_init(E_BYTE_ARRAY_LEN, &st,
2181 E_BYTE_ARRAY, (void *)&e,
2182 fd->version);
2183 break;
2184 }
2185
2186 case 's': case 'S': {
2187 // byte array len, 2 byte
2188 cram_byte_array_len_encoder e;
2189 cram_stats st;
2190
2191 e.len_encoding = E_HUFFMAN;
2192 e.len_dat = NULL;
2193 memset(&st, 0, sizeof(st));
2194 cram_stats_add(&st, 2);
2195 cram_stats_encoding(fd, &st);
2196
2197 e.val_encoding = E_EXTERNAL;
2198 e.val_dat = (void *)sk;
2199
2200 c = cram_encoder_init(E_BYTE_ARRAY_LEN, &st,
2201 E_BYTE_ARRAY, (void *)&e,
2202 fd->version);
2203 break;
2204 }
2205 case 'i': case 'I': case 'f': {
2206 // byte array len, 4 byte
2207 cram_byte_array_len_encoder e;
2208 cram_stats st;
2209
2210 e.len_encoding = E_HUFFMAN;
2211 e.len_dat = NULL;
2212 memset(&st, 0, sizeof(st));
2213 cram_stats_add(&st, 4);
2214 cram_stats_encoding(fd, &st);
2215
2216 e.val_encoding = E_EXTERNAL;
2217 e.val_dat = (void *)sk;
2218
2219 c = cram_encoder_init(E_BYTE_ARRAY_LEN, &st,
2220 E_BYTE_ARRAY, (void *)&e,
2221 fd->version);
2222 break;
2223 }
2224
2225 case 'B': {
2226 // Byte array of variable size, but we generate our tag
2227 // byte stream at the wrong stage (during reading and not
2228 // after slice header construction). So we use
2229 // BYTE_ARRAY_LEN with the length codec being external
2230 // too.
2231 cram_byte_array_len_encoder e;
2232
2233 e.len_encoding = E_EXTERNAL;
2234 e.len_dat = (void *)sk; // or key+128 for len?
2235
2236 e.val_encoding = E_EXTERNAL;
2237 e.val_dat = (void *)sk;
2238
2239 c = cram_encoder_init(E_BYTE_ARRAY_LEN, NULL,
2240 E_BYTE_ARRAY, (void *)&e,
2241 fd->version);
2242 break;
2243 }
2244
2245 default:
2246 fprintf(stderr, "Unsupported SAM aux type '%c'\n",
2247 aux[2]);
2248 c = NULL;
2249 }
2250
2251 m->codec = c;
2252
2253 // Link to fd-global tag metrics
2254 m->m = k_global ? (cram_metrics *)kh_val(fd->tags_used, k_global) : NULL;
2255 }
2256
2257 cram_tag_map *tm = (cram_tag_map *)kh_val(c->tags_used, k);
2258 cram_codec *codec = tm->codec;
2259
2260 switch(aux[2]) {
2261 case 'A': case 'C': case 'c':
2262 if (!tm->blk) {
2263 if (!(tm->blk = cram_new_block(EXTERNAL, key)))
2264 return NULL;
2265 codec->e_byte_array_len.val_codec->out = tm->blk;
2266 }
2267
2268 aux+=3;
2269 //codec->encode(s, codec, aux, 1);
2270 // Functionally equivalent, but less code.
2271 BLOCK_APPEND_CHAR(tm->blk, *aux);
2272 aux++;
2273 break;
2274
2275 case 'S': case 's':
2276 if (!tm->blk) {
2277 if (!(tm->blk = cram_new_block(EXTERNAL, key)))
2278 return NULL;
2279 codec->e_byte_array_len.val_codec->out = tm->blk;
2280 }
2281
2282 aux+=3;
2283 //codec->encode(s, codec, aux, 2);
2284 BLOCK_APPEND(tm->blk, aux, 2);
2285 aux+=2;
2286 break;
2287
2288 case 'I': case 'i': case 'f':
2289 if (!tm->blk) {
2290 if (!(tm->blk = cram_new_block(EXTERNAL, key)))
2291 return NULL;
2292 codec->e_byte_array_len.val_codec->out = tm->blk;
2293 }
2294
2295 aux+=3;
2296 //codec->encode(s, codec, aux, 4);
2297 BLOCK_APPEND(tm->blk, aux, 4);
2298 aux+=4;
2299 break;
2300
2301 case 'd':
2302 if (!tm->blk) {
2303 if (!(tm->blk = cram_new_block(EXTERNAL, key)))
2304 return NULL;
2305 codec->e_byte_array_len.val_codec->out = tm->blk;
2306 }
2307
2308 aux+=3; //*tmp++=*aux++; *tmp++=*aux++; *tmp++=*aux++;
2309 //codec->encode(s, codec, aux, 8);
2310 BLOCK_APPEND(tm->blk, aux, 8);
2311 aux+=8;
2312 break;
2313
2314 case 'Z': case 'H':
2315 {
2316 if (!tm->blk) {
2317 if (!(tm->blk = cram_new_block(EXTERNAL, key)))
2318 return NULL;
2319 codec->out = tm->blk;
2320 }
2321
2322 char *aux_s;
2323 aux += 3;
2324 aux_s = aux;
2325 while (*aux++);
2326 codec->encode(s, codec, aux_s, aux - aux_s);
2327 }
2328 break;
2329
2330 case 'B': {
2331 int type = aux[3], blen;
2332 uint32_t count = (uint32_t)((((unsigned char *)aux)[4]<< 0) +
2333 (((unsigned char *)aux)[5]<< 8) +
2334 (((unsigned char *)aux)[6]<<16) +
2335 (((unsigned char *)aux)[7]<<24));
2336 if (!tm->blk) {
2337 if (!(tm->blk = cram_new_block(EXTERNAL, key)))
2338 return NULL;
2339 codec->e_byte_array_len.len_codec->out = tm->blk;
2340 codec->e_byte_array_len.val_codec->out = tm->blk;
2341 }
2342
2343 // skip TN field
2344 aux+=3;
2345
2346 // We use BYTE_ARRAY_LEN with external length, so store that first
2347 switch (type) {
2348 case 'c': case 'C':
2349 blen = count;
2350 break;
2351 case 's': case 'S':
2352 blen = 2*count;
2353 break;
2354 case 'i': case 'I': case 'f':
2355 blen = 4*count;
2356 break;
2357 default:
2358 fprintf(stderr, "Unknown sub-type '%c' for aux type 'B'\n",
2359 type);
2360 return NULL;
2361
2362 }
2363
2364 blen += 5; // sub-type & length
2365
2366 codec->encode(s, codec, aux, blen);
2367 aux += blen;
2368 break;
2369 }
2370 default:
2371 fprintf(stderr, "Unknown aux type '%c'\n", aux[2]);
2372 return NULL;
2373 }
2374 tm->blk->m = tm->m;
2375 }
2376
2377 // FIXME: sort BLOCK_DATA(td_b) by char[3] triples
2378
2379 // And and increment TD hash entry
2380 BLOCK_APPEND_CHAR(td_b, 0);
2381
2382 // Duplicate key as BLOCK_DATA() can be realloced to a new pointer.
2383 key = string_ndup(c->comp_hdr->TD_keys,
2384 (char *)BLOCK_DATA(td_b) + TD_blk_size,
2385 BLOCK_SIZE(td_b) - TD_blk_size);
2386 k = kh_put(m_s2i, c->comp_hdr->TD_hash, key, &new);
2387 if (new < 0) {
2388 return NULL;
2389 } else if (new == 0) {
2390 BLOCK_SIZE(td_b) = TD_blk_size;
2391 } else {
2392 kh_val(c->comp_hdr->TD_hash, k) = c->comp_hdr->nTL;
2393 c->comp_hdr->nTL++;
2394 }
2395
2396 cr->TL = kh_val(c->comp_hdr->TD_hash, k);
2397 cram_stats_add(c->stats[DS_TL], cr->TL);
2398
2399 return rg;
2400 }
2401
2402 /*
2403 * During cram_next_container or before the final flush at end of
2404 * file, we update the current slice headers and increment the slice
2405 * number to the next slice.
2406 *
2407 * See cram_next_container() and cram_close().
2408 */
cram_update_curr_slice(cram_container * c)2409 void cram_update_curr_slice(cram_container *c) {
2410 cram_slice *s = c->slice;
2411 if (c->multi_seq) {
2412 s->hdr->ref_seq_id = -2;
2413 s->hdr->ref_seq_start = 0;
2414 s->hdr->ref_seq_span = 0;
2415 } else {
2416 s->hdr->ref_seq_id = c->curr_ref;
2417 s->hdr->ref_seq_start = c->first_base;
2418 s->hdr->ref_seq_span = MAX(0, c->last_base - c->first_base + 1);
2419 }
2420 s->hdr->num_records = c->curr_rec;
2421
2422 if (c->curr_slice == 0) {
2423 if (c->ref_seq_id != s->hdr->ref_seq_id)
2424 c->ref_seq_id = s->hdr->ref_seq_id;
2425 c->ref_seq_start = c->first_base;
2426 }
2427
2428 c->curr_slice++;
2429 }
2430
2431 /*
2432 * Handles creation of a new container or new slice, flushing any
2433 * existing containers when appropriate.
2434 *
2435 * Really this is next slice, which may or may not lead to a new container.
2436 *
2437 * Returns cram_container pointer on success
2438 * NULL on failure.
2439 */
cram_next_container(cram_fd * fd,bam_seq_t * b)2440 static cram_container *cram_next_container(cram_fd *fd, bam_seq_t *b) {
2441 cram_container *c = fd->ctr;
2442 int i;
2443
2444 /* First occurence */
2445 if (c->curr_ref == -2)
2446 c->curr_ref = bam_ref(b);
2447
2448 if (c->slice)
2449 cram_update_curr_slice(c);
2450
2451 /* Flush container */
2452 if (c->curr_slice == c->max_slice ||
2453 (bam_ref(b) != c->curr_ref && !c->multi_seq)) {
2454 c->ref_seq_span = fd->last_base - c->ref_seq_start + 1;
2455 if (fd->verbose)
2456 fprintf(stderr, "Flush container %d/%d..%d\n",
2457 c->ref_seq_id, c->ref_seq_start,
2458 c->ref_seq_start + c->ref_seq_span -1);
2459
2460 /* Encode slices */
2461 if (fd->pool) {
2462 if (-1 == cram_flush_container_mt(fd, c))
2463 return NULL;
2464 } else {
2465 if (-1 == cram_flush_container(fd, c))
2466 return NULL;
2467
2468 // Move to sep func, as we need cram_flush_container for
2469 // the closing phase to flush the partial container.
2470 for (i = 0; i < c->max_slice; i++) {
2471 cram_free_slice(c->slices[i]);
2472 c->slices[i] = NULL;
2473 }
2474
2475 c->slice = NULL;
2476 c->curr_slice = 0;
2477
2478 /* Easy approach for purposes of freeing stats */
2479 cram_free_container(c);
2480 }
2481
2482 c = fd->ctr = cram_new_container(fd->seqs_per_slice,
2483 fd->slices_per_container);
2484 if (!c)
2485 return NULL;
2486 c->record_counter = fd->record_counter;
2487 c->curr_ref = bam_ref(b);
2488 }
2489
2490 c->last_pos = c->first_base = c->last_base = bam_pos(b)+1;
2491
2492 /* New slice */
2493 c->slice = c->slices[c->curr_slice] =
2494 cram_new_slice(MAPPED_SLICE, c->max_rec);
2495 if (!c->slice)
2496 return NULL;
2497
2498 if (c->multi_seq) {
2499 c->slice->hdr->ref_seq_id = -2;
2500 c->slice->hdr->ref_seq_start = 0;
2501 c->slice->last_apos = 1;
2502 } else {
2503 c->slice->hdr->ref_seq_id = bam_ref(b);
2504 // wrong for unsorted data, will fix during encoding.
2505 c->slice->hdr->ref_seq_start = bam_pos(b)+1;
2506 c->slice->last_apos = bam_pos(b)+1;
2507 }
2508
2509 c->curr_rec = 0;
2510 c->s_num_bases = 0;
2511
2512 return c;
2513 }
2514
2515 /*
2516 * Converts a single bam record into a cram record.
2517 * Possibly used within a thread.
2518 *
2519 * Returns 0 on success;
2520 * -1 on failure
2521 */
process_one_read(cram_fd * fd,cram_container * c,cram_slice * s,cram_record * cr,bam_seq_t * b,int rnum)2522 static int process_one_read(cram_fd *fd, cram_container *c,
2523 cram_slice *s, cram_record *cr,
2524 bam_seq_t *b, int rnum) {
2525 int i, fake_qual = -1;
2526 char *cp, *rg;
2527 char *ref, *seq, *qual;
2528
2529 // FIXME: multi-ref containers
2530
2531 ref = c->ref;
2532 cr->flags = bam_flag(b);
2533 cr->len = bam_seq_len(b);
2534
2535 //fprintf(stderr, "%s => %d\n", rg ? rg : "\"\"", cr->rg);
2536
2537 // Fields to resolve later
2538 //cr->mate_line; // index to another cram_record
2539 //cr->mate_flags; // MF
2540 //cr->ntags; // TC
2541 cr->ntags = 0; //cram_stats_add(c->stats[DS_TC], cr->ntags);
2542 if (CRAM_MAJOR_VERS(fd->version) == 1)
2543 rg = cram_encode_aux_1_0(fd, b, c, s, cr);
2544 else
2545 rg = cram_encode_aux(fd, b, c, s, cr);
2546
2547 //cr->aux_size = b->blk_size - ((char *)bam_aux(b) - (char *)&bam_ref(b));
2548 //cr->aux = DSTRING_LEN(s->aux_ds);
2549 //dstring_nappend(s->aux_ds, bam_aux(b), cr->aux_size);
2550
2551 /* Read group, identified earlier */
2552 if (rg) {
2553 SAM_RG *brg = sam_hdr_find_rg(fd->header, rg);
2554 cr->rg = brg ? brg->id : -1;
2555 } else if (CRAM_MAJOR_VERS(fd->version) == 1) {
2556 SAM_RG *brg = sam_hdr_find_rg(fd->header, "UNKNOWN");
2557 assert(brg);
2558 } else {
2559 cr->rg = -1;
2560 }
2561 cram_stats_add(c->stats[DS_RG], cr->rg);
2562
2563
2564 cr->ref_id = bam_ref(b); cram_stats_add(c->stats[DS_RI], cr->ref_id);
2565 cram_stats_add(c->stats[DS_BF], fd->cram_flag_swap[cr->flags & 0xfff]);
2566
2567 // Non reference based encoding means storing the bases verbatim as features, which in
2568 // turn means every base also has a quality already stored.
2569 if (!fd->no_ref || CRAM_MAJOR_VERS(fd->version) >= 3)
2570 cr->cram_flags |= CRAM_FLAG_PRESERVE_QUAL_SCORES;
2571
2572 if (cr->len <= 0 && CRAM_MAJOR_VERS(fd->version) >= 3)
2573 cr->cram_flags |= CRAM_FLAG_NO_SEQ;
2574 //cram_stats_add(c->stats[DS_CF], cr->cram_flags & CRAM_FLAG_MASK);
2575
2576 c->num_bases += cr->len;
2577 cr->apos = bam_pos(b)+1;
2578 if (c->pos_sorted) {
2579 if (cr->apos < s->last_apos) {
2580 c->pos_sorted = 0;
2581 } else {
2582 cram_stats_add(c->stats[DS_AP], cr->apos - s->last_apos);
2583 s->last_apos = cr->apos;
2584 }
2585 } else {
2586 //cram_stats_add(c->stats[DS_AP], cr->apos);
2587 }
2588 c->max_apos += (cr->apos > c->max_apos) * (cr->apos - c->max_apos);
2589
2590 /*
2591 * This seqs_ds is largely pointless and it could reuse the same memory
2592 * over and over.
2593 * s->base_blk is what we need for encoding.
2594 */
2595 cr->seq = BLOCK_SIZE(s->seqs_blk);
2596 cr->qual = BLOCK_SIZE(s->qual_blk);
2597 BLOCK_GROW(s->seqs_blk, cr->len+1);
2598 BLOCK_GROW(s->qual_blk, cr->len);
2599 seq = cp = (char *)BLOCK_END(s->seqs_blk);
2600
2601 *seq = 0;
2602 #if HTS_ALLOW_UNALIGNED != 0
2603 {
2604 // Convert seq 2 bases at a time for speed.
2605 static const uint16_t code2base[256] = {
2606 15677, 16701, 17213, 19773, 18237, 21053, 21309, 22077,
2607 21565, 22333, 22845, 18493, 19261, 17469, 16957, 20029,
2608 15681, 16705, 17217, 19777, 18241, 21057, 21313, 22081,
2609 21569, 22337, 22849, 18497, 19265, 17473, 16961, 20033,
2610 15683, 16707, 17219, 19779, 18243, 21059, 21315, 22083,
2611 21571, 22339, 22851, 18499, 19267, 17475, 16963, 20035,
2612 15693, 16717, 17229, 19789, 18253, 21069, 21325, 22093,
2613 21581, 22349, 22861, 18509, 19277, 17485, 16973, 20045,
2614 15687, 16711, 17223, 19783, 18247, 21063, 21319, 22087,
2615 21575, 22343, 22855, 18503, 19271, 17479, 16967, 20039,
2616 15698, 16722, 17234, 19794, 18258, 21074, 21330, 22098,
2617 21586, 22354, 22866, 18514, 19282, 17490, 16978, 20050,
2618 15699, 16723, 17235, 19795, 18259, 21075, 21331, 22099,
2619 21587, 22355, 22867, 18515, 19283, 17491, 16979, 20051,
2620 15702, 16726, 17238, 19798, 18262, 21078, 21334, 22102,
2621 21590, 22358, 22870, 18518, 19286, 17494, 16982, 20054,
2622 15700, 16724, 17236, 19796, 18260, 21076, 21332, 22100,
2623 21588, 22356, 22868, 18516, 19284, 17492, 16980, 20052,
2624 15703, 16727, 17239, 19799, 18263, 21079, 21335, 22103,
2625 21591, 22359, 22871, 18519, 19287, 17495, 16983, 20055,
2626 15705, 16729, 17241, 19801, 18265, 21081, 21337, 22105,
2627 21593, 22361, 22873, 18521, 19289, 17497, 16985, 20057,
2628 15688, 16712, 17224, 19784, 18248, 21064, 21320, 22088,
2629 21576, 22344, 22856, 18504, 19272, 17480, 16968, 20040,
2630 15691, 16715, 17227, 19787, 18251, 21067, 21323, 22091,
2631 21579, 22347, 22859, 18507, 19275, 17483, 16971, 20043,
2632 15684, 16708, 17220, 19780, 18244, 21060, 21316, 22084,
2633 21572, 22340, 22852, 18500, 19268, 17476, 16964, 20036,
2634 15682, 16706, 17218, 19778, 18242, 21058, 21314, 22082,
2635 21570, 22338, 22850, 18498, 19266, 17474, 16962, 20034,
2636 15694, 16718, 17230, 19790, 18254, 21070, 21326, 22094,
2637 21582, 22350, 22862, 18510, 19278, 17486, 16974, 20046
2638 };
2639
2640 int l2 = cr->len / 2;
2641 unsigned char *from = (unsigned char *)bam_seq(b);
2642 uint16_t *cpi = (uint16_t *)cp;
2643 cp[0] = 0;
2644 for (i = 0; i < l2; i++)
2645 cpi[i] = le_int2(code2base[from[i]]);
2646 if ((i *= 2) < cr->len)
2647 cp[i] = seq_nt16_str[bam_seqi(bam_seq(b), i)];
2648 }
2649 #else
2650 for (i = 0; i < cr->len; i++)
2651 cp[i] = seq_nt16_str[bam_seqi(bam_seq(b), i)];
2652 #endif
2653 BLOCK_SIZE(s->seqs_blk) += cr->len;
2654
2655 qual = cp = (char *)bam_qual(b);
2656
2657 /* Copy and parse */
2658 if (!(cr->flags & BAM_FUNMAP)) {
2659 uint32_t *cig_to, *cig_from;
2660 int apos = cr->apos-1, spos = 0;
2661
2662 cr->cigar = s->ncigar;
2663 cr->ncigar = bam_cigar_len(b);
2664 while (cr->cigar + cr->ncigar >= s->cigar_alloc) {
2665 s->cigar_alloc = s->cigar_alloc ? s->cigar_alloc*2 : 1024;
2666 s->cigar = realloc(s->cigar, s->cigar_alloc * sizeof(*s->cigar));
2667 if (!s->cigar)
2668 return -1;
2669 }
2670
2671 cig_to = (uint32_t *)s->cigar;
2672 cig_from = (uint32_t *)bam_cigar(b);
2673
2674 cr->feature = 0;
2675 cr->nfeature = 0;
2676 for (i = 0; i < cr->ncigar; i++) {
2677 enum cigar_op cig_op = cig_from[i] & BAM_CIGAR_MASK;
2678 uint32_t cig_len = cig_from[i] >> BAM_CIGAR_SHIFT;
2679 cig_to[i] = cig_from[i];
2680
2681 /* Can also generate events from here for CRAM diffs */
2682
2683 switch (cig_op) {
2684 int l;
2685
2686 // Don't trust = and X ops to be correct.
2687 case BAM_CMATCH:
2688 case BAM_CBASE_MATCH:
2689 case BAM_CBASE_MISMATCH:
2690 //fprintf(stderr, "\nBAM_CMATCH\nR: %.*s\nS: %.*s\n",
2691 // cig_len, &ref[apos], cig_len, &seq[spos]);
2692 l = 0;
2693 if (!fd->no_ref && cr->len) {
2694 int end = cig_len+apos < c->ref_end
2695 ? cig_len : c->ref_end - apos;
2696 char *sp = &seq[spos];
2697 char *rp = &ref[apos];
2698 char *qp = &qual[spos];
2699 if (end > cr->len) {
2700 fprintf(stderr, "CIGAR and query sequence are of "
2701 "different length\n");
2702 return -1;
2703 }
2704 for (l = 0; l < end; l++) {
2705 if (rp[l] != sp[l]) {
2706 if (!sp[l])
2707 break;
2708 if (0 && CRAM_MAJOR_VERS(fd->version) >= 3) {
2709 // Disabled for the time being as it doesn't
2710 // seem to gain us much.
2711 int ol=l;
2712 while (l<end && rp[l] != sp[l])
2713 l++;
2714 if (l-ol > 1) {
2715 if (cram_add_bases(fd, c, s, cr, spos+ol,
2716 l-ol, &seq[spos+ol]))
2717 return -1;
2718 l--;
2719 } else {
2720 l = ol;
2721 if (cram_add_substitution(fd, c, s, cr,
2722 spos+l, sp[l],
2723 qp[l], rp[l]))
2724 return -1;
2725 }
2726 } else {
2727 if (cram_add_substitution(fd, c, s, cr, spos+l,
2728 sp[l], qp[l], rp[l]))
2729 return -1;
2730 }
2731 }
2732 }
2733 spos += l;
2734 apos += l;
2735 }
2736
2737 if (l < cig_len && cr->len) {
2738 if (fd->no_ref) {
2739 if (CRAM_MAJOR_VERS(fd->version) == 3) {
2740 if (cram_add_bases(fd, c, s, cr, spos,
2741 cig_len-l, &seq[spos]))
2742 return -1;
2743 spos += cig_len-l;
2744 } else {
2745 for (; l < cig_len && seq[spos]; l++, spos++) {
2746 if (cram_add_base(fd, c, s, cr, spos,
2747 seq[spos], qual[spos]))
2748 return -1;
2749 }
2750 }
2751 } else {
2752 /* off end of sequence or non-ref based output */
2753 for (; l < cig_len && seq[spos]; l++, spos++) {
2754 if (cram_add_base(fd, c, s, cr, spos,
2755 seq[spos], qual[spos]))
2756 return -1;
2757 }
2758 }
2759 apos += cig_len;
2760 } else if (!cr->len) {
2761 /* Seq "*" */
2762 apos += cig_len;
2763 spos += cig_len;
2764 }
2765 break;
2766
2767 case BAM_CDEL:
2768 if (cram_add_deletion(c, s, cr, spos, cig_len, &seq[spos]))
2769 return -1;
2770 apos += cig_len;
2771 break;
2772
2773 case BAM_CREF_SKIP:
2774 if (cram_add_skip(c, s, cr, spos, cig_len, &seq[spos]))
2775 return -1;
2776 apos += cig_len;
2777 break;
2778
2779 case BAM_CINS:
2780 if (cram_add_insertion(c, s, cr, spos, cig_len,
2781 cr->len ? &seq[spos] : NULL))
2782 return -1;
2783 if (fd->no_ref && cr->len) {
2784 for (l = 0; l < cig_len; l++, spos++) {
2785 cram_add_quality(fd, c, s, cr, spos, qual[spos]);
2786 }
2787 } else {
2788 spos += cig_len;
2789 }
2790 break;
2791
2792 case BAM_CSOFT_CLIP:
2793 if (cram_add_softclip(c, s, cr, spos, cig_len,
2794 cr->len ? &seq[spos] : NULL,
2795 fd->version))
2796 return -1;
2797
2798 if (fd->no_ref &&
2799 !(cr->cram_flags & CRAM_FLAG_PRESERVE_QUAL_SCORES)) {
2800 if (cr->len) {
2801 for (l = 0; l < cig_len; l++, spos++) {
2802 cram_add_quality(fd, c, s, cr, spos, qual[spos]);
2803 }
2804 } else {
2805 for (l = 0; l < cig_len; l++, spos++) {
2806 cram_add_quality(fd, c, s, cr, spos, -1);
2807 }
2808 }
2809 } else {
2810 spos += cig_len;
2811 }
2812 break;
2813
2814 case BAM_CHARD_CLIP:
2815 if (cram_add_hardclip(c, s, cr, spos, cig_len, &seq[spos]))
2816 return -1;
2817 break;
2818
2819 case BAM_CPAD:
2820 if (cram_add_pad(c, s, cr, spos, cig_len, &seq[spos]))
2821 return -1;
2822 break;
2823
2824 default:
2825 fprintf(stderr, "Unknown CIGAR op code %d\n", cig_op);
2826 return -1;
2827 }
2828 }
2829 if (cr->len && spos != cr->len) {
2830 fprintf(stderr, "CIGAR and query sequence are of different "
2831 "length\n");
2832 return -1;
2833 }
2834 fake_qual = spos;
2835 cr->aend = fd->no_ref ? apos : MIN(apos, c->ref_end);
2836 cram_stats_add(c->stats[DS_FN], cr->nfeature);
2837 } else {
2838 // Unmapped
2839 cr->cram_flags |= CRAM_FLAG_PRESERVE_QUAL_SCORES;
2840 cr->cigar = 0;
2841 cr->ncigar = 0;
2842 cr->nfeature = 0;
2843 cr->aend = cr->apos;
2844 for (i = 0; i < cr->len; i++)
2845 cram_stats_add(c->stats[DS_BA], seq[i]);
2846 fake_qual = 0;
2847 }
2848
2849 /*
2850 * Append to the qual block now. We do this here as
2851 * cram_add_substitution() can generate BA/QS events which need to
2852 * be in the qual block before we append the rest of the data.
2853 */
2854 if (cr->cram_flags & CRAM_FLAG_PRESERVE_QUAL_SCORES) {
2855 /* Special case of seq "*" */
2856 if (cr->len == 0) {
2857 cr->len = fake_qual;
2858 BLOCK_GROW(s->qual_blk, cr->len);
2859 cp = (char *)BLOCK_END(s->qual_blk);
2860 memset(cp, 255, cr->len);
2861 } else {
2862 BLOCK_GROW(s->qual_blk, cr->len);
2863 cp = (char *)BLOCK_END(s->qual_blk);
2864 char *from = (char *)&bam_qual(b)[0];
2865 char *to = &cp[0];
2866 memcpy(to, from, cr->len);
2867 //for (i = 0; i < cr->len; i++) cp[i] = from[i];
2868 }
2869 BLOCK_SIZE(s->qual_blk) += cr->len;
2870 } else {
2871 if (cr->len == 0)
2872 cr->len = fake_qual >= 0 ? fake_qual : cr->aend - cr->apos + 1;
2873 }
2874
2875 cram_stats_add(c->stats[DS_RL], cr->len);
2876
2877 /* Now we know apos and aend both, update mate-pair information */
2878 {
2879 int new;
2880 khint_t k;
2881 int sec = (cr->flags & BAM_FSECONDARY) ? 1 : 0;
2882
2883 //fprintf(stderr, "Checking %"PRId64"/%.*s\t", rnum,
2884 // cr->name_len, DSTRING_STR(s->name_ds)+cr->name);
2885 if (cr->flags & BAM_FPAIRED) {
2886 char *key = string_ndup(s->pair_keys, bam_name(b), bam_name_len(b));
2887 if (!key)
2888 return -1;
2889
2890 k = kh_put(m_s2i, s->pair[sec], key, &new);
2891 if (-1 == new)
2892 return -1;
2893 else if (new > 0)
2894 kh_val(s->pair[sec], k) = rnum;
2895 } else {
2896 new = 1;
2897 }
2898
2899 if (new == 0) {
2900 cram_record *p = &s->crecs[kh_val(s->pair[sec], k)];
2901 int aleft, aright, sign;
2902
2903 aleft = MIN(cr->apos, p->apos);
2904 aright = MAX(cr->aend, p->aend);
2905 if (cr->apos < p->apos) {
2906 sign = 1;
2907 } else if (cr->apos > p->apos) {
2908 sign = -1;
2909 } else if (cr->flags & BAM_FREAD1) {
2910 sign = 1;
2911 } else {
2912 sign = -1;
2913 }
2914
2915 // This vs p: tlen, matepos, flags. Permit TLEN 0 and/or TLEN +/-
2916 // a small amount, if appropriate options set.
2917 if ((bam_ins_size(b) &&
2918 abs(bam_ins_size(b) - sign*(aright-aleft+1)) > fd->tlen_approx) ||
2919 (!bam_ins_size(b) && !fd->tlen_zero))
2920 goto detached;
2921
2922 if ((!fd->tlen_zero && MAX(bam_mate_pos(b)+1, 0) != p->apos) &&
2923 !(fd->tlen_zero && bam_mate_pos(b) == 0))
2924 goto detached;
2925
2926 if (((bam_flag(b) & BAM_FMUNMAP) != 0) !=
2927 ((p->flags & BAM_FUNMAP) != 0))
2928 goto detached;
2929
2930 if (((bam_flag(b) & BAM_FMREVERSE) != 0) !=
2931 ((p->flags & BAM_FREVERSE) != 0))
2932 goto detached;
2933
2934
2935 // p vs this: tlen, matepos, flags
2936 if (p->ref_id != cr->ref_id &&
2937 !(fd->tlen_zero && p->ref_id == -1))
2938 goto detached;
2939
2940 if ((p->tlen && abs(p->tlen - -sign*(aright-aleft+1)) > fd->tlen_approx) ||
2941 (!p->tlen && !fd->tlen_zero))
2942 goto detached;
2943
2944 if (p->mate_pos != cr->apos &&
2945 !(fd->tlen_zero && p->mate_pos == 0))
2946 goto detached;
2947
2948 if (((p->flags & BAM_FMUNMAP) != 0) !=
2949 ((p->mate_flags & CRAM_M_UNMAP) != 0))
2950 goto detached;
2951
2952 if (((p->flags & BAM_FMREVERSE) != 0) !=
2953 ((p->mate_flags & CRAM_M_REVERSE) != 0))
2954 goto detached;
2955
2956 // Supplementary reads are just too ill defined
2957 if ((cr->flags & BAM_FSUPPLEMENTARY) ||
2958 (p->flags & BAM_FSUPPLEMENTARY))
2959 goto detached;
2960
2961 // When in lossy name mode, if a read isn't detached we
2962 // cannot store the name. The corollary is that when we
2963 // must store the name, it must be detached (inefficient).
2964 if (fd->lossy_read_names &&
2965 (!(cr->cram_flags & CRAM_FLAG_DISCARD_NAME) ||
2966 !((p->cram_flags & CRAM_FLAG_DISCARD_NAME))))
2967 goto detached;
2968
2969 /*
2970 * The fields below are unused when encoding this read as it is
2971 * no longer detached. In theory they may get referred to when
2972 * processing a 3rd or 4th read in this template?, so we set them
2973 * here just to be sure.
2974 *
2975 * They do not need cram_stats_add() calls those as they are
2976 * not emitted.
2977 */
2978 cr->mate_pos = p->apos;
2979 cr->tlen = sign*(aright-aleft+1);
2980 cr->mate_flags =
2981 ((p->flags & BAM_FMUNMAP) == BAM_FMUNMAP) * CRAM_M_UNMAP +
2982 ((p->flags & BAM_FMREVERSE) == BAM_FMREVERSE) * CRAM_M_REVERSE;
2983
2984 // Decrement statistics aggregated earlier
2985 if (p->cram_flags & CRAM_FLAG_STATS_ADDED) {
2986 cram_stats_del(c->stats[DS_NP], p->mate_pos);
2987 cram_stats_del(c->stats[DS_MF], p->mate_flags);
2988 cram_stats_del(c->stats[DS_TS], p->tlen);
2989 cram_stats_del(c->stats[DS_NS], p->mate_ref_id);
2990 }
2991
2992 /* Similarly we could correct the p-> values too, but these will no
2993 * longer have any code that refers back to them as the new 'p'
2994 * for this template is our current 'cr'.
2995 */
2996 //p->mate_pos = cr->apos;
2997 //p->mate_flags =
2998 // ((cr->flags & BAM_FMUNMAP) == BAM_FMUNMAP) * CRAM_M_UNMAP +
2999 // ((cr->flags & BAM_FMREVERSE) == BAM_FMREVERSE)* CRAM_M_REVERSE;
3000 //p->tlen = p->apos - cr->aend;
3001
3002 // Clear detached from cr flags
3003 cr->cram_flags &= ~CRAM_FLAG_DETACHED;
3004 cram_stats_add(c->stats[DS_CF], cr->cram_flags & CRAM_FLAG_MASK);
3005
3006 // Clear detached from p flags and set downstream
3007 if (p->cram_flags & CRAM_FLAG_STATS_ADDED) {
3008 cram_stats_del(c->stats[DS_CF], p->cram_flags & CRAM_FLAG_MASK);
3009 p->cram_flags &= ~CRAM_FLAG_STATS_ADDED;
3010 }
3011
3012 p->cram_flags &= ~CRAM_FLAG_DETACHED;
3013 p->cram_flags |= CRAM_FLAG_MATE_DOWNSTREAM;
3014 cram_stats_add(c->stats[DS_CF], p->cram_flags & CRAM_FLAG_MASK);
3015
3016 p->mate_line = rnum - (kh_val(s->pair[sec], k) + 1);
3017 cram_stats_add(c->stats[DS_NF], p->mate_line);
3018
3019 kh_val(s->pair[sec], k) = rnum;
3020 } else {
3021 detached:
3022 //fprintf(stderr, "unpaired\n");
3023
3024 /* Derive mate flags from this flag */
3025 cr->mate_flags = 0;
3026 if (bam_flag(b) & BAM_FMUNMAP)
3027 cr->mate_flags |= CRAM_M_UNMAP;
3028 if (bam_flag(b) & BAM_FMREVERSE)
3029 cr->mate_flags |= CRAM_M_REVERSE;
3030
3031 cram_stats_add(c->stats[DS_MF], cr->mate_flags);
3032
3033 cr->mate_pos = MAX(bam_mate_pos(b)+1, 0);
3034 cram_stats_add(c->stats[DS_NP], cr->mate_pos);
3035
3036 cr->tlen = bam_ins_size(b);
3037 cram_stats_add(c->stats[DS_TS], cr->tlen);
3038
3039 cr->cram_flags |= CRAM_FLAG_DETACHED;
3040 cram_stats_add(c->stats[DS_CF], cr->cram_flags & CRAM_FLAG_MASK);
3041 cram_stats_add(c->stats[DS_NS], bam_mate_ref(b));
3042
3043 cr->cram_flags |= CRAM_FLAG_STATS_ADDED;
3044 }
3045 }
3046
3047 cr->mqual = bam_map_qual(b);
3048 cram_stats_add(c->stats[DS_MQ], cr->mqual);
3049
3050 cr->mate_ref_id = bam_mate_ref(b);
3051
3052 if (!(bam_flag(b) & BAM_FUNMAP)) {
3053 if (c->first_base > cr->apos)
3054 c->first_base = cr->apos;
3055
3056 if (c->last_base < cr->aend)
3057 c->last_base = cr->aend;
3058 }
3059
3060 return 0;
3061 }
3062
3063 /*
3064 * Write iterator: put BAM format sequences into a CRAM file.
3065 * We buffer up a containers worth of data at a time.
3066 *
3067 * Returns 0 on success
3068 * -1 on failure
3069 */
cram_put_bam_seq(cram_fd * fd,bam_seq_t * b)3070 int cram_put_bam_seq(cram_fd *fd, bam_seq_t *b) {
3071 cram_container *c;
3072
3073 if (!fd->ctr) {
3074 fd->ctr = cram_new_container(fd->seqs_per_slice,
3075 fd->slices_per_container);
3076 if (!fd->ctr)
3077 return -1;
3078 fd->ctr->record_counter = fd->record_counter;
3079 }
3080 c = fd->ctr;
3081
3082 if (!c->slice || c->curr_rec == c->max_rec ||
3083 (bam_ref(b) != c->curr_ref && c->curr_ref >= -1) ||
3084 (c->s_num_bases >= fd->bases_per_slice)) {
3085 int slice_rec, curr_rec, multi_seq = fd->multi_seq == 1;
3086 int curr_ref = c->slice ? c->curr_ref : bam_ref(b);
3087
3088 /*
3089 * Start packing slices when we routinely have under 1/4tr full.
3090 *
3091 * This option isn't available if we choose to embed references
3092 * since we can only have one per slice.
3093 */
3094 if (fd->multi_seq == -1 && c->curr_rec < c->max_rec/4+10 &&
3095 fd->last_slice && fd->last_slice < c->max_rec/4+10 &&
3096 !fd->embed_ref) {
3097 if (fd->verbose && !c->multi_seq)
3098 fprintf(stderr, "Multi-ref enabled for this container\n");
3099 multi_seq = 1;
3100 }
3101
3102 slice_rec = c->slice_rec;
3103 curr_rec = c->curr_rec;
3104
3105 if (CRAM_MAJOR_VERS(fd->version) == 1 ||
3106 c->curr_rec == c->max_rec || fd->multi_seq != 1 || !c->slice ||
3107 c->s_num_bases >= fd->bases_per_slice) {
3108 if (NULL == (c = cram_next_container(fd, b))) {
3109 if (fd->ctr) {
3110 // prevent cram_close attempting to flush
3111 cram_free_container(fd->ctr);
3112 fd->ctr = NULL;
3113 }
3114 return -1;
3115 }
3116 }
3117
3118 /*
3119 * Due to our processing order, some things we've already done we
3120 * cannot easily undo. So when we first notice we should be packing
3121 * multiple sequences per container we emit the small partial
3122 * container as-is and then start a fresh one in a different mode.
3123 */
3124 if (multi_seq) {
3125 fd->multi_seq = 1;
3126 c->multi_seq = 1;
3127 c->pos_sorted = 0; // required atm for multi_seq slices
3128
3129 if (!c->refs_used) {
3130 pthread_mutex_lock(&fd->ref_lock);
3131 c->refs_used = calloc(fd->refs->nref, sizeof(int));
3132 pthread_mutex_unlock(&fd->ref_lock);
3133 if (!c->refs_used)
3134 return -1;
3135 }
3136 }
3137
3138 fd->last_slice = curr_rec - slice_rec;
3139 c->slice_rec = c->curr_rec;
3140
3141 // Have we seen this reference before?
3142 if (bam_ref(b) >= 0 && curr_ref >= 0 && bam_ref(b) != curr_ref && !fd->embed_ref &&
3143 !fd->unsorted && multi_seq) {
3144
3145 if (!c->refs_used) {
3146 pthread_mutex_lock(&fd->ref_lock);
3147 c->refs_used = calloc(fd->refs->nref, sizeof(int));
3148 pthread_mutex_unlock(&fd->ref_lock);
3149 if (!c->refs_used)
3150 return -1;
3151 } else if (c->refs_used && c->refs_used[bam_ref(b)]) {
3152 pthread_mutex_lock(&fd->ref_lock);
3153 fd->unsorted = 1;
3154 pthread_mutex_unlock(&fd->ref_lock);
3155 fd->multi_seq = 1;
3156 }
3157 }
3158
3159 c->curr_ref = bam_ref(b);
3160 if (c->refs_used && c->curr_ref >= 0) c->refs_used[c->curr_ref]++;
3161 }
3162
3163 if (!c->bams) {
3164 /* First time through, allocate a set of bam pointers */
3165 pthread_mutex_lock(&fd->bam_list_lock);
3166 if (fd->bl) {
3167 spare_bams *spare = fd->bl;
3168 c->bams = spare->bams;
3169 fd->bl = spare->next;
3170 free(spare);
3171 } else {
3172 c->bams = calloc(c->max_c_rec, sizeof(bam_seq_t *));
3173 if (!c->bams)
3174 return -1;
3175 }
3176 pthread_mutex_unlock(&fd->bam_list_lock);
3177 }
3178
3179 /* Copy or alloc+copy the bam record, for later encoding */
3180 if (c->bams[c->curr_c_rec])
3181 bam_copy1(c->bams[c->curr_c_rec], b);
3182 else
3183 c->bams[c->curr_c_rec] = bam_dup(b);
3184
3185 c->curr_rec++;
3186 c->curr_c_rec++;
3187 c->s_num_bases += bam_seq_len(b);
3188 fd->record_counter++;
3189
3190 return 0;
3191 }
3192