1 /*
2 Copyright (c) 2012-2020 Genome Research Ltd.
3 Author: James Bonfield <jkb@sanger.ac.uk>
4
5 Redistribution and use in source and binary forms, with or without
6 modification, are permitted provided that the following conditions are met:
7
8 1. Redistributions of source code must retain the above copyright notice,
9 this list of conditions and the following disclaimer.
10
11 2. Redistributions in binary form must reproduce the above copyright notice,
12 this list of conditions and the following disclaimer in the documentation
13 and/or other materials provided with the distribution.
14
15 3. Neither the names Genome Research Ltd and Wellcome Trust Sanger
16 Institute nor the names of its contributors may be used to endorse or promote
17 products derived from this software without specific prior written permission.
18
19 THIS SOFTWARE IS PROVIDED BY GENOME RESEARCH LTD AND CONTRIBUTORS "AS IS" AND
20 ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
21 WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
22 DISCLAIMED. IN NO EVENT SHALL GENOME RESEARCH LTD OR CONTRIBUTORS BE LIABLE
23 FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
24 DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
25 SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
26 CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
27 OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
28 OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
29 */
30
31 #define HTS_BUILDING_LIBRARY // Enables HTSLIB_EXPORT, see htslib/hts_defs.h
32 #include <config.h>
33
34 #include <stdio.h>
35 #include <errno.h>
36 #include <assert.h>
37 #include <stdlib.h>
38 #include <string.h>
39 #include <strings.h>
40 #include <zlib.h>
41 #include <sys/types.h>
42 #include <sys/stat.h>
43 #include <math.h>
44 #include <inttypes.h>
45
46 #include "cram.h"
47 #include "os.h"
48 #include "../sam_internal.h" // for nibble2base
49 #include "../htslib/hts.h"
50 #include "../htslib/hts_endian.h"
51
52 KHASH_MAP_INIT_STR(m_s2u64, uint64_t)
53
54 #define Z_CRAM_STRAT Z_FILTERED
55 //#define Z_CRAM_STRAT Z_RLE
56 //#define Z_CRAM_STRAT Z_HUFFMAN_ONLY
57 //#define Z_CRAM_STRAT Z_DEFAULT_STRATEGY
58
59 static int process_one_read(cram_fd *fd, cram_container *c,
60 cram_slice *s, cram_record *cr,
61 bam_seq_t *b, int rnum, kstring_t *MD);
62
63 /*
64 * Returns index of val into key.
65 * Basically strchr(key, val)-key;
66 */
sub_idx(char * key,char val)67 static int sub_idx(char *key, char val) {
68 int i;
69
70 for (i = 0; *key && *key++ != val; i++);
71 return i;
72 }
73
74 /*
75 * Encodes a compression header block into a generic cram_block structure.
76 *
77 * Returns cram_block ptr on success
78 * NULL on failure
79 */
cram_encode_compression_header(cram_fd * fd,cram_container * c,cram_block_compression_hdr * h)80 cram_block *cram_encode_compression_header(cram_fd *fd, cram_container *c,
81 cram_block_compression_hdr *h) {
82 cram_block *cb = cram_new_block(COMPRESSION_HEADER, 0);
83 cram_block *map = cram_new_block(COMPRESSION_HEADER, 0);
84 int i, mc, r = 0;
85
86 if (!cb || !map)
87 return NULL;
88
89 /*
90 * This is a concatenation of several blocks of data:
91 * header + landmarks, preservation map, read encoding map, and the tag
92 * encoding map.
93 * All 4 are variable sized and we need to know how large these are
94 * before creating the compression header itself as this starts with
95 * the total size (stored as a variable length string).
96 */
97
98 // Duplicated from container itself, and removed in 1.1
99 if (CRAM_MAJOR_VERS(fd->version) == 1) {
100 r |= itf8_put_blk(cb, h->ref_seq_id);
101 r |= itf8_put_blk(cb, h->ref_seq_start);
102 r |= itf8_put_blk(cb, h->ref_seq_span);
103 r |= itf8_put_blk(cb, h->num_records);
104 r |= itf8_put_blk(cb, h->num_landmarks);
105 for (i = 0; i < h->num_landmarks; i++) {
106 r |= itf8_put_blk(cb, h->landmark[i]);
107 }
108 }
109
110 if (h->preservation_map) {
111 kh_destroy(map, h->preservation_map);
112 h->preservation_map = NULL;
113 }
114
115 /* Create in-memory preservation map */
116 /* FIXME: should create this when we create the container */
117 if (c->num_records > 0) {
118 khint_t k;
119 int r;
120
121 if (!(h->preservation_map = kh_init(map)))
122 return NULL;
123
124 k = kh_put(map, h->preservation_map, "RN", &r);
125 if (-1 == r) return NULL;
126 kh_val(h->preservation_map, k).i = !fd->lossy_read_names;
127
128 if (CRAM_MAJOR_VERS(fd->version) == 1) {
129 k = kh_put(map, h->preservation_map, "PI", &r);
130 if (-1 == r) return NULL;
131 kh_val(h->preservation_map, k).i = 0;
132
133 k = kh_put(map, h->preservation_map, "UI", &r);
134 if (-1 == r) return NULL;
135 kh_val(h->preservation_map, k).i = 1;
136
137 k = kh_put(map, h->preservation_map, "MI", &r);
138 if (-1 == r) return NULL;
139 kh_val(h->preservation_map, k).i = 1;
140
141 } else {
142 // Technically SM was in 1.0, but wasn't in Java impl.
143 k = kh_put(map, h->preservation_map, "SM", &r);
144 if (-1 == r) return NULL;
145 kh_val(h->preservation_map, k).i = 0;
146
147 k = kh_put(map, h->preservation_map, "TD", &r);
148 if (-1 == r) return NULL;
149 kh_val(h->preservation_map, k).i = 0;
150
151 k = kh_put(map, h->preservation_map, "AP", &r);
152 if (-1 == r) return NULL;
153 kh_val(h->preservation_map, k).i = h->AP_delta;
154
155 if (CRAM_MAJOR_VERS(fd->version) >= 4) {
156 k = kh_put(map, h->preservation_map, "QO", &r);
157 if (-1 == r) return NULL;
158 kh_val(h->preservation_map, k).i = h->qs_seq_orient;
159 }
160
161 if (fd->no_ref || fd->embed_ref) {
162 // Reference Required == No
163 k = kh_put(map, h->preservation_map, "RR", &r);
164 if (-1 == r) return NULL;
165 kh_val(h->preservation_map, k).i = 0;
166 }
167 }
168 }
169
170 /* Encode preservation map; could collapse this and above into one */
171 mc = 0;
172 BLOCK_SIZE(map) = 0;
173 if (h->preservation_map) {
174 khint_t k;
175
176 for (k = kh_begin(h->preservation_map);
177 k != kh_end(h->preservation_map);
178 k++) {
179 const char *key;
180 khash_t(map) *pmap = h->preservation_map;
181
182
183 if (!kh_exist(pmap, k))
184 continue;
185
186 key = kh_key(pmap, k);
187 BLOCK_APPEND(map, key, 2);
188
189 switch(CRAM_KEY(key[0], key[1])) {
190 case CRAM_KEY('M','I'):
191 case CRAM_KEY('U','I'):
192 case CRAM_KEY('P','I'):
193 case CRAM_KEY('A','P'):
194 case CRAM_KEY('R','N'):
195 case CRAM_KEY('R','R'):
196 case CRAM_KEY('Q','O'):
197 BLOCK_APPEND_CHAR(map, kh_val(pmap, k).i);
198 break;
199
200 case CRAM_KEY('S','M'): {
201 char smat[5], *mp = smat;
202 *mp++ =
203 (sub_idx("CGTN", h->substitution_matrix[0][0]) << 6) |
204 (sub_idx("CGTN", h->substitution_matrix[0][1]) << 4) |
205 (sub_idx("CGTN", h->substitution_matrix[0][2]) << 2) |
206 (sub_idx("CGTN", h->substitution_matrix[0][3]) << 0);
207 *mp++ =
208 (sub_idx("AGTN", h->substitution_matrix[1][0]) << 6) |
209 (sub_idx("AGTN", h->substitution_matrix[1][1]) << 4) |
210 (sub_idx("AGTN", h->substitution_matrix[1][2]) << 2) |
211 (sub_idx("AGTN", h->substitution_matrix[1][3]) << 0);
212 *mp++ =
213 (sub_idx("ACTN", h->substitution_matrix[2][0]) << 6) |
214 (sub_idx("ACTN", h->substitution_matrix[2][1]) << 4) |
215 (sub_idx("ACTN", h->substitution_matrix[2][2]) << 2) |
216 (sub_idx("ACTN", h->substitution_matrix[2][3]) << 0);
217 *mp++ =
218 (sub_idx("ACGN", h->substitution_matrix[3][0]) << 6) |
219 (sub_idx("ACGN", h->substitution_matrix[3][1]) << 4) |
220 (sub_idx("ACGN", h->substitution_matrix[3][2]) << 2) |
221 (sub_idx("ACGN", h->substitution_matrix[3][3]) << 0);
222 *mp++ =
223 (sub_idx("ACGT", h->substitution_matrix[4][0]) << 6) |
224 (sub_idx("ACGT", h->substitution_matrix[4][1]) << 4) |
225 (sub_idx("ACGT", h->substitution_matrix[4][2]) << 2) |
226 (sub_idx("ACGT", h->substitution_matrix[4][3]) << 0);
227 BLOCK_APPEND(map, smat, 5);
228 break;
229 }
230
231 case CRAM_KEY('T','D'): {
232 r |= (fd->vv.varint_put32_blk(map, BLOCK_SIZE(h->TD_blk)) <= 0);
233 BLOCK_APPEND(map,
234 BLOCK_DATA(h->TD_blk),
235 BLOCK_SIZE(h->TD_blk));
236 break;
237 }
238
239 default:
240 hts_log_warning("Unknown preservation key '%.2s'", key);
241 break;
242 }
243
244 mc++;
245 }
246 }
247 r |= (fd->vv.varint_put32_blk(cb, BLOCK_SIZE(map) + fd->vv.varint_size(mc)) <= 0);
248 r |= (fd->vv.varint_put32_blk(cb, mc) <= 0);
249 BLOCK_APPEND(cb, BLOCK_DATA(map), BLOCK_SIZE(map));
250
251 /* rec encoding map */
252 mc = 0;
253 BLOCK_SIZE(map) = 0;
254 if (h->codecs[DS_BF]) {
255 if (-1 == h->codecs[DS_BF]->store(h->codecs[DS_BF], map, "BF",
256 fd->version))
257 return NULL;
258 mc++;
259 }
260 if (h->codecs[DS_CF]) {
261 if (-1 == h->codecs[DS_CF]->store(h->codecs[DS_CF], map, "CF",
262 fd->version))
263 return NULL;
264 mc++;
265 }
266 if (h->codecs[DS_RL]) {
267 if (-1 == h->codecs[DS_RL]->store(h->codecs[DS_RL], map, "RL",
268 fd->version))
269 return NULL;
270 mc++;
271 }
272 if (h->codecs[DS_AP]) {
273 if (-1 == h->codecs[DS_AP]->store(h->codecs[DS_AP], map, "AP",
274 fd->version))
275 return NULL;
276 mc++;
277 }
278 if (h->codecs[DS_RG]) {
279 if (-1 == h->codecs[DS_RG]->store(h->codecs[DS_RG], map, "RG",
280 fd->version))
281 return NULL;
282 mc++;
283 }
284 if (h->codecs[DS_MF]) {
285 if (-1 == h->codecs[DS_MF]->store(h->codecs[DS_MF], map, "MF",
286 fd->version))
287 return NULL;
288 mc++;
289 }
290 if (h->codecs[DS_NS]) {
291 if (-1 == h->codecs[DS_NS]->store(h->codecs[DS_NS], map, "NS",
292 fd->version))
293 return NULL;
294 mc++;
295 }
296 if (h->codecs[DS_NP]) {
297 if (-1 == h->codecs[DS_NP]->store(h->codecs[DS_NP], map, "NP",
298 fd->version))
299 return NULL;
300 mc++;
301 }
302 if (h->codecs[DS_TS]) {
303 if (-1 == h->codecs[DS_TS]->store(h->codecs[DS_TS], map, "TS",
304 fd->version))
305 return NULL;
306 mc++;
307 }
308 if (h->codecs[DS_NF]) {
309 if (-1 == h->codecs[DS_NF]->store(h->codecs[DS_NF], map, "NF",
310 fd->version))
311 return NULL;
312 mc++;
313 }
314 if (h->codecs[DS_TC]) {
315 if (-1 == h->codecs[DS_TC]->store(h->codecs[DS_TC], map, "TC",
316 fd->version))
317 return NULL;
318 mc++;
319 }
320 if (h->codecs[DS_TN]) {
321 if (-1 == h->codecs[DS_TN]->store(h->codecs[DS_TN], map, "TN",
322 fd->version))
323 return NULL;
324 mc++;
325 }
326 if (h->codecs[DS_TL]) {
327 if (-1 == h->codecs[DS_TL]->store(h->codecs[DS_TL], map, "TL",
328 fd->version))
329 return NULL;
330 mc++;
331 }
332 if (h->codecs[DS_FN]) {
333 if (-1 == h->codecs[DS_FN]->store(h->codecs[DS_FN], map, "FN",
334 fd->version))
335 return NULL;
336 mc++;
337 }
338 if (h->codecs[DS_FC]) {
339 if (-1 == h->codecs[DS_FC]->store(h->codecs[DS_FC], map, "FC",
340 fd->version))
341 return NULL;
342 mc++;
343 }
344 if (h->codecs[DS_FP]) {
345 if (-1 == h->codecs[DS_FP]->store(h->codecs[DS_FP], map, "FP",
346 fd->version))
347 return NULL;
348 mc++;
349 }
350 if (h->codecs[DS_BS]) {
351 if (-1 == h->codecs[DS_BS]->store(h->codecs[DS_BS], map, "BS",
352 fd->version))
353 return NULL;
354 mc++;
355 }
356 if (h->codecs[DS_IN]) {
357 if (-1 == h->codecs[DS_IN]->store(h->codecs[DS_IN], map, "IN",
358 fd->version))
359 return NULL;
360 mc++;
361 }
362 if (h->codecs[DS_DL]) {
363 if (-1 == h->codecs[DS_DL]->store(h->codecs[DS_DL], map, "DL",
364 fd->version))
365 return NULL;
366 mc++;
367 }
368 if (h->codecs[DS_BA]) {
369 if (-1 == h->codecs[DS_BA]->store(h->codecs[DS_BA], map, "BA",
370 fd->version))
371 return NULL;
372 mc++;
373 }
374 if (h->codecs[DS_BB]) {
375 if (-1 == h->codecs[DS_BB]->store(h->codecs[DS_BB], map, "BB",
376 fd->version))
377 return NULL;
378 mc++;
379 }
380 if (h->codecs[DS_MQ]) {
381 if (-1 == h->codecs[DS_MQ]->store(h->codecs[DS_MQ], map, "MQ",
382 fd->version))
383 return NULL;
384 mc++;
385 }
386 if (h->codecs[DS_RN]) {
387 if (-1 == h->codecs[DS_RN]->store(h->codecs[DS_RN], map, "RN",
388 fd->version))
389 return NULL;
390 mc++;
391 }
392 if (h->codecs[DS_QS]) {
393 if (-1 == h->codecs[DS_QS]->store(h->codecs[DS_QS], map, "QS",
394 fd->version))
395 return NULL;
396 mc++;
397 }
398 if (h->codecs[DS_QQ]) {
399 if (-1 == h->codecs[DS_QQ]->store(h->codecs[DS_QQ], map, "QQ",
400 fd->version))
401 return NULL;
402 mc++;
403 }
404 if (h->codecs[DS_RI]) {
405 if (-1 == h->codecs[DS_RI]->store(h->codecs[DS_RI], map, "RI",
406 fd->version))
407 return NULL;
408 mc++;
409 }
410 if (CRAM_MAJOR_VERS(fd->version) != 1) {
411 if (h->codecs[DS_SC]) {
412 if (-1 == h->codecs[DS_SC]->store(h->codecs[DS_SC], map, "SC",
413 fd->version))
414 return NULL;
415 mc++;
416 }
417 if (h->codecs[DS_RS]) {
418 if (-1 == h->codecs[DS_RS]->store(h->codecs[DS_RS], map, "RS",
419 fd->version))
420 return NULL;
421 mc++;
422 }
423 if (h->codecs[DS_PD]) {
424 if (-1 == h->codecs[DS_PD]->store(h->codecs[DS_PD], map, "PD",
425 fd->version))
426 return NULL;
427 mc++;
428 }
429 if (h->codecs[DS_HC]) {
430 if (-1 == h->codecs[DS_HC]->store(h->codecs[DS_HC], map, "HC",
431 fd->version))
432 return NULL;
433 mc++;
434 }
435 }
436 if (h->codecs[DS_TM]) {
437 if (-1 == h->codecs[DS_TM]->store(h->codecs[DS_TM], map, "TM",
438 fd->version))
439 return NULL;
440 mc++;
441 }
442 if (h->codecs[DS_TV]) {
443 if (-1 == h->codecs[DS_TV]->store(h->codecs[DS_TV], map, "TV",
444 fd->version))
445 return NULL;
446 mc++;
447 }
448 r |= (fd->vv.varint_put32_blk(cb, BLOCK_SIZE(map) + fd->vv.varint_size(mc)) <= 0);
449 r |= (fd->vv.varint_put32_blk(cb, mc) <= 0);
450 BLOCK_APPEND(cb, BLOCK_DATA(map), BLOCK_SIZE(map));
451
452 /* tag encoding map */
453 mc = 0;
454 BLOCK_SIZE(map) = 0;
455 if (c->tags_used) {
456 khint_t k;
457
458 for (k = kh_begin(c->tags_used); k != kh_end(c->tags_used); k++) {
459 int key;
460 if (!kh_exist(c->tags_used, k))
461 continue;
462
463 key = kh_key(c->tags_used, k);
464 cram_codec *cd = kh_val(c->tags_used, k)->codec;
465
466 r |= (fd->vv.varint_put32_blk(map, key) <= 0);
467 if (-1 == cd->store(cd, map, NULL, fd->version))
468 return NULL;
469
470 mc++;
471 }
472 }
473
474 r |= (fd->vv.varint_put32_blk(cb, BLOCK_SIZE(map) + fd->vv.varint_size(mc)) <= 0);
475 r |= (fd->vv.varint_put32_blk(cb, mc) <= 0);
476 BLOCK_APPEND(cb, BLOCK_DATA(map), BLOCK_SIZE(map));
477
478 hts_log_info("Wrote compression block header in %d bytes", (int)BLOCK_SIZE(cb));
479
480 BLOCK_UPLEN(cb);
481
482 cram_free_block(map);
483
484 if (r >= 0)
485 return cb;
486
487 block_err:
488 return NULL;
489 }
490
491
492 /*
493 * Encodes a slice compression header.
494 *
495 * Returns cram_block on success
496 * NULL on failure
497 */
cram_encode_slice_header(cram_fd * fd,cram_slice * s)498 cram_block *cram_encode_slice_header(cram_fd *fd, cram_slice *s) {
499 char *buf;
500 char *cp;
501 cram_block *b = cram_new_block(MAPPED_SLICE, 0);
502 int j;
503
504 if (!b)
505 return NULL;
506
507 cp = buf = malloc(22+16+5*(8+s->hdr->num_blocks));
508 if (NULL == buf) {
509 cram_free_block(b);
510 return NULL;
511 }
512
513 cp += fd->vv.varint_put32s(cp, NULL, s->hdr->ref_seq_id);
514 if (CRAM_MAJOR_VERS(fd->version) >= 4) {
515 cp += fd->vv.varint_put64(cp, NULL, s->hdr->ref_seq_start);
516 cp += fd->vv.varint_put64(cp, NULL, s->hdr->ref_seq_span);
517 } else {
518 cp += fd->vv.varint_put32(cp, NULL, s->hdr->ref_seq_start);
519 cp += fd->vv.varint_put32(cp, NULL, s->hdr->ref_seq_span);
520 }
521 cp += fd->vv.varint_put32(cp, NULL, s->hdr->num_records);
522 if (CRAM_MAJOR_VERS(fd->version) == 2)
523 cp += fd->vv.varint_put32(cp, NULL, s->hdr->record_counter);
524 else if (CRAM_MAJOR_VERS(fd->version) >= 3)
525 cp += fd->vv.varint_put64(cp, NULL, s->hdr->record_counter);
526 cp += fd->vv.varint_put32(cp, NULL, s->hdr->num_blocks);
527 cp += fd->vv.varint_put32(cp, NULL, s->hdr->num_content_ids);
528 for (j = 0; j < s->hdr->num_content_ids; j++) {
529 cp += fd->vv.varint_put32(cp, NULL, s->hdr->block_content_ids[j]);
530 }
531 if (s->hdr->content_type == MAPPED_SLICE)
532 cp += fd->vv.varint_put32(cp, NULL, s->hdr->ref_base_id);
533
534 if (CRAM_MAJOR_VERS(fd->version) != 1) {
535 memcpy(cp, s->hdr->md5, 16); cp += 16;
536 }
537
538 assert(cp-buf <= 22+16+5*(8+s->hdr->num_blocks));
539
540 b->data = (unsigned char *)buf;
541 b->comp_size = b->uncomp_size = cp-buf;
542
543 return b;
544 }
545
546
547 /*
548 * Encodes a single read.
549 *
550 * Returns 0 on success
551 * -1 on failure
552 */
cram_encode_slice_read(cram_fd * fd,cram_container * c,cram_block_compression_hdr * h,cram_slice * s,cram_record * cr,int64_t * last_pos)553 static int cram_encode_slice_read(cram_fd *fd,
554 cram_container *c,
555 cram_block_compression_hdr *h,
556 cram_slice *s,
557 cram_record *cr,
558 int64_t *last_pos) {
559 int r = 0;
560 int32_t i32;
561 int64_t i64;
562 unsigned char uc;
563
564 //fprintf(stderr, "Encode seq %d, %d/%d FN=%d, %s\n", rec, core->byte, core->bit, cr->nfeature, s->name_ds->str + cr->name);
565
566 //printf("BF=0x%x\n", cr->flags);
567 // bf = cram_flag_swap[cr->flags];
568 i32 = fd->cram_flag_swap[cr->flags & 0xfff];
569 r |= h->codecs[DS_BF]->encode(s, h->codecs[DS_BF], (char *)&i32, 1);
570
571 i32 = cr->cram_flags & CRAM_FLAG_MASK;
572 r |= h->codecs[DS_CF]->encode(s, h->codecs[DS_CF], (char *)&i32, 1);
573
574 if (CRAM_MAJOR_VERS(fd->version) != 1 && s->hdr->ref_seq_id == -2)
575 r |= h->codecs[DS_RI]->encode(s, h->codecs[DS_RI], (char *)&cr->ref_id, 1);
576
577 r |= h->codecs[DS_RL]->encode(s, h->codecs[DS_RL], (char *)&cr->len, 1);
578
579 if (c->pos_sorted) {
580 if (CRAM_MAJOR_VERS(fd->version) >= 4) {
581 i64 = cr->apos - *last_pos;
582 r |= h->codecs[DS_AP]->encode(s, h->codecs[DS_AP], (char *)&i64, 1);
583 } else {
584 i32 = cr->apos - *last_pos;
585 r |= h->codecs[DS_AP]->encode(s, h->codecs[DS_AP], (char *)&i32, 1);
586 }
587 *last_pos = cr->apos;
588 } else {
589 if (CRAM_MAJOR_VERS(fd->version) >= 4) {
590 i64 = cr->apos;
591 r |= h->codecs[DS_AP]->encode(s, h->codecs[DS_AP], (char *)&i64, 1);
592 } else {
593 i32 = cr->apos;
594 r |= h->codecs[DS_AP]->encode(s, h->codecs[DS_AP], (char *)&i32, 1);
595 }
596 }
597
598 r |= h->codecs[DS_RG]->encode(s, h->codecs[DS_RG], (char *)&cr->rg, 1);
599
600 if (cr->cram_flags & CRAM_FLAG_DETACHED) {
601 i32 = cr->mate_flags;
602 r |= h->codecs[DS_MF]->encode(s, h->codecs[DS_MF], (char *)&i32, 1);
603
604 r |= h->codecs[DS_NS]->encode(s, h->codecs[DS_NS],
605 (char *)&cr->mate_ref_id, 1);
606
607 if (CRAM_MAJOR_VERS(fd->version) >= 4) {
608 r |= h->codecs[DS_NP]->encode(s, h->codecs[DS_NP],
609 (char *)&cr->mate_pos, 1);
610 r |= h->codecs[DS_TS]->encode(s, h->codecs[DS_TS],
611 (char *)&cr->tlen, 1);
612 } else {
613 i32 = cr->mate_pos;
614 r |= h->codecs[DS_NP]->encode(s, h->codecs[DS_NP],
615 (char *)&i32, 1);
616 i32 = cr->tlen;
617 r |= h->codecs[DS_TS]->encode(s, h->codecs[DS_TS],
618 (char *)&i32, 1);
619 }
620 } else {
621 if (cr->cram_flags & CRAM_FLAG_MATE_DOWNSTREAM) {
622 r |= h->codecs[DS_NF]->encode(s, h->codecs[DS_NF],
623 (char *)&cr->mate_line, 1);
624 }
625 if (cr->cram_flags & CRAM_FLAG_EXPLICIT_TLEN) {
626 if (CRAM_MAJOR_VERS(fd->version) >= 4) {
627 r |= h->codecs[DS_TS]->encode(s, h->codecs[DS_TS],
628 (char *)&cr->tlen, 1);
629 }
630 }
631 }
632
633 /* Aux tags */
634 if (CRAM_MAJOR_VERS(fd->version) == 1) {
635 int j;
636 uc = cr->ntags;
637 r |= h->codecs[DS_TC]->encode(s, h->codecs[DS_TC], (char *)&uc, 1);
638
639 for (j = 0; j < cr->ntags; j++) {
640 uint32_t i32 = s->TN[cr->TN_idx + j]; // id
641 r |= h->codecs[DS_TN]->encode(s, h->codecs[DS_TN], (char *)&i32, 1);
642 }
643 } else {
644 r |= h->codecs[DS_TL]->encode(s, h->codecs[DS_TL], (char *)&cr->TL, 1);
645 }
646
647 // qual
648 // QS codec : Already stored in block[2].
649
650 // features (diffs)
651 if (!(cr->flags & BAM_FUNMAP)) {
652 int prev_pos = 0, j;
653
654 r |= h->codecs[DS_FN]->encode(s, h->codecs[DS_FN],
655 (char *)&cr->nfeature, 1);
656 for (j = 0; j < cr->nfeature; j++) {
657 cram_feature *f = &s->features[cr->feature + j];
658
659 uc = f->X.code;
660 r |= h->codecs[DS_FC]->encode(s, h->codecs[DS_FC], (char *)&uc, 1);
661 i32 = f->X.pos - prev_pos;
662 r |= h->codecs[DS_FP]->encode(s, h->codecs[DS_FP], (char *)&i32, 1);
663 prev_pos = f->X.pos;
664
665 switch(f->X.code) {
666 //char *seq;
667
668 case 'X':
669 //fprintf(stderr, " FC=%c FP=%d base=%d\n", f->X.code, i32, f->X.base);
670
671 uc = f->X.base;
672 r |= h->codecs[DS_BS]->encode(s, h->codecs[DS_BS],
673 (char *)&uc, 1);
674 break;
675 case 'S':
676 // Already done
677 //r |= h->codecs[DS_SC]->encode(s, h->codecs[DS_SC],
678 // BLOCK_DATA(s->soft_blk) + f->S.seq_idx,
679 // f->S.len);
680
681 //if (CRAM_MAJOR_VERS(fd->version) >= 3) {
682 // r |= h->codecs[DS_BB]->encode(s, h->codecs[DS_BB],
683 // BLOCK_DATA(s->seqs_blk) + f->S.seq_idx,
684 // f->S.len);
685 //}
686 break;
687 case 'I':
688 //seq = DSTRING_STR(s->seqs_ds) + f->S.seq_idx;
689 //r |= h->codecs[DS_IN]->encode(s, h->codecs[DS_IN],
690 // seq, f->S.len);
691 //if (CRAM_MAJOR_VERS(fd->version) >= 3) {
692 // r |= h->codecs[DS_BB]->encode(s, h->codecs[DS_BB],
693 // BLOCK_DATA(s->seqs_blk) + f->I.seq_idx,
694 // f->I.len);
695 //}
696 break;
697 case 'i':
698 uc = f->i.base;
699 r |= h->codecs[DS_BA]->encode(s, h->codecs[DS_BA],
700 (char *)&uc, 1);
701 //seq = DSTRING_STR(s->seqs_ds) + f->S.seq_idx;
702 //r |= h->codecs[DS_IN]->encode(s, h->codecs[DS_IN],
703 // seq, 1);
704 break;
705 case 'D':
706 i32 = f->D.len;
707 r |= h->codecs[DS_DL]->encode(s, h->codecs[DS_DL],
708 (char *)&i32, 1);
709 break;
710
711 case 'B':
712 // // Used when we try to store a non ACGTN base or an N
713 // // that aligns against a non ACGTN reference
714
715 uc = f->B.base;
716 r |= h->codecs[DS_BA]->encode(s, h->codecs[DS_BA],
717 (char *)&uc, 1);
718
719 // Already added
720 // uc = f->B.qual;
721 // r |= h->codecs[DS_QS]->encode(s, h->codecs[DS_QS],
722 // (char *)&uc, 1);
723 break;
724
725 case 'b':
726 // string of bases
727 r |= h->codecs[DS_BB]->encode(s, h->codecs[DS_BB],
728 (char *)BLOCK_DATA(s->seqs_blk)
729 + f->b.seq_idx,
730 f->b.len);
731 break;
732
733 case 'Q':
734 // Already added
735 // uc = f->B.qual;
736 // r |= h->codecs[DS_QS]->encode(s, h->codecs[DS_QS],
737 // (char *)&uc, 1);
738 break;
739
740 case 'N':
741 i32 = f->N.len;
742 r |= h->codecs[DS_RS]->encode(s, h->codecs[DS_RS],
743 (char *)&i32, 1);
744 break;
745
746 case 'P':
747 i32 = f->P.len;
748 r |= h->codecs[DS_PD]->encode(s, h->codecs[DS_PD],
749 (char *)&i32, 1);
750 break;
751
752 case 'H':
753 i32 = f->H.len;
754 r |= h->codecs[DS_HC]->encode(s, h->codecs[DS_HC],
755 (char *)&i32, 1);
756 break;
757
758
759 default:
760 hts_log_error("Unhandled feature code %c", f->X.code);
761 return -1;
762 }
763 }
764
765 r |= h->codecs[DS_MQ]->encode(s, h->codecs[DS_MQ],
766 (char *)&cr->mqual, 1);
767 } else {
768 char *seq = (char *)BLOCK_DATA(s->seqs_blk) + cr->seq;
769 if (cr->len)
770 r |= h->codecs[DS_BA]->encode(s, h->codecs[DS_BA], seq, cr->len);
771 }
772
773 return r ? -1 : 0;
774 }
775
776
777 /*
778 * Applies various compression methods to specific blocks, depending on
779 * known observations of how data series compress.
780 *
781 * Returns 0 on success
782 * -1 on failure
783 */
cram_compress_slice(cram_fd * fd,cram_container * c,cram_slice * s)784 static int cram_compress_slice(cram_fd *fd, cram_container *c, cram_slice *s) {
785 int level = fd->level, i;
786 int method = 1<<GZIP | 1<<GZIP_RLE, methodF = method;
787 int v31_or_above = (fd->version >= (3<<8)+1);
788
789 /* Compress the CORE Block too, with minimal zlib level */
790 if (level > 5 && s->block[0]->uncomp_size > 500)
791 cram_compress_block2(fd, s, s->block[0], NULL, 1<<GZIP, 1);
792
793 if (fd->use_bz2)
794 method |= 1<<BZIP2;
795
796 int method_rans = (1<<RANS0) | (1<<RANS1);
797 int method_ranspr = method_rans;
798
799 if (fd->use_rans) {
800 method_ranspr = (1<<RANS_PR0) | (1<<RANS_PR1);
801 if (level > 1)
802 method_ranspr |=
803 (1<<RANS_PR64) | (1<<RANS_PR9)
804 | (1<<RANS_PR128) | (1<<RANS_PR193);
805 if (level > 5)
806 method_ranspr |= (1<<RANS_PR129) | (1<<RANS_PR192);
807 }
808
809 if (fd->use_rans) {
810 methodF |= v31_or_above ? method_ranspr : method_rans;
811 method |= v31_or_above ? method_ranspr : method_rans;
812 }
813
814 int method_arith = 0;
815 if (fd->use_arith) {
816 method_arith = (1<<ARITH_PR0) | (1<<ARITH_PR1);
817 if (level > 1)
818 method_arith |=
819 (1<<ARITH_PR64) | (1<<ARITH_PR9)
820 | (1<<ARITH_PR128) | (1<<ARITH_PR129)
821 | (1<<ARITH_PR192) | (1u<<ARITH_PR193);
822 }
823 if (fd->use_arith && v31_or_above) {
824 methodF |= method_arith;
825 method |= method_arith;
826 }
827
828 if (fd->use_lzma)
829 method |= (1<<LZMA);
830
831 /* Faster method for data series we only need entropy encoding on */
832 methodF = method & ~(1<<GZIP | 1<<BZIP2 | 1<<LZMA);
833 if (level >= 5) {
834 method |= 1<<GZIP_1;
835 methodF = method;
836 }
837 if (level == 1) {
838 method &= ~(1<<GZIP);
839 method |= 1<<GZIP_1;
840 methodF = method;
841 }
842
843 int qmethod = method;
844 int qmethodF = method;
845 if (v31_or_above && fd->use_fqz) {
846 qmethod |= 1<<FQZ;
847 qmethodF |= 1<<FQZ;
848 if (fd->level > 4) {
849 qmethod |= 1<<FQZ_b;
850 qmethodF |= 1<<FQZ_b;
851 }
852 if (fd->level > 6) {
853 qmethod |= (1<<FQZ_c) | (1<<FQZ_d);
854 qmethodF |= (1<<FQZ_c) | (1<<FQZ_d);
855 }
856 }
857
858 pthread_mutex_lock(&fd->metrics_lock);
859 for (i = 0; i < DS_END; i++)
860 if (c->stats[i] && c->stats[i]->nvals > 16)
861 fd->m[i]->unpackable = 1;
862 pthread_mutex_unlock(&fd->metrics_lock);
863
864 /* Specific compression methods for certain block types */
865 if (cram_compress_block2(fd, s, s->block[DS_IN], fd->m[DS_IN], //IN (seq)
866 method, level))
867 return -1;
868
869 if (fd->level == 0) {
870 /* Do nothing */
871 } else if (fd->level == 1) {
872 if (cram_compress_block2(fd, s, s->block[DS_QS], fd->m[DS_QS],
873 qmethodF, 1))
874 return -1;
875 for (i = DS_aux; i <= DS_aux_oz; i++) {
876 if (s->block[i])
877 if (cram_compress_block2(fd, s, s->block[i], fd->m[i],
878 method, 1))
879 return -1;
880 }
881 } else if (fd->level < 3) {
882 if (cram_compress_block2(fd, s, s->block[DS_QS], fd->m[DS_QS],
883 qmethod, 1))
884 return -1;
885 if (cram_compress_block2(fd, s, s->block[DS_BA], fd->m[DS_BA],
886 method, 1))
887 return -1;
888 if (s->block[DS_BB])
889 if (cram_compress_block2(fd, s, s->block[DS_BB], fd->m[DS_BB],
890 method, 1))
891 return -1;
892 for (i = DS_aux; i <= DS_aux_oz; i++) {
893 if (s->block[i])
894 if (cram_compress_block2(fd, s, s->block[i], fd->m[i],
895 method, level))
896 return -1;
897 }
898 } else {
899 if (cram_compress_block2(fd, s, s->block[DS_QS], fd->m[DS_QS],
900 qmethod, level))
901 return -1;
902 if (cram_compress_block2(fd, s, s->block[DS_BA], fd->m[DS_BA],
903 method, level))
904 return -1;
905 if (s->block[DS_BB])
906 if (cram_compress_block2(fd, s, s->block[DS_BB], fd->m[DS_BB],
907 method, level))
908 return -1;
909 for (i = DS_aux; i <= DS_aux_oz; i++) {
910 if (s->block[i])
911 if (cram_compress_block2(fd, s, s->block[i], fd->m[i],
912 method, level))
913 return -1;
914 }
915 }
916
917 // NAME: best is generally xz, bzip2, zlib then rans1
918 int method_rn = method & ~(method_rans | method_ranspr | 1<<GZIP_RLE);
919 if (fd->version >= (3<<8)+1 && fd->use_tok)
920 method_rn |= fd->use_arith ? (1<<TOKA) : (1<<TOK3);
921 if (cram_compress_block2(fd, s, s->block[DS_RN], fd->m[DS_RN],
922 method_rn, level))
923 return -1;
924
925 // NS shows strong local correlation as rearrangements are localised
926 if (s->block[DS_NS] && s->block[DS_NS] != s->block[0])
927 if (cram_compress_block2(fd, s, s->block[DS_NS], fd->m[DS_NS],
928 method, level))
929 return -1;
930
931
932 /*
933 * Compress any auxiliary tags with their own per-tag metrics
934 */
935 {
936 int i;
937 for (i = 0; i < s->naux_block; i++) {
938 if (!s->aux_block[i] || s->aux_block[i] == s->block[0])
939 continue;
940
941 if (s->aux_block[i]->method != RAW)
942 continue;
943
944 if (cram_compress_block2(fd, s, s->aux_block[i], s->aux_block[i]->m,
945 method, level))
946 return -1;
947 }
948 }
949
950 /*
951 * Minimal compression of any block still uncompressed, bar CORE
952 */
953 {
954 int i;
955 for (i = 1; i < s->hdr->num_blocks && i < DS_END; i++) {
956 if (!s->block[i] || s->block[i] == s->block[0])
957 continue;
958
959 if (s->block[i]->method != RAW)
960 continue;
961
962 if (cram_compress_block2(fd, s, s->block[i], fd->m[i],
963 methodF, level))
964 return -1;
965 }
966 }
967
968 return 0;
969 }
970
971 /*
972 * Allocates a block associated with the cram codec associated with
973 * data series ds_id or the internal codec_id (depending on codec
974 * type).
975 *
976 * The ds_ids are what end up written to disk as an external block.
977 * The c_ids are internal and used when daisy-chaining transforms
978 * such as MAP and RLE. These blocks are also allocated, but
979 * are ephemeral in nature. (The codecs themselves cannot allocate
980 * these as the same codec pointer may be operating on multiple slices
981 * if we're using a multi-slice container.)
982 *
983 * Returns 0 on success
984 * -1 on failure
985 */
cram_allocate_block(cram_codec * codec,cram_slice * s,int ds_id)986 static int cram_allocate_block(cram_codec *codec, cram_slice *s, int ds_id) {
987 if (!codec)
988 return 0;
989
990 switch(codec->codec) {
991 // Codecs which are hard-coded to use the CORE block
992 case E_GOLOMB:
993 case E_HUFFMAN:
994 case E_BETA:
995 case E_SUBEXP:
996 case E_GOLOMB_RICE:
997 case E_GAMMA:
998 codec->out = s->block[0];
999 break;
1000
1001 // Codecs which don't use external blocks
1002 case E_CONST_BYTE:
1003 case E_CONST_INT:
1004 codec->out = NULL;
1005 break;
1006
1007 // Codecs that emit directly to external blocks
1008 case E_EXTERNAL:
1009 case E_VARINT_UNSIGNED:
1010 case E_VARINT_SIGNED:
1011 if (!(s->block[ds_id] = cram_new_block(EXTERNAL, ds_id)))
1012 return -1;
1013 codec->u.external.content_id = ds_id;
1014 codec->out = s->block[ds_id];
1015 break;
1016
1017 case E_BYTE_ARRAY_STOP: // Why no sub-codec?
1018 if (!(s->block[ds_id] = cram_new_block(EXTERNAL, ds_id)))
1019 return -1;
1020 codec->u.byte_array_stop.content_id = ds_id;
1021 codec->out = s->block[ds_id];
1022 break;
1023
1024
1025 // Codecs that contain sub-codecs which may in turn emit to external blocks
1026 case E_BYTE_ARRAY_LEN: {
1027 cram_codec *bal = codec->u.e_byte_array_len.len_codec;
1028 if (cram_allocate_block(bal, s, bal->u.external.content_id))
1029 return -1;
1030 bal = codec->u.e_byte_array_len.val_codec;
1031 if (cram_allocate_block(bal, s, bal->u.external.content_id))
1032 return -1;
1033
1034 break;
1035 }
1036
1037 case E_XRLE:
1038 if (cram_allocate_block(codec->u.e_xrle.len_codec, s, ds_id))
1039 //ds_id == DS_QS ? DS_QS_len : ds_id))
1040 return -1;
1041 if (cram_allocate_block(codec->u.e_xrle.lit_codec, s, ds_id))
1042 return -1;
1043
1044 break;
1045
1046 case E_XPACK:
1047 if (cram_allocate_block(codec->u.e_xpack.sub_codec, s, ds_id))
1048 return -1;
1049 codec->out = cram_new_block(0, 0); // ephemeral
1050 if (!codec->out)
1051 return -1;
1052
1053 break;
1054
1055 case E_XDELTA:
1056 if (cram_allocate_block(codec->u.e_xdelta.sub_codec, s, ds_id))
1057 return -1;
1058 codec->out = cram_new_block(0, 0); // ephemeral
1059 if (!codec->out)
1060 return -1;
1061
1062 break;
1063
1064 default:
1065 break;
1066 }
1067
1068 return 0;
1069 }
1070
1071 /*
1072 * Encodes a single slice from a container
1073 *
1074 * Returns 0 on success
1075 * -1 on failure
1076 */
cram_encode_slice(cram_fd * fd,cram_container * c,cram_block_compression_hdr * h,cram_slice * s)1077 static int cram_encode_slice(cram_fd *fd, cram_container *c,
1078 cram_block_compression_hdr *h, cram_slice *s) {
1079 int rec, r = 0;
1080 int64_t last_pos;
1081 int embed_ref;
1082 enum cram_DS_ID id;
1083
1084 embed_ref = fd->embed_ref && s->hdr->ref_seq_id != -1 ? 1 : 0;
1085
1086 /*
1087 * Slice external blocks:
1088 * ID 0 => base calls (insertions, soft-clip)
1089 * ID 1 => qualities
1090 * ID 2 => names
1091 * ID 3 => TS (insert size), NP (next frag)
1092 * ID 4 => tag values
1093 * ID 6 => tag IDs (TN), if CRAM_V1.0
1094 * ID 7 => TD tag dictionary, if !CRAM_V1.0
1095 */
1096
1097 /* Create cram slice header */
1098 s->hdr->ref_base_id = embed_ref && s->hdr->ref_seq_span > 0
1099 ? DS_ref
1100 : (CRAM_MAJOR_VERS(fd->version) >= 4 ? 0 : -1);
1101 s->hdr->record_counter = c->num_records + c->record_counter;
1102 c->num_records += s->hdr->num_records;
1103
1104 int ntags = c->tags_used ? c->tags_used->n_occupied : 0;
1105 s->block = calloc(DS_END + ntags*2, sizeof(s->block[0]));
1106 s->hdr->block_content_ids = malloc(DS_END * sizeof(int32_t));
1107 if (!s->block || !s->hdr->block_content_ids)
1108 return -1;
1109
1110 // Create first fixed blocks, always external.
1111 // CORE
1112 if (!(s->block[0] = cram_new_block(CORE, 0)))
1113 return -1;
1114
1115 // TN block for CRAM v1
1116 if (CRAM_MAJOR_VERS(fd->version) == 1) {
1117 if (h->codecs[DS_TN]->codec == E_EXTERNAL) {
1118 if (!(s->block[DS_TN] = cram_new_block(EXTERNAL,DS_TN))) return -1;
1119 h->codecs[DS_TN]->u.external.content_id = DS_TN;
1120 } else {
1121 s->block[DS_TN] = s->block[0];
1122 }
1123 }
1124
1125 // Embedded reference
1126 if (embed_ref) {
1127 if (!(s->block[DS_ref] = cram_new_block(EXTERNAL, DS_ref)))
1128 return -1;
1129 s->ref_id = DS_ref; // needed?
1130 BLOCK_APPEND(s->block[DS_ref],
1131 c->ref + s->hdr->ref_seq_start - c->ref_start,
1132 s->hdr->ref_seq_span);
1133 }
1134
1135 /*
1136 * All the data-series blocks if appropriate.
1137 */
1138 for (id = DS_QS; id < DS_TN; id++) {
1139 if (cram_allocate_block(h->codecs[id], s, id) < 0)
1140 return -1;
1141 }
1142
1143 /*
1144 * Add in the external tag blocks too.
1145 */
1146 if (c->tags_used) {
1147 int n;
1148 s->hdr->num_blocks = DS_END;
1149 for (n = 0; n < s->naux_block; n++)
1150 s->block[s->hdr->num_blocks++] = s->aux_block[n];
1151 }
1152
1153 /* Encode reads */
1154 last_pos = s->hdr->ref_seq_start;
1155 for (rec = 0; rec < s->hdr->num_records; rec++) {
1156 cram_record *cr = &s->crecs[rec];
1157 if (cram_encode_slice_read(fd, c, h, s, cr, &last_pos) == -1)
1158 return -1;
1159 }
1160
1161 s->block[0]->uncomp_size = s->block[0]->byte + (s->block[0]->bit < 7);
1162 s->block[0]->comp_size = s->block[0]->uncomp_size;
1163
1164 // Make sure the fixed blocks point to the correct sources
1165 if (s->block[DS_IN]) cram_free_block(s->block[DS_IN]);
1166 s->block[DS_IN] = s->base_blk; s->base_blk = NULL;
1167 if (s->block[DS_QS]) cram_free_block(s->block[DS_QS]);
1168 s->block[DS_QS] = s->qual_blk; s->qual_blk = NULL;
1169 if (s->block[DS_RN]) cram_free_block(s->block[DS_RN]);
1170 s->block[DS_RN] = s->name_blk; s->name_blk = NULL;
1171 if (s->block[DS_SC]) cram_free_block(s->block[DS_SC]);
1172 s->block[DS_SC] = s->soft_blk; s->soft_blk = NULL;
1173
1174 // Finalise any data transforms.
1175 for (id = DS_QS; id < DS_TN; id++) {
1176 if (h->codecs[id] && h->codecs[id]->flush)
1177 h->codecs[id]->flush(h->codecs[id]);
1178 }
1179
1180 // Ensure block sizes are up to date.
1181 for (id = 1; id < s->hdr->num_blocks; id++) {
1182 if (!s->block[id] || s->block[id] == s->block[0])
1183 continue;
1184
1185 if (s->block[id]->uncomp_size == 0)
1186 BLOCK_UPLEN(s->block[id]);
1187 }
1188
1189 // Compress it all
1190 if (cram_compress_slice(fd, c, s) == -1)
1191 return -1;
1192
1193 // Collapse empty blocks and create hdr_block
1194 {
1195 int i, j;
1196
1197 s->hdr->block_content_ids = realloc(s->hdr->block_content_ids,
1198 s->hdr->num_blocks * sizeof(int32_t));
1199 if (!s->hdr->block_content_ids)
1200 return -1;
1201
1202 for (i = j = 1; i < s->hdr->num_blocks; i++) {
1203 if (!s->block[i] || s->block[i] == s->block[0])
1204 continue;
1205 if (s->block[i]->uncomp_size == 0) {
1206 cram_free_block(s->block[i]);
1207 s->block[i] = NULL;
1208 continue;
1209 }
1210 s->block[j] = s->block[i];
1211 s->hdr->block_content_ids[j-1] = s->block[i]->content_id;
1212 j++;
1213 }
1214 s->hdr->num_content_ids = j-1;
1215 s->hdr->num_blocks = j;
1216
1217 if (!(s->hdr_block = cram_encode_slice_header(fd, s)))
1218 return -1;
1219 }
1220
1221 return r ? -1 : 0;
1222
1223 block_err:
1224 return -1;
1225 }
1226
1227 /*
1228 * Returns the number of expected read names for this record.
1229 */
expected_template_count(bam_seq_t * b)1230 static int expected_template_count(bam_seq_t *b) {
1231 int expected = bam_flag(b) & BAM_FPAIRED ? 2 : 1;
1232
1233 uint8_t *TC = (uint8_t *)bam_aux_get(b, "TC");
1234 if (TC) {
1235 int n = bam_aux2i(TC);
1236 if (expected < n)
1237 expected = n;
1238 }
1239
1240 if (!TC && bam_aux_get(b, "SA")) {
1241 // We could count the semicolons, but we'd have to do this for
1242 // read1, read2 and read(not-1-or-2) combining the results
1243 // together. This is a cheap and safe alternative for now.
1244 expected = INT_MAX;
1245 }
1246
1247 return expected;
1248 }
1249
1250 /*
1251 * Lossily reject read names.
1252 *
1253 * The rule here is that if all reads for this template reside in the
1254 * same slice then we can lose the name. Otherwise we keep them as we
1255 * do not know when (or if) the other reads will turn up.
1256 *
1257 * Note there may be only 1 read (non-paired library) or more than 2
1258 * reads (paired library with supplementary reads), or other weird
1259 * setups. We need to know how many are expected. Ways to guess:
1260 *
1261 * - Flags (0x1 - has > 1 read)
1262 * - TC aux field (not mandatory)
1263 * - SA tags (count semicolons, NB per fragment so sum - hard)
1264 * - RNEXT/PNEXT uniqueness count. (not implemented, tricky)
1265 *
1266 * Returns 0 on success
1267 * -1 on failure
1268 */
lossy_read_names(cram_fd * fd,cram_container * c,cram_slice * s,int bam_start)1269 static int lossy_read_names(cram_fd *fd, cram_container *c, cram_slice *s,
1270 int bam_start) {
1271 int r1, r2, ret = -1;
1272
1273 // Initialise cram_flags
1274 for (r2 = 0; r2 < s->hdr->num_records; r2++)
1275 s->crecs[r2].cram_flags = 0;
1276
1277 if (!fd->lossy_read_names)
1278 return 0;
1279
1280 khash_t(m_s2u64) *names = kh_init(m_s2u64);
1281 if (!names)
1282 goto fail;
1283
1284 // 1: Iterate through names to count frequency
1285 for (r1 = bam_start, r2 = 0; r2 < s->hdr->num_records; r1++, r2++) {
1286 //cram_record *cr = &s->crecs[r2];
1287 bam_seq_t *b = c->bams[r1];
1288 khint_t k;
1289 int n;
1290 uint64_t e;
1291 union {
1292 uint64_t i64;
1293 struct {
1294 int32_t e,c; // expected & observed counts.
1295 } counts;
1296 } u;
1297
1298 e = expected_template_count(b);
1299 u.counts.e = e; u.counts.c = 1;
1300
1301 k = kh_put(m_s2u64, names, bam_name(b), &n);
1302 if (n == -1)
1303 goto fail;
1304
1305 if (n == 0) {
1306 // not a new name
1307 u.i64 = kh_val(names, k);
1308 if (u.counts.e != e) {
1309 // different expectation or already hit the max
1310 //fprintf(stderr, "Err computing no. %s recs\n", bam_name(b));
1311 kh_val(names, k) = 0;
1312 } else {
1313 u.counts.c++;
1314 if (u.counts.e == u.counts.c) {
1315 // Reached expected count.
1316 kh_val(names, k) = -1;
1317 } else {
1318 kh_val(names, k) = u.i64;
1319 }
1320 }
1321 } else {
1322 // new name
1323 kh_val(names, k) = u.i64;
1324 }
1325 }
1326
1327 // 2: Remove names if all present (hd.i == -1)
1328 for (r1 = bam_start, r2 = 0; r2 < s->hdr->num_records; r1++, r2++) {
1329 cram_record *cr = &s->crecs[r2];
1330 bam_seq_t *b = c->bams[r1];
1331 khint_t k;
1332
1333 k = kh_get(m_s2u64, names, bam_name(b));
1334
1335 if (k == kh_end(names))
1336 goto fail;
1337
1338 if (kh_val(names, k) == -1)
1339 cr->cram_flags = CRAM_FLAG_DISCARD_NAME;
1340 }
1341
1342 ret = 0;
1343 fail: // ret==-1
1344
1345 if (names)
1346 kh_destroy(m_s2u64, names);
1347
1348 return ret;
1349 }
1350
1351 /*
1352 * Adds the reading names. We do this here as a separate pass rather
1353 * than per record in the process_one_read calls as that function can
1354 * go back and change the CRAM_FLAG_DETACHED status of a previously
1355 * processed read if it subsequently determines the TLEN field is
1356 * incorrect. Given DETACHED reads always try to decode read names,
1357 * we need to know their status before generating the read-name block.
1358 *
1359 * Output is an update s->name_blk, and cr->name / cr->name_len
1360 * fields.
1361 */
add_read_names(cram_fd * fd,cram_container * c,cram_slice * s,int bam_start)1362 static int add_read_names(cram_fd *fd, cram_container *c, cram_slice *s,
1363 int bam_start) {
1364 int r1, r2;
1365 int keep_names = !fd->lossy_read_names;
1366
1367 for (r1 = bam_start, r2 = 0;
1368 r1 < c->curr_c_rec && r2 < s->hdr->num_records;
1369 r1++, r2++) {
1370 cram_record *cr = &s->crecs[r2];
1371 bam_seq_t *b = c->bams[r1];
1372
1373 cr->name = BLOCK_SIZE(s->name_blk);
1374 if ((cr->cram_flags & CRAM_FLAG_DETACHED) || keep_names) {
1375 if (CRAM_MAJOR_VERS(fd->version) >= 4
1376 && (cr->cram_flags & CRAM_FLAG_MATE_DOWNSTREAM)
1377 && cr->mate_line) {
1378 // Dedup read names in V4
1379 BLOCK_APPEND(s->name_blk, "\0", 1);
1380 cr->name_len = 1;
1381 } else {
1382 BLOCK_APPEND(s->name_blk, bam_name(b), bam_name_len(b));
1383 cr->name_len = bam_name_len(b);
1384 }
1385 } else {
1386 // Can only discard duplicate names if not detached
1387 cr->name_len = 0;
1388 }
1389
1390 if (cram_stats_add(c->stats[DS_RN], cr->name_len) < 0)
1391 goto block_err;
1392 }
1393
1394 return 0;
1395
1396 block_err:
1397 return -1;
1398 }
1399
1400 /*
1401 * Encodes all slices in a container into blocks.
1402 * Returns 0 on success
1403 * -1 on failure
1404 */
cram_encode_container(cram_fd * fd,cram_container * c)1405 int cram_encode_container(cram_fd *fd, cram_container *c) {
1406 int i, j, slice_offset;
1407 cram_block_compression_hdr *h = c->comp_hdr;
1408 cram_block *c_hdr;
1409 int multi_ref = 0;
1410 int r1, r2, sn, nref;
1411 spare_bams *spares;
1412
1413 if (CRAM_MAJOR_VERS(fd->version) == 1)
1414 goto err;
1415
1416 //#define goto_err {fprintf(stderr, "ERR at %s:%d\n", __FILE__, __LINE__);goto err;}
1417 #define goto_err goto err
1418
1419 /* Cache references up-front if we have unsorted access patterns */
1420 pthread_mutex_lock(&fd->ref_lock);
1421 nref = fd->refs->nref;
1422 pthread_mutex_unlock(&fd->ref_lock);
1423
1424 if (!fd->no_ref && c->refs_used) {
1425 for (i = 0; i < nref; i++) {
1426 if (c->refs_used[i])
1427 cram_get_ref(fd, i, 1, 0);
1428 }
1429 }
1430
1431 /* To create M5 strings */
1432 /* Fetch reference sequence */
1433 if (!fd->no_ref) {
1434 if (!c->bams || !c->bams[0])
1435 goto_err;
1436 bam_seq_t *b = c->bams[0];
1437
1438 char *ref = cram_get_ref(fd, bam_ref(b), 1, 0);
1439 if (!ref && bam_ref(b) >= 0) {
1440 hts_log_error("Failed to load reference #%d", bam_ref(b));
1441 return -1;
1442 }
1443 if ((c->ref_id = bam_ref(b)) >= 0) {
1444 c->ref_seq_id = c->ref_id;
1445 c->ref = fd->refs->ref_id[c->ref_seq_id]->seq;
1446 c->ref_start = 1;
1447 c->ref_end = fd->refs->ref_id[c->ref_seq_id]->length;
1448 } else {
1449 c->ref_seq_id = c->ref_id; // FIXME remove one var!
1450 }
1451 } else {
1452 c->ref_id = bam_ref(c->bams[0]);
1453 cram_ref_incr(fd->refs, c->ref_id);
1454 c->ref_seq_id = c->ref_id;
1455 }
1456
1457 /* Turn bams into cram_records and gather basic stats */
1458 for (r1 = sn = 0; r1 < c->curr_c_rec; sn++) {
1459 cram_slice *s = c->slices[sn];
1460 int64_t first_base = INT64_MAX, last_base = INT64_MIN;
1461
1462 int r1_start = r1;
1463
1464 assert(sn < c->curr_slice);
1465
1466 // Discover which read names *may* be safely removed.
1467 // Ie which ones have all their records in this slice.
1468 if (lossy_read_names(fd, c, s, r1_start) != 0)
1469 return -1;
1470
1471 // Tracking of MD tags so we can spot when the auto-generated values
1472 // will differ from the current stored ones. The kstring here is
1473 // simply to avoid excessive malloc and free calls. All initialisation
1474 // is done within process_one_read().
1475 kstring_t MD = {0};
1476
1477 // Iterate through records creating the cram blocks for some
1478 // fields and just gathering stats for others.
1479 for (r2 = 0; r1 < c->curr_c_rec && r2 < s->hdr->num_records; r1++, r2++) {
1480 cram_record *cr = &s->crecs[r2];
1481 bam_seq_t *b = c->bams[r1];
1482
1483 /* If multi-ref we need to cope with changing reference per seq */
1484 if (c->multi_seq && !fd->no_ref) {
1485 if (bam_ref(b) != c->ref_seq_id && bam_ref(b) >= 0) {
1486 if (c->ref_seq_id >= 0)
1487 cram_ref_decr(fd->refs, c->ref_seq_id);
1488
1489 if (!cram_get_ref(fd, bam_ref(b), 1, 0)) {
1490 hts_log_error("Failed to load reference #%d", bam_ref(b));
1491 free(MD.s);
1492 return -1;
1493 }
1494
1495 c->ref_seq_id = bam_ref(b); // overwritten later by -2
1496 if (!fd->refs->ref_id[c->ref_seq_id]->seq)
1497 return -1;
1498 c->ref = fd->refs->ref_id[c->ref_seq_id]->seq;
1499 c->ref_start = 1;
1500 c->ref_end = fd->refs->ref_id[c->ref_seq_id]->length;
1501 }
1502 }
1503
1504 if (process_one_read(fd, c, s, cr, b, r2, &MD) != 0) {
1505 free(MD.s);
1506 return -1;
1507 }
1508
1509 if (first_base > cr->apos)
1510 first_base = cr->apos;
1511
1512 if (last_base < cr->aend)
1513 last_base = cr->aend;
1514 }
1515 free(MD.s);
1516
1517 // Process_one_read doesn't add read names as it can change
1518 // its mind during the loop on the CRAM_FLAG_DETACHED setting
1519 // of earlier records (if it detects the auto-generation of
1520 // TLEN is incorrect). This affects which read-names can be
1521 // lossily compressed, so we do these in another pass.
1522 if (add_read_names(fd, c, s, r1_start) < 0)
1523 return -1;
1524
1525 if (c->multi_seq) {
1526 s->hdr->ref_seq_id = -2;
1527 s->hdr->ref_seq_start = 0;
1528 s->hdr->ref_seq_span = 0;
1529 } else {
1530 s->hdr->ref_seq_id = c->ref_id;
1531 s->hdr->ref_seq_start = first_base;
1532 s->hdr->ref_seq_span = MAX(0, last_base - first_base + 1);
1533 }
1534 s->hdr->num_records = r2;
1535
1536 // Processed a slice, now stash the aux blocks so the next
1537 // slice can start aggregating them from the start again.
1538 if (c->tags_used->n_occupied) {
1539 int ntags = c->tags_used->n_occupied;
1540 s->aux_block = calloc(ntags*2, sizeof(*s->aux_block));
1541 if (!s->aux_block)
1542 return -1;
1543
1544 khint_t k;
1545
1546 s->naux_block = 0;
1547 for (k = kh_begin(c->tags_used); k != kh_end(c->tags_used); k++) {
1548 if (!kh_exist(c->tags_used, k))
1549 continue;
1550
1551 cram_tag_map *tm = kh_val(c->tags_used, k);
1552 if (!tm) goto_err;
1553 if (!tm->blk) continue;
1554 s->aux_block[s->naux_block++] = tm->blk;
1555 tm->blk = NULL;
1556 if (!tm->blk2) continue;
1557 s->aux_block[s->naux_block++] = tm->blk2;
1558 tm->blk2 = NULL;
1559 }
1560 assert(s->naux_block <= 2*c->tags_used->n_occupied);
1561 }
1562 }
1563
1564 if (c->multi_seq && !fd->no_ref) {
1565 if (c->ref_seq_id >= 0)
1566 cram_ref_decr(fd->refs, c->ref_seq_id);
1567 }
1568
1569 /* Link our bams[] array onto the spare bam list for reuse */
1570 spares = malloc(sizeof(*spares));
1571 if (!spares) goto_err;
1572 pthread_mutex_lock(&fd->bam_list_lock);
1573 spares->bams = c->bams;
1574 spares->next = fd->bl;
1575 fd->bl = spares;
1576 pthread_mutex_unlock(&fd->bam_list_lock);
1577 c->bams = NULL;
1578
1579 /* Detect if a multi-seq container */
1580 cram_stats_encoding(fd, c->stats[DS_RI]);
1581 multi_ref = c->stats[DS_RI]->nvals > 1;
1582 pthread_mutex_lock(&fd->metrics_lock);
1583 fd->last_RI_count = c->stats[DS_RI]->nvals;
1584 pthread_mutex_unlock(&fd->metrics_lock);
1585
1586
1587 if (multi_ref) {
1588 hts_log_info("Multi-ref container");
1589 c->ref_seq_id = -2;
1590 c->ref_seq_start = 0;
1591 c->ref_seq_span = 0;
1592 }
1593
1594
1595 /* Compute MD5s */
1596 int is_v4 = CRAM_MAJOR_VERS(fd->version) >= 4 ? 1 : 0;
1597 for (i = 0; i < c->curr_slice; i++) {
1598 cram_slice *s = c->slices[i];
1599
1600 if (CRAM_MAJOR_VERS(fd->version) != 1) {
1601 if (s->hdr->ref_seq_id >= 0 && c->multi_seq == 0 && !fd->no_ref) {
1602 hts_md5_context *md5 = hts_md5_init();
1603 if (!md5)
1604 return -1;
1605 hts_md5_update(md5,
1606 c->ref + s->hdr->ref_seq_start - c->ref_start,
1607 s->hdr->ref_seq_span);
1608 hts_md5_final(s->hdr->md5, md5);
1609 hts_md5_destroy(md5);
1610 } else {
1611 memset(s->hdr->md5, 0, 16);
1612 }
1613 }
1614 }
1615
1616 c->num_records = 0;
1617 c->num_blocks = 1; // cram_block_compression_hdr
1618 c->length = 0;
1619
1620 //fprintf(stderr, "=== BF ===\n");
1621 h->codecs[DS_BF] = cram_encoder_init(cram_stats_encoding(fd, c->stats[DS_BF]),
1622 c->stats[DS_BF], E_INT, NULL,
1623 fd->version, &fd->vv);
1624 if (c->stats[DS_BF]->nvals && !h->codecs[DS_BF]) goto_err;
1625
1626 //fprintf(stderr, "=== CF ===\n");
1627 h->codecs[DS_CF] = cram_encoder_init(cram_stats_encoding(fd, c->stats[DS_CF]),
1628 c->stats[DS_CF], E_INT, NULL,
1629 fd->version, &fd->vv);
1630 if (c->stats[DS_CF]->nvals && !h->codecs[DS_CF]) goto_err;
1631
1632 //fprintf(stderr, "=== RN ===\n");
1633 //h->codecs[DS_RN] = cram_encoder_init(cram_stats_encoding(fd, c->stats[DS_RN]),
1634 // c->stats[DS_RN], E_BYTE_ARRAY, NULL,
1635 // fd->version);
1636
1637 //fprintf(stderr, "=== AP ===\n");
1638 if (c->pos_sorted || CRAM_MAJOR_VERS(fd->version) >= 4) {
1639 if (c->pos_sorted)
1640 h->codecs[DS_AP] = cram_encoder_init(cram_stats_encoding(fd, c->stats[DS_AP]),
1641 c->stats[DS_AP],
1642 is_v4 ? E_LONG : E_INT,
1643 NULL, fd->version, &fd->vv);
1644 else
1645 // Unsorted data has no stats, but hard-code VARINT_SIGNED / EXT.
1646 h->codecs[DS_AP] = cram_encoder_init(is_v4 ? E_VARINT_SIGNED
1647 : E_EXTERNAL,
1648 NULL,
1649 is_v4 ? E_LONG : E_INT,
1650 NULL, fd->version, &fd->vv);
1651 } else {
1652 // Removed BETA in v4.0.
1653 // Should we consider dropping use of it for 3.0 too?
1654 int p[2] = {0, c->max_apos};
1655 h->codecs[DS_AP] = cram_encoder_init(E_BETA, NULL,
1656 is_v4 ? E_LONG : E_INT,
1657 p, fd->version, &fd->vv);
1658 // cram_xdelta_encoder e;
1659 // e.word_size = is_v4 ? 8 : 4;
1660 // e.sub_encoding = E_EXTERNAL;
1661 // e.sub_codec_dat = (void *)DS_AP;
1662 //
1663 // h->codecs[DS_AP] = cram_encoder_init(E_XDELTA, NULL,
1664 // is_v4 ? E_LONG : E_INT,
1665 // &e, fd->version, &fd->vv);
1666 }
1667 if (!h->codecs[DS_AP]) goto_err;
1668
1669 //fprintf(stderr, "=== RG ===\n");
1670 h->codecs[DS_RG] = cram_encoder_init(cram_stats_encoding(fd, c->stats[DS_RG]),
1671 c->stats[DS_RG],
1672 E_INT,
1673 NULL,
1674 fd->version, &fd->vv);
1675 if (c->stats[DS_RG]->nvals && !h->codecs[DS_RG]) goto_err;
1676
1677 //fprintf(stderr, "=== MQ ===\n");
1678 h->codecs[DS_MQ] = cram_encoder_init(cram_stats_encoding(fd, c->stats[DS_MQ]),
1679 c->stats[DS_MQ], E_INT, NULL,
1680 fd->version, &fd->vv);
1681 if (c->stats[DS_MQ]->nvals && !h->codecs[DS_MQ]) goto_err;
1682
1683 //fprintf(stderr, "=== NS ===\n");
1684 h->codecs[DS_NS] = cram_encoder_init(cram_stats_encoding(fd, c->stats[DS_NS]),
1685 c->stats[DS_NS], E_INT, NULL,
1686 fd->version, &fd->vv);
1687 if (c->stats[DS_NS]->nvals && !h->codecs[DS_NS]) goto_err;
1688
1689 //fprintf(stderr, "=== MF ===\n");
1690 h->codecs[DS_MF] = cram_encoder_init(cram_stats_encoding(fd, c->stats[DS_MF]),
1691 c->stats[DS_MF], E_INT, NULL,
1692 fd->version, &fd->vv);
1693 if (c->stats[DS_MF]->nvals && !h->codecs[DS_MF]) goto_err;
1694
1695 //fprintf(stderr, "=== TS ===\n");
1696 h->codecs[DS_TS] = cram_encoder_init(cram_stats_encoding(fd, c->stats[DS_TS]),
1697 c->stats[DS_TS],
1698 is_v4 ? E_LONG : E_INT,
1699 NULL, fd->version, &fd->vv);
1700 if (c->stats[DS_TS]->nvals && !h->codecs[DS_TS]) goto_err;
1701
1702 //fprintf(stderr, "=== NP ===\n");
1703 h->codecs[DS_NP] = cram_encoder_init(cram_stats_encoding(fd, c->stats[DS_NP]),
1704 c->stats[DS_NP],
1705 is_v4 ? E_LONG : E_INT,
1706 NULL, fd->version, &fd->vv);
1707 if (c->stats[DS_NP]->nvals && !h->codecs[DS_NP]) goto_err;
1708
1709 //fprintf(stderr, "=== NF ===\n");
1710 h->codecs[DS_NF] = cram_encoder_init(cram_stats_encoding(fd, c->stats[DS_NF]),
1711 c->stats[DS_NF], E_INT, NULL,
1712 fd->version, &fd->vv);
1713 if (c->stats[DS_NF]->nvals && !h->codecs[DS_NF]) goto_err;
1714
1715 //fprintf(stderr, "=== RL ===\n");
1716 h->codecs[DS_RL] = cram_encoder_init(cram_stats_encoding(fd, c->stats[DS_RL]),
1717 c->stats[DS_RL], E_INT, NULL,
1718 fd->version, &fd->vv);
1719 if (c->stats[DS_RL]->nvals && !h->codecs[DS_RL]) goto_err;
1720
1721 //fprintf(stderr, "=== FN ===\n");
1722 h->codecs[DS_FN] = cram_encoder_init(cram_stats_encoding(fd, c->stats[DS_FN]),
1723 c->stats[DS_FN], E_INT, NULL,
1724 fd->version, &fd->vv);
1725 if (c->stats[DS_FN]->nvals && !h->codecs[DS_FN]) goto_err;
1726
1727 //fprintf(stderr, "=== FC ===\n");
1728 h->codecs[DS_FC] = cram_encoder_init(cram_stats_encoding(fd, c->stats[DS_FC]),
1729 c->stats[DS_FC], E_BYTE, NULL,
1730 fd->version, &fd->vv);
1731 if (c->stats[DS_FC]->nvals && !h->codecs[DS_FC]) goto_err;
1732
1733 //fprintf(stderr, "=== FP ===\n");
1734 h->codecs[DS_FP] = cram_encoder_init(cram_stats_encoding(fd, c->stats[DS_FP]),
1735 c->stats[DS_FP], E_INT, NULL,
1736 fd->version, &fd->vv);
1737 if (c->stats[DS_FP]->nvals && !h->codecs[DS_FP]) goto_err;
1738
1739 //fprintf(stderr, "=== DL ===\n");
1740 h->codecs[DS_DL] = cram_encoder_init(cram_stats_encoding(fd, c->stats[DS_DL]),
1741 c->stats[DS_DL], E_INT, NULL,
1742 fd->version, &fd->vv);
1743 if (c->stats[DS_DL]->nvals && !h->codecs[DS_DL]) goto_err;
1744
1745 //fprintf(stderr, "=== BA ===\n");
1746 h->codecs[DS_BA] = cram_encoder_init(cram_stats_encoding(fd, c->stats[DS_BA]),
1747 c->stats[DS_BA], E_BYTE, NULL,
1748 fd->version, &fd->vv);
1749 if (c->stats[DS_BA]->nvals && !h->codecs[DS_BA]) goto_err;
1750
1751 if (CRAM_MAJOR_VERS(fd->version) >= 3) {
1752 cram_byte_array_len_encoder e;
1753
1754 e.len_encoding = CRAM_MAJOR_VERS(fd->version) >= 4
1755 ? E_VARINT_UNSIGNED
1756 : E_EXTERNAL;
1757 e.len_dat = (void *)DS_BB_len;
1758 //e.len_dat = (void *)DS_BB;
1759
1760 e.val_encoding = E_EXTERNAL;
1761 e.val_dat = (void *)DS_BB;
1762
1763 h->codecs[DS_BB] = cram_encoder_init(E_BYTE_ARRAY_LEN, NULL,
1764 E_BYTE_ARRAY, (void *)&e,
1765 fd->version, &fd->vv);
1766 if (!h->codecs[DS_BB]) goto_err;
1767 } else {
1768 h->codecs[DS_BB] = NULL;
1769 }
1770
1771 //fprintf(stderr, "=== BS ===\n");
1772 h->codecs[DS_BS] = cram_encoder_init(cram_stats_encoding(fd, c->stats[DS_BS]),
1773 c->stats[DS_BS], E_BYTE, NULL,
1774 fd->version, &fd->vv);
1775 if (c->stats[DS_BS]->nvals && !h->codecs[DS_BS]) goto_err;
1776
1777 if (CRAM_MAJOR_VERS(fd->version) == 1) {
1778 h->codecs[DS_TL] = NULL;
1779 h->codecs[DS_RI] = NULL;
1780 h->codecs[DS_RS] = NULL;
1781 h->codecs[DS_PD] = NULL;
1782 h->codecs[DS_HC] = NULL;
1783 h->codecs[DS_SC] = NULL;
1784
1785 //fprintf(stderr, "=== TC ===\n");
1786 h->codecs[DS_TC] = cram_encoder_init(cram_stats_encoding(fd, c->stats[DS_TC]),
1787 c->stats[DS_TC], E_BYTE, NULL,
1788 fd->version, &fd->vv);
1789 if (c->stats[DS_TC]->nvals && !h->codecs[DS_TC]) goto_err;
1790
1791 //fprintf(stderr, "=== TN ===\n");
1792 h->codecs[DS_TN] = cram_encoder_init(cram_stats_encoding(fd, c->stats[DS_TN]),
1793 c->stats[DS_TN], E_INT, NULL,
1794 fd->version, &fd->vv);
1795 if (c->stats[DS_TN]->nvals && !h->codecs[DS_TN]) goto_err;
1796 } else {
1797 h->codecs[DS_TC] = NULL;
1798 h->codecs[DS_TN] = NULL;
1799
1800 //fprintf(stderr, "=== TL ===\n");
1801 h->codecs[DS_TL] = cram_encoder_init(cram_stats_encoding(fd, c->stats[DS_TL]),
1802 c->stats[DS_TL], E_INT, NULL,
1803 fd->version, &fd->vv);
1804 if (c->stats[DS_TL]->nvals && !h->codecs[DS_TL]) goto_err;
1805
1806
1807 //fprintf(stderr, "=== RI ===\n");
1808 h->codecs[DS_RI] = cram_encoder_init(cram_stats_encoding(fd, c->stats[DS_RI]),
1809 c->stats[DS_RI], E_INT, NULL,
1810 fd->version, &fd->vv);
1811 if (c->stats[DS_RI]->nvals && !h->codecs[DS_RI]) goto_err;
1812
1813 //fprintf(stderr, "=== RS ===\n");
1814 h->codecs[DS_RS] = cram_encoder_init(cram_stats_encoding(fd, c->stats[DS_RS]),
1815 c->stats[DS_RS], E_INT, NULL,
1816 fd->version, &fd->vv);
1817 if (c->stats[DS_RS]->nvals && !h->codecs[DS_RS]) goto_err;
1818
1819 //fprintf(stderr, "=== PD ===\n");
1820 h->codecs[DS_PD] = cram_encoder_init(cram_stats_encoding(fd, c->stats[DS_PD]),
1821 c->stats[DS_PD], E_INT, NULL,
1822 fd->version, &fd->vv);
1823 if (c->stats[DS_PD]->nvals && !h->codecs[DS_PD]) goto_err;
1824
1825 //fprintf(stderr, "=== HC ===\n");
1826 h->codecs[DS_HC] = cram_encoder_init(cram_stats_encoding(fd, c->stats[DS_HC]),
1827 c->stats[DS_HC], E_INT, NULL,
1828 fd->version, &fd->vv);
1829 if (c->stats[DS_HC]->nvals && !h->codecs[DS_HC]) goto_err;
1830
1831 //fprintf(stderr, "=== SC ===\n");
1832 if (1) {
1833 int i2[2] = {0, DS_SC};
1834
1835 h->codecs[DS_SC] = cram_encoder_init(E_BYTE_ARRAY_STOP, NULL,
1836 E_BYTE_ARRAY, (void *)i2,
1837 fd->version, &fd->vv);
1838 } else {
1839 // Appears to be no practical benefit to using this method,
1840 // but it may work better if we start mixing SC, IN and BB
1841 // elements into the same external block.
1842 cram_byte_array_len_encoder e;
1843
1844 e.len_encoding = CRAM_MAJOR_VERS(fd->version) >= 4
1845 ? E_VARINT_UNSIGNED
1846 : E_EXTERNAL;
1847 e.len_dat = (void *)DS_SC_len;
1848
1849 e.val_encoding = E_EXTERNAL;
1850 e.val_dat = (void *)DS_SC;
1851
1852 h->codecs[DS_SC] = cram_encoder_init(E_BYTE_ARRAY_LEN, NULL,
1853 E_BYTE_ARRAY, (void *)&e,
1854 fd->version, &fd->vv);
1855 }
1856 if (!h->codecs[DS_SC]) goto_err;
1857 }
1858
1859 //fprintf(stderr, "=== IN ===\n");
1860 {
1861 int i2[2] = {0, DS_IN};
1862 h->codecs[DS_IN] = cram_encoder_init(E_BYTE_ARRAY_STOP, NULL,
1863 E_BYTE_ARRAY, (void *)i2,
1864 fd->version, &fd->vv);
1865 if (!h->codecs[DS_IN]) goto_err;
1866 }
1867
1868 h->codecs[DS_QS] = cram_encoder_init(E_EXTERNAL, NULL, E_BYTE,
1869 (void *)DS_QS,
1870 fd->version, &fd->vv);
1871 if (!h->codecs[DS_QS]) goto_err;
1872 {
1873 int i2[2] = {0, DS_RN};
1874 h->codecs[DS_RN] = cram_encoder_init(E_BYTE_ARRAY_STOP, NULL,
1875 E_BYTE_ARRAY, (void *)i2,
1876 fd->version, &fd->vv);
1877 if (!h->codecs[DS_RN]) goto_err;
1878 }
1879
1880
1881 /* Encode slices */
1882 for (i = 0; i < c->curr_slice; i++) {
1883 hts_log_info("Encode slice %d", i);
1884
1885 if (cram_encode_slice(fd, c, h, c->slices[i]) != 0)
1886 return -1;
1887 }
1888
1889 /* Create compression header */
1890 {
1891 h->ref_seq_id = c->ref_seq_id;
1892 h->ref_seq_start = c->ref_seq_start;
1893 h->ref_seq_span = c->ref_seq_span;
1894 h->num_records = c->num_records;
1895 h->qs_seq_orient = c->qs_seq_orient;
1896 // slight misnomer - sorted or treat as-if sorted (ap_delta force to 1)
1897 h->AP_delta = c->pos_sorted;
1898 memcpy(h->substitution_matrix, CRAM_SUBST_MATRIX, 20);
1899
1900 if (!(c_hdr = cram_encode_compression_header(fd, c, h)))
1901 return -1;
1902 }
1903
1904 /* Compute landmarks */
1905 /* Fill out slice landmarks */
1906 c->num_landmarks = c->curr_slice;
1907 c->landmark = malloc(c->num_landmarks * sizeof(*c->landmark));
1908 if (!c->landmark)
1909 return -1;
1910
1911 /*
1912 * Slice offset starts after the first block, so we need to simulate
1913 * writing it to work out the correct offset
1914 */
1915 {
1916 slice_offset = c_hdr->method == RAW
1917 ? c_hdr->uncomp_size
1918 : c_hdr->comp_size;
1919 slice_offset += 2 + 4*(CRAM_MAJOR_VERS(fd->version) >= 3) +
1920 fd->vv.varint_size(c_hdr->content_id) +
1921 fd->vv.varint_size(c_hdr->comp_size) +
1922 fd->vv.varint_size(c_hdr->uncomp_size);
1923 }
1924
1925 c->ref_seq_id = c->slices[0]->hdr->ref_seq_id;
1926 c->ref_seq_start = c->slices[0]->hdr->ref_seq_start;
1927 c->ref_seq_span = c->slices[0]->hdr->ref_seq_span;
1928 for (i = 0; i < c->curr_slice; i++) {
1929 cram_slice *s = c->slices[i];
1930
1931 c->num_blocks += s->hdr->num_blocks + 1; // slice header
1932 c->landmark[i] = slice_offset;
1933
1934 if (s->hdr->ref_seq_start + s->hdr->ref_seq_span >
1935 c->ref_seq_start + c->ref_seq_span) {
1936 c->ref_seq_span = s->hdr->ref_seq_start + s->hdr->ref_seq_span
1937 - c->ref_seq_start;
1938 }
1939
1940 slice_offset += s->hdr_block->method == RAW
1941 ? s->hdr_block->uncomp_size
1942 : s->hdr_block->comp_size;
1943
1944 slice_offset += 2 + 4*(CRAM_MAJOR_VERS(fd->version) >= 3) +
1945 fd->vv.varint_size(s->hdr_block->content_id) +
1946 fd->vv.varint_size(s->hdr_block->comp_size) +
1947 fd->vv.varint_size(s->hdr_block->uncomp_size);
1948
1949 for (j = 0; j < s->hdr->num_blocks; j++) {
1950 slice_offset += 2 + 4*(CRAM_MAJOR_VERS(fd->version) >= 3) +
1951 fd->vv.varint_size(s->block[j]->content_id) +
1952 fd->vv.varint_size(s->block[j]->comp_size) +
1953 fd->vv.varint_size(s->block[j]->uncomp_size);
1954
1955 slice_offset += s->block[j]->method == RAW
1956 ? s->block[j]->uncomp_size
1957 : s->block[j]->comp_size;
1958 }
1959 }
1960 c->length += slice_offset; // just past the final slice
1961
1962 c->comp_hdr_block = c_hdr;
1963
1964 if (c->ref_seq_id >= 0) {
1965 cram_ref_decr(fd->refs, c->ref_seq_id);
1966 }
1967
1968 /* Cache references up-front if we have unsorted access patterns */
1969 if (!fd->no_ref && c->refs_used) {
1970 for (i = 0; i < fd->refs->nref; i++) {
1971 if (c->refs_used[i])
1972 cram_ref_decr(fd->refs, i);
1973 }
1974 }
1975
1976 return 0;
1977
1978 err:
1979 return -1;
1980 }
1981
1982
1983 /*
1984 * Adds a feature code to a read within a slice. For purposes of minimising
1985 * memory allocations and fragmentation we have one array of features for all
1986 * reads within the slice. We return the index into this array for this new
1987 * feature.
1988 *
1989 * Returns feature index on success
1990 * -1 on failure.
1991 */
cram_add_feature(cram_container * c,cram_slice * s,cram_record * r,cram_feature * f)1992 static int cram_add_feature(cram_container *c, cram_slice *s,
1993 cram_record *r, cram_feature *f) {
1994 if (s->nfeatures >= s->afeatures) {
1995 s->afeatures = s->afeatures ? s->afeatures*2 : 1024;
1996 s->features = realloc(s->features, s->afeatures*sizeof(*s->features));
1997 if (!s->features)
1998 return -1;
1999 }
2000
2001 if (!r->nfeature++) {
2002 r->feature = s->nfeatures;
2003 if (cram_stats_add(c->stats[DS_FP], f->X.pos) < 0)
2004 return -1;
2005 } else {
2006 if (cram_stats_add(c->stats[DS_FP],
2007 f->X.pos - s->features[r->feature + r->nfeature-2].X.pos) < 0)
2008 return -1;
2009
2010 }
2011 if (cram_stats_add(c->stats[DS_FC], f->X.code) < 0)
2012 return -1;
2013
2014 s->features[s->nfeatures++] = *f;
2015
2016 return 0;
2017 }
2018
cram_add_substitution(cram_fd * fd,cram_container * c,cram_slice * s,cram_record * r,int pos,char base,char qual,char ref)2019 static int cram_add_substitution(cram_fd *fd, cram_container *c,
2020 cram_slice *s, cram_record *r,
2021 int pos, char base, char qual, char ref) {
2022 cram_feature f;
2023
2024 // seq=ACGTN vs ref=ACGT or seq=ACGT vs ref=ACGTN
2025 if (fd->L2[(uc)base]<4 || (fd->L2[(uc)base]<5 && fd->L2[(uc)ref]<4)) {
2026 f.X.pos = pos+1;
2027 f.X.code = 'X';
2028 f.X.base = fd->cram_sub_matrix[ref&0x1f][base&0x1f];
2029 if (cram_stats_add(c->stats[DS_BS], f.X.base) < 0)
2030 return -1;
2031 } else {
2032 f.B.pos = pos+1;
2033 f.B.code = 'B';
2034 f.B.base = base;
2035 f.B.qual = qual;
2036 if (cram_stats_add(c->stats[DS_BA], f.B.base) < 0) return -1;
2037 if (cram_stats_add(c->stats[DS_QS], f.B.qual) < 0) return -1;
2038 BLOCK_APPEND_CHAR(s->qual_blk, qual);
2039 }
2040 return cram_add_feature(c, s, r, &f);
2041
2042 block_err:
2043 return -1;
2044 }
2045
cram_add_bases(cram_fd * fd,cram_container * c,cram_slice * s,cram_record * r,int pos,int len,char * base)2046 static int cram_add_bases(cram_fd *fd, cram_container *c,
2047 cram_slice *s, cram_record *r,
2048 int pos, int len, char *base) {
2049 cram_feature f;
2050
2051 f.b.pos = pos+1;
2052 f.b.code = 'b';
2053 f.b.seq_idx = base - (char *)BLOCK_DATA(s->seqs_blk);
2054 f.b.len = len;
2055
2056 return cram_add_feature(c, s, r, &f);
2057 }
2058
cram_add_base(cram_fd * fd,cram_container * c,cram_slice * s,cram_record * r,int pos,char base,char qual)2059 static int cram_add_base(cram_fd *fd, cram_container *c,
2060 cram_slice *s, cram_record *r,
2061 int pos, char base, char qual) {
2062 cram_feature f;
2063 f.B.pos = pos+1;
2064 f.B.code = 'B';
2065 f.B.base = base;
2066 f.B.qual = qual;
2067 if (cram_stats_add(c->stats[DS_BA], base) < 0) return -1;
2068 if (cram_stats_add(c->stats[DS_QS], qual) < 0) return -1;
2069 BLOCK_APPEND_CHAR(s->qual_blk, qual);
2070 return cram_add_feature(c, s, r, &f);
2071
2072 block_err:
2073 return -1;
2074 }
2075
cram_add_quality(cram_fd * fd,cram_container * c,cram_slice * s,cram_record * r,int pos,char qual)2076 static int cram_add_quality(cram_fd *fd, cram_container *c,
2077 cram_slice *s, cram_record *r,
2078 int pos, char qual) {
2079 cram_feature f;
2080 f.Q.pos = pos+1;
2081 f.Q.code = 'Q';
2082 f.Q.qual = qual;
2083 if (cram_stats_add(c->stats[DS_QS], qual) < 0) return -1;
2084 BLOCK_APPEND_CHAR(s->qual_blk, qual);
2085 return cram_add_feature(c, s, r, &f);
2086
2087 block_err:
2088 return -1;
2089 }
2090
cram_add_deletion(cram_container * c,cram_slice * s,cram_record * r,int pos,int len,char * base)2091 static int cram_add_deletion(cram_container *c, cram_slice *s, cram_record *r,
2092 int pos, int len, char *base) {
2093 cram_feature f;
2094 f.D.pos = pos+1;
2095 f.D.code = 'D';
2096 f.D.len = len;
2097 if (cram_stats_add(c->stats[DS_DL], len) < 0) return -1;
2098 return cram_add_feature(c, s, r, &f);
2099 }
2100
cram_add_softclip(cram_container * c,cram_slice * s,cram_record * r,int pos,int len,char * base,int version)2101 static int cram_add_softclip(cram_container *c, cram_slice *s, cram_record *r,
2102 int pos, int len, char *base, int version) {
2103 cram_feature f;
2104 f.S.pos = pos+1;
2105 f.S.code = 'S';
2106 f.S.len = len;
2107 switch (CRAM_MAJOR_VERS(version)) {
2108 case 1:
2109 f.S.seq_idx = BLOCK_SIZE(s->base_blk);
2110 BLOCK_APPEND(s->base_blk, base, len);
2111 BLOCK_APPEND_CHAR(s->base_blk, '\0');
2112 break;
2113
2114 case 2:
2115 default:
2116 f.S.seq_idx = BLOCK_SIZE(s->soft_blk);
2117 if (base) {
2118 BLOCK_APPEND(s->soft_blk, base, len);
2119 } else {
2120 int i;
2121 for (i = 0; i < len; i++)
2122 BLOCK_APPEND_CHAR(s->soft_blk, 'N');
2123 }
2124 BLOCK_APPEND_CHAR(s->soft_blk, '\0');
2125 break;
2126
2127 //default:
2128 // // v3.0 onwards uses BB data-series
2129 // f.S.seq_idx = BLOCK_SIZE(s->soft_blk);
2130 }
2131 return cram_add_feature(c, s, r, &f);
2132
2133 block_err:
2134 return -1;
2135 }
2136
cram_add_hardclip(cram_container * c,cram_slice * s,cram_record * r,int pos,int len,char * base)2137 static int cram_add_hardclip(cram_container *c, cram_slice *s, cram_record *r,
2138 int pos, int len, char *base) {
2139 cram_feature f;
2140 f.S.pos = pos+1;
2141 f.S.code = 'H';
2142 f.S.len = len;
2143 if (cram_stats_add(c->stats[DS_HC], len) < 0) return -1;
2144 return cram_add_feature(c, s, r, &f);
2145 }
2146
cram_add_skip(cram_container * c,cram_slice * s,cram_record * r,int pos,int len,char * base)2147 static int cram_add_skip(cram_container *c, cram_slice *s, cram_record *r,
2148 int pos, int len, char *base) {
2149 cram_feature f;
2150 f.S.pos = pos+1;
2151 f.S.code = 'N';
2152 f.S.len = len;
2153 if (cram_stats_add(c->stats[DS_RS], len) < 0) return -1;
2154 return cram_add_feature(c, s, r, &f);
2155 }
2156
cram_add_pad(cram_container * c,cram_slice * s,cram_record * r,int pos,int len,char * base)2157 static int cram_add_pad(cram_container *c, cram_slice *s, cram_record *r,
2158 int pos, int len, char *base) {
2159 cram_feature f;
2160 f.S.pos = pos+1;
2161 f.S.code = 'P';
2162 f.S.len = len;
2163 if (cram_stats_add(c->stats[DS_PD], len) < 0) return -1;
2164 return cram_add_feature(c, s, r, &f);
2165 }
2166
cram_add_insertion(cram_container * c,cram_slice * s,cram_record * r,int pos,int len,char * base)2167 static int cram_add_insertion(cram_container *c, cram_slice *s, cram_record *r,
2168 int pos, int len, char *base) {
2169 cram_feature f;
2170 f.I.pos = pos+1;
2171 if (len == 1) {
2172 char b = base ? *base : 'N';
2173 f.i.code = 'i';
2174 f.i.base = b;
2175 if (cram_stats_add(c->stats[DS_BA], b) < 0) return -1;
2176 } else {
2177 f.I.code = 'I';
2178 f.I.len = len;
2179 f.S.seq_idx = BLOCK_SIZE(s->base_blk);
2180 if (base) {
2181 BLOCK_APPEND(s->base_blk, base, len);
2182 } else {
2183 int i;
2184 for (i = 0; i < len; i++)
2185 BLOCK_APPEND_CHAR(s->base_blk, 'N');
2186 }
2187 BLOCK_APPEND_CHAR(s->base_blk, '\0');
2188 }
2189 return cram_add_feature(c, s, r, &f);
2190
2191 block_err:
2192 return -1;
2193 }
2194
2195 /*
2196 * Encodes auxiliary data. Largely duplicated from above, but done so to
2197 * keep it simple and avoid a myriad of version ifs.
2198 *
2199 * Returns the read-group parsed out of the BAM aux fields on success
2200 * NULL on failure or no rg present, also sets "*err" to non-zero
2201 */
cram_encode_aux(cram_fd * fd,bam_seq_t * b,cram_container * c,cram_slice * s,cram_record * cr,int verbatim_NM,int verbatim_MD,int NM,kstring_t * MD,int * err)2202 static char *cram_encode_aux(cram_fd *fd, bam_seq_t *b, cram_container *c,
2203 cram_slice *s, cram_record *cr,
2204 int verbatim_NM, int verbatim_MD,
2205 int NM, kstring_t *MD,
2206 int *err) {
2207 char *aux, *orig, *rg = NULL;
2208 int aux_size = bam_get_l_aux(b);
2209 cram_block *td_b = c->comp_hdr->TD_blk;
2210 int TD_blk_size = BLOCK_SIZE(td_b), new;
2211 char *key;
2212 khint_t k;
2213
2214 if (err) *err = 1;
2215
2216 orig = aux = (char *)bam_aux(b);
2217
2218 // Copy aux keys to td_b and aux values to slice aux blocks
2219 while (aux - orig < aux_size && aux[0] != 0) {
2220 int r;
2221
2222 // RG:Z
2223 if (aux[0] == 'R' && aux[1] == 'G' && aux[2] == 'Z') {
2224 rg = &aux[3];
2225 while (*aux++);
2226 if (CRAM_MAJOR_VERS(fd->version) >= 4)
2227 BLOCK_APPEND(td_b, "RG*", 3);
2228 continue;
2229 }
2230
2231 // MD:Z
2232 if (aux[0] == 'M' && aux[1] == 'D' && aux[2] == 'Z') {
2233 if (cr->len && !fd->no_ref && !(cr->flags & BAM_FUNMAP) && !verbatim_MD) {
2234 if (MD && MD->s && strncasecmp(MD->s, aux+3, orig + aux_size - (aux+3)) == 0) {
2235 while (*aux++);
2236 if (CRAM_MAJOR_VERS(fd->version) >= 4)
2237 BLOCK_APPEND(td_b, "MD*", 3);
2238 continue;
2239 }
2240 }
2241 }
2242
2243 // NM:i
2244 if (aux[0] == 'N' && aux[1] == 'M') {
2245 if (cr->len && !fd->no_ref && !(cr->flags & BAM_FUNMAP) && !verbatim_NM) {
2246 int NM_ = bam_aux2i((uint8_t *)aux+2);
2247 if (NM_ == NM) {
2248 switch(aux[2]) {
2249 case 'A': case 'C': case 'c': aux+=4; break;
2250 case 'S': case 's': aux+=5; break;
2251 case 'I': case 'i': case 'f': aux+=7; break;
2252 default:
2253 hts_log_error("Unhandled type code for NM tag");
2254 return NULL;
2255 }
2256 if (CRAM_MAJOR_VERS(fd->version) >= 4)
2257 BLOCK_APPEND(td_b, "NM*", 3);
2258 continue;
2259 }
2260 }
2261 }
2262
2263 BLOCK_APPEND(td_b, aux, 3);
2264
2265 // Container level tags_used, for TD series
2266 // Maps integer key ('X0i') to cram_tag_map struct.
2267 int key = (aux[0]<<16)|(aux[1]<<8)|aux[2];
2268 k = kh_put(m_tagmap, c->tags_used, key, &r);
2269 if (-1 == r)
2270 return NULL;
2271 else if (r != 0)
2272 kh_val(c->tags_used, k) = NULL;
2273
2274 if (r == 1) {
2275 khint_t k_global;
2276
2277 // Global tags_used for cram_metrics support
2278 pthread_mutex_lock(&fd->metrics_lock);
2279 k_global = kh_put(m_metrics, fd->tags_used, key, &r);
2280 if (-1 == r) {
2281 pthread_mutex_unlock(&fd->metrics_lock);
2282 return NULL;
2283 }
2284 if (r >= 1) {
2285 kh_val(fd->tags_used, k_global) = cram_new_metrics();
2286 if (!kh_val(fd->tags_used, k_global)) {
2287 kh_del(m_metrics, fd->tags_used, k_global);
2288 pthread_mutex_unlock(&fd->metrics_lock);
2289 goto err;
2290 }
2291 }
2292
2293 pthread_mutex_unlock(&fd->metrics_lock);
2294
2295 int i2[2] = {'\t',key};
2296 size_t sk = key;
2297 cram_tag_map *m = calloc(1, sizeof(*m));
2298 if (!m)
2299 goto_err;
2300 kh_val(c->tags_used, k) = m;
2301
2302 cram_codec *c;
2303
2304 // Use a block content id based on the tag id.
2305 // Codec type depends on tag data type.
2306 switch(aux[2]) {
2307 case 'Z': case 'H':
2308 // string as byte_array_stop
2309 c = cram_encoder_init(E_BYTE_ARRAY_STOP, NULL,
2310 E_BYTE_ARRAY, (void *)i2,
2311 fd->version, &fd->vv);
2312 break;
2313
2314 case 'A': case 'c': case 'C': {
2315 // byte array len, 1 byte
2316 cram_byte_array_len_encoder e;
2317 cram_stats st;
2318
2319 if (CRAM_MAJOR_VERS(fd->version) <= 3) {
2320 e.len_encoding = E_HUFFMAN;
2321 e.len_dat = NULL; // will get codes from st
2322 } else {
2323 e.len_encoding = E_CONST_INT;
2324 e.len_dat = NULL; // will get codes from st
2325 }
2326 memset(&st, 0, sizeof(st));
2327 if (cram_stats_add(&st, 1) < 0) goto block_err;
2328 cram_stats_encoding(fd, &st);
2329
2330 e.val_encoding = E_EXTERNAL;
2331 e.val_dat = (void *)sk;
2332
2333 c = cram_encoder_init(E_BYTE_ARRAY_LEN, &st,
2334 E_BYTE_ARRAY, (void *)&e,
2335 fd->version, &fd->vv);
2336 break;
2337 }
2338
2339 case 's': case 'S': {
2340 // byte array len, 2 byte
2341 cram_byte_array_len_encoder e;
2342 cram_stats st;
2343
2344 if (CRAM_MAJOR_VERS(fd->version) <= 3) {
2345 e.len_encoding = E_HUFFMAN;
2346 e.len_dat = NULL; // will get codes from st
2347 } else {
2348 e.len_encoding = E_CONST_INT;
2349 e.len_dat = NULL; // will get codes from st
2350 }
2351 memset(&st, 0, sizeof(st));
2352 if (cram_stats_add(&st, 2) < 0) goto block_err;
2353 cram_stats_encoding(fd, &st);
2354
2355 e.val_encoding = E_EXTERNAL;
2356 e.val_dat = (void *)sk;
2357
2358 c = cram_encoder_init(E_BYTE_ARRAY_LEN, &st,
2359 E_BYTE_ARRAY, (void *)&e,
2360 fd->version, &fd->vv);
2361 break;
2362 }
2363 case 'i': case 'I': case 'f': {
2364 // byte array len, 4 byte
2365 cram_byte_array_len_encoder e;
2366 cram_stats st;
2367
2368 if (CRAM_MAJOR_VERS(fd->version) <= 3) {
2369 e.len_encoding = E_HUFFMAN;
2370 e.len_dat = NULL; // will get codes from st
2371 } else {
2372 e.len_encoding = E_CONST_INT;
2373 e.len_dat = NULL; // will get codes from st
2374 }
2375 memset(&st, 0, sizeof(st));
2376 if (cram_stats_add(&st, 4) < 0) goto block_err;
2377 cram_stats_encoding(fd, &st);
2378
2379 e.val_encoding = E_EXTERNAL;
2380 e.val_dat = (void *)sk;
2381
2382 c = cram_encoder_init(E_BYTE_ARRAY_LEN, &st,
2383 E_BYTE_ARRAY, (void *)&e,
2384 fd->version, &fd->vv);
2385 break;
2386 }
2387
2388 case 'B': {
2389 // Byte array of variable size, but we generate our tag
2390 // byte stream at the wrong stage (during reading and not
2391 // after slice header construction). So we use
2392 // BYTE_ARRAY_LEN with the length codec being external
2393 // too.
2394 cram_byte_array_len_encoder e;
2395
2396 e.len_encoding = CRAM_MAJOR_VERS(fd->version) >= 4
2397 ? E_VARINT_UNSIGNED
2398 : E_EXTERNAL;
2399 e.len_dat = (void *)sk; // or key+128 for len?
2400
2401 e.val_encoding = E_EXTERNAL;
2402 e.val_dat = (void *)sk;
2403
2404 c = cram_encoder_init(E_BYTE_ARRAY_LEN, NULL,
2405 E_BYTE_ARRAY, (void *)&e,
2406 fd->version, &fd->vv);
2407 break;
2408 }
2409
2410 default:
2411 hts_log_error("Unsupported SAM aux type '%c'", aux[2]);
2412 c = NULL;
2413 }
2414
2415 if (!c)
2416 goto_err;
2417
2418 m->codec = c;
2419
2420 // Link to fd-global tag metrics
2421 pthread_mutex_lock(&fd->metrics_lock);
2422 m->m = k_global ? (cram_metrics *)kh_val(fd->tags_used, k_global) : NULL;
2423 pthread_mutex_unlock(&fd->metrics_lock);
2424 }
2425
2426 cram_tag_map *tm = (cram_tag_map *)kh_val(c->tags_used, k);
2427 if (!tm) goto_err;
2428 cram_codec *codec = tm->codec;
2429 if (!tm->codec) goto_err;
2430
2431 switch(aux[2]) {
2432 case 'A': case 'C': case 'c':
2433 if (!tm->blk) {
2434 if (!(tm->blk = cram_new_block(EXTERNAL, key)))
2435 return NULL;
2436 codec->u.e_byte_array_len.val_codec->out = tm->blk;
2437 }
2438
2439 aux+=3;
2440 //codec->encode(s, codec, aux, 1);
2441 // Functionally equivalent, but less code.
2442 BLOCK_APPEND_CHAR(tm->blk, *aux);
2443 aux++;
2444 break;
2445
2446 case 'S': case 's':
2447 if (!tm->blk) {
2448 if (!(tm->blk = cram_new_block(EXTERNAL, key)))
2449 return NULL;
2450 codec->u.e_byte_array_len.val_codec->out = tm->blk;
2451 }
2452
2453 aux+=3;
2454 //codec->encode(s, codec, aux, 2);
2455 BLOCK_APPEND(tm->blk, aux, 2);
2456 aux+=2;
2457 break;
2458
2459 case 'I': case 'i': case 'f':
2460 if (!tm->blk) {
2461 if (!(tm->blk = cram_new_block(EXTERNAL, key)))
2462 return NULL;
2463 codec->u.e_byte_array_len.val_codec->out = tm->blk;
2464 }
2465
2466 aux+=3;
2467 //codec->encode(s, codec, aux, 4);
2468 BLOCK_APPEND(tm->blk, aux, 4);
2469 aux+=4;
2470 break;
2471
2472 case 'd':
2473 if (!tm->blk) {
2474 if (!(tm->blk = cram_new_block(EXTERNAL, key)))
2475 return NULL;
2476 codec->u.e_byte_array_len.val_codec->out = tm->blk;
2477 }
2478
2479 aux+=3; //*tmp++=*aux++; *tmp++=*aux++; *tmp++=*aux++;
2480 //codec->encode(s, codec, aux, 8);
2481 BLOCK_APPEND(tm->blk, aux, 8);
2482 aux+=8;
2483 break;
2484
2485 case 'Z': case 'H':
2486 {
2487 if (!tm->blk) {
2488 if (!(tm->blk = cram_new_block(EXTERNAL, key)))
2489 return NULL;
2490 codec->out = tm->blk;
2491 }
2492
2493 char *aux_s;
2494 aux += 3;
2495 aux_s = aux;
2496 while (*aux++);
2497 if (codec->encode(s, codec, aux_s, aux - aux_s) < 0)
2498 return NULL;
2499 }
2500 break;
2501
2502 case 'B': {
2503 int type = aux[3], blen;
2504 uint32_t count = (uint32_t)((((unsigned char *)aux)[4]<< 0) +
2505 (((unsigned char *)aux)[5]<< 8) +
2506 (((unsigned char *)aux)[6]<<16) +
2507 (((unsigned char *)aux)[7]<<24));
2508 if (!tm->blk) {
2509 if (!(tm->blk = cram_new_block(EXTERNAL, key)))
2510 return NULL;
2511 if (codec->u.e_byte_array_len.val_codec->codec == E_XDELTA) {
2512 if (!(tm->blk2 = cram_new_block(EXTERNAL, key+128)))
2513 return NULL;
2514 codec->u.e_byte_array_len.len_codec->out = tm->blk2;
2515 codec->u.e_byte_array_len.val_codec->u.e_xdelta.sub_codec->out = tm->blk;
2516 } else {
2517 codec->u.e_byte_array_len.len_codec->out = tm->blk;
2518 codec->u.e_byte_array_len.val_codec->out = tm->blk;
2519 }
2520 }
2521
2522 // skip TN field
2523 aux+=3;
2524
2525 // We use BYTE_ARRAY_LEN with external length, so store that first
2526 switch (type) {
2527 case 'c': case 'C':
2528 blen = count;
2529 break;
2530 case 's': case 'S':
2531 blen = 2*count;
2532 break;
2533 case 'i': case 'I': case 'f':
2534 blen = 4*count;
2535 break;
2536 default:
2537 hts_log_error("Unknown sub-type '%c' for aux type 'B'", type);
2538 return NULL;
2539 }
2540
2541 blen += 5; // sub-type & length
2542
2543 if (codec->encode(s, codec, aux, blen) < 0)
2544 return NULL;
2545 aux += blen;
2546 break;
2547 }
2548 default:
2549 hts_log_error("Unknown aux type '%c'", aux[2]);
2550 return NULL;
2551 }
2552 tm->blk->m = tm->m;
2553 }
2554
2555 // FIXME: sort BLOCK_DATA(td_b) by char[3] triples
2556
2557 // And and increment TD hash entry
2558 BLOCK_APPEND_CHAR(td_b, 0);
2559
2560 // Duplicate key as BLOCK_DATA() can be realloced to a new pointer.
2561 key = string_ndup(c->comp_hdr->TD_keys,
2562 (char *)BLOCK_DATA(td_b) + TD_blk_size,
2563 BLOCK_SIZE(td_b) - TD_blk_size);
2564 if (!key)
2565 goto block_err;
2566 k = kh_put(m_s2i, c->comp_hdr->TD_hash, key, &new);
2567 if (new < 0) {
2568 return NULL;
2569 } else if (new == 0) {
2570 BLOCK_SIZE(td_b) = TD_blk_size;
2571 } else {
2572 kh_val(c->comp_hdr->TD_hash, k) = c->comp_hdr->nTL;
2573 c->comp_hdr->nTL++;
2574 }
2575
2576 cr->TL = kh_val(c->comp_hdr->TD_hash, k);
2577 if (cram_stats_add(c->stats[DS_TL], cr->TL) < 0)
2578 goto block_err;
2579
2580 if (err) *err = 0;
2581 return rg;
2582
2583 err:
2584 block_err:
2585 return NULL;
2586 }
2587
2588 /*
2589 * During cram_next_container or before the final flush at end of
2590 * file, we update the current slice headers and increment the slice
2591 * number to the next slice.
2592 *
2593 * See cram_next_container() and cram_close().
2594 */
cram_update_curr_slice(cram_container * c)2595 void cram_update_curr_slice(cram_container *c) {
2596 cram_slice *s = c->slice;
2597 if (c->multi_seq) {
2598 s->hdr->ref_seq_id = -2;
2599 s->hdr->ref_seq_start = 0;
2600 s->hdr->ref_seq_span = 0;
2601 } else {
2602 s->hdr->ref_seq_id = c->curr_ref;
2603 s->hdr->ref_seq_start = c->first_base;
2604 s->hdr->ref_seq_span = MAX(0, c->last_base - c->first_base + 1);
2605 }
2606 s->hdr->num_records = c->curr_rec;
2607
2608 if (c->curr_slice == 0) {
2609 if (c->ref_seq_id != s->hdr->ref_seq_id)
2610 c->ref_seq_id = s->hdr->ref_seq_id;
2611 c->ref_seq_start = c->first_base;
2612 }
2613
2614 c->curr_slice++;
2615 }
2616
2617 /*
2618 * Handles creation of a new container or new slice, flushing any
2619 * existing containers when appropriate.
2620 *
2621 * Really this is next slice, which may or may not lead to a new container.
2622 *
2623 * Returns cram_container pointer on success
2624 * NULL on failure.
2625 */
cram_next_container(cram_fd * fd,bam_seq_t * b)2626 static cram_container *cram_next_container(cram_fd *fd, bam_seq_t *b) {
2627 cram_container *c = fd->ctr;
2628 int i;
2629
2630 /* First occurrence */
2631 if (c->curr_ref == -2)
2632 c->curr_ref = bam_ref(b);
2633
2634 if (c->slice)
2635 cram_update_curr_slice(c);
2636
2637 /* Flush container */
2638 if (c->curr_slice == c->max_slice ||
2639 (bam_ref(b) != c->curr_ref && !c->multi_seq)) {
2640 c->ref_seq_span = fd->last_base - c->ref_seq_start + 1;
2641 hts_log_info("Flush container %d/%"PRId64"..%"PRId64,
2642 c->ref_seq_id, c->ref_seq_start,
2643 c->ref_seq_start + c->ref_seq_span -1);
2644
2645 /* Encode slices */
2646 if (-1 == cram_flush_container_mt(fd, c))
2647 return NULL;
2648 if (!fd->pool) {
2649 // Move to sep func, as we need cram_flush_container for
2650 // the closing phase to flush the partial container.
2651 for (i = 0; i < c->max_slice; i++) {
2652 cram_free_slice(c->slices[i]);
2653 c->slices[i] = NULL;
2654 }
2655
2656 c->slice = NULL;
2657 c->curr_slice = 0;
2658
2659 /* Easy approach for purposes of freeing stats */
2660 cram_free_container(c);
2661 }
2662
2663 c = fd->ctr = cram_new_container(fd->seqs_per_slice,
2664 fd->slices_per_container);
2665 if (!c)
2666 return NULL;
2667 c->record_counter = fd->record_counter;
2668 c->curr_ref = bam_ref(b);
2669 }
2670
2671 c->last_pos = c->first_base = c->last_base = bam_pos(b)+1;
2672
2673 /* New slice */
2674 c->slice = c->slices[c->curr_slice] =
2675 cram_new_slice(MAPPED_SLICE, c->max_rec);
2676 if (!c->slice)
2677 return NULL;
2678
2679 if (c->multi_seq) {
2680 c->slice->hdr->ref_seq_id = -2;
2681 c->slice->hdr->ref_seq_start = 0;
2682 c->slice->last_apos = 1;
2683 } else {
2684 c->slice->hdr->ref_seq_id = bam_ref(b);
2685 // wrong for unsorted data, will fix during encoding.
2686 c->slice->hdr->ref_seq_start = bam_pos(b)+1;
2687 c->slice->last_apos = bam_pos(b)+1;
2688 }
2689
2690 c->curr_rec = 0;
2691 c->s_num_bases = 0;
2692 c->n_mapped = 0;
2693
2694 // QO field: 0 implies original orientation, 1 implies sequence orientation
2695 // 1 is often preferable for NovaSeq, but impact is slight. ~0.5% diff.
2696 // Conversely other data sets it's often better than 1% saving for 0.
2697 // Short of trying both and learning, for now we use use 0 for V4, 1 for V3.
2698 c->qs_seq_orient = CRAM_MAJOR_VERS(fd->version) >= 4 ? 0 : 1;
2699
2700 return c;
2701 }
2702
2703 /*
2704 * Converts a single bam record into a cram record.
2705 * Possibly used within a thread.
2706 *
2707 * Returns 0 on success;
2708 * -1 on failure
2709 */
process_one_read(cram_fd * fd,cram_container * c,cram_slice * s,cram_record * cr,bam_seq_t * b,int rnum,kstring_t * MD)2710 static int process_one_read(cram_fd *fd, cram_container *c,
2711 cram_slice *s, cram_record *cr,
2712 bam_seq_t *b, int rnum, kstring_t *MD) {
2713 int i, fake_qual = -1, NM = 0;
2714 char *cp, *rg;
2715 char *ref, *seq, *qual;
2716
2717 // Any places with N in seq and/or reference can lead to ambiguous
2718 // interpretation of the SAM NM:i tag. So we store these verbatim
2719 // to ensure valid data round-trips the same regardless of who
2720 // defines it as valid.
2721 // Similarly when alignments go beyond end of the reference.
2722 int verbatim_NM = fd->store_nm;
2723 int verbatim_MD = fd->store_md;
2724
2725 // FIXME: multi-ref containers
2726
2727 ref = c->ref;
2728 cr->flags = bam_flag(b);
2729 cr->len = bam_seq_len(b);
2730 if (!bam_aux_get(b, "MD"))
2731 MD = NULL;
2732 else
2733 MD->l = 0;
2734
2735 //fprintf(stderr, "%s => %d\n", rg ? rg : "\"\"", cr->rg);
2736
2737 cr->ref_id = bam_ref(b);
2738 if (cram_stats_add(c->stats[DS_RI], cr->ref_id) < 0)
2739 goto block_err;
2740 if (cram_stats_add(c->stats[DS_BF], fd->cram_flag_swap[cr->flags & 0xfff]) < 0)
2741 goto block_err;
2742
2743 // Non reference based encoding means storing the bases verbatim as features, which in
2744 // turn means every base also has a quality already stored.
2745 if (!fd->no_ref || CRAM_MAJOR_VERS(fd->version) >= 3)
2746 cr->cram_flags |= CRAM_FLAG_PRESERVE_QUAL_SCORES;
2747
2748 if (cr->len <= 0 && CRAM_MAJOR_VERS(fd->version) >= 3)
2749 cr->cram_flags |= CRAM_FLAG_NO_SEQ;
2750 //cram_stats_add(c->stats[DS_CF], cr->cram_flags & CRAM_FLAG_MASK);
2751
2752 c->num_bases += cr->len;
2753 cr->apos = bam_pos(b)+1;
2754 if (c->pos_sorted) {
2755 if (cr->apos < s->last_apos && !fd->ap_delta) {
2756 c->pos_sorted = 0;
2757 } else {
2758 if (cram_stats_add(c->stats[DS_AP], cr->apos - s->last_apos) < 0)
2759 goto block_err;
2760 s->last_apos = cr->apos;
2761 }
2762 } else {
2763 //cram_stats_add(c->stats[DS_AP], cr->apos);
2764 }
2765 c->max_apos += (cr->apos > c->max_apos) * (cr->apos - c->max_apos);
2766
2767 /*
2768 * This seqs_ds is largely pointless and it could reuse the same memory
2769 * over and over.
2770 * s->base_blk is what we need for encoding.
2771 */
2772 cr->seq = BLOCK_SIZE(s->seqs_blk);
2773 cr->qual = BLOCK_SIZE(s->qual_blk);
2774 BLOCK_GROW(s->seqs_blk, cr->len+1);
2775 BLOCK_GROW(s->qual_blk, cr->len);
2776
2777 // Convert BAM nibble encoded sequence to string of base pairs
2778 seq = cp = (char *)BLOCK_END(s->seqs_blk);
2779 *seq = 0;
2780 nibble2base(bam_seq(b), cp, cr->len);
2781 BLOCK_SIZE(s->seqs_blk) += cr->len;
2782
2783 qual = cp = (char *)bam_qual(b);
2784
2785
2786 /* Copy and parse */
2787 if (!(cr->flags & BAM_FUNMAP)) {
2788 uint32_t *cig_to, *cig_from;
2789 int64_t apos = cr->apos-1, spos = 0;
2790 int64_t MD_last = apos; // last position of edit in MD tag
2791
2792 cr->cigar = s->ncigar;
2793 cr->ncigar = bam_cigar_len(b);
2794 while (cr->cigar + cr->ncigar >= s->cigar_alloc) {
2795 s->cigar_alloc = s->cigar_alloc ? s->cigar_alloc*2 : 1024;
2796 s->cigar = realloc(s->cigar, s->cigar_alloc * sizeof(*s->cigar));
2797 if (!s->cigar)
2798 return -1;
2799 }
2800
2801 cig_to = (uint32_t *)s->cigar;
2802 cig_from = (uint32_t *)bam_cigar(b);
2803
2804 cr->feature = 0;
2805 cr->nfeature = 0;
2806 for (i = 0; i < cr->ncigar; i++) {
2807 enum cigar_op cig_op = cig_from[i] & BAM_CIGAR_MASK;
2808 uint32_t cig_len = cig_from[i] >> BAM_CIGAR_SHIFT;
2809 cig_to[i] = cig_from[i];
2810
2811 /* Can also generate events from here for CRAM diffs */
2812
2813 switch (cig_op) {
2814 int l;
2815
2816 // Don't trust = and X ops to be correct.
2817 case BAM_CMATCH:
2818 case BAM_CBASE_MATCH:
2819 case BAM_CBASE_MISMATCH:
2820 //fprintf(stderr, "\nBAM_CMATCH\nR: %.*s\nS: %.*s\n",
2821 // cig_len, &ref[apos], cig_len, &seq[spos]);
2822 l = 0;
2823 if (!fd->no_ref && cr->len) {
2824 int end = cig_len+apos < c->ref_end
2825 ? cig_len : c->ref_end - apos;
2826 char *sp = &seq[spos];
2827 char *rp = &ref[apos];
2828 char *qp = &qual[spos];
2829 if (end > cr->len) {
2830 hts_log_error("CIGAR and query sequence are of different length");
2831 return -1;
2832 }
2833 for (l = 0; l < end; l++) {
2834 // This case is just too disputed and different tools
2835 // interpret these in different ways. We give up and
2836 // store verbatim.
2837 if (rp[l] == 'N' && sp[l] == 'N')
2838 verbatim_NM = verbatim_MD = 1;
2839 if (rp[l] != sp[l]) {
2840 // Build our own MD tag if one is on the sequence, so
2841 // we can ensure it matches and thus can be discarded.
2842 if (MD && ref) {
2843 if (kputuw(apos+l - MD_last, MD) < 0) goto err;
2844 if (kputc(rp[l], MD) < 0) goto err;
2845 MD_last = apos+l+1;
2846 }
2847 NM++;
2848 if (!sp[l])
2849 break;
2850 if (0 && CRAM_MAJOR_VERS(fd->version) >= 3) {
2851 #if 0
2852 // Disabled for the time being as it doesn't
2853 // seem to gain us much.
2854 int ol=l;
2855 while (l<end && rp[l] != sp[l])
2856 l++;
2857 if (l-ol > 1) {
2858 if (cram_add_bases(fd, c, s, cr, spos+ol,
2859 l-ol, &seq[spos+ol]))
2860 return -1;
2861 l--;
2862 } else {
2863 l = ol;
2864 if (cram_add_substitution(fd, c, s, cr,
2865 spos+l, sp[l],
2866 qp[l], rp[l]))
2867 return -1;
2868 }
2869 #else
2870 // With urmap pushed to the limit and lots
2871 // of unaligned data (should be soft-clipped)
2872 // this saves ~2-7%. Worth it?
2873 int nl = l;
2874 int max_end = nl, max_score = 0, score = 0;
2875 while (nl < end) {
2876 if (rp[nl] != sp[nl]) {
2877 score += 3;
2878 if (max_score < score) {
2879 max_score = score;
2880 max_end = nl;
2881 }
2882 } else {
2883 score--;
2884 if (score < -2 ||
2885 max_score - score > 7)
2886 break;
2887 }
2888 nl++;
2889 }
2890 if (max_score > 20) {
2891 cram_add_bases(fd, c, s, cr, spos+l,
2892 max_end-l, &seq[spos+l]);
2893 l = max_end-1;
2894 } else {
2895 while (l < nl) {
2896 if (rp[l] != sp[l])
2897 cram_add_substitution(fd, c, s,
2898 cr, spos+l,
2899 sp[l], qp[l],
2900 rp[l]);
2901 l++;
2902 }
2903 l--;
2904 }
2905 #endif
2906 } else {
2907 if (cram_add_substitution(fd, c, s, cr, spos+l,
2908 sp[l], qp[l], rp[l]))
2909 return -1;
2910 }
2911 }
2912 }
2913 spos += l;
2914 apos += l;
2915 }
2916
2917 if (l < cig_len && cr->len) {
2918 if (fd->no_ref) {
2919 if (CRAM_MAJOR_VERS(fd->version) == 3) {
2920 if (cram_add_bases(fd, c, s, cr, spos,
2921 cig_len-l, &seq[spos]))
2922 return -1;
2923 spos += cig_len-l;
2924 } else {
2925 for (; l < cig_len && seq[spos]; l++, spos++) {
2926 if (cram_add_base(fd, c, s, cr, spos,
2927 seq[spos], qual[spos]))
2928 return -1;
2929 }
2930 }
2931 } else {
2932 /* off end of sequence or non-ref based output */
2933 verbatim_NM = verbatim_MD = 1;
2934 for (; l < cig_len && seq[spos]; l++, spos++) {
2935 if (cram_add_base(fd, c, s, cr, spos,
2936 seq[spos], qual[spos]))
2937 return -1;
2938 }
2939 }
2940 apos += cig_len;
2941 } else if (!cr->len) {
2942 /* Seq "*" */
2943 verbatim_NM = verbatim_MD = 1;
2944 apos += cig_len;
2945 spos += cig_len;
2946 }
2947 break;
2948
2949 case BAM_CDEL:
2950 if (MD && ref) {
2951 if (kputuw(apos - MD_last, MD) < 0) goto err;
2952 if (apos < c->ref_end) {
2953 if (kputc_('^', MD) < 0) goto err;
2954 if (kputsn(&ref[apos], MIN(c->ref_end - apos, cig_len), MD) < 0)
2955 goto err;
2956 }
2957 }
2958 NM += cig_len;
2959
2960 if (cram_add_deletion(c, s, cr, spos, cig_len, &seq[spos]))
2961 return -1;
2962 apos += cig_len;
2963 MD_last = apos;
2964 break;
2965
2966 case BAM_CREF_SKIP:
2967 if (cram_add_skip(c, s, cr, spos, cig_len, &seq[spos]))
2968 return -1;
2969 apos += cig_len;
2970 MD_last += cig_len;
2971 break;
2972
2973 case BAM_CINS:
2974 if (cram_add_insertion(c, s, cr, spos, cig_len,
2975 cr->len ? &seq[spos] : NULL))
2976 return -1;
2977 if (fd->no_ref && cr->len) {
2978 for (l = 0; l < cig_len; l++, spos++) {
2979 cram_add_quality(fd, c, s, cr, spos, qual[spos]);
2980 }
2981 } else {
2982 spos += cig_len;
2983 }
2984 NM += cig_len;
2985 break;
2986
2987 case BAM_CSOFT_CLIP:
2988 if (cram_add_softclip(c, s, cr, spos, cig_len,
2989 cr->len ? &seq[spos] : NULL,
2990 fd->version))
2991 return -1;
2992
2993 if (fd->no_ref &&
2994 !(cr->cram_flags & CRAM_FLAG_PRESERVE_QUAL_SCORES)) {
2995 if (cr->len) {
2996 for (l = 0; l < cig_len; l++, spos++) {
2997 cram_add_quality(fd, c, s, cr, spos, qual[spos]);
2998 }
2999 } else {
3000 for (l = 0; l < cig_len; l++, spos++) {
3001 cram_add_quality(fd, c, s, cr, spos, -1);
3002 }
3003 }
3004 } else {
3005 spos += cig_len;
3006 }
3007 break;
3008
3009 case BAM_CHARD_CLIP:
3010 if (cram_add_hardclip(c, s, cr, spos, cig_len, &seq[spos]))
3011 return -1;
3012 break;
3013
3014 case BAM_CPAD:
3015 if (cram_add_pad(c, s, cr, spos, cig_len, &seq[spos]))
3016 return -1;
3017 break;
3018
3019 default:
3020 hts_log_error("Unknown CIGAR op code %d", cig_op);
3021 return -1;
3022 }
3023 }
3024 if (cr->len && spos != cr->len) {
3025 hts_log_error("CIGAR and query sequence are of different length");
3026 return -1;
3027 }
3028 fake_qual = spos;
3029 cr->aend = fd->no_ref ? apos : MIN(apos, c->ref_end);
3030 if (cram_stats_add(c->stats[DS_FN], cr->nfeature) < 0)
3031 goto block_err;
3032
3033 if (MD && ref)
3034 if (kputuw(apos - MD_last, MD) < 0) goto err;
3035 } else {
3036 // Unmapped
3037 cr->cram_flags |= CRAM_FLAG_PRESERVE_QUAL_SCORES;
3038 cr->cigar = 0;
3039 cr->ncigar = 0;
3040 cr->nfeature = 0;
3041 cr->aend = cr->apos;
3042 for (i = 0; i < cr->len; i++)
3043 if (cram_stats_add(c->stats[DS_BA], seq[i]) < 0)
3044 goto block_err;
3045 fake_qual = 0;
3046 }
3047
3048 cr->ntags = 0; //cram_stats_add(c->stats[DS_TC], cr->ntags);
3049 int err = 0;
3050 rg = cram_encode_aux(fd, b, c, s, cr, verbatim_NM, verbatim_MD, NM, MD, &err);
3051 if (err)
3052 goto block_err;
3053
3054 /* Read group, identified earlier */
3055 if (rg) {
3056 sam_hrec_rg_t *brg = sam_hrecs_find_rg(fd->header->hrecs, rg);
3057 cr->rg = brg ? brg->id : -1;
3058 } else if (CRAM_MAJOR_VERS(fd->version) == 1) {
3059 sam_hrec_rg_t *brg = sam_hrecs_find_rg(fd->header->hrecs, "UNKNOWN");
3060 assert(brg);
3061 } else {
3062 cr->rg = -1;
3063 }
3064 if (cram_stats_add(c->stats[DS_RG], cr->rg) < 0)
3065 goto block_err;
3066
3067 /*
3068 * Append to the qual block now. We do this here as
3069 * cram_add_substitution() can generate BA/QS events which need to
3070 * be in the qual block before we append the rest of the data.
3071 */
3072 if (cr->cram_flags & CRAM_FLAG_PRESERVE_QUAL_SCORES) {
3073 /* Special case of seq "*" */
3074 if (cr->len == 0) {
3075 cr->len = fake_qual;
3076 BLOCK_GROW(s->qual_blk, cr->len);
3077 cp = (char *)BLOCK_END(s->qual_blk);
3078 memset(cp, 255, cr->len);
3079 } else {
3080 BLOCK_GROW(s->qual_blk, cr->len);
3081 cp = (char *)BLOCK_END(s->qual_blk);
3082 char *from = (char *)&bam_qual(b)[0];
3083 char *to = &cp[0];
3084 memcpy(to, from, cr->len);
3085
3086 // Store quality in original orientation for better compression.
3087 if (!c->qs_seq_orient) {
3088 if (cr->flags & BAM_FREVERSE) {
3089 int i, j;
3090 for (i = 0, j = cr->len-1; i < j; i++, j--) {
3091 unsigned char c;
3092 c = to[i];
3093 to[i] = to[j];
3094 to[j] = c;
3095 }
3096 }
3097 }
3098 }
3099 BLOCK_SIZE(s->qual_blk) += cr->len;
3100 } else {
3101 if (cr->len == 0)
3102 cr->len = fake_qual >= 0 ? fake_qual : cr->aend - cr->apos + 1;
3103 }
3104
3105 if (cram_stats_add(c->stats[DS_RL], cr->len) < 0)
3106 goto block_err;
3107
3108 /* Now we know apos and aend both, update mate-pair information */
3109 {
3110 int new;
3111 khint_t k;
3112 int sec = (cr->flags & BAM_FSECONDARY) ? 1 : 0;
3113
3114 //fprintf(stderr, "Checking %"PRId64"/%.*s\t", rnum,
3115 // cr->name_len, DSTRING_STR(s->name_ds)+cr->name);
3116 if (cr->flags & BAM_FPAIRED) {
3117 char *key = string_ndup(s->pair_keys, bam_name(b), bam_name_len(b));
3118 if (!key)
3119 return -1;
3120
3121 k = kh_put(m_s2i, s->pair[sec], key, &new);
3122 if (-1 == new)
3123 return -1;
3124 else if (new > 0)
3125 kh_val(s->pair[sec], k) = rnum;
3126 } else {
3127 new = 1;
3128 k = 0; // Prevents false-positive warning from gcc -Og
3129 }
3130
3131 if (new == 0) {
3132 cram_record *p = &s->crecs[kh_val(s->pair[sec], k)];
3133 int64_t aleft, aright;
3134 int sign;
3135
3136 aleft = MIN(cr->apos, p->apos);
3137 aright = MAX(cr->aend, p->aend);
3138 if (cr->apos < p->apos) {
3139 sign = 1;
3140 } else if (cr->apos > p->apos) {
3141 sign = -1;
3142 } else if (cr->flags & BAM_FREAD1) {
3143 sign = 1;
3144 } else {
3145 sign = -1;
3146 }
3147
3148 // This vs p: tlen, matepos, flags. Permit TLEN 0 and/or TLEN +/-
3149 // a small amount, if appropriate options set.
3150 if ((!fd->tlen_zero && MAX(bam_mate_pos(b)+1, 0) != p->apos) &&
3151 !(fd->tlen_zero && bam_mate_pos(b) == 0))
3152 goto detached;
3153
3154 if (((bam_flag(b) & BAM_FMUNMAP) != 0) !=
3155 ((p->flags & BAM_FUNMAP) != 0))
3156 goto detached;
3157
3158 if (((bam_flag(b) & BAM_FMREVERSE) != 0) !=
3159 ((p->flags & BAM_FREVERSE) != 0))
3160 goto detached;
3161
3162
3163 // p vs this: tlen, matepos, flags
3164 if (p->ref_id != cr->ref_id &&
3165 !(fd->tlen_zero && p->ref_id == -1))
3166 goto detached;
3167
3168 if (p->mate_pos != cr->apos &&
3169 !(fd->tlen_zero && p->mate_pos == 0))
3170 goto detached;
3171
3172 if (((p->flags & BAM_FMUNMAP) != 0) !=
3173 ((p->mate_flags & CRAM_M_UNMAP) != 0))
3174 goto detached;
3175
3176 if (((p->flags & BAM_FMREVERSE) != 0) !=
3177 ((p->mate_flags & CRAM_M_REVERSE) != 0))
3178 goto detached;
3179
3180 // Supplementary reads are just too ill defined
3181 if ((cr->flags & BAM_FSUPPLEMENTARY) ||
3182 (p->flags & BAM_FSUPPLEMENTARY))
3183 goto detached;
3184
3185 // When in lossy name mode, if a read isn't detached we
3186 // cannot store the name. The corollary is that when we
3187 // must store the name, it must be detached (inefficient).
3188 if (fd->lossy_read_names &&
3189 (!(cr->cram_flags & CRAM_FLAG_DISCARD_NAME) ||
3190 !((p->cram_flags & CRAM_FLAG_DISCARD_NAME))))
3191 goto detached;
3192
3193 // Now check TLEN. We do this last as sometimes it's the
3194 // only thing that differs. In CRAM4 we have a better way
3195 // of handling this that doesn't break detached status
3196 int explicit_tlen = 0;
3197 int tflag1 = ((bam_ins_size(b) &&
3198 llabs(bam_ins_size(b) - sign*(aright-aleft+1))
3199 > fd->tlen_approx)
3200 || (!bam_ins_size(b) && !fd->tlen_zero));
3201
3202 int tflag2 = ((p->tlen && llabs(p->tlen - -sign*(aright-aleft+1))
3203 > fd->tlen_approx)
3204 || (!p->tlen && !fd->tlen_zero));
3205
3206 if (tflag1 || tflag2) {
3207 if (CRAM_MAJOR_VERS(fd->version) >= 4) {
3208 explicit_tlen = CRAM_FLAG_EXPLICIT_TLEN;
3209 } else {
3210 // Stil do detached for unmapped data in CRAM4 as this
3211 // also impacts RNEXT calculation.
3212 goto detached;
3213 }
3214 }
3215
3216 /*
3217 * The fields below are unused when encoding this read as it is
3218 * no longer detached. In theory they may get referred to when
3219 * processing a 3rd or 4th read in this template?, so we set them
3220 * here just to be sure.
3221 *
3222 * They do not need cram_stats_add() calls those as they are
3223 * not emitted.
3224 */
3225 cr->mate_pos = p->apos;
3226 cram_stats_add(c->stats[DS_NP], cr->mate_pos);
3227 cr->tlen = explicit_tlen ? bam_ins_size(b) : sign*(aright-aleft+1);
3228 cram_stats_add(c->stats[DS_TS], cr->tlen);
3229 cr->mate_flags =
3230 ((p->flags & BAM_FMUNMAP) == BAM_FMUNMAP) * CRAM_M_UNMAP +
3231 ((p->flags & BAM_FMREVERSE) == BAM_FMREVERSE) * CRAM_M_REVERSE;
3232
3233 // Decrement statistics aggregated earlier
3234 if (p->cram_flags & CRAM_FLAG_STATS_ADDED) {
3235 cram_stats_del(c->stats[DS_NP], p->mate_pos);
3236 cram_stats_del(c->stats[DS_MF], p->mate_flags);
3237 if (!(p->cram_flags & CRAM_FLAG_EXPLICIT_TLEN))
3238 cram_stats_del(c->stats[DS_TS], p->tlen);
3239 cram_stats_del(c->stats[DS_NS], p->mate_ref_id);
3240 }
3241
3242 /* Similarly we could correct the p-> values too, but these will no
3243 * longer have any code that refers back to them as the new 'p'
3244 * for this template is our current 'cr'.
3245 */
3246 //p->mate_pos = cr->apos;
3247 //p->mate_flags =
3248 // ((cr->flags & BAM_FMUNMAP) == BAM_FMUNMAP) * CRAM_M_UNMAP +
3249 // ((cr->flags & BAM_FMREVERSE) == BAM_FMREVERSE)* CRAM_M_REVERSE;
3250 //p->tlen = p->apos - cr->aend;
3251
3252 // Clear detached from cr flags
3253 cr->cram_flags &= ~CRAM_FLAG_DETACHED;
3254 cr->cram_flags |= explicit_tlen;
3255 if (cram_stats_add(c->stats[DS_CF], cr->cram_flags & CRAM_FLAG_MASK) < 0)
3256 goto block_err;
3257
3258 // Clear detached from p flags and set downstream
3259 if (p->cram_flags & CRAM_FLAG_STATS_ADDED) {
3260 cram_stats_del(c->stats[DS_CF], p->cram_flags & CRAM_FLAG_MASK);
3261 p->cram_flags &= ~CRAM_FLAG_STATS_ADDED;
3262 }
3263
3264 p->cram_flags &= ~CRAM_FLAG_DETACHED;
3265 p->cram_flags |= CRAM_FLAG_MATE_DOWNSTREAM | explicit_tlen;;
3266 if (cram_stats_add(c->stats[DS_CF], p->cram_flags & CRAM_FLAG_MASK) < 0)
3267 goto block_err;
3268
3269 p->mate_line = rnum - (kh_val(s->pair[sec], k) + 1);
3270 if (cram_stats_add(c->stats[DS_NF], p->mate_line) < 0)
3271 goto block_err;
3272
3273 kh_val(s->pair[sec], k) = rnum;
3274 } else {
3275 detached:
3276 //fprintf(stderr, "unpaired\n");
3277
3278 /* Derive mate flags from this flag */
3279 cr->mate_flags = 0;
3280 if (bam_flag(b) & BAM_FMUNMAP)
3281 cr->mate_flags |= CRAM_M_UNMAP;
3282 if (bam_flag(b) & BAM_FMREVERSE)
3283 cr->mate_flags |= CRAM_M_REVERSE;
3284
3285 if (cram_stats_add(c->stats[DS_MF], cr->mate_flags) < 0)
3286 goto block_err;
3287
3288 cr->mate_pos = MAX(bam_mate_pos(b)+1, 0);
3289 if (cram_stats_add(c->stats[DS_NP], cr->mate_pos) < 0)
3290 goto block_err;
3291
3292 cr->tlen = bam_ins_size(b);
3293 if (cram_stats_add(c->stats[DS_TS], cr->tlen) < 0)
3294 goto block_err;
3295
3296 cr->cram_flags |= CRAM_FLAG_DETACHED;
3297 if (cram_stats_add(c->stats[DS_CF], cr->cram_flags & CRAM_FLAG_MASK) < 0)
3298 goto block_err;
3299 if (cram_stats_add(c->stats[DS_NS], bam_mate_ref(b)) < 0)
3300 goto block_err;
3301
3302 cr->cram_flags |= CRAM_FLAG_STATS_ADDED;
3303 }
3304 }
3305
3306 cr->mqual = bam_map_qual(b);
3307 if (cram_stats_add(c->stats[DS_MQ], cr->mqual) < 0)
3308 goto block_err;
3309
3310 cr->mate_ref_id = bam_mate_ref(b);
3311
3312 if (!(bam_flag(b) & BAM_FUNMAP)) {
3313 if (c->first_base > cr->apos)
3314 c->first_base = cr->apos;
3315
3316 if (c->last_base < cr->aend)
3317 c->last_base = cr->aend;
3318 }
3319
3320 return 0;
3321
3322 block_err:
3323 err:
3324 return -1;
3325 }
3326
3327 /*
3328 * Write iterator: put BAM format sequences into a CRAM file.
3329 * We buffer up a containers worth of data at a time.
3330 *
3331 * Returns 0 on success
3332 * -1 on failure
3333 */
cram_put_bam_seq(cram_fd * fd,bam_seq_t * b)3334 int cram_put_bam_seq(cram_fd *fd, bam_seq_t *b) {
3335 cram_container *c;
3336
3337 if (!fd->ctr) {
3338 fd->ctr = cram_new_container(fd->seqs_per_slice,
3339 fd->slices_per_container);
3340 if (!fd->ctr)
3341 return -1;
3342 fd->ctr->record_counter = fd->record_counter;
3343 }
3344 c = fd->ctr;
3345
3346 if (!c->slice || c->curr_rec == c->max_rec ||
3347 (bam_ref(b) != c->curr_ref && c->curr_ref >= -1) ||
3348 (c->s_num_bases >= fd->bases_per_slice)) {
3349 int slice_rec, curr_rec, multi_seq = fd->multi_seq == 1;
3350 int curr_ref = c->slice ? c->curr_ref : bam_ref(b);
3351
3352 /*
3353 * Start packing slices when we routinely have under 1/4tr full.
3354 *
3355 * This option isn't available if we choose to embed references
3356 * since we can only have one per slice.
3357 *
3358 * The multi_seq var here refers to our intention for the next slice.
3359 * This slice has already been encoded so we output as-is.
3360 */
3361 if (fd->multi_seq == -1 && c->curr_rec < c->max_rec/4+10 &&
3362 fd->last_slice && fd->last_slice < c->max_rec/4+10 &&
3363 !fd->embed_ref) {
3364 if (!c->multi_seq)
3365 hts_log_info("Multi-ref enabled for next container");
3366 multi_seq = 1;
3367 } else if (fd->multi_seq == 1) {
3368 pthread_mutex_lock(&fd->metrics_lock);
3369 if (fd->last_RI_count <= c->max_slice && fd->multi_seq_user != 1) {
3370 multi_seq = 0;
3371 hts_log_info("Multi-ref disabled for next container");
3372 }
3373 pthread_mutex_unlock(&fd->metrics_lock);
3374 }
3375
3376 slice_rec = c->slice_rec;
3377 curr_rec = c->curr_rec;
3378
3379 if (CRAM_MAJOR_VERS(fd->version) == 1 ||
3380 c->curr_rec == c->max_rec || fd->multi_seq != 1 || !c->slice ||
3381 c->s_num_bases >= fd->bases_per_slice) {
3382 if (NULL == (c = cram_next_container(fd, b))) {
3383 if (fd->ctr) {
3384 // prevent cram_close attempting to flush
3385 cram_free_container(fd->ctr);
3386 fd->ctr = NULL;
3387 }
3388 return -1;
3389 }
3390 }
3391
3392 /*
3393 * Due to our processing order, some things we've already done we
3394 * cannot easily undo. So when we first notice we should be packing
3395 * multiple sequences per container we emit the small partial
3396 * container as-is and then start a fresh one in a different mode.
3397 */
3398 if (multi_seq == 0 && fd->multi_seq == 1 && fd->multi_seq_user == -1) {
3399 // User selected auto-mode, we're currently using multi-seq, but
3400 // have detected we don't need to. Switch back to auto.
3401 fd->multi_seq = -1;
3402 } else if (multi_seq) {
3403 // We detected we need multi-seq
3404 fd->multi_seq = 1;
3405 c->multi_seq = 1;
3406 c->pos_sorted = 0;
3407
3408 if (!c->refs_used) {
3409 pthread_mutex_lock(&fd->ref_lock);
3410 c->refs_used = calloc(fd->refs->nref, sizeof(int));
3411 pthread_mutex_unlock(&fd->ref_lock);
3412 if (!c->refs_used)
3413 return -1;
3414 }
3415 }
3416
3417 fd->last_slice = curr_rec - slice_rec;
3418 c->slice_rec = c->curr_rec;
3419
3420 // Have we seen this reference before?
3421 if (bam_ref(b) >= 0 && curr_ref >= 0 && bam_ref(b) != curr_ref && !fd->embed_ref &&
3422 !fd->unsorted && multi_seq) {
3423
3424 if (!c->refs_used) {
3425 pthread_mutex_lock(&fd->ref_lock);
3426 c->refs_used = calloc(fd->refs->nref, sizeof(int));
3427 pthread_mutex_unlock(&fd->ref_lock);
3428 if (!c->refs_used)
3429 return -1;
3430 } else if (c->refs_used && c->refs_used[bam_ref(b)]) {
3431 pthread_mutex_lock(&fd->ref_lock);
3432 fd->unsorted = 1;
3433 pthread_mutex_unlock(&fd->ref_lock);
3434 fd->multi_seq = 1;
3435 }
3436 }
3437
3438 c->curr_ref = bam_ref(b);
3439 if (c->refs_used && c->curr_ref >= 0) c->refs_used[c->curr_ref]++;
3440 }
3441
3442 if (!c->bams) {
3443 /* First time through, allocate a set of bam pointers */
3444 pthread_mutex_lock(&fd->bam_list_lock);
3445 if (fd->bl) {
3446 spare_bams *spare = fd->bl;
3447 c->bams = spare->bams;
3448 fd->bl = spare->next;
3449 free(spare);
3450 } else {
3451 c->bams = calloc(c->max_c_rec, sizeof(bam_seq_t *));
3452 if (!c->bams) {
3453 pthread_mutex_unlock(&fd->bam_list_lock);
3454 return -1;
3455 }
3456 }
3457 pthread_mutex_unlock(&fd->bam_list_lock);
3458 }
3459
3460 /* Copy or alloc+copy the bam record, for later encoding */
3461 if (c->bams[c->curr_c_rec]) {
3462 if (bam_copy1(c->bams[c->curr_c_rec], b) == NULL)
3463 return -1;
3464 } else {
3465 c->bams[c->curr_c_rec] = bam_dup1(b);
3466 if (c->bams[c->curr_c_rec] == NULL)
3467 return -1;
3468 }
3469 c->curr_rec++;
3470 c->curr_c_rec++;
3471 c->s_num_bases += bam_seq_len(b);
3472 c->n_mapped += (bam_flag(b) & BAM_FUNMAP) ? 0 : 1;
3473 fd->record_counter++;
3474
3475 return 0;
3476 }
3477