1 /*
2 Copyright (c) 2012-2020 Genome Research Ltd.
3 Author: James Bonfield <jkb@sanger.ac.uk>
4 
5 Redistribution and use in source and binary forms, with or without
6 modification, are permitted provided that the following conditions are met:
7 
8    1. Redistributions of source code must retain the above copyright notice,
9 this list of conditions and the following disclaimer.
10 
11    2. Redistributions in binary form must reproduce the above copyright notice,
12 this list of conditions and the following disclaimer in the documentation
13 and/or other materials provided with the distribution.
14 
15    3. Neither the names Genome Research Ltd and Wellcome Trust Sanger
16 Institute nor the names of its contributors may be used to endorse or promote
17 products derived from this software without specific prior written permission.
18 
19 THIS SOFTWARE IS PROVIDED BY GENOME RESEARCH LTD AND CONTRIBUTORS "AS IS" AND
20 ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
21 WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
22 DISCLAIMED. IN NO EVENT SHALL GENOME RESEARCH LTD OR CONTRIBUTORS BE LIABLE
23 FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
24 DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
25 SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
26 CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
27 OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
28 OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
29 */
30 
31 #define HTS_BUILDING_LIBRARY // Enables HTSLIB_EXPORT, see htslib/hts_defs.h
32 #include <config.h>
33 
34 #include <stdio.h>
35 #include <errno.h>
36 #include <assert.h>
37 #include <stdlib.h>
38 #include <string.h>
39 #include <strings.h>
40 #include <zlib.h>
41 #include <sys/types.h>
42 #include <sys/stat.h>
43 #include <math.h>
44 #include <inttypes.h>
45 
46 #include "cram.h"
47 #include "os.h"
48 #include "../sam_internal.h" // for nibble2base
49 #include "../htslib/hts.h"
50 #include "../htslib/hts_endian.h"
51 
52 KHASH_MAP_INIT_STR(m_s2u64, uint64_t)
53 
54 #define Z_CRAM_STRAT Z_FILTERED
55 //#define Z_CRAM_STRAT Z_RLE
56 //#define Z_CRAM_STRAT Z_HUFFMAN_ONLY
57 //#define Z_CRAM_STRAT Z_DEFAULT_STRATEGY
58 
59 static int process_one_read(cram_fd *fd, cram_container *c,
60                             cram_slice *s, cram_record *cr,
61                             bam_seq_t *b, int rnum, kstring_t *MD);
62 
63 /*
64  * Returns index of val into key.
65  * Basically strchr(key, val)-key;
66  */
sub_idx(char * key,char val)67 static int sub_idx(char *key, char val) {
68     int i;
69 
70     for (i = 0; *key && *key++ != val; i++);
71     return i;
72 }
73 
74 /*
75  * Encodes a compression header block into a generic cram_block structure.
76  *
77  * Returns cram_block ptr on success
78  *         NULL on failure
79  */
cram_encode_compression_header(cram_fd * fd,cram_container * c,cram_block_compression_hdr * h)80 cram_block *cram_encode_compression_header(cram_fd *fd, cram_container *c,
81                                            cram_block_compression_hdr *h) {
82     cram_block *cb  = cram_new_block(COMPRESSION_HEADER, 0);
83     cram_block *map = cram_new_block(COMPRESSION_HEADER, 0);
84     int i, mc, r = 0;
85 
86     if (!cb || !map)
87         return NULL;
88 
89     /*
90      * This is a concatenation of several blocks of data:
91      * header + landmarks, preservation map, read encoding map, and the tag
92      * encoding map.
93      * All 4 are variable sized and we need to know how large these are
94      * before creating the compression header itself as this starts with
95      * the total size (stored as a variable length string).
96      */
97 
98     // Duplicated from container itself, and removed in 1.1
99     if (CRAM_MAJOR_VERS(fd->version) == 1) {
100         r |= itf8_put_blk(cb, h->ref_seq_id);
101         r |= itf8_put_blk(cb, h->ref_seq_start);
102         r |= itf8_put_blk(cb, h->ref_seq_span);
103         r |= itf8_put_blk(cb, h->num_records);
104         r |= itf8_put_blk(cb, h->num_landmarks);
105         for (i = 0; i < h->num_landmarks; i++) {
106             r |= itf8_put_blk(cb, h->landmark[i]);
107         }
108     }
109 
110     if (h->preservation_map) {
111         kh_destroy(map, h->preservation_map);
112         h->preservation_map = NULL;
113     }
114 
115     /* Create in-memory preservation map */
116     /* FIXME: should create this when we create the container */
117     if (c->num_records > 0) {
118         khint_t k;
119         int r;
120 
121         if (!(h->preservation_map = kh_init(map)))
122             return NULL;
123 
124         k = kh_put(map, h->preservation_map, "RN", &r);
125         if (-1 == r) return NULL;
126         kh_val(h->preservation_map, k).i = !fd->lossy_read_names;
127 
128         if (CRAM_MAJOR_VERS(fd->version) == 1) {
129             k = kh_put(map, h->preservation_map, "PI", &r);
130             if (-1 == r) return NULL;
131             kh_val(h->preservation_map, k).i = 0;
132 
133             k = kh_put(map, h->preservation_map, "UI", &r);
134             if (-1 == r) return NULL;
135             kh_val(h->preservation_map, k).i = 1;
136 
137             k = kh_put(map, h->preservation_map, "MI", &r);
138             if (-1 == r) return NULL;
139             kh_val(h->preservation_map, k).i = 1;
140 
141         } else {
142             // Technically SM was in 1.0, but wasn't in Java impl.
143             k = kh_put(map, h->preservation_map, "SM", &r);
144             if (-1 == r) return NULL;
145             kh_val(h->preservation_map, k).i = 0;
146 
147             k = kh_put(map, h->preservation_map, "TD", &r);
148             if (-1 == r) return NULL;
149             kh_val(h->preservation_map, k).i = 0;
150 
151             k = kh_put(map, h->preservation_map, "AP", &r);
152             if (-1 == r) return NULL;
153             kh_val(h->preservation_map, k).i = h->AP_delta;
154 
155             if (CRAM_MAJOR_VERS(fd->version) >= 4) {
156                 k = kh_put(map, h->preservation_map, "QO", &r);
157                 if (-1 == r) return NULL;
158                 kh_val(h->preservation_map, k).i = h->qs_seq_orient;
159             }
160 
161             if (fd->no_ref || fd->embed_ref) {
162                 // Reference Required == No
163                 k = kh_put(map, h->preservation_map, "RR", &r);
164                 if (-1 == r) return NULL;
165                 kh_val(h->preservation_map, k).i = 0;
166             }
167         }
168     }
169 
170     /* Encode preservation map; could collapse this and above into one */
171     mc = 0;
172     BLOCK_SIZE(map) = 0;
173     if (h->preservation_map) {
174         khint_t k;
175 
176         for (k = kh_begin(h->preservation_map);
177              k != kh_end(h->preservation_map);
178              k++) {
179             const char *key;
180             khash_t(map) *pmap = h->preservation_map;
181 
182 
183             if (!kh_exist(pmap, k))
184                 continue;
185 
186             key = kh_key(pmap, k);
187             BLOCK_APPEND(map, key, 2);
188 
189             switch(CRAM_KEY(key[0], key[1])) {
190             case CRAM_KEY('M','I'):
191             case CRAM_KEY('U','I'):
192             case CRAM_KEY('P','I'):
193             case CRAM_KEY('A','P'):
194             case CRAM_KEY('R','N'):
195             case CRAM_KEY('R','R'):
196             case CRAM_KEY('Q','O'):
197                 BLOCK_APPEND_CHAR(map, kh_val(pmap, k).i);
198                 break;
199 
200             case CRAM_KEY('S','M'): {
201                 char smat[5], *mp = smat;
202                 *mp++ =
203                     (sub_idx("CGTN", h->substitution_matrix[0][0]) << 6) |
204                     (sub_idx("CGTN", h->substitution_matrix[0][1]) << 4) |
205                     (sub_idx("CGTN", h->substitution_matrix[0][2]) << 2) |
206                     (sub_idx("CGTN", h->substitution_matrix[0][3]) << 0);
207                 *mp++ =
208                     (sub_idx("AGTN", h->substitution_matrix[1][0]) << 6) |
209                     (sub_idx("AGTN", h->substitution_matrix[1][1]) << 4) |
210                     (sub_idx("AGTN", h->substitution_matrix[1][2]) << 2) |
211                     (sub_idx("AGTN", h->substitution_matrix[1][3]) << 0);
212                 *mp++ =
213                     (sub_idx("ACTN", h->substitution_matrix[2][0]) << 6) |
214                     (sub_idx("ACTN", h->substitution_matrix[2][1]) << 4) |
215                     (sub_idx("ACTN", h->substitution_matrix[2][2]) << 2) |
216                     (sub_idx("ACTN", h->substitution_matrix[2][3]) << 0);
217                 *mp++ =
218                     (sub_idx("ACGN", h->substitution_matrix[3][0]) << 6) |
219                     (sub_idx("ACGN", h->substitution_matrix[3][1]) << 4) |
220                     (sub_idx("ACGN", h->substitution_matrix[3][2]) << 2) |
221                     (sub_idx("ACGN", h->substitution_matrix[3][3]) << 0);
222                 *mp++ =
223                     (sub_idx("ACGT", h->substitution_matrix[4][0]) << 6) |
224                     (sub_idx("ACGT", h->substitution_matrix[4][1]) << 4) |
225                     (sub_idx("ACGT", h->substitution_matrix[4][2]) << 2) |
226                     (sub_idx("ACGT", h->substitution_matrix[4][3]) << 0);
227                 BLOCK_APPEND(map, smat, 5);
228                 break;
229             }
230 
231             case CRAM_KEY('T','D'): {
232                 r |= (fd->vv.varint_put32_blk(map, BLOCK_SIZE(h->TD_blk)) <= 0);
233                 BLOCK_APPEND(map,
234                              BLOCK_DATA(h->TD_blk),
235                              BLOCK_SIZE(h->TD_blk));
236                 break;
237             }
238 
239             default:
240                 hts_log_warning("Unknown preservation key '%.2s'", key);
241                 break;
242             }
243 
244             mc++;
245         }
246     }
247     r |= (fd->vv.varint_put32_blk(cb, BLOCK_SIZE(map) + fd->vv.varint_size(mc)) <= 0);
248     r |= (fd->vv.varint_put32_blk(cb, mc) <= 0);
249     BLOCK_APPEND(cb, BLOCK_DATA(map), BLOCK_SIZE(map));
250 
251     /* rec encoding map */
252     mc = 0;
253     BLOCK_SIZE(map) = 0;
254     if (h->codecs[DS_BF]) {
255         if (-1 == h->codecs[DS_BF]->store(h->codecs[DS_BF], map, "BF",
256                                           fd->version))
257             return NULL;
258         mc++;
259     }
260     if (h->codecs[DS_CF]) {
261         if (-1 == h->codecs[DS_CF]->store(h->codecs[DS_CF], map, "CF",
262                                           fd->version))
263             return NULL;
264         mc++;
265     }
266     if (h->codecs[DS_RL]) {
267         if (-1 == h->codecs[DS_RL]->store(h->codecs[DS_RL], map, "RL",
268                                           fd->version))
269             return NULL;
270         mc++;
271     }
272     if (h->codecs[DS_AP]) {
273         if (-1 == h->codecs[DS_AP]->store(h->codecs[DS_AP], map, "AP",
274                                           fd->version))
275             return NULL;
276         mc++;
277     }
278     if (h->codecs[DS_RG]) {
279         if (-1 == h->codecs[DS_RG]->store(h->codecs[DS_RG], map, "RG",
280                                           fd->version))
281             return NULL;
282         mc++;
283     }
284     if (h->codecs[DS_MF]) {
285         if (-1 == h->codecs[DS_MF]->store(h->codecs[DS_MF], map, "MF",
286                                           fd->version))
287             return NULL;
288         mc++;
289     }
290     if (h->codecs[DS_NS]) {
291         if (-1 == h->codecs[DS_NS]->store(h->codecs[DS_NS], map, "NS",
292                                           fd->version))
293             return NULL;
294         mc++;
295     }
296     if (h->codecs[DS_NP]) {
297         if (-1 == h->codecs[DS_NP]->store(h->codecs[DS_NP], map, "NP",
298                                           fd->version))
299             return NULL;
300         mc++;
301     }
302     if (h->codecs[DS_TS]) {
303         if (-1 == h->codecs[DS_TS]->store(h->codecs[DS_TS], map, "TS",
304                                           fd->version))
305             return NULL;
306         mc++;
307     }
308     if (h->codecs[DS_NF]) {
309         if (-1 == h->codecs[DS_NF]->store(h->codecs[DS_NF], map, "NF",
310                                           fd->version))
311             return NULL;
312         mc++;
313     }
314     if (h->codecs[DS_TC]) {
315         if (-1 == h->codecs[DS_TC]->store(h->codecs[DS_TC], map, "TC",
316                                           fd->version))
317             return NULL;
318         mc++;
319     }
320     if (h->codecs[DS_TN]) {
321         if (-1 == h->codecs[DS_TN]->store(h->codecs[DS_TN], map, "TN",
322                                           fd->version))
323             return NULL;
324         mc++;
325     }
326     if (h->codecs[DS_TL]) {
327         if (-1 == h->codecs[DS_TL]->store(h->codecs[DS_TL], map, "TL",
328                                           fd->version))
329             return NULL;
330         mc++;
331     }
332     if (h->codecs[DS_FN]) {
333         if (-1 == h->codecs[DS_FN]->store(h->codecs[DS_FN], map, "FN",
334                                           fd->version))
335             return NULL;
336         mc++;
337     }
338     if (h->codecs[DS_FC]) {
339         if (-1 == h->codecs[DS_FC]->store(h->codecs[DS_FC], map, "FC",
340                                           fd->version))
341             return NULL;
342         mc++;
343     }
344     if (h->codecs[DS_FP]) {
345         if (-1 == h->codecs[DS_FP]->store(h->codecs[DS_FP], map, "FP",
346                                           fd->version))
347             return NULL;
348         mc++;
349     }
350     if (h->codecs[DS_BS]) {
351         if (-1 == h->codecs[DS_BS]->store(h->codecs[DS_BS], map, "BS",
352                                           fd->version))
353             return NULL;
354         mc++;
355     }
356     if (h->codecs[DS_IN]) {
357         if (-1 == h->codecs[DS_IN]->store(h->codecs[DS_IN], map, "IN",
358                                           fd->version))
359             return NULL;
360         mc++;
361     }
362     if (h->codecs[DS_DL]) {
363         if (-1 == h->codecs[DS_DL]->store(h->codecs[DS_DL], map, "DL",
364                                           fd->version))
365             return NULL;
366         mc++;
367     }
368     if (h->codecs[DS_BA]) {
369         if (-1 == h->codecs[DS_BA]->store(h->codecs[DS_BA], map, "BA",
370                                           fd->version))
371             return NULL;
372         mc++;
373     }
374     if (h->codecs[DS_BB]) {
375         if (-1 == h->codecs[DS_BB]->store(h->codecs[DS_BB], map, "BB",
376                                           fd->version))
377             return NULL;
378         mc++;
379     }
380     if (h->codecs[DS_MQ]) {
381         if (-1 == h->codecs[DS_MQ]->store(h->codecs[DS_MQ], map, "MQ",
382                                           fd->version))
383             return NULL;
384         mc++;
385     }
386     if (h->codecs[DS_RN]) {
387         if (-1 == h->codecs[DS_RN]->store(h->codecs[DS_RN], map, "RN",
388                                           fd->version))
389             return NULL;
390         mc++;
391     }
392     if (h->codecs[DS_QS]) {
393         if (-1 == h->codecs[DS_QS]->store(h->codecs[DS_QS], map, "QS",
394                                           fd->version))
395             return NULL;
396         mc++;
397     }
398     if (h->codecs[DS_QQ]) {
399         if (-1 == h->codecs[DS_QQ]->store(h->codecs[DS_QQ], map, "QQ",
400                                           fd->version))
401             return NULL;
402         mc++;
403     }
404     if (h->codecs[DS_RI]) {
405         if (-1 == h->codecs[DS_RI]->store(h->codecs[DS_RI], map, "RI",
406                                           fd->version))
407             return NULL;
408         mc++;
409     }
410     if (CRAM_MAJOR_VERS(fd->version) != 1) {
411         if (h->codecs[DS_SC]) {
412             if (-1 == h->codecs[DS_SC]->store(h->codecs[DS_SC], map, "SC",
413                                               fd->version))
414                 return NULL;
415             mc++;
416         }
417         if (h->codecs[DS_RS]) {
418             if (-1 == h->codecs[DS_RS]->store(h->codecs[DS_RS], map, "RS",
419                                               fd->version))
420                 return NULL;
421             mc++;
422         }
423         if (h->codecs[DS_PD]) {
424             if (-1 == h->codecs[DS_PD]->store(h->codecs[DS_PD], map, "PD",
425                                               fd->version))
426                 return NULL;
427             mc++;
428         }
429         if (h->codecs[DS_HC]) {
430             if (-1 == h->codecs[DS_HC]->store(h->codecs[DS_HC], map, "HC",
431                                               fd->version))
432                 return NULL;
433             mc++;
434         }
435     }
436     if (h->codecs[DS_TM]) {
437         if (-1 == h->codecs[DS_TM]->store(h->codecs[DS_TM], map, "TM",
438                                           fd->version))
439             return NULL;
440         mc++;
441     }
442     if (h->codecs[DS_TV]) {
443         if (-1 == h->codecs[DS_TV]->store(h->codecs[DS_TV], map, "TV",
444                                           fd->version))
445             return NULL;
446         mc++;
447     }
448     r |= (fd->vv.varint_put32_blk(cb, BLOCK_SIZE(map) + fd->vv.varint_size(mc)) <= 0);
449     r |= (fd->vv.varint_put32_blk(cb, mc) <= 0);
450     BLOCK_APPEND(cb, BLOCK_DATA(map), BLOCK_SIZE(map));
451 
452     /* tag encoding map */
453     mc = 0;
454     BLOCK_SIZE(map) = 0;
455     if (c->tags_used) {
456         khint_t k;
457 
458         for (k = kh_begin(c->tags_used); k != kh_end(c->tags_used); k++) {
459             int key;
460             if (!kh_exist(c->tags_used, k))
461                 continue;
462 
463             key = kh_key(c->tags_used, k);
464             cram_codec *cd = kh_val(c->tags_used, k)->codec;
465 
466             r |= (fd->vv.varint_put32_blk(map, key) <= 0);
467             if (-1 == cd->store(cd, map, NULL, fd->version))
468                 return NULL;
469 
470             mc++;
471         }
472     }
473 
474     r |= (fd->vv.varint_put32_blk(cb, BLOCK_SIZE(map) + fd->vv.varint_size(mc)) <= 0);
475     r |= (fd->vv.varint_put32_blk(cb, mc) <= 0);
476     BLOCK_APPEND(cb, BLOCK_DATA(map), BLOCK_SIZE(map));
477 
478     hts_log_info("Wrote compression block header in %d bytes", (int)BLOCK_SIZE(cb));
479 
480     BLOCK_UPLEN(cb);
481 
482     cram_free_block(map);
483 
484     if (r >= 0)
485         return cb;
486 
487  block_err:
488     return NULL;
489 }
490 
491 
492 /*
493  * Encodes a slice compression header.
494  *
495  * Returns cram_block on success
496  *         NULL on failure
497  */
cram_encode_slice_header(cram_fd * fd,cram_slice * s)498 cram_block *cram_encode_slice_header(cram_fd *fd, cram_slice *s) {
499     char *buf;
500     char *cp;
501     cram_block *b = cram_new_block(MAPPED_SLICE, 0);
502     int j;
503 
504     if (!b)
505         return NULL;
506 
507     cp = buf = malloc(22+16+5*(8+s->hdr->num_blocks));
508     if (NULL == buf) {
509         cram_free_block(b);
510         return NULL;
511     }
512 
513     cp += fd->vv.varint_put32s(cp, NULL, s->hdr->ref_seq_id);
514     if (CRAM_MAJOR_VERS(fd->version) >= 4) {
515         cp += fd->vv.varint_put64(cp, NULL, s->hdr->ref_seq_start);
516         cp += fd->vv.varint_put64(cp, NULL, s->hdr->ref_seq_span);
517     } else {
518         cp += fd->vv.varint_put32(cp, NULL, s->hdr->ref_seq_start);
519         cp += fd->vv.varint_put32(cp, NULL, s->hdr->ref_seq_span);
520     }
521     cp += fd->vv.varint_put32(cp, NULL, s->hdr->num_records);
522     if (CRAM_MAJOR_VERS(fd->version) == 2)
523         cp += fd->vv.varint_put32(cp, NULL, s->hdr->record_counter);
524     else if (CRAM_MAJOR_VERS(fd->version) >= 3)
525         cp += fd->vv.varint_put64(cp, NULL, s->hdr->record_counter);
526     cp += fd->vv.varint_put32(cp, NULL, s->hdr->num_blocks);
527     cp += fd->vv.varint_put32(cp, NULL, s->hdr->num_content_ids);
528     for (j = 0; j < s->hdr->num_content_ids; j++) {
529         cp += fd->vv.varint_put32(cp, NULL, s->hdr->block_content_ids[j]);
530     }
531     if (s->hdr->content_type == MAPPED_SLICE)
532         cp += fd->vv.varint_put32(cp, NULL, s->hdr->ref_base_id);
533 
534     if (CRAM_MAJOR_VERS(fd->version) != 1) {
535         memcpy(cp, s->hdr->md5, 16); cp += 16;
536     }
537 
538     assert(cp-buf <= 22+16+5*(8+s->hdr->num_blocks));
539 
540     b->data = (unsigned char *)buf;
541     b->comp_size = b->uncomp_size = cp-buf;
542 
543     return b;
544 }
545 
546 
547 /*
548  * Encodes a single read.
549  *
550  * Returns 0 on success
551  *        -1 on failure
552  */
cram_encode_slice_read(cram_fd * fd,cram_container * c,cram_block_compression_hdr * h,cram_slice * s,cram_record * cr,int64_t * last_pos)553 static int cram_encode_slice_read(cram_fd *fd,
554                                   cram_container *c,
555                                   cram_block_compression_hdr *h,
556                                   cram_slice *s,
557                                   cram_record *cr,
558                                   int64_t *last_pos) {
559     int r = 0;
560     int32_t i32;
561     int64_t i64;
562     unsigned char uc;
563 
564     //fprintf(stderr, "Encode seq %d, %d/%d FN=%d, %s\n", rec, core->byte, core->bit, cr->nfeature, s->name_ds->str + cr->name);
565 
566     //printf("BF=0x%x\n", cr->flags);
567     //      bf = cram_flag_swap[cr->flags];
568     i32 = fd->cram_flag_swap[cr->flags & 0xfff];
569     r |= h->codecs[DS_BF]->encode(s, h->codecs[DS_BF], (char *)&i32, 1);
570 
571     i32 = cr->cram_flags & CRAM_FLAG_MASK;
572     r |= h->codecs[DS_CF]->encode(s, h->codecs[DS_CF], (char *)&i32, 1);
573 
574     if (CRAM_MAJOR_VERS(fd->version) != 1 && s->hdr->ref_seq_id == -2)
575         r |= h->codecs[DS_RI]->encode(s, h->codecs[DS_RI], (char *)&cr->ref_id, 1);
576 
577     r |= h->codecs[DS_RL]->encode(s, h->codecs[DS_RL], (char *)&cr->len, 1);
578 
579     if (c->pos_sorted) {
580         if (CRAM_MAJOR_VERS(fd->version) >= 4) {
581             i64 = cr->apos - *last_pos;
582             r |= h->codecs[DS_AP]->encode(s, h->codecs[DS_AP], (char *)&i64, 1);
583         } else {
584             i32 = cr->apos - *last_pos;
585             r |= h->codecs[DS_AP]->encode(s, h->codecs[DS_AP], (char *)&i32, 1);
586         }
587         *last_pos = cr->apos;
588     } else {
589         if (CRAM_MAJOR_VERS(fd->version) >= 4) {
590             i64 = cr->apos;
591             r |= h->codecs[DS_AP]->encode(s, h->codecs[DS_AP], (char *)&i64, 1);
592         } else {
593             i32 = cr->apos;
594             r |= h->codecs[DS_AP]->encode(s, h->codecs[DS_AP], (char *)&i32, 1);
595         }
596     }
597 
598     r |= h->codecs[DS_RG]->encode(s, h->codecs[DS_RG], (char *)&cr->rg, 1);
599 
600     if (cr->cram_flags & CRAM_FLAG_DETACHED) {
601         i32 = cr->mate_flags;
602         r |= h->codecs[DS_MF]->encode(s, h->codecs[DS_MF], (char *)&i32, 1);
603 
604         r |= h->codecs[DS_NS]->encode(s, h->codecs[DS_NS],
605                                       (char *)&cr->mate_ref_id, 1);
606 
607         if (CRAM_MAJOR_VERS(fd->version) >= 4) {
608             r |= h->codecs[DS_NP]->encode(s, h->codecs[DS_NP],
609                                           (char *)&cr->mate_pos, 1);
610             r |= h->codecs[DS_TS]->encode(s, h->codecs[DS_TS],
611                                           (char *)&cr->tlen, 1);
612         } else {
613             i32 = cr->mate_pos;
614             r |= h->codecs[DS_NP]->encode(s, h->codecs[DS_NP],
615                                           (char *)&i32, 1);
616             i32 = cr->tlen;
617             r |= h->codecs[DS_TS]->encode(s, h->codecs[DS_TS],
618                                           (char *)&i32, 1);
619         }
620     } else {
621         if (cr->cram_flags & CRAM_FLAG_MATE_DOWNSTREAM) {
622             r |= h->codecs[DS_NF]->encode(s, h->codecs[DS_NF],
623                                           (char *)&cr->mate_line, 1);
624         }
625         if (cr->cram_flags & CRAM_FLAG_EXPLICIT_TLEN) {
626             if (CRAM_MAJOR_VERS(fd->version) >= 4) {
627                 r |= h->codecs[DS_TS]->encode(s, h->codecs[DS_TS],
628                                               (char *)&cr->tlen, 1);
629             }
630         }
631     }
632 
633     /* Aux tags */
634     if (CRAM_MAJOR_VERS(fd->version) == 1) {
635         int j;
636         uc = cr->ntags;
637         r |= h->codecs[DS_TC]->encode(s, h->codecs[DS_TC], (char *)&uc, 1);
638 
639         for (j = 0; j < cr->ntags; j++) {
640             uint32_t i32 = s->TN[cr->TN_idx + j]; // id
641             r |= h->codecs[DS_TN]->encode(s, h->codecs[DS_TN], (char *)&i32, 1);
642         }
643     } else {
644         r |= h->codecs[DS_TL]->encode(s, h->codecs[DS_TL], (char *)&cr->TL, 1);
645     }
646 
647     // qual
648     // QS codec : Already stored in block[2].
649 
650     // features (diffs)
651     if (!(cr->flags & BAM_FUNMAP)) {
652         int prev_pos = 0, j;
653 
654         r |= h->codecs[DS_FN]->encode(s, h->codecs[DS_FN],
655                                       (char *)&cr->nfeature, 1);
656         for (j = 0; j < cr->nfeature; j++) {
657             cram_feature *f = &s->features[cr->feature + j];
658 
659             uc = f->X.code;
660             r |= h->codecs[DS_FC]->encode(s, h->codecs[DS_FC], (char *)&uc, 1);
661             i32 = f->X.pos - prev_pos;
662             r |= h->codecs[DS_FP]->encode(s, h->codecs[DS_FP], (char *)&i32, 1);
663             prev_pos = f->X.pos;
664 
665             switch(f->X.code) {
666                 //char *seq;
667 
668             case 'X':
669                 //fprintf(stderr, "    FC=%c FP=%d base=%d\n", f->X.code, i32, f->X.base);
670 
671                 uc = f->X.base;
672                 r |= h->codecs[DS_BS]->encode(s, h->codecs[DS_BS],
673                                               (char *)&uc, 1);
674                 break;
675             case 'S':
676                 // Already done
677                 //r |= h->codecs[DS_SC]->encode(s, h->codecs[DS_SC],
678                 //                              BLOCK_DATA(s->soft_blk) + f->S.seq_idx,
679                 //                              f->S.len);
680 
681                 //if (CRAM_MAJOR_VERS(fd->version) >= 3) {
682                 //    r |= h->codecs[DS_BB]->encode(s, h->codecs[DS_BB],
683                 //                                  BLOCK_DATA(s->seqs_blk) + f->S.seq_idx,
684                 //                                  f->S.len);
685                 //}
686                 break;
687             case 'I':
688                 //seq = DSTRING_STR(s->seqs_ds) + f->S.seq_idx;
689                 //r |= h->codecs[DS_IN]->encode(s, h->codecs[DS_IN],
690                 //                           seq, f->S.len);
691                 //if (CRAM_MAJOR_VERS(fd->version) >= 3) {
692                 //    r |= h->codecs[DS_BB]->encode(s, h->codecs[DS_BB],
693                 //                                  BLOCK_DATA(s->seqs_blk) + f->I.seq_idx,
694                 //                                  f->I.len);
695                 //}
696                 break;
697             case 'i':
698                 uc = f->i.base;
699                 r |= h->codecs[DS_BA]->encode(s, h->codecs[DS_BA],
700                                               (char *)&uc, 1);
701                 //seq = DSTRING_STR(s->seqs_ds) + f->S.seq_idx;
702                 //r |= h->codecs[DS_IN]->encode(s, h->codecs[DS_IN],
703                 //                           seq, 1);
704                 break;
705             case 'D':
706                 i32 = f->D.len;
707                 r |= h->codecs[DS_DL]->encode(s, h->codecs[DS_DL],
708                                               (char *)&i32, 1);
709                 break;
710 
711             case 'B':
712                 //                  // Used when we try to store a non ACGTN base or an N
713                 //                  // that aligns against a non ACGTN reference
714 
715                 uc  = f->B.base;
716                 r |= h->codecs[DS_BA]->encode(s, h->codecs[DS_BA],
717                                               (char *)&uc, 1);
718 
719                 //                  Already added
720                 //                  uc  = f->B.qual;
721                 //                  r |= h->codecs[DS_QS]->encode(s, h->codecs[DS_QS],
722                 //                                           (char *)&uc, 1);
723                 break;
724 
725             case 'b':
726                 // string of bases
727                 r |= h->codecs[DS_BB]->encode(s, h->codecs[DS_BB],
728                                               (char *)BLOCK_DATA(s->seqs_blk)
729                                                       + f->b.seq_idx,
730                                               f->b.len);
731                 break;
732 
733             case 'Q':
734                 //                  Already added
735                 //                  uc  = f->B.qual;
736                 //                  r |= h->codecs[DS_QS]->encode(s, h->codecs[DS_QS],
737                 //                                           (char *)&uc, 1);
738                 break;
739 
740             case 'N':
741                 i32 = f->N.len;
742                 r |= h->codecs[DS_RS]->encode(s, h->codecs[DS_RS],
743                                               (char *)&i32, 1);
744                 break;
745 
746             case 'P':
747                 i32 = f->P.len;
748                 r |= h->codecs[DS_PD]->encode(s, h->codecs[DS_PD],
749                                               (char *)&i32, 1);
750                 break;
751 
752             case 'H':
753                 i32 = f->H.len;
754                 r |= h->codecs[DS_HC]->encode(s, h->codecs[DS_HC],
755                                               (char *)&i32, 1);
756                 break;
757 
758 
759             default:
760                 hts_log_error("Unhandled feature code %c", f->X.code);
761                 return -1;
762             }
763         }
764 
765         r |= h->codecs[DS_MQ]->encode(s, h->codecs[DS_MQ],
766                                       (char *)&cr->mqual, 1);
767     } else {
768         char *seq = (char *)BLOCK_DATA(s->seqs_blk) + cr->seq;
769         if (cr->len)
770             r |= h->codecs[DS_BA]->encode(s, h->codecs[DS_BA], seq, cr->len);
771     }
772 
773     return r ? -1 : 0;
774 }
775 
776 
777 /*
778  * Applies various compression methods to specific blocks, depending on
779  * known observations of how data series compress.
780  *
781  * Returns 0 on success
782  *        -1 on failure
783  */
cram_compress_slice(cram_fd * fd,cram_container * c,cram_slice * s)784 static int cram_compress_slice(cram_fd *fd, cram_container *c, cram_slice *s) {
785     int level = fd->level, i;
786     int method = 1<<GZIP | 1<<GZIP_RLE, methodF = method;
787     int v31_or_above = (fd->version >= (3<<8)+1);
788 
789     /* Compress the CORE Block too, with minimal zlib level */
790     if (level > 5 && s->block[0]->uncomp_size > 500)
791         cram_compress_block2(fd, s, s->block[0], NULL, 1<<GZIP, 1);
792 
793     if (fd->use_bz2)
794         method |= 1<<BZIP2;
795 
796     int method_rans   = (1<<RANS0) | (1<<RANS1);
797     int method_ranspr = method_rans;
798 
799     if (fd->use_rans) {
800         method_ranspr = (1<<RANS_PR0)   | (1<<RANS_PR1);
801         if (level > 1)
802             method_ranspr |=
803                   (1<<RANS_PR64)  | (1<<RANS_PR9)
804                 | (1<<RANS_PR128) | (1<<RANS_PR193);
805         if (level > 5)
806             method_ranspr |= (1<<RANS_PR129) | (1<<RANS_PR192);
807     }
808 
809     if (fd->use_rans) {
810         methodF |= v31_or_above ? method_ranspr : method_rans;
811         method  |= v31_or_above ? method_ranspr : method_rans;
812     }
813 
814     int method_arith   = 0;
815     if (fd->use_arith) {
816         method_arith = (1<<ARITH_PR0)   | (1<<ARITH_PR1);
817         if (level > 1)
818             method_arith |=
819                   (1<<ARITH_PR64)  | (1<<ARITH_PR9)
820                 | (1<<ARITH_PR128) | (1<<ARITH_PR129)
821                 | (1<<ARITH_PR192) | (1u<<ARITH_PR193);
822     }
823     if (fd->use_arith && v31_or_above) {
824         methodF |= method_arith;
825         method  |= method_arith;
826     }
827 
828     if (fd->use_lzma)
829         method |= (1<<LZMA);
830 
831     /* Faster method for data series we only need entropy encoding on */
832     methodF = method & ~(1<<GZIP | 1<<BZIP2 | 1<<LZMA);
833     if (level >= 5) {
834         method |= 1<<GZIP_1;
835         methodF = method;
836     }
837     if (level == 1) {
838         method &= ~(1<<GZIP);
839         method |=   1<<GZIP_1;
840         methodF = method;
841     }
842 
843     int qmethod  = method;
844     int qmethodF = method;
845     if (v31_or_above && fd->use_fqz) {
846         qmethod  |= 1<<FQZ;
847         qmethodF |= 1<<FQZ;
848         if (fd->level > 4) {
849             qmethod  |= 1<<FQZ_b;
850             qmethodF |= 1<<FQZ_b;
851         }
852         if (fd->level > 6) {
853             qmethod  |= (1<<FQZ_c) | (1<<FQZ_d);
854             qmethodF |= (1<<FQZ_c) | (1<<FQZ_d);
855         }
856     }
857 
858     pthread_mutex_lock(&fd->metrics_lock);
859     for (i = 0; i < DS_END; i++)
860         if (c->stats[i] && c->stats[i]->nvals > 16)
861             fd->m[i]->unpackable = 1;
862     pthread_mutex_unlock(&fd->metrics_lock);
863 
864     /* Specific compression methods for certain block types */
865     if (cram_compress_block2(fd, s, s->block[DS_IN], fd->m[DS_IN], //IN (seq)
866                              method, level))
867         return -1;
868 
869     if (fd->level == 0) {
870         /* Do nothing */
871     } else if (fd->level == 1) {
872         if (cram_compress_block2(fd, s, s->block[DS_QS], fd->m[DS_QS],
873                                  qmethodF, 1))
874             return -1;
875         for (i = DS_aux; i <= DS_aux_oz; i++) {
876             if (s->block[i])
877                 if (cram_compress_block2(fd, s, s->block[i], fd->m[i],
878                                          method, 1))
879                     return -1;
880         }
881     } else if (fd->level < 3) {
882         if (cram_compress_block2(fd, s, s->block[DS_QS], fd->m[DS_QS],
883                                  qmethod, 1))
884             return -1;
885         if (cram_compress_block2(fd, s, s->block[DS_BA], fd->m[DS_BA],
886                                  method, 1))
887             return -1;
888         if (s->block[DS_BB])
889             if (cram_compress_block2(fd, s, s->block[DS_BB], fd->m[DS_BB],
890                                      method, 1))
891                 return -1;
892         for (i = DS_aux; i <= DS_aux_oz; i++) {
893             if (s->block[i])
894                 if (cram_compress_block2(fd, s, s->block[i], fd->m[i],
895                                          method, level))
896                     return -1;
897         }
898     } else {
899         if (cram_compress_block2(fd, s, s->block[DS_QS], fd->m[DS_QS],
900                                  qmethod, level))
901             return -1;
902         if (cram_compress_block2(fd, s, s->block[DS_BA], fd->m[DS_BA],
903                                  method, level))
904             return -1;
905         if (s->block[DS_BB])
906             if (cram_compress_block2(fd, s, s->block[DS_BB], fd->m[DS_BB],
907                                      method, level))
908                 return -1;
909         for (i = DS_aux; i <= DS_aux_oz; i++) {
910             if (s->block[i])
911                 if (cram_compress_block2(fd, s, s->block[i], fd->m[i],
912                                          method, level))
913                     return -1;
914         }
915     }
916 
917     // NAME: best is generally xz, bzip2, zlib then rans1
918     int method_rn = method & ~(method_rans | method_ranspr | 1<<GZIP_RLE);
919     if (fd->version >= (3<<8)+1 && fd->use_tok)
920         method_rn |= fd->use_arith ? (1<<TOKA) : (1<<TOK3);
921     if (cram_compress_block2(fd, s, s->block[DS_RN], fd->m[DS_RN],
922                              method_rn, level))
923         return -1;
924 
925     // NS shows strong local correlation as rearrangements are localised
926     if (s->block[DS_NS] && s->block[DS_NS] != s->block[0])
927         if (cram_compress_block2(fd, s, s->block[DS_NS], fd->m[DS_NS],
928                                  method, level))
929             return -1;
930 
931 
932     /*
933      * Compress any auxiliary tags with their own per-tag metrics
934      */
935     {
936         int i;
937         for (i = 0; i < s->naux_block; i++) {
938             if (!s->aux_block[i] || s->aux_block[i] == s->block[0])
939                 continue;
940 
941             if (s->aux_block[i]->method != RAW)
942                 continue;
943 
944             if (cram_compress_block2(fd, s, s->aux_block[i], s->aux_block[i]->m,
945                                      method, level))
946                 return -1;
947         }
948     }
949 
950     /*
951      * Minimal compression of any block still uncompressed, bar CORE
952      */
953     {
954         int i;
955         for (i = 1; i < s->hdr->num_blocks && i < DS_END; i++) {
956             if (!s->block[i] || s->block[i] == s->block[0])
957                 continue;
958 
959             if (s->block[i]->method != RAW)
960                 continue;
961 
962             if (cram_compress_block2(fd, s, s->block[i], fd->m[i],
963                                     methodF, level))
964                 return -1;
965         }
966     }
967 
968     return 0;
969 }
970 
971 /*
972  * Allocates a block associated with the cram codec associated with
973  * data series ds_id or the internal codec_id (depending on codec
974  * type).
975  *
976  * The ds_ids are what end up written to disk as an external block.
977  * The c_ids are internal and used when daisy-chaining transforms
978  * such as MAP and RLE.  These blocks are also allocated, but
979  * are ephemeral in nature.  (The codecs themselves cannot allocate
980  * these as the same codec pointer may be operating on multiple slices
981  * if we're using a multi-slice container.)
982  *
983  * Returns 0 on success
984  *        -1 on failure
985  */
cram_allocate_block(cram_codec * codec,cram_slice * s,int ds_id)986 static int cram_allocate_block(cram_codec *codec, cram_slice *s, int ds_id) {
987     if (!codec)
988         return 0;
989 
990     switch(codec->codec) {
991     // Codecs which are hard-coded to use the CORE block
992     case E_GOLOMB:
993     case E_HUFFMAN:
994     case E_BETA:
995     case E_SUBEXP:
996     case E_GOLOMB_RICE:
997     case E_GAMMA:
998         codec->out = s->block[0];
999         break;
1000 
1001     // Codecs which don't use external blocks
1002     case E_CONST_BYTE:
1003     case E_CONST_INT:
1004        codec->out = NULL;
1005        break;
1006 
1007     // Codecs that emit directly to external blocks
1008     case E_EXTERNAL:
1009     case E_VARINT_UNSIGNED:
1010     case E_VARINT_SIGNED:
1011         if (!(s->block[ds_id] = cram_new_block(EXTERNAL, ds_id)))
1012             return -1;
1013         codec->u.external.content_id = ds_id;
1014         codec->out = s->block[ds_id];
1015         break;
1016 
1017     case E_BYTE_ARRAY_STOP: // Why no sub-codec?
1018         if (!(s->block[ds_id] = cram_new_block(EXTERNAL, ds_id)))
1019             return -1;
1020         codec->u.byte_array_stop.content_id = ds_id;
1021         codec->out = s->block[ds_id];
1022         break;
1023 
1024 
1025     // Codecs that contain sub-codecs which may in turn emit to external blocks
1026     case E_BYTE_ARRAY_LEN: {
1027         cram_codec *bal = codec->u.e_byte_array_len.len_codec;
1028         if (cram_allocate_block(bal, s, bal->u.external.content_id))
1029             return -1;
1030         bal = codec->u.e_byte_array_len.val_codec;
1031         if (cram_allocate_block(bal, s, bal->u.external.content_id))
1032             return -1;
1033 
1034         break;
1035     }
1036 
1037     case E_XRLE:
1038         if (cram_allocate_block(codec->u.e_xrle.len_codec, s, ds_id))
1039                                 //ds_id == DS_QS ? DS_QS_len : ds_id))
1040             return -1;
1041         if (cram_allocate_block(codec->u.e_xrle.lit_codec, s, ds_id))
1042             return -1;
1043 
1044         break;
1045 
1046     case E_XPACK:
1047         if (cram_allocate_block(codec->u.e_xpack.sub_codec, s, ds_id))
1048             return -1;
1049         codec->out = cram_new_block(0, 0); // ephemeral
1050         if (!codec->out)
1051             return -1;
1052 
1053         break;
1054 
1055     case E_XDELTA:
1056         if (cram_allocate_block(codec->u.e_xdelta.sub_codec, s, ds_id))
1057             return -1;
1058         codec->out = cram_new_block(0, 0); // ephemeral
1059         if (!codec->out)
1060             return -1;
1061 
1062         break;
1063 
1064     default:
1065         break;
1066     }
1067 
1068     return 0;
1069 }
1070 
1071 /*
1072  * Encodes a single slice from a container
1073  *
1074  * Returns 0 on success
1075  *        -1 on failure
1076  */
cram_encode_slice(cram_fd * fd,cram_container * c,cram_block_compression_hdr * h,cram_slice * s)1077 static int cram_encode_slice(cram_fd *fd, cram_container *c,
1078                              cram_block_compression_hdr *h, cram_slice *s) {
1079     int rec, r = 0;
1080     int64_t last_pos;
1081     int embed_ref;
1082     enum cram_DS_ID id;
1083 
1084     embed_ref = fd->embed_ref && s->hdr->ref_seq_id != -1 ? 1 : 0;
1085 
1086     /*
1087      * Slice external blocks:
1088      * ID 0 => base calls (insertions, soft-clip)
1089      * ID 1 => qualities
1090      * ID 2 => names
1091      * ID 3 => TS (insert size), NP (next frag)
1092      * ID 4 => tag values
1093      * ID 6 => tag IDs (TN), if CRAM_V1.0
1094      * ID 7 => TD tag dictionary, if !CRAM_V1.0
1095      */
1096 
1097     /* Create cram slice header */
1098     s->hdr->ref_base_id = embed_ref && s->hdr->ref_seq_span > 0
1099         ? DS_ref
1100         : (CRAM_MAJOR_VERS(fd->version) >= 4 ? 0 : -1);
1101     s->hdr->record_counter = c->num_records + c->record_counter;
1102     c->num_records += s->hdr->num_records;
1103 
1104     int ntags = c->tags_used ? c->tags_used->n_occupied : 0;
1105     s->block = calloc(DS_END + ntags*2, sizeof(s->block[0]));
1106     s->hdr->block_content_ids = malloc(DS_END * sizeof(int32_t));
1107     if (!s->block || !s->hdr->block_content_ids)
1108         return -1;
1109 
1110     // Create first fixed blocks, always external.
1111     // CORE
1112     if (!(s->block[0] = cram_new_block(CORE, 0)))
1113         return -1;
1114 
1115     // TN block for CRAM v1
1116     if (CRAM_MAJOR_VERS(fd->version) == 1) {
1117         if (h->codecs[DS_TN]->codec == E_EXTERNAL) {
1118             if (!(s->block[DS_TN] = cram_new_block(EXTERNAL,DS_TN))) return -1;
1119             h->codecs[DS_TN]->u.external.content_id = DS_TN;
1120         } else {
1121             s->block[DS_TN] = s->block[0];
1122         }
1123     }
1124 
1125     // Embedded reference
1126     if (embed_ref) {
1127         if (!(s->block[DS_ref] = cram_new_block(EXTERNAL, DS_ref)))
1128             return -1;
1129         s->ref_id = DS_ref; // needed?
1130         BLOCK_APPEND(s->block[DS_ref],
1131                      c->ref + s->hdr->ref_seq_start - c->ref_start,
1132                      s->hdr->ref_seq_span);
1133     }
1134 
1135     /*
1136      * All the data-series blocks if appropriate.
1137      */
1138     for (id = DS_QS; id < DS_TN; id++) {
1139         if (cram_allocate_block(h->codecs[id], s, id) < 0)
1140             return -1;
1141     }
1142 
1143     /*
1144      * Add in the external tag blocks too.
1145      */
1146     if (c->tags_used) {
1147         int n;
1148         s->hdr->num_blocks = DS_END;
1149         for (n = 0; n < s->naux_block; n++)
1150             s->block[s->hdr->num_blocks++] = s->aux_block[n];
1151     }
1152 
1153     /* Encode reads */
1154     last_pos = s->hdr->ref_seq_start;
1155     for (rec = 0; rec < s->hdr->num_records; rec++) {
1156         cram_record *cr = &s->crecs[rec];
1157         if (cram_encode_slice_read(fd, c, h, s, cr, &last_pos) == -1)
1158             return -1;
1159     }
1160 
1161     s->block[0]->uncomp_size = s->block[0]->byte + (s->block[0]->bit < 7);
1162     s->block[0]->comp_size = s->block[0]->uncomp_size;
1163 
1164     // Make sure the fixed blocks point to the correct sources
1165     if (s->block[DS_IN]) cram_free_block(s->block[DS_IN]);
1166     s->block[DS_IN] = s->base_blk; s->base_blk = NULL;
1167     if (s->block[DS_QS]) cram_free_block(s->block[DS_QS]);
1168     s->block[DS_QS] = s->qual_blk; s->qual_blk = NULL;
1169     if (s->block[DS_RN]) cram_free_block(s->block[DS_RN]);
1170     s->block[DS_RN] = s->name_blk; s->name_blk = NULL;
1171     if (s->block[DS_SC]) cram_free_block(s->block[DS_SC]);
1172     s->block[DS_SC] = s->soft_blk; s->soft_blk = NULL;
1173 
1174     // Finalise any data transforms.
1175     for (id = DS_QS; id < DS_TN; id++) {
1176        if (h->codecs[id] && h->codecs[id]->flush)
1177            h->codecs[id]->flush(h->codecs[id]);
1178     }
1179 
1180     // Ensure block sizes are up to date.
1181     for (id = 1; id < s->hdr->num_blocks; id++) {
1182         if (!s->block[id] || s->block[id] == s->block[0])
1183             continue;
1184 
1185         if (s->block[id]->uncomp_size == 0)
1186             BLOCK_UPLEN(s->block[id]);
1187     }
1188 
1189     // Compress it all
1190     if (cram_compress_slice(fd, c, s) == -1)
1191         return -1;
1192 
1193     // Collapse empty blocks and create hdr_block
1194     {
1195         int i, j;
1196 
1197         s->hdr->block_content_ids = realloc(s->hdr->block_content_ids,
1198                                             s->hdr->num_blocks * sizeof(int32_t));
1199         if (!s->hdr->block_content_ids)
1200             return -1;
1201 
1202         for (i = j = 1; i < s->hdr->num_blocks; i++) {
1203             if (!s->block[i] || s->block[i] == s->block[0])
1204                 continue;
1205             if (s->block[i]->uncomp_size == 0) {
1206                 cram_free_block(s->block[i]);
1207                 s->block[i] = NULL;
1208                 continue;
1209             }
1210             s->block[j] = s->block[i];
1211             s->hdr->block_content_ids[j-1] = s->block[i]->content_id;
1212             j++;
1213         }
1214         s->hdr->num_content_ids = j-1;
1215         s->hdr->num_blocks = j;
1216 
1217         if (!(s->hdr_block = cram_encode_slice_header(fd, s)))
1218             return -1;
1219     }
1220 
1221     return r ? -1 : 0;
1222 
1223  block_err:
1224     return -1;
1225 }
1226 
1227 /*
1228  * Returns the number of expected read names for this record.
1229  */
expected_template_count(bam_seq_t * b)1230 static int expected_template_count(bam_seq_t *b) {
1231     int expected = bam_flag(b) & BAM_FPAIRED ? 2 : 1;
1232 
1233     uint8_t *TC = (uint8_t *)bam_aux_get(b, "TC");
1234     if (TC) {
1235         int n = bam_aux2i(TC);
1236         if (expected < n)
1237             expected = n;
1238     }
1239 
1240     if (!TC && bam_aux_get(b, "SA")) {
1241         // We could count the semicolons, but we'd have to do this for
1242         // read1, read2 and read(not-1-or-2) combining the results
1243         // together.  This is a cheap and safe alternative for now.
1244         expected = INT_MAX;
1245     }
1246 
1247     return expected;
1248 }
1249 
1250 /*
1251  * Lossily reject read names.
1252  *
1253  * The rule here is that if all reads for this template reside in the
1254  * same slice then we can lose the name.  Otherwise we keep them as we
1255  * do not know when (or if) the other reads will turn up.
1256  *
1257  * Note there may be only 1 read (non-paired library) or more than 2
1258  * reads (paired library with supplementary reads), or other weird
1259  * setups.  We need to know how many are expected.  Ways to guess:
1260  *
1261  * - Flags (0x1 - has > 1 read)
1262  * - TC aux field (not mandatory)
1263  * - SA tags (count semicolons, NB per fragment so sum - hard)
1264  * - RNEXT/PNEXT uniqueness count. (not implemented, tricky)
1265  *
1266  * Returns 0 on success
1267  *        -1 on failure
1268  */
lossy_read_names(cram_fd * fd,cram_container * c,cram_slice * s,int bam_start)1269 static int lossy_read_names(cram_fd *fd, cram_container *c, cram_slice *s,
1270                             int bam_start) {
1271     int r1, r2, ret = -1;
1272 
1273     // Initialise cram_flags
1274     for (r2 = 0; r2 < s->hdr->num_records; r2++)
1275         s->crecs[r2].cram_flags = 0;
1276 
1277     if (!fd->lossy_read_names)
1278         return 0;
1279 
1280     khash_t(m_s2u64) *names = kh_init(m_s2u64);
1281     if (!names)
1282         goto fail;
1283 
1284     // 1: Iterate through names to count frequency
1285     for (r1 = bam_start, r2 = 0; r2 < s->hdr->num_records; r1++, r2++) {
1286         //cram_record *cr = &s->crecs[r2];
1287         bam_seq_t *b = c->bams[r1];
1288         khint_t k;
1289         int n;
1290         uint64_t e;
1291         union {
1292             uint64_t i64;
1293             struct {
1294                 int32_t e,c; // expected & observed counts.
1295             } counts;
1296         } u;
1297 
1298         e = expected_template_count(b);
1299         u.counts.e = e; u.counts.c = 1;
1300 
1301         k = kh_put(m_s2u64, names, bam_name(b), &n);
1302         if (n == -1)
1303             goto fail;
1304 
1305         if (n == 0) {
1306             // not a new name
1307             u.i64 = kh_val(names, k);
1308             if (u.counts.e != e) {
1309                 // different expectation or already hit the max
1310                 //fprintf(stderr, "Err computing no. %s recs\n", bam_name(b));
1311                 kh_val(names, k) = 0;
1312             } else {
1313                 u.counts.c++;
1314                 if (u.counts.e == u.counts.c) {
1315                     // Reached expected count.
1316                     kh_val(names, k) = -1;
1317                 } else {
1318                     kh_val(names, k) = u.i64;
1319                 }
1320             }
1321         } else {
1322             // new name
1323             kh_val(names, k) = u.i64;
1324         }
1325     }
1326 
1327     // 2: Remove names if all present (hd.i == -1)
1328     for (r1 = bam_start, r2 = 0; r2 < s->hdr->num_records; r1++, r2++) {
1329         cram_record *cr = &s->crecs[r2];
1330         bam_seq_t *b = c->bams[r1];
1331         khint_t k;
1332 
1333         k = kh_get(m_s2u64, names, bam_name(b));
1334 
1335         if (k == kh_end(names))
1336             goto fail;
1337 
1338         if (kh_val(names, k) == -1)
1339             cr->cram_flags = CRAM_FLAG_DISCARD_NAME;
1340     }
1341 
1342     ret = 0;
1343  fail: // ret==-1
1344 
1345     if (names)
1346         kh_destroy(m_s2u64, names);
1347 
1348     return ret;
1349 }
1350 
1351 /*
1352  * Adds the reading names.  We do this here as a separate pass rather
1353  * than per record in the process_one_read calls as that function can
1354  * go back and change the CRAM_FLAG_DETACHED status of a previously
1355  * processed read if it subsequently determines the TLEN field is
1356  * incorrect.  Given DETACHED reads always try to decode read names,
1357  * we need to know their status before generating the read-name block.
1358  *
1359  * Output is an update s->name_blk, and cr->name / cr->name_len
1360  * fields.
1361  */
add_read_names(cram_fd * fd,cram_container * c,cram_slice * s,int bam_start)1362 static int add_read_names(cram_fd *fd, cram_container *c, cram_slice *s,
1363                           int bam_start) {
1364     int r1, r2;
1365     int keep_names = !fd->lossy_read_names;
1366 
1367     for (r1 = bam_start, r2 = 0;
1368          r1 < c->curr_c_rec && r2 < s->hdr->num_records;
1369          r1++, r2++) {
1370         cram_record *cr = &s->crecs[r2];
1371         bam_seq_t *b = c->bams[r1];
1372 
1373         cr->name        = BLOCK_SIZE(s->name_blk);
1374         if ((cr->cram_flags & CRAM_FLAG_DETACHED) || keep_names) {
1375             if (CRAM_MAJOR_VERS(fd->version) >= 4
1376                 && (cr->cram_flags & CRAM_FLAG_MATE_DOWNSTREAM)
1377                 && cr->mate_line) {
1378                 // Dedup read names in V4
1379                 BLOCK_APPEND(s->name_blk, "\0", 1);
1380                 cr->name_len    = 1;
1381             } else {
1382                 BLOCK_APPEND(s->name_blk, bam_name(b), bam_name_len(b));
1383                 cr->name_len    = bam_name_len(b);
1384             }
1385         } else {
1386             // Can only discard duplicate names if not detached
1387             cr->name_len = 0;
1388         }
1389 
1390         if (cram_stats_add(c->stats[DS_RN], cr->name_len) < 0)
1391             goto block_err;
1392     }
1393 
1394     return 0;
1395 
1396  block_err:
1397     return -1;
1398 }
1399 
1400 /*
1401  * Encodes all slices in a container into blocks.
1402  * Returns 0 on success
1403  *        -1 on failure
1404  */
cram_encode_container(cram_fd * fd,cram_container * c)1405 int cram_encode_container(cram_fd *fd, cram_container *c) {
1406     int i, j, slice_offset;
1407     cram_block_compression_hdr *h = c->comp_hdr;
1408     cram_block *c_hdr;
1409     int multi_ref = 0;
1410     int r1, r2, sn, nref;
1411     spare_bams *spares;
1412 
1413     if (CRAM_MAJOR_VERS(fd->version) == 1)
1414         goto err;
1415 
1416 //#define goto_err {fprintf(stderr, "ERR at %s:%d\n", __FILE__, __LINE__);goto err;}
1417 #define goto_err goto err
1418 
1419     /* Cache references up-front if we have unsorted access patterns */
1420     pthread_mutex_lock(&fd->ref_lock);
1421     nref = fd->refs->nref;
1422     pthread_mutex_unlock(&fd->ref_lock);
1423 
1424     if (!fd->no_ref && c->refs_used) {
1425         for (i = 0; i < nref; i++) {
1426             if (c->refs_used[i])
1427                 cram_get_ref(fd, i, 1, 0);
1428         }
1429     }
1430 
1431     /* To create M5 strings */
1432     /* Fetch reference sequence */
1433     if (!fd->no_ref) {
1434         if (!c->bams || !c->bams[0])
1435             goto_err;
1436         bam_seq_t *b = c->bams[0];
1437 
1438         char *ref = cram_get_ref(fd, bam_ref(b), 1, 0);
1439         if (!ref && bam_ref(b) >= 0) {
1440             hts_log_error("Failed to load reference #%d", bam_ref(b));
1441             return -1;
1442         }
1443         if ((c->ref_id = bam_ref(b)) >= 0) {
1444             c->ref_seq_id = c->ref_id;
1445             c->ref       = fd->refs->ref_id[c->ref_seq_id]->seq;
1446             c->ref_start = 1;
1447             c->ref_end   = fd->refs->ref_id[c->ref_seq_id]->length;
1448         } else {
1449             c->ref_seq_id = c->ref_id; // FIXME remove one var!
1450         }
1451     } else {
1452         c->ref_id = bam_ref(c->bams[0]);
1453         cram_ref_incr(fd->refs, c->ref_id);
1454         c->ref_seq_id = c->ref_id;
1455     }
1456 
1457     /* Turn bams into cram_records and gather basic stats */
1458     for (r1 = sn = 0; r1 < c->curr_c_rec; sn++) {
1459         cram_slice *s = c->slices[sn];
1460         int64_t first_base = INT64_MAX, last_base = INT64_MIN;
1461 
1462         int r1_start = r1;
1463 
1464         assert(sn < c->curr_slice);
1465 
1466         // Discover which read names *may* be safely removed.
1467         // Ie which ones have all their records in this slice.
1468         if (lossy_read_names(fd, c, s, r1_start) != 0)
1469             return -1;
1470 
1471         // Tracking of MD tags so we can spot when the auto-generated values
1472         // will differ from the current stored ones.  The kstring here is
1473         // simply to avoid excessive malloc and free calls.  All initialisation
1474         // is done within process_one_read().
1475         kstring_t MD = {0};
1476 
1477         // Iterate through records creating the cram blocks for some
1478         // fields and just gathering stats for others.
1479         for (r2 = 0; r1 < c->curr_c_rec && r2 < s->hdr->num_records; r1++, r2++) {
1480             cram_record *cr = &s->crecs[r2];
1481             bam_seq_t *b = c->bams[r1];
1482 
1483             /* If multi-ref we need to cope with changing reference per seq */
1484             if (c->multi_seq && !fd->no_ref) {
1485                 if (bam_ref(b) != c->ref_seq_id && bam_ref(b) >= 0) {
1486                     if (c->ref_seq_id >= 0)
1487                         cram_ref_decr(fd->refs, c->ref_seq_id);
1488 
1489                     if (!cram_get_ref(fd, bam_ref(b), 1, 0)) {
1490                         hts_log_error("Failed to load reference #%d", bam_ref(b));
1491                         free(MD.s);
1492                         return -1;
1493                     }
1494 
1495                     c->ref_seq_id = bam_ref(b); // overwritten later by -2
1496                     if (!fd->refs->ref_id[c->ref_seq_id]->seq)
1497                         return -1;
1498                     c->ref       = fd->refs->ref_id[c->ref_seq_id]->seq;
1499                     c->ref_start = 1;
1500                     c->ref_end   = fd->refs->ref_id[c->ref_seq_id]->length;
1501                 }
1502             }
1503 
1504             if (process_one_read(fd, c, s, cr, b, r2, &MD) != 0) {
1505                 free(MD.s);
1506                 return -1;
1507             }
1508 
1509             if (first_base > cr->apos)
1510                 first_base = cr->apos;
1511 
1512             if (last_base < cr->aend)
1513                 last_base = cr->aend;
1514         }
1515         free(MD.s);
1516 
1517         // Process_one_read doesn't add read names as it can change
1518         // its mind during the loop on the CRAM_FLAG_DETACHED setting
1519         // of earlier records (if it detects the auto-generation of
1520         // TLEN is incorrect).  This affects which read-names can be
1521         // lossily compressed, so we do these in another pass.
1522         if (add_read_names(fd, c, s, r1_start) < 0)
1523             return -1;
1524 
1525         if (c->multi_seq) {
1526             s->hdr->ref_seq_id    = -2;
1527             s->hdr->ref_seq_start = 0;
1528             s->hdr->ref_seq_span  = 0;
1529         } else {
1530             s->hdr->ref_seq_id    = c->ref_id;
1531             s->hdr->ref_seq_start = first_base;
1532             s->hdr->ref_seq_span  = MAX(0, last_base - first_base + 1);
1533         }
1534         s->hdr->num_records = r2;
1535 
1536         // Processed a slice, now stash the aux blocks so the next
1537         // slice can start aggregating them from the start again.
1538         if (c->tags_used->n_occupied) {
1539             int ntags = c->tags_used->n_occupied;
1540             s->aux_block = calloc(ntags*2, sizeof(*s->aux_block));
1541             if (!s->aux_block)
1542                 return -1;
1543 
1544             khint_t k;
1545 
1546             s->naux_block = 0;
1547             for (k = kh_begin(c->tags_used); k != kh_end(c->tags_used); k++) {
1548                 if (!kh_exist(c->tags_used, k))
1549                     continue;
1550 
1551                 cram_tag_map *tm = kh_val(c->tags_used, k);
1552                 if (!tm) goto_err;
1553                 if (!tm->blk) continue;
1554                 s->aux_block[s->naux_block++] = tm->blk;
1555                 tm->blk = NULL;
1556                 if (!tm->blk2) continue;
1557                 s->aux_block[s->naux_block++] = tm->blk2;
1558                 tm->blk2 = NULL;
1559             }
1560             assert(s->naux_block <= 2*c->tags_used->n_occupied);
1561         }
1562     }
1563 
1564     if (c->multi_seq && !fd->no_ref) {
1565         if (c->ref_seq_id >= 0)
1566             cram_ref_decr(fd->refs, c->ref_seq_id);
1567     }
1568 
1569     /* Link our bams[] array onto the spare bam list for reuse */
1570     spares = malloc(sizeof(*spares));
1571     if (!spares) goto_err;
1572     pthread_mutex_lock(&fd->bam_list_lock);
1573     spares->bams = c->bams;
1574     spares->next = fd->bl;
1575     fd->bl = spares;
1576     pthread_mutex_unlock(&fd->bam_list_lock);
1577     c->bams = NULL;
1578 
1579     /* Detect if a multi-seq container */
1580     cram_stats_encoding(fd, c->stats[DS_RI]);
1581     multi_ref = c->stats[DS_RI]->nvals > 1;
1582     pthread_mutex_lock(&fd->metrics_lock);
1583     fd->last_RI_count = c->stats[DS_RI]->nvals;
1584     pthread_mutex_unlock(&fd->metrics_lock);
1585 
1586 
1587     if (multi_ref) {
1588         hts_log_info("Multi-ref container");
1589         c->ref_seq_id = -2;
1590         c->ref_seq_start = 0;
1591         c->ref_seq_span = 0;
1592     }
1593 
1594 
1595     /* Compute MD5s */
1596     int is_v4 = CRAM_MAJOR_VERS(fd->version) >= 4 ? 1 : 0;
1597     for (i = 0; i < c->curr_slice; i++) {
1598         cram_slice *s = c->slices[i];
1599 
1600         if (CRAM_MAJOR_VERS(fd->version) != 1) {
1601             if (s->hdr->ref_seq_id >= 0 && c->multi_seq == 0 && !fd->no_ref) {
1602                 hts_md5_context *md5 = hts_md5_init();
1603                 if (!md5)
1604                     return -1;
1605                 hts_md5_update(md5,
1606                                c->ref + s->hdr->ref_seq_start - c->ref_start,
1607                                s->hdr->ref_seq_span);
1608                 hts_md5_final(s->hdr->md5, md5);
1609                 hts_md5_destroy(md5);
1610             } else {
1611                 memset(s->hdr->md5, 0, 16);
1612             }
1613         }
1614     }
1615 
1616     c->num_records = 0;
1617     c->num_blocks = 1; // cram_block_compression_hdr
1618     c->length = 0;
1619 
1620     //fprintf(stderr, "=== BF ===\n");
1621     h->codecs[DS_BF] = cram_encoder_init(cram_stats_encoding(fd, c->stats[DS_BF]),
1622                                          c->stats[DS_BF], E_INT, NULL,
1623                                          fd->version, &fd->vv);
1624     if (c->stats[DS_BF]->nvals && !h->codecs[DS_BF]) goto_err;
1625 
1626     //fprintf(stderr, "=== CF ===\n");
1627     h->codecs[DS_CF] = cram_encoder_init(cram_stats_encoding(fd, c->stats[DS_CF]),
1628                                          c->stats[DS_CF], E_INT, NULL,
1629                                          fd->version, &fd->vv);
1630     if (c->stats[DS_CF]->nvals && !h->codecs[DS_CF]) goto_err;
1631 
1632     //fprintf(stderr, "=== RN ===\n");
1633     //h->codecs[DS_RN] = cram_encoder_init(cram_stats_encoding(fd, c->stats[DS_RN]),
1634     //                                     c->stats[DS_RN], E_BYTE_ARRAY, NULL,
1635     //                                     fd->version);
1636 
1637     //fprintf(stderr, "=== AP ===\n");
1638     if (c->pos_sorted || CRAM_MAJOR_VERS(fd->version) >= 4) {
1639         if (c->pos_sorted)
1640             h->codecs[DS_AP] = cram_encoder_init(cram_stats_encoding(fd, c->stats[DS_AP]),
1641                                                  c->stats[DS_AP],
1642                                                  is_v4 ? E_LONG : E_INT,
1643                                                  NULL, fd->version, &fd->vv);
1644         else
1645             // Unsorted data has no stats, but hard-code VARINT_SIGNED / EXT.
1646             h->codecs[DS_AP] = cram_encoder_init(is_v4 ? E_VARINT_SIGNED
1647                                                        : E_EXTERNAL,
1648                                                  NULL,
1649                                                  is_v4 ? E_LONG : E_INT,
1650                                                  NULL, fd->version, &fd->vv);
1651     } else {
1652         // Removed BETA in v4.0.
1653         // Should we consider dropping use of it for 3.0 too?
1654         int p[2] = {0, c->max_apos};
1655         h->codecs[DS_AP] = cram_encoder_init(E_BETA, NULL,
1656                                              is_v4 ? E_LONG : E_INT,
1657                                              p, fd->version, &fd->vv);
1658 //      cram_xdelta_encoder e;
1659 //      e.word_size = is_v4 ? 8 : 4;
1660 //      e.sub_encoding = E_EXTERNAL;
1661 //      e.sub_codec_dat = (void *)DS_AP;
1662 //
1663 //      h->codecs[DS_AP] = cram_encoder_init(E_XDELTA, NULL,
1664 //                                           is_v4 ? E_LONG : E_INT,
1665 //                                           &e, fd->version, &fd->vv);
1666     }
1667     if (!h->codecs[DS_AP]) goto_err;
1668 
1669     //fprintf(stderr, "=== RG ===\n");
1670     h->codecs[DS_RG] = cram_encoder_init(cram_stats_encoding(fd, c->stats[DS_RG]),
1671                                          c->stats[DS_RG],
1672                                          E_INT,
1673                                          NULL,
1674                                          fd->version, &fd->vv);
1675     if (c->stats[DS_RG]->nvals && !h->codecs[DS_RG]) goto_err;
1676 
1677     //fprintf(stderr, "=== MQ ===\n");
1678     h->codecs[DS_MQ] = cram_encoder_init(cram_stats_encoding(fd, c->stats[DS_MQ]),
1679                                          c->stats[DS_MQ], E_INT, NULL,
1680                                          fd->version, &fd->vv);
1681     if (c->stats[DS_MQ]->nvals && !h->codecs[DS_MQ]) goto_err;
1682 
1683     //fprintf(stderr, "=== NS ===\n");
1684     h->codecs[DS_NS] = cram_encoder_init(cram_stats_encoding(fd, c->stats[DS_NS]),
1685                                          c->stats[DS_NS], E_INT, NULL,
1686                                          fd->version, &fd->vv);
1687     if (c->stats[DS_NS]->nvals && !h->codecs[DS_NS]) goto_err;
1688 
1689     //fprintf(stderr, "=== MF ===\n");
1690     h->codecs[DS_MF] = cram_encoder_init(cram_stats_encoding(fd, c->stats[DS_MF]),
1691                                          c->stats[DS_MF], E_INT, NULL,
1692                                          fd->version, &fd->vv);
1693     if (c->stats[DS_MF]->nvals && !h->codecs[DS_MF]) goto_err;
1694 
1695     //fprintf(stderr, "=== TS ===\n");
1696     h->codecs[DS_TS] = cram_encoder_init(cram_stats_encoding(fd, c->stats[DS_TS]),
1697                                          c->stats[DS_TS],
1698                                          is_v4 ? E_LONG : E_INT,
1699                                          NULL, fd->version, &fd->vv);
1700     if (c->stats[DS_TS]->nvals && !h->codecs[DS_TS]) goto_err;
1701 
1702     //fprintf(stderr, "=== NP ===\n");
1703     h->codecs[DS_NP] = cram_encoder_init(cram_stats_encoding(fd, c->stats[DS_NP]),
1704                                          c->stats[DS_NP],
1705                                          is_v4 ? E_LONG : E_INT,
1706                                          NULL, fd->version, &fd->vv);
1707     if (c->stats[DS_NP]->nvals && !h->codecs[DS_NP]) goto_err;
1708 
1709     //fprintf(stderr, "=== NF ===\n");
1710     h->codecs[DS_NF] = cram_encoder_init(cram_stats_encoding(fd, c->stats[DS_NF]),
1711                                          c->stats[DS_NF], E_INT, NULL,
1712                                          fd->version, &fd->vv);
1713     if (c->stats[DS_NF]->nvals && !h->codecs[DS_NF]) goto_err;
1714 
1715     //fprintf(stderr, "=== RL ===\n");
1716     h->codecs[DS_RL] = cram_encoder_init(cram_stats_encoding(fd, c->stats[DS_RL]),
1717                                          c->stats[DS_RL], E_INT, NULL,
1718                                          fd->version, &fd->vv);
1719     if (c->stats[DS_RL]->nvals && !h->codecs[DS_RL]) goto_err;
1720 
1721     //fprintf(stderr, "=== FN ===\n");
1722     h->codecs[DS_FN] = cram_encoder_init(cram_stats_encoding(fd, c->stats[DS_FN]),
1723                                          c->stats[DS_FN], E_INT, NULL,
1724                                          fd->version, &fd->vv);
1725     if (c->stats[DS_FN]->nvals && !h->codecs[DS_FN]) goto_err;
1726 
1727     //fprintf(stderr, "=== FC ===\n");
1728     h->codecs[DS_FC] = cram_encoder_init(cram_stats_encoding(fd, c->stats[DS_FC]),
1729                                          c->stats[DS_FC], E_BYTE, NULL,
1730                                          fd->version, &fd->vv);
1731     if (c->stats[DS_FC]->nvals && !h->codecs[DS_FC]) goto_err;
1732 
1733     //fprintf(stderr, "=== FP ===\n");
1734     h->codecs[DS_FP] = cram_encoder_init(cram_stats_encoding(fd, c->stats[DS_FP]),
1735                                          c->stats[DS_FP], E_INT, NULL,
1736                                          fd->version, &fd->vv);
1737     if (c->stats[DS_FP]->nvals && !h->codecs[DS_FP]) goto_err;
1738 
1739     //fprintf(stderr, "=== DL ===\n");
1740     h->codecs[DS_DL] = cram_encoder_init(cram_stats_encoding(fd, c->stats[DS_DL]),
1741                                          c->stats[DS_DL], E_INT, NULL,
1742                                          fd->version, &fd->vv);
1743     if (c->stats[DS_DL]->nvals && !h->codecs[DS_DL]) goto_err;
1744 
1745     //fprintf(stderr, "=== BA ===\n");
1746     h->codecs[DS_BA] = cram_encoder_init(cram_stats_encoding(fd, c->stats[DS_BA]),
1747                                          c->stats[DS_BA], E_BYTE, NULL,
1748                                          fd->version, &fd->vv);
1749     if (c->stats[DS_BA]->nvals && !h->codecs[DS_BA]) goto_err;
1750 
1751     if (CRAM_MAJOR_VERS(fd->version) >= 3) {
1752         cram_byte_array_len_encoder e;
1753 
1754         e.len_encoding = CRAM_MAJOR_VERS(fd->version) >= 4
1755             ? E_VARINT_UNSIGNED
1756             : E_EXTERNAL;
1757         e.len_dat = (void *)DS_BB_len;
1758         //e.len_dat = (void *)DS_BB;
1759 
1760         e.val_encoding = E_EXTERNAL;
1761         e.val_dat = (void *)DS_BB;
1762 
1763         h->codecs[DS_BB] = cram_encoder_init(E_BYTE_ARRAY_LEN, NULL,
1764                                              E_BYTE_ARRAY, (void *)&e,
1765                                              fd->version, &fd->vv);
1766         if (!h->codecs[DS_BB]) goto_err;
1767     } else {
1768         h->codecs[DS_BB] = NULL;
1769     }
1770 
1771     //fprintf(stderr, "=== BS ===\n");
1772     h->codecs[DS_BS] = cram_encoder_init(cram_stats_encoding(fd, c->stats[DS_BS]),
1773                                          c->stats[DS_BS], E_BYTE, NULL,
1774                                          fd->version, &fd->vv);
1775     if (c->stats[DS_BS]->nvals && !h->codecs[DS_BS]) goto_err;
1776 
1777     if (CRAM_MAJOR_VERS(fd->version) == 1) {
1778         h->codecs[DS_TL] = NULL;
1779         h->codecs[DS_RI] = NULL;
1780         h->codecs[DS_RS] = NULL;
1781         h->codecs[DS_PD] = NULL;
1782         h->codecs[DS_HC] = NULL;
1783         h->codecs[DS_SC] = NULL;
1784 
1785         //fprintf(stderr, "=== TC ===\n");
1786         h->codecs[DS_TC] = cram_encoder_init(cram_stats_encoding(fd, c->stats[DS_TC]),
1787                                              c->stats[DS_TC], E_BYTE, NULL,
1788                                              fd->version, &fd->vv);
1789         if (c->stats[DS_TC]->nvals && !h->codecs[DS_TC]) goto_err;
1790 
1791         //fprintf(stderr, "=== TN ===\n");
1792         h->codecs[DS_TN] = cram_encoder_init(cram_stats_encoding(fd, c->stats[DS_TN]),
1793                                              c->stats[DS_TN], E_INT, NULL,
1794                                              fd->version, &fd->vv);
1795         if (c->stats[DS_TN]->nvals && !h->codecs[DS_TN]) goto_err;
1796     } else {
1797         h->codecs[DS_TC] = NULL;
1798         h->codecs[DS_TN] = NULL;
1799 
1800         //fprintf(stderr, "=== TL ===\n");
1801         h->codecs[DS_TL] = cram_encoder_init(cram_stats_encoding(fd, c->stats[DS_TL]),
1802                                              c->stats[DS_TL], E_INT, NULL,
1803                                              fd->version, &fd->vv);
1804         if (c->stats[DS_TL]->nvals && !h->codecs[DS_TL]) goto_err;
1805 
1806 
1807         //fprintf(stderr, "=== RI ===\n");
1808         h->codecs[DS_RI] = cram_encoder_init(cram_stats_encoding(fd, c->stats[DS_RI]),
1809                                              c->stats[DS_RI], E_INT, NULL,
1810                                              fd->version, &fd->vv);
1811         if (c->stats[DS_RI]->nvals && !h->codecs[DS_RI]) goto_err;
1812 
1813         //fprintf(stderr, "=== RS ===\n");
1814         h->codecs[DS_RS] = cram_encoder_init(cram_stats_encoding(fd, c->stats[DS_RS]),
1815                                              c->stats[DS_RS], E_INT, NULL,
1816                                              fd->version, &fd->vv);
1817         if (c->stats[DS_RS]->nvals && !h->codecs[DS_RS]) goto_err;
1818 
1819         //fprintf(stderr, "=== PD ===\n");
1820         h->codecs[DS_PD] = cram_encoder_init(cram_stats_encoding(fd, c->stats[DS_PD]),
1821                                              c->stats[DS_PD], E_INT, NULL,
1822                                              fd->version, &fd->vv);
1823         if (c->stats[DS_PD]->nvals && !h->codecs[DS_PD]) goto_err;
1824 
1825         //fprintf(stderr, "=== HC ===\n");
1826         h->codecs[DS_HC] = cram_encoder_init(cram_stats_encoding(fd, c->stats[DS_HC]),
1827                                              c->stats[DS_HC], E_INT, NULL,
1828                                              fd->version, &fd->vv);
1829         if (c->stats[DS_HC]->nvals && !h->codecs[DS_HC]) goto_err;
1830 
1831         //fprintf(stderr, "=== SC ===\n");
1832         if (1) {
1833             int i2[2] = {0, DS_SC};
1834 
1835             h->codecs[DS_SC] = cram_encoder_init(E_BYTE_ARRAY_STOP, NULL,
1836                                                  E_BYTE_ARRAY, (void *)i2,
1837                                                  fd->version, &fd->vv);
1838         } else {
1839             // Appears to be no practical benefit to using this method,
1840             // but it may work better if we start mixing SC, IN and BB
1841             // elements into the same external block.
1842             cram_byte_array_len_encoder e;
1843 
1844             e.len_encoding = CRAM_MAJOR_VERS(fd->version) >= 4
1845                 ? E_VARINT_UNSIGNED
1846                 : E_EXTERNAL;
1847             e.len_dat = (void *)DS_SC_len;
1848 
1849             e.val_encoding = E_EXTERNAL;
1850             e.val_dat = (void *)DS_SC;
1851 
1852             h->codecs[DS_SC] = cram_encoder_init(E_BYTE_ARRAY_LEN, NULL,
1853                                                  E_BYTE_ARRAY, (void *)&e,
1854                                                  fd->version, &fd->vv);
1855         }
1856         if (!h->codecs[DS_SC]) goto_err;
1857     }
1858 
1859     //fprintf(stderr, "=== IN ===\n");
1860     {
1861         int i2[2] = {0, DS_IN};
1862         h->codecs[DS_IN] = cram_encoder_init(E_BYTE_ARRAY_STOP, NULL,
1863                                              E_BYTE_ARRAY, (void *)i2,
1864                                              fd->version, &fd->vv);
1865         if (!h->codecs[DS_IN]) goto_err;
1866     }
1867 
1868     h->codecs[DS_QS] = cram_encoder_init(E_EXTERNAL, NULL, E_BYTE,
1869                                          (void *)DS_QS,
1870                                          fd->version, &fd->vv);
1871     if (!h->codecs[DS_QS]) goto_err;
1872     {
1873         int i2[2] = {0, DS_RN};
1874         h->codecs[DS_RN] = cram_encoder_init(E_BYTE_ARRAY_STOP, NULL,
1875                                              E_BYTE_ARRAY, (void *)i2,
1876                                              fd->version, &fd->vv);
1877         if (!h->codecs[DS_RN]) goto_err;
1878     }
1879 
1880 
1881     /* Encode slices */
1882     for (i = 0; i < c->curr_slice; i++) {
1883         hts_log_info("Encode slice %d", i);
1884 
1885         if (cram_encode_slice(fd, c, h, c->slices[i]) != 0)
1886             return -1;
1887     }
1888 
1889     /* Create compression header */
1890     {
1891         h->ref_seq_id    = c->ref_seq_id;
1892         h->ref_seq_start = c->ref_seq_start;
1893         h->ref_seq_span  = c->ref_seq_span;
1894         h->num_records   = c->num_records;
1895         h->qs_seq_orient = c->qs_seq_orient;
1896         // slight misnomer - sorted or treat as-if sorted (ap_delta force to 1)
1897         h->AP_delta      = c->pos_sorted;
1898         memcpy(h->substitution_matrix, CRAM_SUBST_MATRIX, 20);
1899 
1900         if (!(c_hdr = cram_encode_compression_header(fd, c, h)))
1901             return -1;
1902     }
1903 
1904     /* Compute landmarks */
1905     /* Fill out slice landmarks */
1906     c->num_landmarks = c->curr_slice;
1907     c->landmark = malloc(c->num_landmarks * sizeof(*c->landmark));
1908     if (!c->landmark)
1909         return -1;
1910 
1911     /*
1912      * Slice offset starts after the first block, so we need to simulate
1913      * writing it to work out the correct offset
1914      */
1915     {
1916         slice_offset = c_hdr->method == RAW
1917             ? c_hdr->uncomp_size
1918             : c_hdr->comp_size;
1919         slice_offset += 2 + 4*(CRAM_MAJOR_VERS(fd->version) >= 3) +
1920             fd->vv.varint_size(c_hdr->content_id) +
1921             fd->vv.varint_size(c_hdr->comp_size) +
1922             fd->vv.varint_size(c_hdr->uncomp_size);
1923     }
1924 
1925     c->ref_seq_id    = c->slices[0]->hdr->ref_seq_id;
1926     c->ref_seq_start = c->slices[0]->hdr->ref_seq_start;
1927     c->ref_seq_span  = c->slices[0]->hdr->ref_seq_span;
1928     for (i = 0; i < c->curr_slice; i++) {
1929         cram_slice *s = c->slices[i];
1930 
1931         c->num_blocks += s->hdr->num_blocks + 1; // slice header
1932         c->landmark[i] = slice_offset;
1933 
1934         if (s->hdr->ref_seq_start + s->hdr->ref_seq_span >
1935             c->ref_seq_start + c->ref_seq_span) {
1936             c->ref_seq_span = s->hdr->ref_seq_start + s->hdr->ref_seq_span
1937                 - c->ref_seq_start;
1938         }
1939 
1940         slice_offset += s->hdr_block->method == RAW
1941             ? s->hdr_block->uncomp_size
1942             : s->hdr_block->comp_size;
1943 
1944         slice_offset += 2 + 4*(CRAM_MAJOR_VERS(fd->version) >= 3) +
1945             fd->vv.varint_size(s->hdr_block->content_id) +
1946             fd->vv.varint_size(s->hdr_block->comp_size) +
1947             fd->vv.varint_size(s->hdr_block->uncomp_size);
1948 
1949         for (j = 0; j < s->hdr->num_blocks; j++) {
1950             slice_offset += 2 + 4*(CRAM_MAJOR_VERS(fd->version) >= 3) +
1951                 fd->vv.varint_size(s->block[j]->content_id) +
1952                 fd->vv.varint_size(s->block[j]->comp_size) +
1953                 fd->vv.varint_size(s->block[j]->uncomp_size);
1954 
1955             slice_offset += s->block[j]->method == RAW
1956                 ? s->block[j]->uncomp_size
1957                 : s->block[j]->comp_size;
1958         }
1959     }
1960     c->length += slice_offset; // just past the final slice
1961 
1962     c->comp_hdr_block = c_hdr;
1963 
1964     if (c->ref_seq_id >= 0) {
1965         cram_ref_decr(fd->refs, c->ref_seq_id);
1966     }
1967 
1968     /* Cache references up-front if we have unsorted access patterns */
1969     if (!fd->no_ref && c->refs_used) {
1970         for (i = 0; i < fd->refs->nref; i++) {
1971             if (c->refs_used[i])
1972                 cram_ref_decr(fd->refs, i);
1973         }
1974     }
1975 
1976     return 0;
1977 
1978  err:
1979     return -1;
1980 }
1981 
1982 
1983 /*
1984  * Adds a feature code to a read within a slice. For purposes of minimising
1985  * memory allocations and fragmentation we have one array of features for all
1986  * reads within the slice. We return the index into this array for this new
1987  * feature.
1988  *
1989  * Returns feature index on success
1990  *         -1 on failure.
1991  */
cram_add_feature(cram_container * c,cram_slice * s,cram_record * r,cram_feature * f)1992 static int cram_add_feature(cram_container *c, cram_slice *s,
1993                             cram_record *r, cram_feature *f) {
1994     if (s->nfeatures >= s->afeatures) {
1995         s->afeatures = s->afeatures ? s->afeatures*2 : 1024;
1996         s->features = realloc(s->features, s->afeatures*sizeof(*s->features));
1997         if (!s->features)
1998             return -1;
1999     }
2000 
2001     if (!r->nfeature++) {
2002         r->feature = s->nfeatures;
2003         if (cram_stats_add(c->stats[DS_FP], f->X.pos) < 0)
2004             return -1;
2005     } else {
2006         if (cram_stats_add(c->stats[DS_FP],
2007                            f->X.pos - s->features[r->feature + r->nfeature-2].X.pos) < 0)
2008             return -1;
2009 
2010     }
2011     if (cram_stats_add(c->stats[DS_FC], f->X.code) < 0)
2012         return -1;
2013 
2014     s->features[s->nfeatures++] = *f;
2015 
2016     return 0;
2017 }
2018 
cram_add_substitution(cram_fd * fd,cram_container * c,cram_slice * s,cram_record * r,int pos,char base,char qual,char ref)2019 static int cram_add_substitution(cram_fd *fd, cram_container *c,
2020                                  cram_slice *s, cram_record *r,
2021                                  int pos, char base, char qual, char ref) {
2022     cram_feature f;
2023 
2024     // seq=ACGTN vs ref=ACGT or seq=ACGT vs ref=ACGTN
2025     if (fd->L2[(uc)base]<4 || (fd->L2[(uc)base]<5 && fd->L2[(uc)ref]<4)) {
2026         f.X.pos = pos+1;
2027         f.X.code = 'X';
2028         f.X.base = fd->cram_sub_matrix[ref&0x1f][base&0x1f];
2029         if (cram_stats_add(c->stats[DS_BS], f.X.base) < 0)
2030             return -1;
2031     } else {
2032         f.B.pos = pos+1;
2033         f.B.code = 'B';
2034         f.B.base = base;
2035         f.B.qual = qual;
2036         if (cram_stats_add(c->stats[DS_BA], f.B.base) < 0) return -1;
2037         if (cram_stats_add(c->stats[DS_QS], f.B.qual) < 0) return -1;
2038         BLOCK_APPEND_CHAR(s->qual_blk, qual);
2039     }
2040     return cram_add_feature(c, s, r, &f);
2041 
2042  block_err:
2043     return -1;
2044 }
2045 
cram_add_bases(cram_fd * fd,cram_container * c,cram_slice * s,cram_record * r,int pos,int len,char * base)2046 static int cram_add_bases(cram_fd *fd, cram_container *c,
2047                           cram_slice *s, cram_record *r,
2048                           int pos, int len, char *base) {
2049     cram_feature f;
2050 
2051     f.b.pos = pos+1;
2052     f.b.code = 'b';
2053     f.b.seq_idx = base - (char *)BLOCK_DATA(s->seqs_blk);
2054     f.b.len = len;
2055 
2056     return cram_add_feature(c, s, r, &f);
2057 }
2058 
cram_add_base(cram_fd * fd,cram_container * c,cram_slice * s,cram_record * r,int pos,char base,char qual)2059 static int cram_add_base(cram_fd *fd, cram_container *c,
2060                          cram_slice *s, cram_record *r,
2061                          int pos, char base, char qual) {
2062     cram_feature f;
2063     f.B.pos = pos+1;
2064     f.B.code = 'B';
2065     f.B.base = base;
2066     f.B.qual = qual;
2067     if (cram_stats_add(c->stats[DS_BA], base) < 0) return -1;
2068     if (cram_stats_add(c->stats[DS_QS], qual) < 0) return -1;
2069     BLOCK_APPEND_CHAR(s->qual_blk, qual);
2070     return cram_add_feature(c, s, r, &f);
2071 
2072  block_err:
2073     return -1;
2074 }
2075 
cram_add_quality(cram_fd * fd,cram_container * c,cram_slice * s,cram_record * r,int pos,char qual)2076 static int cram_add_quality(cram_fd *fd, cram_container *c,
2077                             cram_slice *s, cram_record *r,
2078                             int pos, char qual) {
2079     cram_feature f;
2080     f.Q.pos = pos+1;
2081     f.Q.code = 'Q';
2082     f.Q.qual = qual;
2083     if (cram_stats_add(c->stats[DS_QS], qual) < 0) return -1;
2084     BLOCK_APPEND_CHAR(s->qual_blk, qual);
2085     return cram_add_feature(c, s, r, &f);
2086 
2087  block_err:
2088     return -1;
2089 }
2090 
cram_add_deletion(cram_container * c,cram_slice * s,cram_record * r,int pos,int len,char * base)2091 static int cram_add_deletion(cram_container *c, cram_slice *s, cram_record *r,
2092                              int pos, int len, char *base) {
2093     cram_feature f;
2094     f.D.pos = pos+1;
2095     f.D.code = 'D';
2096     f.D.len = len;
2097     if (cram_stats_add(c->stats[DS_DL], len) < 0) return -1;
2098     return cram_add_feature(c, s, r, &f);
2099 }
2100 
cram_add_softclip(cram_container * c,cram_slice * s,cram_record * r,int pos,int len,char * base,int version)2101 static int cram_add_softclip(cram_container *c, cram_slice *s, cram_record *r,
2102                              int pos, int len, char *base, int version) {
2103     cram_feature f;
2104     f.S.pos = pos+1;
2105     f.S.code = 'S';
2106     f.S.len = len;
2107     switch (CRAM_MAJOR_VERS(version)) {
2108     case 1:
2109         f.S.seq_idx = BLOCK_SIZE(s->base_blk);
2110         BLOCK_APPEND(s->base_blk, base, len);
2111         BLOCK_APPEND_CHAR(s->base_blk, '\0');
2112         break;
2113 
2114     case 2:
2115     default:
2116         f.S.seq_idx = BLOCK_SIZE(s->soft_blk);
2117         if (base) {
2118             BLOCK_APPEND(s->soft_blk, base, len);
2119         } else {
2120             int i;
2121             for (i = 0; i < len; i++)
2122                 BLOCK_APPEND_CHAR(s->soft_blk, 'N');
2123         }
2124         BLOCK_APPEND_CHAR(s->soft_blk, '\0');
2125         break;
2126 
2127         //default:
2128         //    // v3.0 onwards uses BB data-series
2129         //    f.S.seq_idx = BLOCK_SIZE(s->soft_blk);
2130     }
2131     return cram_add_feature(c, s, r, &f);
2132 
2133  block_err:
2134     return -1;
2135 }
2136 
cram_add_hardclip(cram_container * c,cram_slice * s,cram_record * r,int pos,int len,char * base)2137 static int cram_add_hardclip(cram_container *c, cram_slice *s, cram_record *r,
2138                              int pos, int len, char *base) {
2139     cram_feature f;
2140     f.S.pos = pos+1;
2141     f.S.code = 'H';
2142     f.S.len = len;
2143     if (cram_stats_add(c->stats[DS_HC], len) < 0) return -1;
2144     return cram_add_feature(c, s, r, &f);
2145 }
2146 
cram_add_skip(cram_container * c,cram_slice * s,cram_record * r,int pos,int len,char * base)2147 static int cram_add_skip(cram_container *c, cram_slice *s, cram_record *r,
2148                          int pos, int len, char *base) {
2149     cram_feature f;
2150     f.S.pos = pos+1;
2151     f.S.code = 'N';
2152     f.S.len = len;
2153     if (cram_stats_add(c->stats[DS_RS], len) < 0) return -1;
2154     return cram_add_feature(c, s, r, &f);
2155 }
2156 
cram_add_pad(cram_container * c,cram_slice * s,cram_record * r,int pos,int len,char * base)2157 static int cram_add_pad(cram_container *c, cram_slice *s, cram_record *r,
2158                         int pos, int len, char *base) {
2159     cram_feature f;
2160     f.S.pos = pos+1;
2161     f.S.code = 'P';
2162     f.S.len = len;
2163     if (cram_stats_add(c->stats[DS_PD], len) < 0) return -1;
2164     return cram_add_feature(c, s, r, &f);
2165 }
2166 
cram_add_insertion(cram_container * c,cram_slice * s,cram_record * r,int pos,int len,char * base)2167 static int cram_add_insertion(cram_container *c, cram_slice *s, cram_record *r,
2168                               int pos, int len, char *base) {
2169     cram_feature f;
2170     f.I.pos = pos+1;
2171     if (len == 1) {
2172         char b = base ? *base : 'N';
2173         f.i.code = 'i';
2174         f.i.base = b;
2175         if (cram_stats_add(c->stats[DS_BA], b) < 0) return -1;
2176     } else {
2177         f.I.code = 'I';
2178         f.I.len = len;
2179         f.S.seq_idx = BLOCK_SIZE(s->base_blk);
2180         if (base) {
2181             BLOCK_APPEND(s->base_blk, base, len);
2182         } else {
2183             int i;
2184             for (i = 0; i < len; i++)
2185                 BLOCK_APPEND_CHAR(s->base_blk, 'N');
2186         }
2187         BLOCK_APPEND_CHAR(s->base_blk, '\0');
2188     }
2189     return cram_add_feature(c, s, r, &f);
2190 
2191  block_err:
2192     return -1;
2193 }
2194 
2195 /*
2196  * Encodes auxiliary data. Largely duplicated from above, but done so to
2197  * keep it simple and avoid a myriad of version ifs.
2198  *
2199  * Returns the read-group parsed out of the BAM aux fields on success
2200  *         NULL on failure or no rg present, also sets "*err" to non-zero
2201  */
cram_encode_aux(cram_fd * fd,bam_seq_t * b,cram_container * c,cram_slice * s,cram_record * cr,int verbatim_NM,int verbatim_MD,int NM,kstring_t * MD,int * err)2202 static char *cram_encode_aux(cram_fd *fd, bam_seq_t *b, cram_container *c,
2203                              cram_slice *s, cram_record *cr,
2204                              int verbatim_NM, int verbatim_MD,
2205                              int NM, kstring_t *MD,
2206                              int *err) {
2207     char *aux, *orig, *rg = NULL;
2208     int aux_size = bam_get_l_aux(b);
2209     cram_block *td_b = c->comp_hdr->TD_blk;
2210     int TD_blk_size = BLOCK_SIZE(td_b), new;
2211     char *key;
2212     khint_t k;
2213 
2214     if (err) *err = 1;
2215 
2216     orig = aux = (char *)bam_aux(b);
2217 
2218     // Copy aux keys to td_b and aux values to slice aux blocks
2219     while (aux - orig < aux_size && aux[0] != 0) {
2220         int r;
2221 
2222         // RG:Z
2223         if (aux[0] == 'R' && aux[1] == 'G' && aux[2] == 'Z') {
2224             rg = &aux[3];
2225             while (*aux++);
2226             if (CRAM_MAJOR_VERS(fd->version) >= 4)
2227                 BLOCK_APPEND(td_b, "RG*", 3);
2228             continue;
2229         }
2230 
2231         // MD:Z
2232         if (aux[0] == 'M' && aux[1] == 'D' && aux[2] == 'Z') {
2233             if (cr->len && !fd->no_ref && !(cr->flags & BAM_FUNMAP) && !verbatim_MD) {
2234                 if (MD && MD->s && strncasecmp(MD->s, aux+3, orig + aux_size - (aux+3)) == 0) {
2235                     while (*aux++);
2236                     if (CRAM_MAJOR_VERS(fd->version) >= 4)
2237                         BLOCK_APPEND(td_b, "MD*", 3);
2238                     continue;
2239                 }
2240             }
2241         }
2242 
2243         // NM:i
2244         if (aux[0] == 'N' && aux[1] == 'M') {
2245             if (cr->len && !fd->no_ref && !(cr->flags & BAM_FUNMAP) && !verbatim_NM) {
2246                 int NM_ = bam_aux2i((uint8_t *)aux+2);
2247                 if (NM_ == NM) {
2248                     switch(aux[2]) {
2249                     case 'A': case 'C': case 'c': aux+=4; break;
2250                     case 'S': case 's':           aux+=5; break;
2251                     case 'I': case 'i': case 'f': aux+=7; break;
2252                     default:
2253                         hts_log_error("Unhandled type code for NM tag");
2254                         return NULL;
2255                     }
2256                     if (CRAM_MAJOR_VERS(fd->version) >= 4)
2257                         BLOCK_APPEND(td_b, "NM*", 3);
2258                     continue;
2259                 }
2260             }
2261         }
2262 
2263         BLOCK_APPEND(td_b, aux, 3);
2264 
2265         // Container level tags_used, for TD series
2266         // Maps integer key ('X0i') to cram_tag_map struct.
2267         int key = (aux[0]<<16)|(aux[1]<<8)|aux[2];
2268         k = kh_put(m_tagmap, c->tags_used, key, &r);
2269         if (-1 == r)
2270             return NULL;
2271         else if (r != 0)
2272             kh_val(c->tags_used, k) = NULL;
2273 
2274         if (r == 1) {
2275             khint_t k_global;
2276 
2277             // Global tags_used for cram_metrics support
2278             pthread_mutex_lock(&fd->metrics_lock);
2279             k_global = kh_put(m_metrics, fd->tags_used, key, &r);
2280             if (-1 == r) {
2281                 pthread_mutex_unlock(&fd->metrics_lock);
2282                 return NULL;
2283             }
2284             if (r >= 1) {
2285                 kh_val(fd->tags_used, k_global) = cram_new_metrics();
2286                 if (!kh_val(fd->tags_used, k_global)) {
2287                     kh_del(m_metrics, fd->tags_used, k_global);
2288                     pthread_mutex_unlock(&fd->metrics_lock);
2289                     goto err;
2290                 }
2291             }
2292 
2293             pthread_mutex_unlock(&fd->metrics_lock);
2294 
2295             int i2[2] = {'\t',key};
2296             size_t sk = key;
2297             cram_tag_map *m = calloc(1, sizeof(*m));
2298             if (!m)
2299                 goto_err;
2300             kh_val(c->tags_used, k) = m;
2301 
2302             cram_codec *c;
2303 
2304             // Use a block content id based on the tag id.
2305             // Codec type depends on tag data type.
2306             switch(aux[2]) {
2307             case 'Z': case 'H':
2308                 // string as byte_array_stop
2309                 c = cram_encoder_init(E_BYTE_ARRAY_STOP, NULL,
2310                                       E_BYTE_ARRAY, (void *)i2,
2311                                       fd->version, &fd->vv);
2312                 break;
2313 
2314             case 'A': case 'c': case 'C': {
2315                 // byte array len, 1 byte
2316                 cram_byte_array_len_encoder e;
2317                 cram_stats st;
2318 
2319                 if (CRAM_MAJOR_VERS(fd->version) <= 3) {
2320                     e.len_encoding = E_HUFFMAN;
2321                     e.len_dat = NULL; // will get codes from st
2322                 } else {
2323                     e.len_encoding = E_CONST_INT;
2324                     e.len_dat = NULL; // will get codes from st
2325                 }
2326                 memset(&st, 0, sizeof(st));
2327                 if (cram_stats_add(&st, 1) < 0) goto block_err;
2328                 cram_stats_encoding(fd, &st);
2329 
2330                 e.val_encoding = E_EXTERNAL;
2331                 e.val_dat = (void *)sk;
2332 
2333                 c = cram_encoder_init(E_BYTE_ARRAY_LEN, &st,
2334                                       E_BYTE_ARRAY, (void *)&e,
2335                                       fd->version, &fd->vv);
2336                 break;
2337             }
2338 
2339             case 's': case 'S': {
2340                 // byte array len, 2 byte
2341                 cram_byte_array_len_encoder e;
2342                 cram_stats st;
2343 
2344                 if (CRAM_MAJOR_VERS(fd->version) <= 3) {
2345                     e.len_encoding = E_HUFFMAN;
2346                     e.len_dat = NULL; // will get codes from st
2347                 } else {
2348                     e.len_encoding = E_CONST_INT;
2349                     e.len_dat = NULL; // will get codes from st
2350                 }
2351                 memset(&st, 0, sizeof(st));
2352                 if (cram_stats_add(&st, 2) < 0) goto block_err;
2353                 cram_stats_encoding(fd, &st);
2354 
2355                 e.val_encoding = E_EXTERNAL;
2356                 e.val_dat = (void *)sk;
2357 
2358                 c = cram_encoder_init(E_BYTE_ARRAY_LEN, &st,
2359                                       E_BYTE_ARRAY, (void *)&e,
2360                                       fd->version, &fd->vv);
2361                 break;
2362             }
2363             case 'i': case 'I': case 'f': {
2364                 // byte array len, 4 byte
2365                 cram_byte_array_len_encoder e;
2366                 cram_stats st;
2367 
2368                 if (CRAM_MAJOR_VERS(fd->version) <= 3) {
2369                     e.len_encoding = E_HUFFMAN;
2370                     e.len_dat = NULL; // will get codes from st
2371                 } else {
2372                     e.len_encoding = E_CONST_INT;
2373                     e.len_dat = NULL; // will get codes from st
2374                 }
2375                 memset(&st, 0, sizeof(st));
2376                 if (cram_stats_add(&st, 4) < 0) goto block_err;
2377                 cram_stats_encoding(fd, &st);
2378 
2379                 e.val_encoding = E_EXTERNAL;
2380                 e.val_dat = (void *)sk;
2381 
2382                 c = cram_encoder_init(E_BYTE_ARRAY_LEN, &st,
2383                                       E_BYTE_ARRAY, (void *)&e,
2384                                       fd->version, &fd->vv);
2385                 break;
2386             }
2387 
2388             case 'B': {
2389                 // Byte array of variable size, but we generate our tag
2390                 // byte stream at the wrong stage (during reading and not
2391                 // after slice header construction). So we use
2392                 // BYTE_ARRAY_LEN with the length codec being external
2393                 // too.
2394                 cram_byte_array_len_encoder e;
2395 
2396                 e.len_encoding = CRAM_MAJOR_VERS(fd->version) >= 4
2397                     ? E_VARINT_UNSIGNED
2398                     : E_EXTERNAL;
2399                 e.len_dat = (void *)sk; // or key+128 for len?
2400 
2401                 e.val_encoding = E_EXTERNAL;
2402                 e.val_dat = (void *)sk;
2403 
2404                 c = cram_encoder_init(E_BYTE_ARRAY_LEN, NULL,
2405                                       E_BYTE_ARRAY, (void *)&e,
2406                                       fd->version, &fd->vv);
2407                 break;
2408             }
2409 
2410             default:
2411                 hts_log_error("Unsupported SAM aux type '%c'", aux[2]);
2412                 c = NULL;
2413             }
2414 
2415             if (!c)
2416                 goto_err;
2417 
2418             m->codec = c;
2419 
2420             // Link to fd-global tag metrics
2421             pthread_mutex_lock(&fd->metrics_lock);
2422             m->m = k_global ? (cram_metrics *)kh_val(fd->tags_used, k_global) : NULL;
2423             pthread_mutex_unlock(&fd->metrics_lock);
2424         }
2425 
2426         cram_tag_map *tm = (cram_tag_map *)kh_val(c->tags_used, k);
2427         if (!tm) goto_err;
2428         cram_codec *codec = tm->codec;
2429         if (!tm->codec) goto_err;
2430 
2431         switch(aux[2]) {
2432         case 'A': case 'C': case 'c':
2433             if (!tm->blk) {
2434                 if (!(tm->blk = cram_new_block(EXTERNAL, key)))
2435                     return NULL;
2436                 codec->u.e_byte_array_len.val_codec->out = tm->blk;
2437             }
2438 
2439             aux+=3;
2440             //codec->encode(s, codec, aux, 1);
2441             // Functionally equivalent, but less code.
2442             BLOCK_APPEND_CHAR(tm->blk, *aux);
2443             aux++;
2444             break;
2445 
2446         case 'S': case 's':
2447             if (!tm->blk) {
2448                 if (!(tm->blk = cram_new_block(EXTERNAL, key)))
2449                     return NULL;
2450                 codec->u.e_byte_array_len.val_codec->out = tm->blk;
2451             }
2452 
2453             aux+=3;
2454             //codec->encode(s, codec, aux, 2);
2455             BLOCK_APPEND(tm->blk, aux, 2);
2456             aux+=2;
2457             break;
2458 
2459         case 'I': case 'i': case 'f':
2460             if (!tm->blk) {
2461                 if (!(tm->blk = cram_new_block(EXTERNAL, key)))
2462                     return NULL;
2463                 codec->u.e_byte_array_len.val_codec->out = tm->blk;
2464             }
2465 
2466             aux+=3;
2467             //codec->encode(s, codec, aux, 4);
2468             BLOCK_APPEND(tm->blk, aux, 4);
2469             aux+=4;
2470             break;
2471 
2472         case 'd':
2473             if (!tm->blk) {
2474                 if (!(tm->blk = cram_new_block(EXTERNAL, key)))
2475                     return NULL;
2476                 codec->u.e_byte_array_len.val_codec->out = tm->blk;
2477             }
2478 
2479             aux+=3; //*tmp++=*aux++; *tmp++=*aux++; *tmp++=*aux++;
2480             //codec->encode(s, codec, aux, 8);
2481             BLOCK_APPEND(tm->blk, aux, 8);
2482             aux+=8;
2483             break;
2484 
2485         case 'Z': case 'H':
2486             {
2487                 if (!tm->blk) {
2488                     if (!(tm->blk = cram_new_block(EXTERNAL, key)))
2489                         return NULL;
2490                     codec->out = tm->blk;
2491                 }
2492 
2493                 char *aux_s;
2494                 aux += 3;
2495                 aux_s = aux;
2496                 while (*aux++);
2497                 if (codec->encode(s, codec, aux_s, aux - aux_s) < 0)
2498                     return NULL;
2499             }
2500             break;
2501 
2502         case 'B': {
2503             int type = aux[3], blen;
2504             uint32_t count = (uint32_t)((((unsigned char *)aux)[4]<< 0) +
2505                                         (((unsigned char *)aux)[5]<< 8) +
2506                                         (((unsigned char *)aux)[6]<<16) +
2507                                         (((unsigned char *)aux)[7]<<24));
2508             if (!tm->blk) {
2509                 if (!(tm->blk = cram_new_block(EXTERNAL, key)))
2510                     return NULL;
2511                 if (codec->u.e_byte_array_len.val_codec->codec == E_XDELTA) {
2512                     if (!(tm->blk2 = cram_new_block(EXTERNAL, key+128)))
2513                         return NULL;
2514                     codec->u.e_byte_array_len.len_codec->out = tm->blk2;
2515                     codec->u.e_byte_array_len.val_codec->u.e_xdelta.sub_codec->out = tm->blk;
2516                 } else {
2517                     codec->u.e_byte_array_len.len_codec->out = tm->blk;
2518                     codec->u.e_byte_array_len.val_codec->out = tm->blk;
2519                 }
2520             }
2521 
2522             // skip TN field
2523             aux+=3;
2524 
2525             // We use BYTE_ARRAY_LEN with external length, so store that first
2526             switch (type) {
2527             case 'c': case 'C':
2528                 blen = count;
2529                 break;
2530             case 's': case 'S':
2531                 blen = 2*count;
2532                 break;
2533             case 'i': case 'I': case 'f':
2534                 blen = 4*count;
2535                 break;
2536             default:
2537                 hts_log_error("Unknown sub-type '%c' for aux type 'B'", type);
2538                 return NULL;
2539             }
2540 
2541             blen += 5; // sub-type & length
2542 
2543             if (codec->encode(s, codec, aux, blen) < 0)
2544                 return NULL;
2545             aux += blen;
2546             break;
2547         }
2548         default:
2549             hts_log_error("Unknown aux type '%c'", aux[2]);
2550             return NULL;
2551         }
2552         tm->blk->m = tm->m;
2553     }
2554 
2555     // FIXME: sort BLOCK_DATA(td_b) by char[3] triples
2556 
2557     // And and increment TD hash entry
2558     BLOCK_APPEND_CHAR(td_b, 0);
2559 
2560     // Duplicate key as BLOCK_DATA() can be realloced to a new pointer.
2561     key = string_ndup(c->comp_hdr->TD_keys,
2562                       (char *)BLOCK_DATA(td_b) + TD_blk_size,
2563                       BLOCK_SIZE(td_b) - TD_blk_size);
2564     if (!key)
2565         goto block_err;
2566     k = kh_put(m_s2i, c->comp_hdr->TD_hash, key, &new);
2567     if (new < 0) {
2568         return NULL;
2569     } else if (new == 0) {
2570         BLOCK_SIZE(td_b) = TD_blk_size;
2571     } else {
2572         kh_val(c->comp_hdr->TD_hash, k) = c->comp_hdr->nTL;
2573         c->comp_hdr->nTL++;
2574     }
2575 
2576     cr->TL = kh_val(c->comp_hdr->TD_hash, k);
2577     if (cram_stats_add(c->stats[DS_TL], cr->TL) < 0)
2578         goto block_err;
2579 
2580     if (err) *err = 0;
2581     return rg;
2582 
2583  err:
2584  block_err:
2585     return NULL;
2586 }
2587 
2588 /*
2589  * During cram_next_container or before the final flush at end of
2590  * file, we update the current slice headers and increment the slice
2591  * number to the next slice.
2592  *
2593  * See cram_next_container() and cram_close().
2594  */
cram_update_curr_slice(cram_container * c)2595 void cram_update_curr_slice(cram_container *c) {
2596     cram_slice *s = c->slice;
2597     if (c->multi_seq) {
2598         s->hdr->ref_seq_id    = -2;
2599         s->hdr->ref_seq_start = 0;
2600         s->hdr->ref_seq_span  = 0;
2601     } else {
2602         s->hdr->ref_seq_id    = c->curr_ref;
2603         s->hdr->ref_seq_start = c->first_base;
2604         s->hdr->ref_seq_span  = MAX(0, c->last_base - c->first_base + 1);
2605     }
2606     s->hdr->num_records   = c->curr_rec;
2607 
2608     if (c->curr_slice == 0) {
2609         if (c->ref_seq_id != s->hdr->ref_seq_id)
2610             c->ref_seq_id  = s->hdr->ref_seq_id;
2611         c->ref_seq_start = c->first_base;
2612     }
2613 
2614     c->curr_slice++;
2615 }
2616 
2617 /*
2618  * Handles creation of a new container or new slice, flushing any
2619  * existing containers when appropriate.
2620  *
2621  * Really this is next slice, which may or may not lead to a new container.
2622  *
2623  * Returns cram_container pointer on success
2624  *         NULL on failure.
2625  */
cram_next_container(cram_fd * fd,bam_seq_t * b)2626 static cram_container *cram_next_container(cram_fd *fd, bam_seq_t *b) {
2627     cram_container *c = fd->ctr;
2628     int i;
2629 
2630     /* First occurrence */
2631     if (c->curr_ref == -2)
2632         c->curr_ref = bam_ref(b);
2633 
2634     if (c->slice)
2635         cram_update_curr_slice(c);
2636 
2637     /* Flush container */
2638     if (c->curr_slice == c->max_slice ||
2639         (bam_ref(b) != c->curr_ref && !c->multi_seq)) {
2640         c->ref_seq_span = fd->last_base - c->ref_seq_start + 1;
2641         hts_log_info("Flush container %d/%"PRId64"..%"PRId64,
2642                      c->ref_seq_id, c->ref_seq_start,
2643                      c->ref_seq_start + c->ref_seq_span -1);
2644 
2645         /* Encode slices */
2646         if (-1 == cram_flush_container_mt(fd, c))
2647             return NULL;
2648         if (!fd->pool) {
2649             // Move to sep func, as we need cram_flush_container for
2650             // the closing phase to flush the partial container.
2651             for (i = 0; i < c->max_slice; i++) {
2652                 cram_free_slice(c->slices[i]);
2653                 c->slices[i] = NULL;
2654             }
2655 
2656             c->slice = NULL;
2657             c->curr_slice = 0;
2658 
2659             /* Easy approach for purposes of freeing stats */
2660             cram_free_container(c);
2661         }
2662 
2663         c = fd->ctr = cram_new_container(fd->seqs_per_slice,
2664                                          fd->slices_per_container);
2665         if (!c)
2666             return NULL;
2667         c->record_counter = fd->record_counter;
2668         c->curr_ref = bam_ref(b);
2669     }
2670 
2671     c->last_pos = c->first_base = c->last_base = bam_pos(b)+1;
2672 
2673     /* New slice */
2674     c->slice = c->slices[c->curr_slice] =
2675         cram_new_slice(MAPPED_SLICE, c->max_rec);
2676     if (!c->slice)
2677         return NULL;
2678 
2679     if (c->multi_seq) {
2680         c->slice->hdr->ref_seq_id = -2;
2681         c->slice->hdr->ref_seq_start = 0;
2682         c->slice->last_apos = 1;
2683     } else {
2684         c->slice->hdr->ref_seq_id = bam_ref(b);
2685         // wrong for unsorted data, will fix during encoding.
2686         c->slice->hdr->ref_seq_start = bam_pos(b)+1;
2687         c->slice->last_apos = bam_pos(b)+1;
2688     }
2689 
2690     c->curr_rec = 0;
2691     c->s_num_bases = 0;
2692     c->n_mapped = 0;
2693 
2694     // QO field: 0 implies original orientation, 1 implies sequence orientation
2695     // 1 is often preferable for NovaSeq, but impact is slight. ~0.5% diff.
2696     // Conversely other data sets it's often better than 1% saving for 0.
2697     // Short of trying both and learning, for now we use use 0 for V4, 1 for V3.
2698     c->qs_seq_orient = CRAM_MAJOR_VERS(fd->version) >= 4 ? 0 : 1;
2699 
2700     return c;
2701 }
2702 
2703 /*
2704  * Converts a single bam record into a cram record.
2705  * Possibly used within a thread.
2706  *
2707  * Returns 0 on success;
2708  *        -1 on failure
2709  */
process_one_read(cram_fd * fd,cram_container * c,cram_slice * s,cram_record * cr,bam_seq_t * b,int rnum,kstring_t * MD)2710 static int process_one_read(cram_fd *fd, cram_container *c,
2711                             cram_slice *s, cram_record *cr,
2712                             bam_seq_t *b, int rnum, kstring_t *MD) {
2713     int i, fake_qual = -1, NM = 0;
2714     char *cp, *rg;
2715     char *ref, *seq, *qual;
2716 
2717     // Any places with N in seq and/or reference can lead to ambiguous
2718     // interpretation of the SAM NM:i tag.  So we store these verbatim
2719     // to ensure valid data round-trips the same regardless of who
2720     // defines it as valid.
2721     // Similarly when alignments go beyond end of the reference.
2722     int verbatim_NM = fd->store_nm;
2723     int verbatim_MD = fd->store_md;
2724 
2725     // FIXME: multi-ref containers
2726 
2727     ref = c->ref;
2728     cr->flags       = bam_flag(b);
2729     cr->len         = bam_seq_len(b);
2730     if (!bam_aux_get(b, "MD"))
2731         MD = NULL;
2732     else
2733         MD->l = 0;
2734 
2735     //fprintf(stderr, "%s => %d\n", rg ? rg : "\"\"", cr->rg);
2736 
2737     cr->ref_id      = bam_ref(b);
2738     if (cram_stats_add(c->stats[DS_RI], cr->ref_id) < 0)
2739         goto block_err;
2740     if (cram_stats_add(c->stats[DS_BF], fd->cram_flag_swap[cr->flags & 0xfff]) < 0)
2741         goto block_err;
2742 
2743     // Non reference based encoding means storing the bases verbatim as features, which in
2744     // turn means every base also has a quality already stored.
2745     if (!fd->no_ref || CRAM_MAJOR_VERS(fd->version) >= 3)
2746         cr->cram_flags |= CRAM_FLAG_PRESERVE_QUAL_SCORES;
2747 
2748     if (cr->len <= 0 && CRAM_MAJOR_VERS(fd->version) >= 3)
2749         cr->cram_flags |= CRAM_FLAG_NO_SEQ;
2750     //cram_stats_add(c->stats[DS_CF], cr->cram_flags & CRAM_FLAG_MASK);
2751 
2752     c->num_bases   += cr->len;
2753     cr->apos        = bam_pos(b)+1;
2754     if (c->pos_sorted) {
2755         if (cr->apos < s->last_apos && !fd->ap_delta) {
2756             c->pos_sorted = 0;
2757         } else {
2758             if (cram_stats_add(c->stats[DS_AP], cr->apos - s->last_apos) < 0)
2759                 goto block_err;
2760             s->last_apos = cr->apos;
2761         }
2762     } else {
2763         //cram_stats_add(c->stats[DS_AP], cr->apos);
2764     }
2765     c->max_apos += (cr->apos > c->max_apos) * (cr->apos - c->max_apos);
2766 
2767     /*
2768      * This seqs_ds is largely pointless and it could reuse the same memory
2769      * over and over.
2770      * s->base_blk is what we need for encoding.
2771      */
2772     cr->seq         = BLOCK_SIZE(s->seqs_blk);
2773     cr->qual        = BLOCK_SIZE(s->qual_blk);
2774     BLOCK_GROW(s->seqs_blk, cr->len+1);
2775     BLOCK_GROW(s->qual_blk, cr->len);
2776 
2777     // Convert BAM nibble encoded sequence to string of base pairs
2778     seq = cp = (char *)BLOCK_END(s->seqs_blk);
2779     *seq = 0;
2780     nibble2base(bam_seq(b), cp, cr->len);
2781     BLOCK_SIZE(s->seqs_blk) += cr->len;
2782 
2783     qual = cp = (char *)bam_qual(b);
2784 
2785 
2786     /* Copy and parse */
2787     if (!(cr->flags & BAM_FUNMAP)) {
2788         uint32_t *cig_to, *cig_from;
2789         int64_t apos = cr->apos-1, spos = 0;
2790         int64_t MD_last = apos; // last position of edit in MD tag
2791 
2792         cr->cigar       = s->ncigar;
2793         cr->ncigar      = bam_cigar_len(b);
2794         while (cr->cigar + cr->ncigar >= s->cigar_alloc) {
2795             s->cigar_alloc = s->cigar_alloc ? s->cigar_alloc*2 : 1024;
2796             s->cigar = realloc(s->cigar, s->cigar_alloc * sizeof(*s->cigar));
2797             if (!s->cigar)
2798                 return -1;
2799         }
2800 
2801         cig_to = (uint32_t *)s->cigar;
2802         cig_from = (uint32_t *)bam_cigar(b);
2803 
2804         cr->feature = 0;
2805         cr->nfeature = 0;
2806         for (i = 0; i < cr->ncigar; i++) {
2807             enum cigar_op cig_op = cig_from[i] & BAM_CIGAR_MASK;
2808             uint32_t cig_len = cig_from[i] >> BAM_CIGAR_SHIFT;
2809             cig_to[i] = cig_from[i];
2810 
2811             /* Can also generate events from here for CRAM diffs */
2812 
2813             switch (cig_op) {
2814                 int l;
2815 
2816                 // Don't trust = and X ops to be correct.
2817             case BAM_CMATCH:
2818             case BAM_CBASE_MATCH:
2819             case BAM_CBASE_MISMATCH:
2820                 //fprintf(stderr, "\nBAM_CMATCH\nR: %.*s\nS: %.*s\n",
2821                 //      cig_len, &ref[apos], cig_len, &seq[spos]);
2822                 l = 0;
2823                 if (!fd->no_ref && cr->len) {
2824                     int end = cig_len+apos < c->ref_end
2825                         ? cig_len : c->ref_end - apos;
2826                     char *sp = &seq[spos];
2827                     char *rp = &ref[apos];
2828                     char *qp = &qual[spos];
2829                     if (end > cr->len) {
2830                         hts_log_error("CIGAR and query sequence are of different length");
2831                         return -1;
2832                     }
2833                     for (l = 0; l < end; l++) {
2834                         // This case is just too disputed and different tools
2835                         // interpret these in different ways.  We give up and
2836                         // store verbatim.
2837                         if (rp[l] == 'N' && sp[l] == 'N')
2838                             verbatim_NM = verbatim_MD = 1;
2839                         if (rp[l] != sp[l]) {
2840                             // Build our own MD tag if one is on the sequence, so
2841                             // we can ensure it matches and thus can be discarded.
2842                             if (MD && ref) {
2843                                 if (kputuw(apos+l - MD_last, MD) < 0) goto err;
2844                                 if (kputc(rp[l], MD) < 0) goto err;
2845                                 MD_last = apos+l+1;
2846                             }
2847                             NM++;
2848                             if (!sp[l])
2849                                 break;
2850                             if (0 && CRAM_MAJOR_VERS(fd->version) >= 3) {
2851 #if 0
2852                                 // Disabled for the time being as it doesn't
2853                                 // seem to gain us much.
2854                                 int ol=l;
2855                                 while (l<end && rp[l] != sp[l])
2856                                     l++;
2857                                 if (l-ol > 1) {
2858                                     if (cram_add_bases(fd, c, s, cr, spos+ol,
2859                                                        l-ol, &seq[spos+ol]))
2860                                         return -1;
2861                                     l--;
2862                                 } else {
2863                                     l = ol;
2864                                     if (cram_add_substitution(fd, c, s, cr,
2865                                                               spos+l, sp[l],
2866                                                               qp[l], rp[l]))
2867                                         return -1;
2868                                 }
2869 #else
2870                                 // With urmap pushed to the limit and lots
2871                                 // of unaligned data (should be soft-clipped)
2872                                 // this saves ~2-7%. Worth it?
2873                                 int nl = l;
2874                                 int max_end = nl, max_score = 0, score = 0;
2875                                 while (nl < end) {
2876                                     if (rp[nl] != sp[nl]) {
2877                                         score += 3;
2878                                         if (max_score < score) {
2879                                             max_score = score;
2880                                             max_end = nl;
2881                                         }
2882                                     } else {
2883                                         score--;
2884                                         if (score < -2 ||
2885                                             max_score - score > 7)
2886                                             break;
2887                                     }
2888                                     nl++;
2889                                 }
2890                                 if (max_score > 20) {
2891                                     cram_add_bases(fd, c, s, cr, spos+l,
2892                                                    max_end-l, &seq[spos+l]);
2893                                     l = max_end-1;
2894                                 } else {
2895                                     while (l < nl) {
2896                                         if (rp[l] != sp[l])
2897                                             cram_add_substitution(fd, c, s,
2898                                                                   cr, spos+l,
2899                                                                   sp[l], qp[l],
2900                                                                   rp[l]);
2901                                         l++;
2902                                     }
2903                                     l--;
2904                                 }
2905 #endif
2906                             } else {
2907                                 if (cram_add_substitution(fd, c, s, cr, spos+l,
2908                                                           sp[l], qp[l], rp[l]))
2909                                     return -1;
2910                             }
2911                         }
2912                     }
2913                     spos += l;
2914                     apos += l;
2915                 }
2916 
2917                 if (l < cig_len && cr->len) {
2918                     if (fd->no_ref) {
2919                         if (CRAM_MAJOR_VERS(fd->version) == 3) {
2920                             if (cram_add_bases(fd, c, s, cr, spos,
2921                                                cig_len-l, &seq[spos]))
2922                                 return -1;
2923                             spos += cig_len-l;
2924                         } else {
2925                             for (; l < cig_len && seq[spos]; l++, spos++) {
2926                                 if (cram_add_base(fd, c, s, cr, spos,
2927                                                   seq[spos], qual[spos]))
2928                                     return -1;
2929                             }
2930                         }
2931                     } else {
2932                         /* off end of sequence or non-ref based output */
2933                         verbatim_NM = verbatim_MD = 1;
2934                         for (; l < cig_len && seq[spos]; l++, spos++) {
2935                             if (cram_add_base(fd, c, s, cr, spos,
2936                                               seq[spos], qual[spos]))
2937                                 return -1;
2938                         }
2939                     }
2940                     apos += cig_len;
2941                 } else if (!cr->len) {
2942                     /* Seq "*" */
2943                     verbatim_NM = verbatim_MD = 1;
2944                     apos += cig_len;
2945                     spos += cig_len;
2946                 }
2947                 break;
2948 
2949             case BAM_CDEL:
2950                 if (MD && ref) {
2951                     if (kputuw(apos - MD_last, MD) < 0) goto err;
2952                     if (apos < c->ref_end) {
2953                         if (kputc_('^', MD) < 0) goto err;
2954                         if (kputsn(&ref[apos], MIN(c->ref_end - apos, cig_len), MD) < 0)
2955                             goto err;
2956                     }
2957                 }
2958                 NM += cig_len;
2959 
2960                 if (cram_add_deletion(c, s, cr, spos, cig_len, &seq[spos]))
2961                     return -1;
2962                 apos += cig_len;
2963                 MD_last = apos;
2964                 break;
2965 
2966             case BAM_CREF_SKIP:
2967                 if (cram_add_skip(c, s, cr, spos, cig_len, &seq[spos]))
2968                     return -1;
2969                 apos += cig_len;
2970                 MD_last += cig_len;
2971                 break;
2972 
2973             case BAM_CINS:
2974                 if (cram_add_insertion(c, s, cr, spos, cig_len,
2975                                        cr->len ? &seq[spos] : NULL))
2976                     return -1;
2977                 if (fd->no_ref && cr->len) {
2978                     for (l = 0; l < cig_len; l++, spos++) {
2979                         cram_add_quality(fd, c, s, cr, spos, qual[spos]);
2980                     }
2981                 } else {
2982                     spos += cig_len;
2983                 }
2984                 NM += cig_len;
2985                 break;
2986 
2987             case BAM_CSOFT_CLIP:
2988                 if (cram_add_softclip(c, s, cr, spos, cig_len,
2989                                       cr->len ? &seq[spos] : NULL,
2990                                       fd->version))
2991                     return -1;
2992 
2993                 if (fd->no_ref &&
2994                     !(cr->cram_flags & CRAM_FLAG_PRESERVE_QUAL_SCORES)) {
2995                     if (cr->len) {
2996                         for (l = 0; l < cig_len; l++, spos++) {
2997                             cram_add_quality(fd, c, s, cr, spos, qual[spos]);
2998                         }
2999                     } else {
3000                         for (l = 0; l < cig_len; l++, spos++) {
3001                             cram_add_quality(fd, c, s, cr, spos, -1);
3002                         }
3003                     }
3004                 } else {
3005                     spos += cig_len;
3006                 }
3007                 break;
3008 
3009             case BAM_CHARD_CLIP:
3010                 if (cram_add_hardclip(c, s, cr, spos, cig_len, &seq[spos]))
3011                     return -1;
3012                 break;
3013 
3014             case BAM_CPAD:
3015                 if (cram_add_pad(c, s, cr, spos, cig_len, &seq[spos]))
3016                     return -1;
3017                 break;
3018 
3019             default:
3020                 hts_log_error("Unknown CIGAR op code %d", cig_op);
3021                 return -1;
3022             }
3023         }
3024         if (cr->len && spos != cr->len) {
3025             hts_log_error("CIGAR and query sequence are of different length");
3026             return -1;
3027         }
3028         fake_qual = spos;
3029         cr->aend = fd->no_ref ? apos : MIN(apos, c->ref_end);
3030         if (cram_stats_add(c->stats[DS_FN], cr->nfeature) < 0)
3031             goto block_err;
3032 
3033         if (MD && ref)
3034             if (kputuw(apos - MD_last, MD) < 0) goto err;
3035     } else {
3036         // Unmapped
3037         cr->cram_flags |= CRAM_FLAG_PRESERVE_QUAL_SCORES;
3038         cr->cigar  = 0;
3039         cr->ncigar = 0;
3040         cr->nfeature = 0;
3041         cr->aend = cr->apos;
3042         for (i = 0; i < cr->len; i++)
3043             if (cram_stats_add(c->stats[DS_BA], seq[i]) < 0)
3044                 goto block_err;
3045         fake_qual = 0;
3046     }
3047 
3048     cr->ntags      = 0; //cram_stats_add(c->stats[DS_TC], cr->ntags);
3049     int err = 0;
3050     rg = cram_encode_aux(fd, b, c, s, cr, verbatim_NM, verbatim_MD, NM, MD, &err);
3051     if (err)
3052         goto block_err;
3053 
3054     /* Read group, identified earlier */
3055     if (rg) {
3056         sam_hrec_rg_t *brg = sam_hrecs_find_rg(fd->header->hrecs, rg);
3057         cr->rg = brg ? brg->id : -1;
3058     } else if (CRAM_MAJOR_VERS(fd->version) == 1) {
3059         sam_hrec_rg_t *brg = sam_hrecs_find_rg(fd->header->hrecs, "UNKNOWN");
3060         assert(brg);
3061     } else {
3062         cr->rg = -1;
3063     }
3064     if (cram_stats_add(c->stats[DS_RG], cr->rg) < 0)
3065         goto block_err;
3066 
3067     /*
3068      * Append to the qual block now. We do this here as
3069      * cram_add_substitution() can generate BA/QS events which need to
3070      * be in the qual block before we append the rest of the data.
3071      */
3072     if (cr->cram_flags & CRAM_FLAG_PRESERVE_QUAL_SCORES) {
3073         /* Special case of seq "*" */
3074         if (cr->len == 0) {
3075             cr->len = fake_qual;
3076             BLOCK_GROW(s->qual_blk, cr->len);
3077             cp = (char *)BLOCK_END(s->qual_blk);
3078             memset(cp, 255, cr->len);
3079         } else {
3080             BLOCK_GROW(s->qual_blk, cr->len);
3081             cp = (char *)BLOCK_END(s->qual_blk);
3082             char *from = (char *)&bam_qual(b)[0];
3083             char *to = &cp[0];
3084             memcpy(to, from, cr->len);
3085 
3086             // Store quality in original orientation for better compression.
3087             if (!c->qs_seq_orient) {
3088                 if (cr->flags & BAM_FREVERSE) {
3089                     int i, j;
3090                     for (i = 0, j = cr->len-1; i < j; i++, j--) {
3091                         unsigned char c;
3092                         c = to[i];
3093                         to[i] = to[j];
3094                         to[j] = c;
3095                     }
3096                 }
3097             }
3098         }
3099         BLOCK_SIZE(s->qual_blk) += cr->len;
3100     } else {
3101         if (cr->len == 0)
3102             cr->len = fake_qual >= 0 ? fake_qual : cr->aend - cr->apos + 1;
3103     }
3104 
3105     if (cram_stats_add(c->stats[DS_RL], cr->len) < 0)
3106         goto block_err;
3107 
3108     /* Now we know apos and aend both, update mate-pair information */
3109     {
3110         int new;
3111         khint_t k;
3112         int sec = (cr->flags & BAM_FSECONDARY) ? 1 : 0;
3113 
3114         //fprintf(stderr, "Checking %"PRId64"/%.*s\t", rnum,
3115         //      cr->name_len, DSTRING_STR(s->name_ds)+cr->name);
3116         if (cr->flags & BAM_FPAIRED) {
3117             char *key = string_ndup(s->pair_keys, bam_name(b), bam_name_len(b));
3118             if (!key)
3119                 return -1;
3120 
3121             k = kh_put(m_s2i, s->pair[sec], key, &new);
3122             if (-1 == new)
3123                 return -1;
3124             else if (new > 0)
3125                 kh_val(s->pair[sec], k) = rnum;
3126         } else {
3127             new = 1;
3128             k = 0; // Prevents false-positive warning from gcc -Og
3129         }
3130 
3131         if (new == 0) {
3132             cram_record *p = &s->crecs[kh_val(s->pair[sec], k)];
3133             int64_t aleft, aright;
3134             int sign;
3135 
3136             aleft = MIN(cr->apos, p->apos);
3137             aright = MAX(cr->aend, p->aend);
3138             if (cr->apos < p->apos) {
3139                 sign = 1;
3140             } else if (cr->apos > p->apos) {
3141                 sign = -1;
3142             } else if (cr->flags & BAM_FREAD1) {
3143                 sign = 1;
3144             } else {
3145                 sign = -1;
3146             }
3147 
3148             // This vs p: tlen, matepos, flags. Permit TLEN 0 and/or TLEN +/-
3149             // a small amount, if appropriate options set.
3150             if ((!fd->tlen_zero && MAX(bam_mate_pos(b)+1, 0) != p->apos) &&
3151                 !(fd->tlen_zero && bam_mate_pos(b) == 0))
3152                 goto detached;
3153 
3154             if (((bam_flag(b) & BAM_FMUNMAP) != 0) !=
3155                 ((p->flags & BAM_FUNMAP) != 0))
3156                 goto detached;
3157 
3158             if (((bam_flag(b) & BAM_FMREVERSE) != 0) !=
3159                 ((p->flags & BAM_FREVERSE) != 0))
3160                 goto detached;
3161 
3162 
3163             // p vs this: tlen, matepos, flags
3164             if (p->ref_id != cr->ref_id &&
3165                 !(fd->tlen_zero && p->ref_id == -1))
3166                 goto detached;
3167 
3168             if (p->mate_pos != cr->apos &&
3169                 !(fd->tlen_zero && p->mate_pos == 0))
3170                 goto detached;
3171 
3172             if (((p->flags & BAM_FMUNMAP) != 0) !=
3173                 ((p->mate_flags & CRAM_M_UNMAP) != 0))
3174                 goto detached;
3175 
3176             if (((p->flags & BAM_FMREVERSE) != 0) !=
3177                 ((p->mate_flags & CRAM_M_REVERSE) != 0))
3178                 goto detached;
3179 
3180             // Supplementary reads are just too ill defined
3181             if ((cr->flags & BAM_FSUPPLEMENTARY) ||
3182                 (p->flags & BAM_FSUPPLEMENTARY))
3183                 goto detached;
3184 
3185             // When in lossy name mode, if a read isn't detached we
3186             // cannot store the name.  The corollary is that when we
3187             // must store the name, it must be detached (inefficient).
3188             if (fd->lossy_read_names &&
3189                 (!(cr->cram_flags & CRAM_FLAG_DISCARD_NAME) ||
3190                  !((p->cram_flags & CRAM_FLAG_DISCARD_NAME))))
3191                 goto detached;
3192 
3193             // Now check TLEN.  We do this last as sometimes it's the
3194             // only thing that differs.  In CRAM4 we have a better way
3195             // of handling this that doesn't break detached status
3196             int explicit_tlen = 0;
3197             int tflag1 = ((bam_ins_size(b) &&
3198                            llabs(bam_ins_size(b) - sign*(aright-aleft+1))
3199                            > fd->tlen_approx)
3200                           || (!bam_ins_size(b) && !fd->tlen_zero));
3201 
3202             int tflag2 = ((p->tlen && llabs(p->tlen - -sign*(aright-aleft+1))
3203                            > fd->tlen_approx)
3204                           || (!p->tlen && !fd->tlen_zero));
3205 
3206             if (tflag1 || tflag2) {
3207                 if (CRAM_MAJOR_VERS(fd->version) >= 4) {
3208                     explicit_tlen = CRAM_FLAG_EXPLICIT_TLEN;
3209                 } else {
3210                     // Stil do detached for unmapped data in CRAM4 as this
3211                     // also impacts RNEXT calculation.
3212                     goto detached;
3213                 }
3214             }
3215 
3216             /*
3217              * The fields below are unused when encoding this read as it is
3218              * no longer detached.  In theory they may get referred to when
3219              * processing a 3rd or 4th read in this template?, so we set them
3220              * here just to be sure.
3221              *
3222              * They do not need cram_stats_add() calls those as they are
3223              * not emitted.
3224              */
3225             cr->mate_pos = p->apos;
3226             cram_stats_add(c->stats[DS_NP], cr->mate_pos);
3227             cr->tlen = explicit_tlen ? bam_ins_size(b) : sign*(aright-aleft+1);
3228             cram_stats_add(c->stats[DS_TS], cr->tlen);
3229             cr->mate_flags =
3230                 ((p->flags & BAM_FMUNMAP)   == BAM_FMUNMAP)   * CRAM_M_UNMAP +
3231                 ((p->flags & BAM_FMREVERSE) == BAM_FMREVERSE) * CRAM_M_REVERSE;
3232 
3233             // Decrement statistics aggregated earlier
3234             if (p->cram_flags & CRAM_FLAG_STATS_ADDED) {
3235                 cram_stats_del(c->stats[DS_NP], p->mate_pos);
3236                 cram_stats_del(c->stats[DS_MF], p->mate_flags);
3237                 if (!(p->cram_flags & CRAM_FLAG_EXPLICIT_TLEN))
3238                     cram_stats_del(c->stats[DS_TS], p->tlen);
3239                 cram_stats_del(c->stats[DS_NS], p->mate_ref_id);
3240             }
3241 
3242             /* Similarly we could correct the p-> values too, but these will no
3243              * longer have any code that refers back to them as the new 'p'
3244              * for this template is our current 'cr'.
3245              */
3246             //p->mate_pos = cr->apos;
3247             //p->mate_flags =
3248             //  ((cr->flags & BAM_FMUNMAP)   == BAM_FMUNMAP)  * CRAM_M_UNMAP +
3249             //  ((cr->flags & BAM_FMREVERSE) == BAM_FMREVERSE)* CRAM_M_REVERSE;
3250             //p->tlen = p->apos - cr->aend;
3251 
3252             // Clear detached from cr flags
3253             cr->cram_flags &= ~CRAM_FLAG_DETACHED;
3254             cr->cram_flags |= explicit_tlen;
3255             if (cram_stats_add(c->stats[DS_CF], cr->cram_flags & CRAM_FLAG_MASK) < 0)
3256                 goto block_err;
3257 
3258             // Clear detached from p flags and set downstream
3259             if (p->cram_flags & CRAM_FLAG_STATS_ADDED) {
3260                 cram_stats_del(c->stats[DS_CF], p->cram_flags & CRAM_FLAG_MASK);
3261                 p->cram_flags &= ~CRAM_FLAG_STATS_ADDED;
3262             }
3263 
3264             p->cram_flags  &= ~CRAM_FLAG_DETACHED;
3265             p->cram_flags  |=  CRAM_FLAG_MATE_DOWNSTREAM | explicit_tlen;;
3266             if (cram_stats_add(c->stats[DS_CF], p->cram_flags & CRAM_FLAG_MASK) < 0)
3267                 goto block_err;
3268 
3269             p->mate_line = rnum - (kh_val(s->pair[sec], k) + 1);
3270             if (cram_stats_add(c->stats[DS_NF], p->mate_line) < 0)
3271                 goto block_err;
3272 
3273             kh_val(s->pair[sec], k) = rnum;
3274         } else {
3275         detached:
3276             //fprintf(stderr, "unpaired\n");
3277 
3278             /* Derive mate flags from this flag */
3279             cr->mate_flags = 0;
3280             if (bam_flag(b) & BAM_FMUNMAP)
3281                 cr->mate_flags |= CRAM_M_UNMAP;
3282             if (bam_flag(b) & BAM_FMREVERSE)
3283                 cr->mate_flags |= CRAM_M_REVERSE;
3284 
3285             if (cram_stats_add(c->stats[DS_MF], cr->mate_flags) < 0)
3286                 goto block_err;
3287 
3288             cr->mate_pos    = MAX(bam_mate_pos(b)+1, 0);
3289             if (cram_stats_add(c->stats[DS_NP], cr->mate_pos) < 0)
3290                 goto block_err;
3291 
3292             cr->tlen        = bam_ins_size(b);
3293             if (cram_stats_add(c->stats[DS_TS], cr->tlen) < 0)
3294                 goto block_err;
3295 
3296             cr->cram_flags |= CRAM_FLAG_DETACHED;
3297             if (cram_stats_add(c->stats[DS_CF], cr->cram_flags & CRAM_FLAG_MASK) < 0)
3298                 goto block_err;
3299             if (cram_stats_add(c->stats[DS_NS], bam_mate_ref(b)) < 0)
3300                 goto block_err;
3301 
3302             cr->cram_flags |= CRAM_FLAG_STATS_ADDED;
3303         }
3304     }
3305 
3306     cr->mqual       = bam_map_qual(b);
3307     if (cram_stats_add(c->stats[DS_MQ], cr->mqual) < 0)
3308         goto block_err;
3309 
3310     cr->mate_ref_id = bam_mate_ref(b);
3311 
3312     if (!(bam_flag(b) & BAM_FUNMAP)) {
3313         if (c->first_base > cr->apos)
3314             c->first_base = cr->apos;
3315 
3316         if (c->last_base < cr->aend)
3317             c->last_base = cr->aend;
3318     }
3319 
3320     return 0;
3321 
3322  block_err:
3323  err:
3324     return -1;
3325 }
3326 
3327 /*
3328  * Write iterator: put BAM format sequences into a CRAM file.
3329  * We buffer up a containers worth of data at a time.
3330  *
3331  * Returns 0 on success
3332  *        -1 on failure
3333  */
cram_put_bam_seq(cram_fd * fd,bam_seq_t * b)3334 int cram_put_bam_seq(cram_fd *fd, bam_seq_t *b) {
3335     cram_container *c;
3336 
3337     if (!fd->ctr) {
3338         fd->ctr = cram_new_container(fd->seqs_per_slice,
3339                                      fd->slices_per_container);
3340         if (!fd->ctr)
3341             return -1;
3342         fd->ctr->record_counter = fd->record_counter;
3343     }
3344     c = fd->ctr;
3345 
3346     if (!c->slice || c->curr_rec == c->max_rec ||
3347         (bam_ref(b) != c->curr_ref && c->curr_ref >= -1) ||
3348         (c->s_num_bases >= fd->bases_per_slice)) {
3349         int slice_rec, curr_rec, multi_seq = fd->multi_seq == 1;
3350         int curr_ref = c->slice ? c->curr_ref : bam_ref(b);
3351 
3352         /*
3353          * Start packing slices when we routinely have under 1/4tr full.
3354          *
3355          * This option isn't available if we choose to embed references
3356          * since we can only have one per slice.
3357          *
3358          * The multi_seq var here refers to our intention for the next slice.
3359          * This slice has already been encoded so we output as-is.
3360          */
3361         if (fd->multi_seq == -1 && c->curr_rec < c->max_rec/4+10 &&
3362             fd->last_slice && fd->last_slice < c->max_rec/4+10 &&
3363             !fd->embed_ref) {
3364             if (!c->multi_seq)
3365                 hts_log_info("Multi-ref enabled for next container");
3366             multi_seq = 1;
3367         } else if (fd->multi_seq == 1) {
3368             pthread_mutex_lock(&fd->metrics_lock);
3369             if (fd->last_RI_count <= c->max_slice && fd->multi_seq_user != 1) {
3370                 multi_seq = 0;
3371                 hts_log_info("Multi-ref disabled for next container");
3372             }
3373             pthread_mutex_unlock(&fd->metrics_lock);
3374         }
3375 
3376         slice_rec = c->slice_rec;
3377         curr_rec  = c->curr_rec;
3378 
3379         if (CRAM_MAJOR_VERS(fd->version) == 1 ||
3380             c->curr_rec == c->max_rec || fd->multi_seq != 1 || !c->slice ||
3381             c->s_num_bases >= fd->bases_per_slice) {
3382             if (NULL == (c = cram_next_container(fd, b))) {
3383                 if (fd->ctr) {
3384                     // prevent cram_close attempting to flush
3385                     cram_free_container(fd->ctr);
3386                     fd->ctr = NULL;
3387                 }
3388                 return -1;
3389             }
3390         }
3391 
3392         /*
3393          * Due to our processing order, some things we've already done we
3394          * cannot easily undo. So when we first notice we should be packing
3395          * multiple sequences per container we emit the small partial
3396          * container as-is and then start a fresh one in a different mode.
3397          */
3398         if (multi_seq == 0 && fd->multi_seq == 1 && fd->multi_seq_user == -1) {
3399             // User selected auto-mode, we're currently using multi-seq, but
3400             // have detected we don't need to.  Switch back to auto.
3401             fd->multi_seq = -1;
3402         } else if (multi_seq) {
3403             // We detected we need multi-seq
3404             fd->multi_seq = 1;
3405             c->multi_seq = 1;
3406             c->pos_sorted = 0;
3407 
3408             if (!c->refs_used) {
3409                 pthread_mutex_lock(&fd->ref_lock);
3410                 c->refs_used = calloc(fd->refs->nref, sizeof(int));
3411                 pthread_mutex_unlock(&fd->ref_lock);
3412                 if (!c->refs_used)
3413                     return -1;
3414             }
3415         }
3416 
3417         fd->last_slice = curr_rec - slice_rec;
3418         c->slice_rec = c->curr_rec;
3419 
3420         // Have we seen this reference before?
3421         if (bam_ref(b) >= 0 && curr_ref >= 0 && bam_ref(b) != curr_ref && !fd->embed_ref &&
3422             !fd->unsorted && multi_seq) {
3423 
3424             if (!c->refs_used) {
3425                 pthread_mutex_lock(&fd->ref_lock);
3426                 c->refs_used = calloc(fd->refs->nref, sizeof(int));
3427                 pthread_mutex_unlock(&fd->ref_lock);
3428                 if (!c->refs_used)
3429                     return -1;
3430             } else if (c->refs_used && c->refs_used[bam_ref(b)]) {
3431                 pthread_mutex_lock(&fd->ref_lock);
3432                 fd->unsorted = 1;
3433                 pthread_mutex_unlock(&fd->ref_lock);
3434                 fd->multi_seq = 1;
3435             }
3436         }
3437 
3438         c->curr_ref = bam_ref(b);
3439         if (c->refs_used && c->curr_ref >= 0) c->refs_used[c->curr_ref]++;
3440     }
3441 
3442     if (!c->bams) {
3443         /* First time through, allocate a set of bam pointers */
3444         pthread_mutex_lock(&fd->bam_list_lock);
3445         if (fd->bl) {
3446             spare_bams *spare = fd->bl;
3447             c->bams = spare->bams;
3448             fd->bl = spare->next;
3449             free(spare);
3450         } else {
3451             c->bams = calloc(c->max_c_rec, sizeof(bam_seq_t *));
3452             if (!c->bams) {
3453                 pthread_mutex_unlock(&fd->bam_list_lock);
3454                 return -1;
3455             }
3456         }
3457         pthread_mutex_unlock(&fd->bam_list_lock);
3458     }
3459 
3460     /* Copy or alloc+copy the bam record, for later encoding */
3461     if (c->bams[c->curr_c_rec]) {
3462         if (bam_copy1(c->bams[c->curr_c_rec], b) == NULL)
3463             return -1;
3464     } else {
3465         c->bams[c->curr_c_rec] = bam_dup1(b);
3466         if (c->bams[c->curr_c_rec] == NULL)
3467             return -1;
3468     }
3469     c->curr_rec++;
3470     c->curr_c_rec++;
3471     c->s_num_bases += bam_seq_len(b);
3472     c->n_mapped += (bam_flag(b) & BAM_FUNMAP) ? 0 : 1;
3473     fd->record_counter++;
3474 
3475     return 0;
3476 }
3477