1 /*
2 Copyright (c) 2012-2021 Genome Research Ltd.
3 Author: James Bonfield <jkb@sanger.ac.uk>
4 
5 Redistribution and use in source and binary forms, with or without
6 modification, are permitted provided that the following conditions are met:
7 
8    1. Redistributions of source code must retain the above copyright notice,
9 this list of conditions and the following disclaimer.
10 
11    2. Redistributions in binary form must reproduce the above copyright notice,
12 this list of conditions and the following disclaimer in the documentation
13 and/or other materials provided with the distribution.
14 
15    3. Neither the names Genome Research Ltd and Wellcome Trust Sanger
16 Institute nor the names of its contributors may be used to endorse or promote
17 products derived from this software without specific prior written permission.
18 
19 THIS SOFTWARE IS PROVIDED BY GENOME RESEARCH LTD AND CONTRIBUTORS "AS IS" AND
20 ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
21 WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
22 DISCLAIMED. IN NO EVENT SHALL GENOME RESEARCH LTD OR CONTRIBUTORS BE LIABLE
23 FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
24 DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
25 SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
26 CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
27 OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
28 OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
29 */
30 
31 /*
32  * CRAM I/O primitives.
33  *
34  * - ITF8 encoding and decoding.
35  * - Block based I/O
36  * - Zlib inflating and deflating (memory)
37  * - CRAM basic data structure reading and writing
38  * - File opening / closing
39  * - Reference sequence handling
40  */
41 
42 #define HTS_BUILDING_LIBRARY // Enables HTSLIB_EXPORT, see htslib/hts_defs.h
43 #include <config.h>
44 
45 #include <stdio.h>
46 #include <errno.h>
47 #include <assert.h>
48 #include <stdlib.h>
49 #include <string.h>
50 #include <unistd.h>
51 #include <zlib.h>
52 #ifdef HAVE_LIBBZ2
53 #include <bzlib.h>
54 #endif
55 #ifdef HAVE_LIBLZMA
56 #ifdef HAVE_LZMA_H
57 #include <lzma.h>
58 #else
59 #include "../os/lzma_stub.h"
60 #endif
61 #endif
62 #include <sys/types.h>
63 #include <sys/stat.h>
64 #include <math.h>
65 #include <stdint.h>
66 
67 #ifdef HAVE_LIBDEFLATE
68 #include <libdeflate.h>
69 #define crc32(a,b,c) libdeflate_crc32((a),(b),(c))
70 #endif
71 
72 #include "cram.h"
73 #include "os.h"
74 #include "../htslib/hts.h"
75 #include "open_trace_file.h"
76 
77 #if defined(HAVE_EXTERNAL_LIBHTSCODECS)
78 #include <htscodecs/rANS_static.h>
79 #include <htscodecs/rANS_static4x16.h>
80 #include <htscodecs/arith_dynamic.h>
81 #include <htscodecs/tokenise_name3.h>
82 #include <htscodecs/fqzcomp_qual.h>
83 #include <htscodecs/varint.h> // CRAM v4.0 variable-size integers
84 #else
85 #include "../htscodecs/htscodecs/rANS_static.h"
86 #include "../htscodecs/htscodecs/rANS_static4x16.h"
87 #include "../htscodecs/htscodecs/arith_dynamic.h"
88 #include "../htscodecs/htscodecs/tokenise_name3.h"
89 #include "../htscodecs/htscodecs/fqzcomp_qual.h"
90 #include "../htscodecs/htscodecs/varint.h"
91 #endif
92 
93 //#define REF_DEBUG
94 
95 #ifdef REF_DEBUG
96 #include <sys/syscall.h>
97 #define gettid() (int)syscall(SYS_gettid)
98 
99 #define RP(...) fprintf (stderr, __VA_ARGS__)
100 #else
101 #define RP(...)
102 #endif
103 
104 #include "../htslib/hfile.h"
105 #include "../htslib/bgzf.h"
106 #include "../htslib/faidx.h"
107 #include "../hts_internal.h"
108 
109 #ifndef PATH_MAX
110 #define PATH_MAX FILENAME_MAX
111 #endif
112 
113 #define TRIAL_SPAN 70
114 #define NTRIALS 3
115 
116 #define CRAM_DEFAULT_LEVEL 5
117 
118 /* ----------------------------------------------------------------------
119  * ITF8 encoding and decoding.
120  *
121  * Also see the itf8_get and itf8_put macros in cram_io.h
122  */
123 
124 /*
125  * LEGACY: consider using itf8_decode_crc.
126  *
127  * Reads an integer in ITF-8 encoding from 'cp' and stores it in
128  * *val.
129  *
130  * Returns the number of bytes read on success
131  *        -1 on failure
132  */
itf8_decode(cram_fd * fd,int32_t * val_p)133 int itf8_decode(cram_fd *fd, int32_t *val_p) {
134     static int nbytes[16] = {
135         0,0,0,0, 0,0,0,0,                               // 0000xxxx - 0111xxxx
136         1,1,1,1,                                        // 1000xxxx - 1011xxxx
137         2,2,                                            // 1100xxxx - 1101xxxx
138         3,                                              // 1110xxxx
139         4,                                              // 1111xxxx
140     };
141 
142     static int nbits[16] = {
143         0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, // 0000xxxx - 0111xxxx
144         0x3f, 0x3f, 0x3f, 0x3f,                         // 1000xxxx - 1011xxxx
145         0x1f, 0x1f,                                     // 1100xxxx - 1101xxxx
146         0x0f,                                           // 1110xxxx
147         0x0f,                                           // 1111xxxx
148     };
149 
150     int32_t val = hgetc(fd->fp);
151     if (val == -1)
152         return -1;
153 
154     int i = nbytes[val>>4];
155     val &= nbits[val>>4];
156 
157     switch(i) {
158     case 0:
159         *val_p = val;
160         return 1;
161 
162     case 1:
163         val = (val<<8) | (unsigned char)hgetc(fd->fp);
164         *val_p = val;
165         return 2;
166 
167     case 2:
168         val = (val<<8) | (unsigned char)hgetc(fd->fp);
169         val = (val<<8) | (unsigned char)hgetc(fd->fp);
170         *val_p = val;
171         return 3;
172 
173     case 3:
174         val = (val<<8) | (unsigned char)hgetc(fd->fp);
175         val = (val<<8) | (unsigned char)hgetc(fd->fp);
176         val = (val<<8) | (unsigned char)hgetc(fd->fp);
177         *val_p = val;
178         return 4;
179 
180     case 4: // really 3.5 more, why make it different?
181         val = (val<<8) | (unsigned char)hgetc(fd->fp);
182         val = (val<<8) | (unsigned char)hgetc(fd->fp);
183         val = (val<<8) | (unsigned char)hgetc(fd->fp);
184         val = (val<<4) | (((unsigned char)hgetc(fd->fp)) & 0x0f);
185         *val_p = val;
186     }
187 
188     return 5;
189 }
190 
itf8_decode_crc(cram_fd * fd,int32_t * val_p,uint32_t * crc)191 int itf8_decode_crc(cram_fd *fd, int32_t *val_p, uint32_t *crc) {
192     static int nbytes[16] = {
193         0,0,0,0, 0,0,0,0,                               // 0000xxxx - 0111xxxx
194         1,1,1,1,                                        // 1000xxxx - 1011xxxx
195         2,2,                                            // 1100xxxx - 1101xxxx
196         3,                                              // 1110xxxx
197         4,                                              // 1111xxxx
198     };
199 
200     static int nbits[16] = {
201         0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, // 0000xxxx - 0111xxxx
202         0x3f, 0x3f, 0x3f, 0x3f,                         // 1000xxxx - 1011xxxx
203         0x1f, 0x1f,                                     // 1100xxxx - 1101xxxx
204         0x0f,                                           // 1110xxxx
205         0x0f,                                           // 1111xxxx
206     };
207     unsigned char c[5];
208 
209     int32_t val = hgetc(fd->fp);
210     if (val == -1)
211         return -1;
212 
213     c[0]=val;
214 
215     int i = nbytes[val>>4];
216     val &= nbits[val>>4];
217 
218     if (i > 0) {
219         if (hread(fd->fp, &c[1], i) < i)
220             return -1;
221     }
222 
223     switch(i) {
224     case 0:
225         *val_p = val;
226         *crc = crc32(*crc, c, 1);
227         return 1;
228 
229     case 1:
230         val = (val<<8) | c[1];
231         *val_p = val;
232         *crc = crc32(*crc, c, 2);
233         return 2;
234 
235     case 2:
236         val = (val<<8) | c[1];
237         val = (val<<8) | c[2];
238         *val_p = val;
239         *crc = crc32(*crc, c, 3);
240         return 3;
241 
242     case 3:
243         val = (val<<8) | c[1];
244         val = (val<<8) | c[2];
245         val = (val<<8) | c[3];
246         *val_p = val;
247         *crc = crc32(*crc, c, 4);
248         return 4;
249 
250     case 4: // really 3.5 more, why make it different?
251         {
252             uint32_t uv = val;
253             uv = (uv<<8) |  c[1];
254             uv = (uv<<8) |  c[2];
255             uv = (uv<<8) |  c[3];
256             uv = (uv<<4) | (c[4] & 0x0f);
257             // Avoid implementation-defined behaviour on negative values
258             *val_p = uv < 0x80000000UL ? (int32_t) uv : -((int32_t) (0xffffffffUL - uv)) - 1;
259             *crc = crc32(*crc, c, 5);
260         }
261     }
262 
263     return 5;
264 }
265 
266 /*
267  * Stores a value to memory in ITF-8 format.
268  *
269  * Returns the number of bytes required to store the number.
270  * This is a maximum of 5 bytes.
271  */
itf8_put(char * cp,int32_t val)272 static inline int itf8_put(char *cp, int32_t val) {
273     unsigned char *up = (unsigned char *)cp;
274     if        (!(val & ~0x00000007f)) { // 1 byte
275         *up = val;
276         return 1;
277     } else if (!(val & ~0x00003fff)) { // 2 byte
278         *up++ = (val >> 8 ) | 0x80;
279         *up   = val & 0xff;
280         return 2;
281     } else if (!(val & ~0x01fffff)) { // 3 byte
282         *up++ = (val >> 16) | 0xc0;
283         *up++ = (val >> 8 ) & 0xff;
284         *up   = val & 0xff;
285         return 3;
286     } else if (!(val & ~0x0fffffff)) { // 4 byte
287         *up++ = (val >> 24) | 0xe0;
288         *up++ = (val >> 16) & 0xff;
289         *up++ = (val >> 8 ) & 0xff;
290         *up   = val & 0xff;
291         return 4;
292     } else {                           // 5 byte
293         *up++ = 0xf0 | ((val>>28) & 0xff);
294         *up++ = (val >> 20) & 0xff;
295         *up++ = (val >> 12) & 0xff;
296         *up++ = (val >> 4 ) & 0xff;
297         *up = val & 0x0f;
298         return 5;
299     }
300 }
301 
302 
303 /* 64-bit itf8 variant */
ltf8_put(char * cp,int64_t val)304 static inline int ltf8_put(char *cp, int64_t val) {
305     unsigned char *up = (unsigned char *)cp;
306     if        (!(val & ~((1LL<<7)-1))) {
307         *up = val;
308         return 1;
309     } else if (!(val & ~((1LL<<(6+8))-1))) {
310         *up++ = (val >> 8 ) | 0x80;
311         *up   = val & 0xff;
312         return 2;
313     } else if (!(val & ~((1LL<<(5+2*8))-1))) {
314         *up++ = (val >> 16) | 0xc0;
315         *up++ = (val >> 8 ) & 0xff;
316         *up   = val & 0xff;
317         return 3;
318     } else if (!(val & ~((1LL<<(4+3*8))-1))) {
319         *up++ = (val >> 24) | 0xe0;
320         *up++ = (val >> 16) & 0xff;
321         *up++ = (val >> 8 ) & 0xff;
322         *up   = val & 0xff;
323         return 4;
324     } else if (!(val & ~((1LL<<(3+4*8))-1))) {
325         *up++ = (val >> 32) | 0xf0;
326         *up++ = (val >> 24) & 0xff;
327         *up++ = (val >> 16) & 0xff;
328         *up++ = (val >> 8 ) & 0xff;
329         *up   = val & 0xff;
330         return 5;
331     } else if (!(val & ~((1LL<<(2+5*8))-1))) {
332         *up++ = (val >> 40) | 0xf8;
333         *up++ = (val >> 32) & 0xff;
334         *up++ = (val >> 24) & 0xff;
335         *up++ = (val >> 16) & 0xff;
336         *up++ = (val >> 8 ) & 0xff;
337         *up   = val & 0xff;
338         return 6;
339     } else if (!(val & ~((1LL<<(1+6*8))-1))) {
340         *up++ = (val >> 48) | 0xfc;
341         *up++ = (val >> 40) & 0xff;
342         *up++ = (val >> 32) & 0xff;
343         *up++ = (val >> 24) & 0xff;
344         *up++ = (val >> 16) & 0xff;
345         *up++ = (val >> 8 ) & 0xff;
346         *up   = val & 0xff;
347         return 7;
348     } else if (!(val & ~((1LL<<(7*8))-1))) {
349         *up++ = (val >> 56) | 0xfe;
350         *up++ = (val >> 48) & 0xff;
351         *up++ = (val >> 40) & 0xff;
352         *up++ = (val >> 32) & 0xff;
353         *up++ = (val >> 24) & 0xff;
354         *up++ = (val >> 16) & 0xff;
355         *up++ = (val >> 8 ) & 0xff;
356         *up   = val & 0xff;
357         return 8;
358     } else {
359         *up++ = 0xff;
360         *up++ = (val >> 56) & 0xff;
361         *up++ = (val >> 48) & 0xff;
362         *up++ = (val >> 40) & 0xff;
363         *up++ = (val >> 32) & 0xff;
364         *up++ = (val >> 24) & 0xff;
365         *up++ = (val >> 16) & 0xff;
366         *up++ = (val >> 8 ) & 0xff;
367         *up   = val & 0xff;
368         return 9;
369     }
370 }
371 
372 /*
373  * Encodes and writes a single integer in ITF-8 format.
374  * Returns 0 on success
375  *        -1 on failure
376  */
itf8_encode(cram_fd * fd,int32_t val)377 int itf8_encode(cram_fd *fd, int32_t val) {
378     char buf[5];
379     int len = itf8_put(buf, val);
380     return hwrite(fd->fp, buf, len) == len ? 0 : -1;
381 }
382 
383 const int itf8_bytes[16] = {
384     1, 1, 1, 1,  1, 1, 1, 1,
385     2, 2, 2, 2,  3, 3, 4, 5
386 };
387 
388 const int ltf8_bytes[256] = {
389     1, 1, 1, 1,  1, 1, 1, 1,  1, 1, 1, 1,  1, 1, 1, 1,
390     1, 1, 1, 1,  1, 1, 1, 1,  1, 1, 1, 1,  1, 1, 1, 1,
391     1, 1, 1, 1,  1, 1, 1, 1,  1, 1, 1, 1,  1, 1, 1, 1,
392     1, 1, 1, 1,  1, 1, 1, 1,  1, 1, 1, 1,  1, 1, 1, 1,
393 
394     1, 1, 1, 1,  1, 1, 1, 1,  1, 1, 1, 1,  1, 1, 1, 1,
395     1, 1, 1, 1,  1, 1, 1, 1,  1, 1, 1, 1,  1, 1, 1, 1,
396     1, 1, 1, 1,  1, 1, 1, 1,  1, 1, 1, 1,  1, 1, 1, 1,
397     1, 1, 1, 1,  1, 1, 1, 1,  1, 1, 1, 1,  1, 1, 1, 1,
398 
399     2, 2, 2, 2,  2, 2, 2, 2,  2, 2, 2, 2,  2, 2, 2, 2,
400     2, 2, 2, 2,  2, 2, 2, 2,  2, 2, 2, 2,  2, 2, 2, 2,
401     2, 2, 2, 2,  2, 2, 2, 2,  2, 2, 2, 2,  2, 2, 2, 2,
402     2, 2, 2, 2,  2, 2, 2, 2,  2, 2, 2, 2,  2, 2, 2, 2,
403 
404     3, 3, 3, 3,  3, 3, 3, 3,  3, 3, 3, 3,  3, 3, 3, 3,
405     3, 3, 3, 3,  3, 3, 3, 3,  3, 3, 3, 3,  3, 3, 3, 3,
406 
407     4, 4, 4, 4,  4, 4, 4, 4,  4, 4, 4, 4,  4, 4, 4, 4,
408 
409     5, 5, 5, 5,  5, 5, 5, 5,  6, 6, 6, 6,  7, 7, 8, 9
410 };
411 
412 /*
413  * LEGACY: consider using ltf8_decode_crc.
414  */
ltf8_decode(cram_fd * fd,int64_t * val_p)415 int ltf8_decode(cram_fd *fd, int64_t *val_p) {
416     int c = hgetc(fd->fp);
417     int64_t val = (unsigned char)c;
418     if (c == -1)
419         return -1;
420 
421     if (val < 0x80) {
422         *val_p =   val;
423         return 1;
424 
425     } else if (val < 0xc0) {
426         val = (val<<8) | (unsigned char)hgetc(fd->fp);
427         *val_p = val & (((1LL<<(6+8)))-1);
428         return 2;
429 
430     } else if (val < 0xe0) {
431         val = (val<<8) | (unsigned char)hgetc(fd->fp);
432         val = (val<<8) | (unsigned char)hgetc(fd->fp);
433         *val_p = val & ((1LL<<(5+2*8))-1);
434         return 3;
435 
436     } else if (val < 0xf0) {
437         val = (val<<8) | (unsigned char)hgetc(fd->fp);
438         val = (val<<8) | (unsigned char)hgetc(fd->fp);
439         val = (val<<8) | (unsigned char)hgetc(fd->fp);
440         *val_p = val & ((1LL<<(4+3*8))-1);
441         return 4;
442 
443     } else if (val < 0xf8) {
444         val = (val<<8) | (unsigned char)hgetc(fd->fp);
445         val = (val<<8) | (unsigned char)hgetc(fd->fp);
446         val = (val<<8) | (unsigned char)hgetc(fd->fp);
447         val = (val<<8) | (unsigned char)hgetc(fd->fp);
448         *val_p = val & ((1LL<<(3+4*8))-1);
449         return 5;
450 
451     } else if (val < 0xfc) {
452         val = (val<<8) | (unsigned char)hgetc(fd->fp);
453         val = (val<<8) | (unsigned char)hgetc(fd->fp);
454         val = (val<<8) | (unsigned char)hgetc(fd->fp);
455         val = (val<<8) | (unsigned char)hgetc(fd->fp);
456         val = (val<<8) | (unsigned char)hgetc(fd->fp);
457         *val_p = val & ((1LL<<(2+5*8))-1);
458         return 6;
459 
460     } else if (val < 0xfe) {
461         val = (val<<8) | (unsigned char)hgetc(fd->fp);
462         val = (val<<8) | (unsigned char)hgetc(fd->fp);
463         val = (val<<8) | (unsigned char)hgetc(fd->fp);
464         val = (val<<8) | (unsigned char)hgetc(fd->fp);
465         val = (val<<8) | (unsigned char)hgetc(fd->fp);
466         val = (val<<8) | (unsigned char)hgetc(fd->fp);
467         *val_p = val & ((1LL<<(1+6*8))-1);
468         return 7;
469 
470     } else if (val < 0xff) {
471         val = (val<<8) | (unsigned char)hgetc(fd->fp);
472         val = (val<<8) | (unsigned char)hgetc(fd->fp);
473         val = (val<<8) | (unsigned char)hgetc(fd->fp);
474         val = (val<<8) | (unsigned char)hgetc(fd->fp);
475         val = (val<<8) | (unsigned char)hgetc(fd->fp);
476         val = (val<<8) | (unsigned char)hgetc(fd->fp);
477         val = (val<<8) | (unsigned char)hgetc(fd->fp);
478         *val_p = val & ((1LL<<(7*8))-1);
479         return 8;
480 
481     } else {
482         val = (val<<8) | (unsigned char)hgetc(fd->fp);
483         val = (val<<8) | (unsigned char)hgetc(fd->fp);
484         val = (val<<8) | (unsigned char)hgetc(fd->fp);
485         val = (val<<8) | (unsigned char)hgetc(fd->fp);
486         val = (val<<8) | (unsigned char)hgetc(fd->fp);
487         val = (val<<8) | (unsigned char)hgetc(fd->fp);
488         val = (val<<8) | (unsigned char)hgetc(fd->fp);
489         val = (val<<8) | (unsigned char)hgetc(fd->fp);
490         *val_p = val;
491     }
492 
493     return 9;
494 }
495 
ltf8_decode_crc(cram_fd * fd,int64_t * val_p,uint32_t * crc)496 int ltf8_decode_crc(cram_fd *fd, int64_t *val_p, uint32_t *crc) {
497     unsigned char c[9];
498     int64_t val = hgetc(fd->fp);
499     if (val < 0)
500         return -1;
501 
502     c[0] = val;
503 
504     if (val < 0x80) {
505         *val_p =   val;
506         *crc = crc32(*crc, c, 1);
507         return 1;
508 
509     } else if (val < 0xc0) {
510         int v = hgetc(fd->fp);
511         if (v < 0)
512             return -1;
513         val = (val<<8) | (c[1]=v);
514         *val_p = val & (((1LL<<(6+8)))-1);
515         *crc = crc32(*crc, c, 2);
516         return 2;
517 
518     } else if (val < 0xe0) {
519         if (hread(fd->fp, &c[1], 2) < 2)
520             return -1;
521         val = (val<<8) | c[1];
522         val = (val<<8) | c[2];
523         *val_p = val & ((1LL<<(5+2*8))-1);
524         *crc = crc32(*crc, c, 3);
525         return 3;
526 
527     } else if (val < 0xf0) {
528         if (hread(fd->fp, &c[1], 3) < 3)
529             return -1;
530         val = (val<<8) | c[1];
531         val = (val<<8) | c[2];
532         val = (val<<8) | c[3];
533         *val_p = val & ((1LL<<(4+3*8))-1);
534         *crc = crc32(*crc, c, 4);
535         return 4;
536 
537     } else if (val < 0xf8) {
538         if (hread(fd->fp, &c[1], 4) < 4)
539             return -1;
540         val = (val<<8) | c[1];
541         val = (val<<8) | c[2];
542         val = (val<<8) | c[3];
543         val = (val<<8) | c[4];
544         *val_p = val & ((1LL<<(3+4*8))-1);
545         *crc = crc32(*crc, c, 5);
546         return 5;
547 
548     } else if (val < 0xfc) {
549         if (hread(fd->fp, &c[1], 5) < 5)
550             return -1;
551         val = (val<<8) | c[1];
552         val = (val<<8) | c[2];
553         val = (val<<8) | c[3];
554         val = (val<<8) | c[4];
555         val = (val<<8) | c[5];
556         *val_p = val & ((1LL<<(2+5*8))-1);
557         *crc = crc32(*crc, c, 6);
558         return 6;
559 
560     } else if (val < 0xfe) {
561         if (hread(fd->fp, &c[1], 6) < 6)
562             return -1;
563         val = (val<<8) | c[1];
564         val = (val<<8) | c[2];
565         val = (val<<8) | c[3];
566         val = (val<<8) | c[4];
567         val = (val<<8) | c[5];
568         val = (val<<8) | c[6];
569         *val_p = val & ((1LL<<(1+6*8))-1);
570         *crc = crc32(*crc, c, 7);
571         return 7;
572 
573     } else if (val < 0xff) {
574         uint64_t uval = val;
575         if (hread(fd->fp, &c[1], 7) < 7)
576             return -1;
577         uval = (uval<<8) | c[1];
578         uval = (uval<<8) | c[2];
579         uval = (uval<<8) | c[3];
580         uval = (uval<<8) | c[4];
581         uval = (uval<<8) | c[5];
582         uval = (uval<<8) | c[6];
583         uval = (uval<<8) | c[7];
584         *val_p = uval & ((1ULL<<(7*8))-1);
585         *crc = crc32(*crc, c, 8);
586         return 8;
587 
588     } else {
589         uint64_t uval;
590         if (hread(fd->fp, &c[1], 8) < 8)
591             return -1;
592         uval =             c[1];
593         uval = (uval<<8) | c[2];
594         uval = (uval<<8) | c[3];
595         uval = (uval<<8) | c[4];
596         uval = (uval<<8) | c[5];
597         uval = (uval<<8) | c[6];
598         uval = (uval<<8) | c[7];
599         uval = (uval<<8) | c[8];
600         *crc = crc32(*crc, c, 9);
601         // Avoid implementation-defined behaviour on negative values
602         *val_p = c[1] < 0x80 ? (int64_t) uval : -((int64_t) (0xffffffffffffffffULL - uval)) - 1;
603     }
604 
605     return 9;
606 }
607 
608 /*
609  * Pushes a value in ITF8 format onto the end of a block.
610  * This shouldn't be used for high-volume data as it is not the fastest
611  * method.
612  *
613  * Returns the number of bytes written
614  */
itf8_put_blk(cram_block * blk,int32_t val)615 int itf8_put_blk(cram_block *blk, int32_t val) {
616     char buf[5];
617     int sz;
618 
619     sz = itf8_put(buf, val);
620     BLOCK_APPEND(blk, buf, sz);
621     return sz;
622 
623  block_err:
624     return -1;
625 }
626 
ltf8_put_blk(cram_block * blk,int64_t val)627 int ltf8_put_blk(cram_block *blk, int64_t val) {
628     char buf[9];
629     int sz;
630 
631     sz = ltf8_put(buf, val);
632     BLOCK_APPEND(blk, buf, sz);
633     return sz;
634 
635  block_err:
636     return -1;
637 }
638 
safe_itf8_get(char ** cp,const char * endp,int * err)639 static int64_t safe_itf8_get(char **cp, const char *endp, int *err) {
640     const unsigned char *up = (unsigned char *)*cp;
641 
642     if (endp && endp - *cp < 5 &&
643         (*cp >= endp || endp - *cp < itf8_bytes[up[0]>>4])) {
644         if (err) *err = 1;
645         return 0;
646     }
647 
648     if (up[0] < 0x80) {
649         (*cp)++;
650         return up[0];
651     } else if (up[0] < 0xc0) {
652         (*cp)+=2;
653         return ((up[0] <<8) |  up[1])                           & 0x3fff;
654     } else if (up[0] < 0xe0) {
655         (*cp)+=3;
656         return ((up[0]<<16) | (up[1]<< 8) |  up[2])             & 0x1fffff;
657     } else if (up[0] < 0xf0) {
658         (*cp)+=4;
659         uint32_t uv = (((uint32_t)up[0]<<24) | (up[1]<<16) | (up[2]<<8) | up[3]) & 0x0fffffff;
660         return (int32_t)uv;
661     } else {
662         (*cp)+=5;
663         uint32_t uv = (((uint32_t)up[0] & 0x0f)<<28) | (up[1]<<20) | (up[2]<<12) | (up[3]<<4) | (up[4] & 0x0f);
664         return (int32_t)uv;
665     }
666 }
667 
safe_ltf8_get(char ** cp,const char * endp,int * err)668 static int64_t safe_ltf8_get(char **cp, const char *endp, int *err) {
669     unsigned char *up = (unsigned char *)*cp;
670 
671     if (endp && endp - *cp < 9 &&
672         (*cp >= endp || endp - *cp < ltf8_bytes[up[0]])) {
673         if (err) *err = 1;
674         return 0;
675     }
676 
677     if (up[0] < 0x80) {
678         (*cp)++;
679         return up[0];
680     } else if (up[0] < 0xc0) {
681         (*cp)+=2;
682         return (((uint64_t)up[0]<< 8) |
683                  (uint64_t)up[1]) & (((1LL<<(6+8)))-1);
684     } else if (up[0] < 0xe0) {
685         (*cp)+=3;
686         return (((uint64_t)up[0]<<16) |
687                 ((uint64_t)up[1]<< 8) |
688                  (uint64_t)up[2]) & ((1LL<<(5+2*8))-1);
689     } else if (up[0] < 0xf0) {
690         (*cp)+=4;
691         return (((uint64_t)up[0]<<24) |
692                 ((uint64_t)up[1]<<16) |
693                 ((uint64_t)up[2]<< 8) |
694                  (uint64_t)up[3]) & ((1LL<<(4+3*8))-1);
695     } else if (up[0] < 0xf8) {
696         (*cp)+=5;
697         return (((uint64_t)up[0]<<32) |
698                 ((uint64_t)up[1]<<24) |
699                 ((uint64_t)up[2]<<16) |
700                 ((uint64_t)up[3]<< 8) |
701                  (uint64_t)up[4]) & ((1LL<<(3+4*8))-1);
702     } else if (up[0] < 0xfc) {
703         (*cp)+=6;
704         return (((uint64_t)up[0]<<40) |
705                 ((uint64_t)up[1]<<32) |
706                 ((uint64_t)up[2]<<24) |
707                 ((uint64_t)up[3]<<16) |
708                 ((uint64_t)up[4]<< 8) |
709                  (uint64_t)up[5]) & ((1LL<<(2+5*8))-1);
710     } else if (up[0] < 0xfe) {
711         (*cp)+=7;
712         return (((uint64_t)up[0]<<48) |
713                 ((uint64_t)up[1]<<40) |
714                 ((uint64_t)up[2]<<32) |
715                 ((uint64_t)up[3]<<24) |
716                 ((uint64_t)up[4]<<16) |
717                 ((uint64_t)up[5]<< 8) |
718                  (uint64_t)up[6]) & ((1LL<<(1+6*8))-1);
719     } else if (up[0] < 0xff) {
720         (*cp)+=8;
721         return (((uint64_t)up[1]<<48) |
722                 ((uint64_t)up[2]<<40) |
723                 ((uint64_t)up[3]<<32) |
724                 ((uint64_t)up[4]<<24) |
725                 ((uint64_t)up[5]<<16) |
726                 ((uint64_t)up[6]<< 8) |
727                  (uint64_t)up[7]) & ((1LL<<(7*8))-1);
728     } else {
729         (*cp)+=9;
730         return (((uint64_t)up[1]<<56) |
731                 ((uint64_t)up[2]<<48) |
732                 ((uint64_t)up[3]<<40) |
733                 ((uint64_t)up[4]<<32) |
734                 ((uint64_t)up[5]<<24) |
735                 ((uint64_t)up[6]<<16) |
736                 ((uint64_t)up[7]<< 8) |
737                  (uint64_t)up[8]);
738     }
739 }
740 
741 // Wrapper for now
safe_itf8_put(char * cp,char * cp_end,int32_t val)742 static int safe_itf8_put(char *cp, char *cp_end, int32_t val) {
743     return itf8_put(cp, val);
744 }
745 
safe_ltf8_put(char * cp,char * cp_end,int64_t val)746 static int safe_ltf8_put(char *cp, char *cp_end, int64_t val) {
747     return ltf8_put(cp, val);
748 }
749 
itf8_size(int64_t v)750 static int itf8_size(int64_t v) {
751     return ((!((v)&~0x7f))?1:(!((v)&~0x3fff))?2:(!((v)&~0x1fffff))?3:(!((v)&~0xfffffff))?4:5);
752 }
753 
754 //-----------------------------------------------------------------------------
755 
756 // CRAM v4.0 onwards uses a different variable sized integer encoding
757 // that is size agnostic.
758 
759 // Local interface to varint.h inline version, so we can use in func ptr.
760 // Note a lot of these use the unsigned interface but take signed int64_t.
761 // This is because the old CRAM ITF8 inteface had signed -1 as unsigned
762 // 0xffffffff.
uint7_size(int64_t v)763 static int uint7_size(int64_t v) {
764     return var_size_u64(v);
765 }
766 
uint7_get_32(char ** cp,const char * endp,int * err)767 static int64_t uint7_get_32(char **cp, const char *endp, int *err) {
768     uint32_t val;
769     int nb = var_get_u32((uint8_t *)(*cp), (const uint8_t *)endp, &val);
770     (*cp) += nb;
771     if (!nb && err) *err = 1;
772     return val;
773 }
774 
sint7_get_32(char ** cp,const char * endp,int * err)775 static int64_t sint7_get_32(char **cp, const char *endp, int *err) {
776     int32_t val;
777     int nb = var_get_s32((uint8_t *)(*cp), (const uint8_t *)endp, &val);
778     (*cp) += nb;
779     if (!nb && err) *err = 1;
780     return val;
781 }
782 
uint7_get_64(char ** cp,const char * endp,int * err)783 static int64_t uint7_get_64(char **cp, const char *endp, int *err) {
784     uint64_t val;
785     int nb = var_get_u64((uint8_t *)(*cp), (const uint8_t *)endp, &val);
786     (*cp) += nb;
787     if (!nb && err) *err = 1;
788     return val;
789 }
790 
sint7_get_64(char ** cp,const char * endp,int * err)791 static int64_t sint7_get_64(char **cp, const char *endp, int *err) {
792     int64_t val;
793     int nb = var_get_s64((uint8_t *)(*cp), (const uint8_t *)endp, &val);
794     (*cp) += nb;
795     if (!nb && err) *err = 1;
796     return val;
797 }
798 
uint7_put_32(char * cp,char * endp,int32_t val)799 static int uint7_put_32(char *cp, char *endp, int32_t val) {
800     return var_put_u32((uint8_t *)cp, (uint8_t *)endp, val);
801 }
802 
sint7_put_32(char * cp,char * endp,int32_t val)803 static int sint7_put_32(char *cp, char *endp, int32_t val) {
804     return var_put_s32((uint8_t *)cp, (uint8_t *)endp, val);
805 }
806 
uint7_put_64(char * cp,char * endp,int64_t val)807 static int uint7_put_64(char *cp, char *endp, int64_t val) {
808     return var_put_u64((uint8_t *)cp, (uint8_t *)endp, val);
809 }
810 
sint7_put_64(char * cp,char * endp,int64_t val)811 static int sint7_put_64(char *cp, char *endp, int64_t val) {
812     return var_put_s64((uint8_t *)cp, (uint8_t *)endp, val);
813 }
814 
815 // Put direct to to cram_block
uint7_put_blk_32(cram_block * blk,int32_t v)816 static int uint7_put_blk_32(cram_block *blk, int32_t v) {
817     uint8_t buf[10];
818     int sz = var_put_u32(buf, buf+10, v);
819     BLOCK_APPEND(blk, buf, sz);
820     return sz;
821 
822  block_err:
823     return -1;
824 }
825 
sint7_put_blk_32(cram_block * blk,int32_t v)826 static int sint7_put_blk_32(cram_block *blk, int32_t v) {
827     uint8_t buf[10];
828     int sz = var_put_s32(buf, buf+10, v);
829     BLOCK_APPEND(blk, buf, sz);
830     return sz;
831 
832  block_err:
833     return -1;
834 }
835 
uint7_put_blk_64(cram_block * blk,int64_t v)836 static int uint7_put_blk_64(cram_block *blk, int64_t v) {
837     uint8_t buf[10];
838     int sz = var_put_u64(buf, buf+10, v);
839     BLOCK_APPEND(blk, buf, sz);
840     return sz;
841 
842  block_err:
843     return -1;
844 }
845 
sint7_put_blk_64(cram_block * blk,int64_t v)846 static int sint7_put_blk_64(cram_block *blk, int64_t v) {
847     uint8_t buf[10];
848     int sz = var_put_s64(buf, buf+10, v);
849     BLOCK_APPEND(blk, buf, sz);
850     return sz;
851 
852  block_err:
853     return -1;
854 }
855 
856 // Decode 32-bits with CRC update from cram_fd
uint7_decode_crc32(cram_fd * fd,int32_t * val_p,uint32_t * crc)857 static int uint7_decode_crc32(cram_fd *fd, int32_t *val_p, uint32_t *crc) {
858     uint8_t b[5], i = 0;
859     int c;
860     uint32_t v = 0;
861 
862 #ifdef VARINT2
863     b[0] = hgetc(fd->fp);
864     if (b[0] < 177) {
865     } else if (b[0] < 241) {
866         b[1] = hgetc(fd->fp);
867     } else if (b[0] < 249) {
868         b[1] = hgetc(fd->fp);
869         b[2] = hgetc(fd->fp);
870     } else {
871         int n = b[0]+2, z = 1;
872         while (n-- >= 249)
873             b[z++] = hgetc(fd->fp);
874     }
875     i = var_get_u32(b, NULL, &v);
876 #else
877 //    // Little endian
878 //    int s = 0;
879 //    do {
880 //        b[i++] = c = hgetc(fd->fp);
881 //        if (c < 0)
882 //            return -1;
883 //        v |= (c & 0x7f) << s;
884 //      s += 7;
885 //    } while (i < 5 && (c & 0x80));
886 
887     // Big endian, see also htscodecs/varint.h
888     do {
889         b[i++] = c = hgetc(fd->fp);
890         if (c < 0)
891             return -1;
892         v = (v<<7) | (c & 0x7f);
893     } while (i < 5 && (c & 0x80));
894 #endif
895     *crc = crc32(*crc, b, i);
896 
897     *val_p = v;
898     return i;
899 }
900 
901 // Decode 32-bits with CRC update from cram_fd
sint7_decode_crc32(cram_fd * fd,int32_t * val_p,uint32_t * crc)902 static int sint7_decode_crc32(cram_fd *fd, int32_t *val_p, uint32_t *crc) {
903     uint8_t b[5], i = 0;
904     int c;
905     uint32_t v = 0;
906 
907 #ifdef VARINT2
908     b[0] = hgetc(fd->fp);
909     if (b[0] < 177) {
910     } else if (b[0] < 241) {
911         b[1] = hgetc(fd->fp);
912     } else if (b[0] < 249) {
913         b[1] = hgetc(fd->fp);
914         b[2] = hgetc(fd->fp);
915     } else {
916         int n = b[0]+2, z = 1;
917         while (n-- >= 249)
918             b[z++] = hgetc(fd->fp);
919     }
920     i = var_get_u32(b, NULL, &v);
921 #else
922 //    // Little endian
923 //    int s = 0;
924 //    do {
925 //        b[i++] = c = hgetc(fd->fp);
926 //        if (c < 0)
927 //            return -1;
928 //        v |= (c & 0x7f) << s;
929 //      s += 7;
930 //    } while (i < 5 && (c & 0x80));
931 
932     // Big endian, see also htscodecs/varint.h
933     do {
934         b[i++] = c = hgetc(fd->fp);
935         if (c < 0)
936             return -1;
937         v = (v<<7) | (c & 0x7f);
938     } while (i < 5 && (c & 0x80));
939 #endif
940     *crc = crc32(*crc, b, i);
941 
942     *val_p = (v>>1) ^ -(v&1);
943     return i;
944 }
945 
946 
947 // Decode 64-bits with CRC update from cram_fd
uint7_decode_crc64(cram_fd * fd,int64_t * val_p,uint32_t * crc)948 static int uint7_decode_crc64(cram_fd *fd, int64_t *val_p, uint32_t *crc) {
949     uint8_t b[10], i = 0;
950     int c;
951     uint64_t v = 0;
952 
953 #ifdef VARINT2
954     b[0] = hgetc(fd->fp);
955     if (b[0] < 177) {
956     } else if (b[0] < 241) {
957         b[1] = hgetc(fd->fp);
958     } else if (b[0] < 249) {
959         b[1] = hgetc(fd->fp);
960         b[2] = hgetc(fd->fp);
961     } else {
962         int n = b[0]+2, z = 1;
963         while (n-- >= 249)
964             b[z++] = hgetc(fd->fp);
965     }
966     i = var_get_u64(b, NULL, &v);
967 #else
968 //    // Little endian
969 //    int s = 0;
970 //    do {
971 //        b[i++] = c = hgetc(fd->fp);
972 //        if (c < 0)
973 //            return -1;
974 //        v |= (c & 0x7f) << s;
975 //      s += 7;
976 //    } while (i < 10 && (c & 0x80));
977 
978     // Big endian, see also htscodecs/varint.h
979     do {
980         b[i++] = c = hgetc(fd->fp);
981         if (c < 0)
982             return -1;
983         v = (v<<7) | (c & 0x7f);
984     } while (i < 5 && (c & 0x80));
985 #endif
986     *crc = crc32(*crc, b, i);
987 
988     *val_p = v;
989     return i;
990 }
991 
992 //-----------------------------------------------------------------------------
993 
994 /*
995  * Decodes a 32-bit little endian value from fd and stores in val.
996  *
997  * Returns the number of bytes read on success
998  *         -1 on failure
999  */
int32_decode(cram_fd * fd,int32_t * val)1000 static int int32_decode(cram_fd *fd, int32_t *val) {
1001     int32_t i;
1002     if (4 != hread(fd->fp, &i, 4))
1003         return -1;
1004 
1005     *val = le_int4(i);
1006     return 4;
1007 }
1008 
1009 /*
1010  * Encodes a 32-bit little endian value 'val' and writes to fd.
1011  *
1012  * Returns the number of bytes written on success
1013  *         -1 on failure
1014  */
int32_encode(cram_fd * fd,int32_t val)1015 static int int32_encode(cram_fd *fd, int32_t val) {
1016     uint32_t v = le_int4(val);
1017     if (4 != hwrite(fd->fp, &v, 4))
1018         return -1;
1019 
1020     return 4;
1021 }
1022 
1023 /* As int32_decoded/encode, but from/to blocks instead of cram_fd */
int32_get_blk(cram_block * b,int32_t * val)1024 int int32_get_blk(cram_block *b, int32_t *val) {
1025     if (b->uncomp_size - BLOCK_SIZE(b) < 4)
1026         return -1;
1027 
1028     uint32_t v =
1029          ((uint32_t) b->data[b->byte  ])        |
1030         (((uint32_t) b->data[b->byte+1]) <<  8) |
1031         (((uint32_t) b->data[b->byte+2]) << 16) |
1032         (((uint32_t) b->data[b->byte+3]) << 24);
1033     // Avoid implementation-defined behaviour on negative values
1034     *val = v < 0x80000000U ? (int32_t) v : -((int32_t) (0xffffffffU - v)) - 1;
1035     BLOCK_SIZE(b) += 4;
1036     return 4;
1037 }
1038 
1039 /* As int32_decoded/encode, but from/to blocks instead of cram_fd */
int32_put_blk(cram_block * b,int32_t val)1040 int int32_put_blk(cram_block *b, int32_t val) {
1041     unsigned char cp[4];
1042     uint32_t v = val;
1043     cp[0] = ( v      & 0xff);
1044     cp[1] = ((v>>8)  & 0xff);
1045     cp[2] = ((v>>16) & 0xff);
1046     cp[3] = ((v>>24) & 0xff);
1047 
1048     BLOCK_APPEND(b, cp, 4);
1049     return 0;
1050 
1051  block_err:
1052     return -1;
1053 }
1054 
1055 #ifdef HAVE_LIBDEFLATE
1056 /* ----------------------------------------------------------------------
1057  * libdeflate compression code, with interface to match
1058  * zlib_mem_{in,de}flate for simplicity elsewhere.
1059  */
1060 
1061 // Named the same as the version that uses zlib as we always use libdeflate for
1062 // decompression when available.
zlib_mem_inflate(char * cdata,size_t csize,size_t * size)1063 char *zlib_mem_inflate(char *cdata, size_t csize, size_t *size) {
1064     struct libdeflate_decompressor *z = libdeflate_alloc_decompressor();
1065     if (!z) {
1066         hts_log_error("Call to libdeflate_alloc_decompressor failed");
1067         return NULL;
1068     }
1069 
1070     uint8_t *data = NULL, *new_data;
1071     if (!*size)
1072         *size = csize*2;
1073     for(;;) {
1074         new_data = realloc(data, *size);
1075         if (!new_data) {
1076             hts_log_error("Memory allocation failure");
1077             goto fail;
1078         }
1079         data = new_data;
1080 
1081         int ret = libdeflate_gzip_decompress(z, cdata, csize, data, *size, size);
1082 
1083         // Auto grow output buffer size if needed and try again.
1084         // Fortunately for all bar one call of this we know the size already.
1085         if (ret == LIBDEFLATE_INSUFFICIENT_SPACE) {
1086             (*size) *= 1.5;
1087             continue;
1088         }
1089 
1090         if (ret != LIBDEFLATE_SUCCESS) {
1091             hts_log_error("Inflate operation failed: %d", ret);
1092             goto fail;
1093         } else {
1094             break;
1095         }
1096     }
1097 
1098     libdeflate_free_decompressor(z);
1099     return (char *)data;
1100 
1101  fail:
1102     libdeflate_free_decompressor(z);
1103     free(data);
1104     return NULL;
1105 }
1106 
1107 // Named differently as we use both zlib/libdeflate for compression.
libdeflate_deflate(char * data,size_t size,size_t * cdata_size,int level,int strat)1108 static char *libdeflate_deflate(char *data, size_t size, size_t *cdata_size,
1109                                 int level, int strat) {
1110     level = level > 0 ? level : 6; // libdeflate doesn't honour -1 as default
1111     level *= 1.2; // NB levels go up to 12 here; 5 onwards is +1
1112     if (level >= 8) level += level/8; // 8->10, 9->12
1113     if (level > 12) level = 12;
1114 
1115     if (strat == Z_RLE) // not supported by libdeflate
1116         level = 1;
1117 
1118     struct libdeflate_compressor *z = libdeflate_alloc_compressor(level);
1119     if (!z) {
1120         hts_log_error("Call to libdeflate_alloc_compressor failed");
1121         return NULL;
1122     }
1123 
1124     unsigned char *cdata = NULL; /* Compressed output */
1125     size_t cdata_alloc;
1126     cdata = malloc(cdata_alloc = size*1.05+100);
1127     if (!cdata) {
1128         hts_log_error("Memory allocation failure");
1129         libdeflate_free_compressor(z);
1130         return NULL;
1131     }
1132 
1133     *cdata_size = libdeflate_gzip_compress(z, data, size, cdata, cdata_alloc);
1134     libdeflate_free_compressor(z);
1135 
1136     if (*cdata_size == 0) {
1137         hts_log_error("Call to libdeflate_gzip_compress failed");
1138         free(cdata);
1139         return NULL;
1140     }
1141 
1142     return (char *)cdata;
1143 }
1144 
1145 #else
1146 
1147 /* ----------------------------------------------------------------------
1148  * zlib compression code - from Gap5's tg_iface_g.c
1149  * They're static here as they're only used within the cram_compress_block
1150  * and cram_uncompress_block functions, which are the external interface.
1151  */
zlib_mem_inflate(char * cdata,size_t csize,size_t * size)1152 char *zlib_mem_inflate(char *cdata, size_t csize, size_t *size) {
1153     z_stream s;
1154     unsigned char *data = NULL; /* Uncompressed output */
1155     int data_alloc = 0;
1156     int err;
1157 
1158     /* Starting point at uncompressed size, and scale after that */
1159     data = malloc(data_alloc = csize*1.2+100);
1160     if (!data)
1161         return NULL;
1162 
1163     /* Initialise zlib stream */
1164     s.zalloc = Z_NULL; /* use default allocation functions */
1165     s.zfree  = Z_NULL;
1166     s.opaque = Z_NULL;
1167     s.next_in  = (unsigned char *)cdata;
1168     s.avail_in = csize;
1169     s.total_in = 0;
1170     s.next_out  = data;
1171     s.avail_out = data_alloc;
1172     s.total_out = 0;
1173 
1174     //err = inflateInit(&s);
1175     err = inflateInit2(&s, 15 + 32);
1176     if (err != Z_OK) {
1177         hts_log_error("Call to zlib inflateInit failed: %s", s.msg);
1178         free(data);
1179         return NULL;
1180     }
1181 
1182     /* Decode to 'data' array */
1183     for (;s.avail_in;) {
1184         unsigned char *data_tmp;
1185         int alloc_inc;
1186 
1187         s.next_out = &data[s.total_out];
1188         err = inflate(&s, Z_NO_FLUSH);
1189         if (err == Z_STREAM_END)
1190             break;
1191 
1192         if (err != Z_OK) {
1193             hts_log_error("Call to zlib inflate failed: %s", s.msg);
1194             free(data);
1195             inflateEnd(&s);
1196             return NULL;
1197         }
1198 
1199         /* More to come, so realloc based on growth so far */
1200         alloc_inc = (double)s.avail_in/s.total_in * s.total_out + 100;
1201         data = realloc((data_tmp = data), data_alloc += alloc_inc);
1202         if (!data) {
1203             free(data_tmp);
1204             inflateEnd(&s);
1205             return NULL;
1206         }
1207         s.avail_out += alloc_inc;
1208     }
1209     inflateEnd(&s);
1210 
1211     *size = s.total_out;
1212     return (char *)data;
1213 }
1214 #endif
1215 
zlib_mem_deflate(char * data,size_t size,size_t * cdata_size,int level,int strat)1216 static char *zlib_mem_deflate(char *data, size_t size, size_t *cdata_size,
1217                               int level, int strat) {
1218     z_stream s;
1219     unsigned char *cdata = NULL; /* Compressed output */
1220     int cdata_alloc = 0;
1221     int cdata_pos = 0;
1222     int err;
1223 
1224     cdata = malloc(cdata_alloc = size*1.05+100);
1225     if (!cdata)
1226         return NULL;
1227     cdata_pos = 0;
1228 
1229     /* Initialise zlib stream */
1230     s.zalloc = Z_NULL; /* use default allocation functions */
1231     s.zfree  = Z_NULL;
1232     s.opaque = Z_NULL;
1233     s.next_in  = (unsigned char *)data;
1234     s.avail_in = size;
1235     s.total_in = 0;
1236     s.next_out  = cdata;
1237     s.avail_out = cdata_alloc;
1238     s.total_out = 0;
1239     s.data_type = Z_BINARY;
1240 
1241     err = deflateInit2(&s, level, Z_DEFLATED, 15|16, 9, strat);
1242     if (err != Z_OK) {
1243         hts_log_error("Call to zlib deflateInit2 failed: %s", s.msg);
1244         return NULL;
1245     }
1246 
1247     /* Encode to 'cdata' array */
1248     for (;s.avail_in;) {
1249         s.next_out = &cdata[cdata_pos];
1250         s.avail_out = cdata_alloc - cdata_pos;
1251         if (cdata_alloc - cdata_pos <= 0) {
1252             hts_log_error("Deflate produced larger output than expected");
1253             return NULL;
1254         }
1255         err = deflate(&s, Z_NO_FLUSH);
1256         cdata_pos = cdata_alloc - s.avail_out;
1257         if (err != Z_OK) {
1258             hts_log_error("Call to zlib deflate failed: %s", s.msg);
1259             break;
1260         }
1261     }
1262     if (deflate(&s, Z_FINISH) != Z_STREAM_END) {
1263         hts_log_error("Call to zlib deflate failed: %s", s.msg);
1264     }
1265     *cdata_size = s.total_out;
1266 
1267     if (deflateEnd(&s) != Z_OK) {
1268         hts_log_error("Call to zlib deflate failed: %s", s.msg);
1269     }
1270     return (char *)cdata;
1271 }
1272 
1273 #ifdef HAVE_LIBLZMA
1274 /* ------------------------------------------------------------------------ */
1275 /*
1276  * Data compression routines using liblzma (xz)
1277  *
1278  * On a test set this shrunk the main db from 136157104 bytes to 114796168, but
1279  * caused tg_index to grow from 2m43.707s to 15m3.961s. Exporting as bfastq
1280  * went from 18.3s to 36.3s. So decompression suffers too, but not as bad
1281  * as compression times.
1282  *
1283  * For now we disable this functionality. If it's to be reenabled make sure you
1284  * improve the mem_inflate implementation as it's just a test hack at the
1285  * moment.
1286  */
1287 
lzma_mem_deflate(char * data,size_t size,size_t * cdata_size,int level)1288 static char *lzma_mem_deflate(char *data, size_t size, size_t *cdata_size,
1289                               int level) {
1290     char *out;
1291     size_t out_size = lzma_stream_buffer_bound(size);
1292     *cdata_size = 0;
1293 
1294     out = malloc(out_size);
1295 
1296     /* Single call compression */
1297     if (LZMA_OK != lzma_easy_buffer_encode(level, LZMA_CHECK_CRC32, NULL,
1298                                            (uint8_t *)data, size,
1299                                            (uint8_t *)out, cdata_size,
1300                                            out_size))
1301         return NULL;
1302 
1303     return out;
1304 }
1305 
lzma_mem_inflate(char * cdata,size_t csize,size_t * size)1306 static char *lzma_mem_inflate(char *cdata, size_t csize, size_t *size) {
1307     lzma_stream strm = LZMA_STREAM_INIT;
1308     size_t out_size = 0, out_pos = 0;
1309     char *out = NULL, *new_out;
1310     int r;
1311 
1312     /* Initiate the decoder */
1313     if (LZMA_OK != lzma_stream_decoder(&strm, lzma_easy_decoder_memusage(9), 0))
1314         return NULL;
1315 
1316     /* Decode loop */
1317     strm.avail_in = csize;
1318     strm.next_in = (uint8_t *)cdata;
1319 
1320     for (;strm.avail_in;) {
1321         if (strm.avail_in > out_size - out_pos) {
1322             out_size += strm.avail_in * 4 + 32768;
1323             new_out = realloc(out, out_size);
1324             if (!new_out)
1325                 goto fail;
1326             out = new_out;
1327         }
1328         strm.avail_out = out_size - out_pos;
1329         strm.next_out = (uint8_t *)&out[out_pos];
1330 
1331         r = lzma_code(&strm, LZMA_RUN);
1332         if (LZMA_OK != r && LZMA_STREAM_END != r) {
1333             hts_log_error("LZMA decode failure (error %d)", r);
1334             goto fail;
1335         }
1336 
1337         out_pos = strm.total_out;
1338 
1339         if (r == LZMA_STREAM_END)
1340             break;
1341     }
1342 
1343     /* finish up any unflushed data; necessary? */
1344     r = lzma_code(&strm, LZMA_FINISH);
1345     if (r != LZMA_OK && r != LZMA_STREAM_END) {
1346         hts_log_error("Call to lzma_code failed with error %d", r);
1347         goto fail;
1348     }
1349 
1350     new_out = realloc(out, strm.total_out > 0 ? strm.total_out : 1);
1351     if (new_out)
1352         out = new_out;
1353     *size = strm.total_out;
1354 
1355     lzma_end(&strm);
1356 
1357     return out;
1358 
1359  fail:
1360     lzma_end(&strm);
1361     free(out);
1362     return NULL;
1363 }
1364 #endif
1365 
1366 /* ----------------------------------------------------------------------
1367  * CRAM blocks - the dynamically growable data block. We have code to
1368  * create, update, (un)compress and read/write.
1369  *
1370  * These are derived from the deflate_interlaced.c blocks, but with the
1371  * CRAM extension of content types and IDs.
1372  */
1373 
1374 /*
1375  * Allocates a new cram_block structure with a specified content_type and
1376  * id.
1377  *
1378  * Returns block pointer on success
1379  *         NULL on failure
1380  */
cram_new_block(enum cram_content_type content_type,int content_id)1381 cram_block *cram_new_block(enum cram_content_type content_type,
1382                            int content_id) {
1383     cram_block *b = malloc(sizeof(*b));
1384     if (!b)
1385         return NULL;
1386     b->method = b->orig_method = RAW;
1387     b->content_type = content_type;
1388     b->content_id = content_id;
1389     b->comp_size = 0;
1390     b->uncomp_size = 0;
1391     b->data = NULL;
1392     b->alloc = 0;
1393     b->byte = 0;
1394     b->bit = 7; // MSB
1395     b->crc32 = 0;
1396     b->idx = 0;
1397     b->m = NULL;
1398 
1399     return b;
1400 }
1401 
1402 /*
1403  * Reads a block from a cram file.
1404  * Returns cram_block pointer on success.
1405  *         NULL on failure
1406  */
cram_read_block(cram_fd * fd)1407 cram_block *cram_read_block(cram_fd *fd) {
1408     cram_block *b = malloc(sizeof(*b));
1409     unsigned char c;
1410     uint32_t crc = 0;
1411     if (!b)
1412         return NULL;
1413 
1414     //fprintf(stderr, "Block at %d\n", (int)ftell(fd->fp));
1415 
1416     if (-1 == (b->method      = hgetc(fd->fp))) { free(b); return NULL; }
1417     c = b->method; crc = crc32(crc, &c, 1);
1418     if (-1 == (b->content_type= hgetc(fd->fp))) { free(b); return NULL; }
1419     c = b->content_type; crc = crc32(crc, &c, 1);
1420     if (-1 == fd->vv.varint_decode32_crc(fd, &b->content_id, &crc))  { free(b); return NULL; }
1421     if (-1 == fd->vv.varint_decode32_crc(fd, &b->comp_size, &crc))   { free(b); return NULL; }
1422     if (-1 == fd->vv.varint_decode32_crc(fd, &b->uncomp_size, &crc)) { free(b); return NULL; }
1423 
1424     //fprintf(stderr, "  method %d, ctype %d, cid %d, csize %d, ucsize %d\n",
1425     //      b->method, b->content_type, b->content_id, b->comp_size, b->uncomp_size);
1426 
1427     if (b->method == RAW) {
1428         if (b->uncomp_size < 0 || b->comp_size != b->uncomp_size) {
1429             free(b);
1430             return NULL;
1431         }
1432         b->alloc = b->uncomp_size;
1433         if (!(b->data = malloc(b->uncomp_size))){ free(b); return NULL; }
1434         if (b->uncomp_size != hread(fd->fp, b->data, b->uncomp_size)) {
1435             free(b->data);
1436             free(b);
1437             return NULL;
1438         }
1439     } else {
1440         if (b->comp_size < 0 || b->uncomp_size < 0) {
1441             free(b);
1442             return NULL;
1443         }
1444         b->alloc = b->comp_size;
1445         if (!(b->data = malloc(b->comp_size)))  { free(b); return NULL; }
1446         if (b->comp_size != hread(fd->fp, b->data, b->comp_size)) {
1447             free(b->data);
1448             free(b);
1449             return NULL;
1450         }
1451     }
1452 
1453     if (CRAM_MAJOR_VERS(fd->version) >= 3) {
1454         if (-1 == int32_decode(fd, (int32_t *)&b->crc32)) {
1455             free(b->data);
1456             free(b);
1457             return NULL;
1458         }
1459 
1460         b->crc32_checked = fd->ignore_md5;
1461         b->crc_part = crc;
1462     } else {
1463         b->crc32_checked = 1; // CRC not present
1464     }
1465 
1466     b->orig_method = b->method;
1467     b->idx = 0;
1468     b->byte = 0;
1469     b->bit = 7; // MSB
1470 
1471     return b;
1472 }
1473 
1474 
1475 /*
1476  * Computes the size of a cram block, including the block
1477  * header itself.
1478  */
cram_block_size(cram_block * b)1479 uint32_t cram_block_size(cram_block *b) {
1480     unsigned char dat[100], *cp = dat;;
1481     uint32_t sz;
1482 
1483     *cp++ = b->method;
1484     *cp++ = b->content_type;
1485     cp += itf8_put((char*)cp, b->content_id);
1486     cp += itf8_put((char*)cp, b->comp_size);
1487     cp += itf8_put((char*)cp, b->uncomp_size);
1488 
1489     sz = cp-dat + 4;
1490     sz += b->method == RAW ? b->uncomp_size : b->comp_size;
1491 
1492     return sz;
1493 }
1494 
1495 /*
1496  * Writes a CRAM block.
1497  * Returns 0 on success
1498  *        -1 on failure
1499  */
cram_write_block(cram_fd * fd,cram_block * b)1500 int cram_write_block(cram_fd *fd, cram_block *b) {
1501     char vardata[100];
1502     int vardata_o = 0;
1503 
1504     assert(b->method != RAW || (b->comp_size == b->uncomp_size));
1505 
1506     if (hputc(b->method,       fd->fp)  == EOF) return -1;
1507     if (hputc(b->content_type, fd->fp)  == EOF) return -1;
1508     vardata_o += fd->vv.varint_put32(vardata          , vardata+100, b->content_id);
1509     vardata_o += fd->vv.varint_put32(vardata+vardata_o, vardata+100, b->comp_size);
1510     vardata_o += fd->vv.varint_put32(vardata+vardata_o, vardata+100, b->uncomp_size);
1511     if (vardata_o != hwrite(fd->fp, vardata, vardata_o))
1512         return -1;
1513 
1514     if (b->data) {
1515         if (b->method == RAW) {
1516             if (b->uncomp_size != hwrite(fd->fp, b->data, b->uncomp_size))
1517                 return -1;
1518         } else {
1519             if (b->comp_size != hwrite(fd->fp, b->data, b->comp_size))
1520                 return -1;
1521         }
1522     } else {
1523         // Absent blocks should be size 0
1524         assert(b->method == RAW && b->uncomp_size == 0);
1525     }
1526 
1527     if (CRAM_MAJOR_VERS(fd->version) >= 3) {
1528         char dat[100], *cp = (char *)dat;
1529         uint32_t crc;
1530 
1531         *cp++ = b->method;
1532         *cp++ = b->content_type;
1533         cp += fd->vv.varint_put32(cp, dat+100, b->content_id);
1534         cp += fd->vv.varint_put32(cp, dat+100, b->comp_size);
1535         cp += fd->vv.varint_put32(cp, dat+100, b->uncomp_size);
1536         crc = crc32(0L, (uc *)dat, cp-dat);
1537 
1538         if (b->method == RAW) {
1539             b->crc32 = crc32(crc, b->data ? b->data : (uc*)"", b->uncomp_size);
1540         } else {
1541             b->crc32 = crc32(crc, b->data ? b->data : (uc*)"", b->comp_size);
1542         }
1543 
1544         if (-1 == int32_encode(fd, b->crc32))
1545             return -1;
1546     }
1547 
1548     return 0;
1549 }
1550 
1551 /*
1552  * Frees a CRAM block, deallocating internal data too.
1553  */
cram_free_block(cram_block * b)1554 void cram_free_block(cram_block *b) {
1555     if (!b)
1556         return;
1557     if (b->data)
1558         free(b->data);
1559     free(b);
1560 }
1561 
1562 /*
1563  * Uncompresses a CRAM block, if compressed.
1564  */
cram_uncompress_block(cram_block * b)1565 int cram_uncompress_block(cram_block *b) {
1566     char *uncomp;
1567     size_t uncomp_size = 0;
1568 
1569     if (b->crc32_checked == 0) {
1570         uint32_t crc = crc32(b->crc_part, b->data ? b->data : (uc *)"", b->alloc);
1571         b->crc32_checked = 1;
1572         if (crc != b->crc32) {
1573             hts_log_error("Block CRC32 failure");
1574             return -1;
1575         }
1576     }
1577 
1578     if (b->uncomp_size == 0) {
1579         // blank block
1580         b->method = RAW;
1581         return 0;
1582     }
1583     assert(b->uncomp_size >= 0); // cram_read_block should ensure this
1584 
1585     switch (b->method) {
1586     case RAW:
1587         return 0;
1588 
1589     case GZIP:
1590         uncomp_size = b->uncomp_size;
1591         uncomp = zlib_mem_inflate((char *)b->data, b->comp_size, &uncomp_size);
1592 
1593         if (!uncomp)
1594             return -1;
1595         if (uncomp_size != b->uncomp_size) {
1596             free(uncomp);
1597             return -1;
1598         }
1599         free(b->data);
1600         b->data = (unsigned char *)uncomp;
1601         b->alloc = uncomp_size;
1602         b->method = RAW;
1603         break;
1604 
1605 #ifdef HAVE_LIBBZ2
1606     case BZIP2: {
1607         unsigned int usize = b->uncomp_size;
1608         if (!(uncomp = malloc(usize)))
1609             return -1;
1610         if (BZ_OK != BZ2_bzBuffToBuffDecompress(uncomp, &usize,
1611                                                 (char *)b->data, b->comp_size,
1612                                                 0, 0)) {
1613             free(uncomp);
1614             return -1;
1615         }
1616         free(b->data);
1617         b->data = (unsigned char *)uncomp;
1618         b->alloc = usize;
1619         b->method = RAW;
1620         b->uncomp_size = usize; // Just in case it differs
1621         break;
1622     }
1623 #else
1624     case BZIP2:
1625         hts_log_error("Bzip2 compression is not compiled into this version. Please rebuild and try again");
1626         return -1;
1627 #endif
1628 
1629 #ifdef HAVE_LIBLZMA
1630     case LZMA:
1631         uncomp = lzma_mem_inflate((char *)b->data, b->comp_size, &uncomp_size);
1632         if (!uncomp)
1633             return -1;
1634         if (uncomp_size != b->uncomp_size) {
1635             free(uncomp);
1636             return -1;
1637         }
1638         free(b->data);
1639         b->data = (unsigned char *)uncomp;
1640         b->alloc = uncomp_size;
1641         b->method = RAW;
1642         break;
1643 #else
1644     case LZMA:
1645         hts_log_error("Lzma compression is not compiled into this version. Please rebuild and try again");
1646         return -1;
1647         break;
1648 #endif
1649 
1650     case RANS: {
1651         unsigned int usize = b->uncomp_size, usize2;
1652         uncomp = (char *)rans_uncompress(b->data, b->comp_size, &usize2);
1653         if (!uncomp)
1654             return -1;
1655         if (usize != usize2) {
1656             free(uncomp);
1657             return -1;
1658         }
1659         free(b->data);
1660         b->data = (unsigned char *)uncomp;
1661         b->alloc = usize2;
1662         b->method = RAW;
1663         b->uncomp_size = usize2; // Just in case it differs
1664         //fprintf(stderr, "Expanded %d to %d\n", b->comp_size, b->uncomp_size);
1665         break;
1666     }
1667 
1668     case FQZ: {
1669         uncomp_size = b->uncomp_size;
1670         uncomp = fqz_decompress((char *)b->data, b->comp_size, &uncomp_size, NULL, 0);
1671         if (!uncomp)
1672             return -1;
1673         free(b->data);
1674         b->data = (unsigned char *)uncomp;
1675         b->alloc = uncomp_size;
1676         b->method = RAW;
1677         b->uncomp_size = uncomp_size;
1678         break;
1679     }
1680 
1681     case RANS_PR0: {
1682         unsigned int usize = b->uncomp_size, usize2;
1683         uncomp = (char *)rans_uncompress_4x16(b->data, b->comp_size, &usize2);
1684         if (!uncomp)
1685             return -1;
1686         if (usize != usize2) {
1687             free(uncomp);
1688             return -1;
1689         }
1690         b->orig_method = RANS_PR0 + (b->data[0]&1)
1691             + 2*((b->data[0]&0x40)>0) + 4*((b->data[0]&0x80)>0);
1692         free(b->data);
1693         b->data = (unsigned char *)uncomp;
1694         b->alloc = usize2;
1695         b->method = RAW;
1696         b->uncomp_size = usize2; // Just incase it differs
1697         //fprintf(stderr, "Expanded %d to %d\n", b->comp_size, b->uncomp_size);
1698         break;
1699     }
1700 
1701     case ARITH_PR0: {
1702         unsigned int usize = b->uncomp_size, usize2;
1703         uncomp = (char *)arith_uncompress_to(b->data, b->comp_size, NULL, &usize2);
1704         if (!uncomp)
1705             return -1;
1706         if (usize != usize2) {
1707             free(uncomp);
1708             return -1;
1709         }
1710         b->orig_method = ARITH_PR0 + (b->data[0]&1)
1711             + 2*((b->data[0]&0x40)>0) + 4*((b->data[0]&0x80)>0);
1712         free(b->data);
1713         b->data = (unsigned char *)uncomp;
1714         b->alloc = usize2;
1715         b->method = RAW;
1716         b->uncomp_size = usize2; // Just incase it differs
1717         //fprintf(stderr, "Expanded %d to %d\n", b->comp_size, b->uncomp_size);
1718         break;
1719     }
1720 
1721     case TOK3: {
1722         uint32_t out_len;
1723         uint8_t *cp = decode_names(b->data, b->comp_size, &out_len);
1724         if (!cp)
1725             return -1;
1726         b->orig_method = TOK3;
1727         b->method = RAW;
1728         free(b->data);
1729         b->data = cp;
1730         b->alloc = out_len;
1731         b->uncomp_size = out_len;
1732         break;
1733     }
1734 
1735     default:
1736         return -1;
1737     }
1738 
1739     return 0;
1740 }
1741 
cram_compress_by_method(cram_slice * s,char * in,size_t in_size,int content_id,size_t * out_size,enum cram_block_method_int method,int level,int strat)1742 static char *cram_compress_by_method(cram_slice *s, char *in, size_t in_size,
1743                                      int content_id, size_t *out_size,
1744                                      enum cram_block_method_int method,
1745                                      int level, int strat) {
1746     switch (method) {
1747     case GZIP:
1748     case GZIP_RLE:
1749     case GZIP_1:
1750         // Read names bizarrely benefit from zlib over libdeflate for
1751         // mid-range compression levels.  Focusing purely of ratio or
1752         // speed, libdeflate still wins.  It also seems to win for
1753         // other data series too.
1754         //
1755         // Eg RN at level 5;  libdeflate=55.9MB  zlib=51.6MB
1756 #ifdef HAVE_LIBDEFLATE
1757         if (content_id == DS_RN && level >= 4 && level <= 7)
1758             return zlib_mem_deflate(in, in_size, out_size, level, strat);
1759         else
1760             return libdeflate_deflate(in, in_size, out_size, level, strat);
1761 #else
1762         return zlib_mem_deflate(in, in_size, out_size, level, strat);
1763 #endif
1764 
1765     case BZIP2: {
1766 #ifdef HAVE_LIBBZ2
1767         unsigned int comp_size = in_size*1.01 + 600;
1768         char *comp = malloc(comp_size);
1769         if (!comp)
1770             return NULL;
1771 
1772         if (BZ_OK != BZ2_bzBuffToBuffCompress(comp, &comp_size,
1773                                               in, in_size,
1774                                               level, 0, 30)) {
1775             free(comp);
1776             return NULL;
1777         }
1778         *out_size = comp_size;
1779         return comp;
1780 #else
1781         return NULL;
1782 #endif
1783     }
1784 
1785     case FQZ:
1786     case FQZ_b:
1787     case FQZ_c:
1788     case FQZ_d: {
1789         // Extract the necessary portion of the slice into an fqz_slice struct.
1790         // These previously were the same thing, but this permits us to detach
1791         // the codec from the rest of this CRAM implementation.
1792         fqz_slice *f = malloc(2*s->hdr->num_records * sizeof(uint32_t) + sizeof(fqz_slice));
1793         if (!f)
1794             return NULL;
1795         f->num_records = s->hdr->num_records;
1796         f->len = (uint32_t *)(((char *)f) + sizeof(fqz_slice));
1797         f->flags = f->len + s->hdr->num_records;
1798         int i;
1799         for (i = 0; i < s->hdr->num_records; i++) {
1800             f->flags[i] = s->crecs[i].flags;
1801             f->len[i] = (i+1 < s->hdr->num_records
1802                          ? s->crecs[i+1].qual - s->crecs[i].qual
1803                          : s->block[DS_QS]->uncomp_size - s->crecs[i].qual);
1804         }
1805         char *comp = fqz_compress(strat & 0xff /* cram vers */, f,
1806                                   in, in_size, out_size, strat >> 8, NULL);
1807         free(f);
1808         return comp;
1809     }
1810 
1811     case LZMA:
1812 #ifdef HAVE_LIBLZMA
1813         return lzma_mem_deflate(in, in_size, out_size, level);
1814 #else
1815         return NULL;
1816 #endif
1817 
1818     case RANS0:
1819     case RANS1: {
1820         unsigned int out_size_i;
1821         unsigned char *cp;
1822         cp = rans_compress((unsigned char *)in, in_size, &out_size_i,
1823                            method == RANS0 ? 0 : 1);
1824         *out_size = out_size_i;
1825         return (char *)cp;
1826     }
1827 
1828     case RANS_PR0:
1829     case RANS_PR1:
1830     case RANS_PR64:
1831     case RANS_PR9:
1832     case RANS_PR128:
1833     case RANS_PR129:
1834     case RANS_PR192:
1835     case RANS_PR193: {
1836         unsigned int out_size_i;
1837         unsigned char *cp;
1838 
1839         // see enum cram_block. We map RANS_* methods to order bit-fields
1840         static int methmap[] = { 1, 64,9, 128,129, 192,193 };
1841 
1842         cp = rans_compress_4x16((unsigned char *)in, in_size, &out_size_i,
1843                                 method == RANS_PR0 ? 0 : methmap[method - RANS_PR1]);
1844         *out_size = out_size_i;
1845         return (char *)cp;
1846     }
1847 
1848     case ARITH_PR0:
1849     case ARITH_PR1:
1850     case ARITH_PR64:
1851     case ARITH_PR9:
1852     case ARITH_PR128:
1853     case ARITH_PR129:
1854     case ARITH_PR192:
1855     case ARITH_PR193: {
1856         unsigned int out_size_i;
1857         unsigned char *cp;
1858 
1859         // see enum cram_block. We map ARITH_* methods to order bit-fields
1860         static int methmap[] = { 1, 64,9, 128,129, 192,193 };
1861 
1862         cp = arith_compress_to((unsigned char *)in, in_size, NULL, &out_size_i,
1863                                method == ARITH_PR0 ? 0 : methmap[method - ARITH_PR1]);
1864         *out_size = out_size_i;
1865         return (char *)cp;
1866     }
1867 
1868     case TOK3:
1869     case TOKA: {
1870         int out_len;
1871         int lev = level;
1872         if (method == TOK3 && lev > 3)
1873             lev = 3;
1874         uint8_t *cp = encode_names(in, in_size, lev, strat, &out_len, NULL);
1875         *out_size = out_len;
1876         return (char *)cp;
1877     }
1878 
1879     case RAW:
1880         break;
1881 
1882     default:
1883         return NULL;
1884     }
1885 
1886     return NULL;
1887 }
1888 
1889 
1890 /*
1891  * Compresses a block using one of two different zlib strategies. If we only
1892  * want one choice set strat2 to be -1.
1893  *
1894  * The logic here is that sometimes Z_RLE does a better job than Z_FILTERED
1895  * or Z_DEFAULT_STRATEGY on quality data. If so, we'd rather use it as it is
1896  * significantly faster.
1897  *
1898  * Method and level -1 implies defaults, as specified in cram_fd.
1899  */
cram_compress_block2(cram_fd * fd,cram_slice * s,cram_block * b,cram_metrics * metrics,int method,int level)1900 int cram_compress_block2(cram_fd *fd, cram_slice *s,
1901                          cram_block *b, cram_metrics *metrics,
1902                          int method, int level) {
1903 
1904     if (!b)
1905         return 0;
1906 
1907     char *comp = NULL;
1908     size_t comp_size = 0;
1909     int strat;
1910 
1911     // Internally we have parameterised methods that externally map
1912     // to the same CRAM method value.
1913     // See enum_cram_block_method_int in cram_structs.h.
1914     int methmap[] = {
1915         // Externally defined values
1916         RAW, GZIP, BZIP2, LZMA, RANS, RANSPR, ARITH, FQZ, TOK3,
1917 
1918         // Reserved for possible expansion
1919         0, 0,
1920 
1921         // Internally parameterised versions matching back to above
1922         // external values
1923         GZIP, GZIP,
1924         FQZ, FQZ, FQZ,
1925         RANS,
1926         RANSPR, RANSPR, RANSPR, RANSPR, RANSPR, RANSPR, RANSPR,
1927         TOK3,
1928         ARITH,  ARITH,  ARITH,  ARITH,  ARITH,  ARITH,  ARITH,
1929     };
1930 
1931     if (b->method != RAW) {
1932         // Maybe already compressed if s->block[0] was compressed and
1933         // we have e.g. s->block[DS_BA] set to s->block[0] due to only
1934         // one base type present and hence using E_HUFFMAN on block 0.
1935         // A second explicit attempt to compress the same block then
1936         // occurs.
1937         return 0;
1938     }
1939 
1940     if (method == -1) {
1941         method = 1<<GZIP;
1942         if (fd->use_bz2)
1943             method |= 1<<BZIP2;
1944         if (fd->use_lzma)
1945             method |= 1<<LZMA;
1946     }
1947 
1948     if (level == -1)
1949         level = fd->level;
1950 
1951     //fprintf(stderr, "IN: block %d, sz %d\n", b->content_id, b->uncomp_size);
1952 
1953     if (method == RAW || level == 0 || b->uncomp_size == 0) {
1954         b->method = RAW;
1955         b->comp_size = b->uncomp_size;
1956         //fprintf(stderr, "Skip block id %d\n", b->content_id);
1957         return 0;
1958     }
1959 
1960 #ifndef ABS
1961 #    define ABS(a) ((a)>=0?(a):-(a))
1962 #endif
1963 
1964     if (metrics) {
1965         pthread_mutex_lock(&fd->metrics_lock);
1966         // Sudden changes in size trigger a retrial.  These are mainly
1967         // triggered when switching to sorted / unsorted, where the number
1968         // of elements in a slice radically changes.
1969         //
1970         // We also get large fluctuations based on genome coordinate for
1971         // e.g. SA:Z and SC series, but we consider the typical scale of
1972         // delta between blocks and use this to look for abnormality.
1973         if (metrics->input_avg_sz &&
1974             (b->uncomp_size + 1000 > 4*(metrics->input_avg_sz+1000) ||
1975              b->uncomp_size + 1000 < (metrics->input_avg_sz+1000)/4) &&
1976             ABS(b->uncomp_size-metrics->input_avg_sz)
1977                 > 10*metrics->input_avg_delta) {
1978             metrics->next_trial = 0;
1979         }
1980 
1981         if (metrics->trial > 0 || --metrics->next_trial <= 0) {
1982             int m, unpackable = metrics->unpackable;
1983             size_t sz_best = b->uncomp_size;
1984             size_t sz[CRAM_MAX_METHOD] = {0};
1985             int method_best = 0; // RAW
1986             char *c_best = NULL, *c = NULL;
1987 
1988             metrics->input_avg_delta =
1989                 0.9 * (metrics->input_avg_delta +
1990                        ABS(b->uncomp_size - metrics->input_avg_sz));
1991 
1992             metrics->input_avg_sz += b->uncomp_size*.2;
1993             metrics->input_avg_sz *= 0.8;
1994 
1995             if (metrics->revised_method)
1996                 method = metrics->revised_method;
1997             else
1998                 metrics->revised_method = method;
1999 
2000             if (metrics->next_trial <= 0) {
2001                 metrics->next_trial = TRIAL_SPAN;
2002                 metrics->trial = NTRIALS;
2003                 for (m = 0; m < CRAM_MAX_METHOD; m++)
2004                     metrics->sz[m] /= 2;
2005                 metrics->unpackable = 0;
2006             }
2007 
2008             // Compress this block using the best method
2009             if (unpackable && CRAM_MAJOR_VERS(fd->version) > 3) {
2010                 // No point trying bit-pack if 17+ symbols.
2011                 if (method & (1<<RANS_PR128))
2012                     method = (method|(1<<RANS_PR0))&~(1<<RANS_PR128);
2013                 if (method & (1<<RANS_PR129))
2014                     method = (method|(1<<RANS_PR1))&~(1<<RANS_PR129);
2015                 if (method & (1<<RANS_PR192))
2016                     method = (method|(1<<RANS_PR64))&~(1<<RANS_PR192);
2017                 if (method & (1<<RANS_PR193))
2018                     method = (method|(1<<RANS_PR64)|(1<<RANS_PR1))&~(1<<RANS_PR193);
2019 
2020                 if (method & (1<<ARITH_PR128))
2021                     method = (method|(1<<ARITH_PR0))&~(1<<ARITH_PR128);
2022                 if (method & (1<<ARITH_PR129))
2023                     method = (method|(1<<ARITH_PR1))&~(1<<ARITH_PR129);
2024                 if (method & (1<<ARITH_PR192))
2025                     method = (method|(1<<ARITH_PR64))&~(1<<ARITH_PR192);
2026                 if (method & (1u<<ARITH_PR193))
2027                     method = (method|(1<<ARITH_PR64)|(1<<ARITH_PR1))&~(1u<<ARITH_PR193);
2028             }
2029 
2030             // Libdeflate doesn't have a Z_RLE strategy.
2031             // We treat it as level 1, but iff we haven't also
2032             // explicitly listed that in the method list.
2033 #ifdef HAVE_LIBDEFLATE
2034             if ((method & (1<<GZIP_RLE)) && (method & (1<<GZIP_1)))
2035                 method &= ~(1<<GZIP_RLE);
2036 #endif
2037 
2038             pthread_mutex_unlock(&fd->metrics_lock);
2039 
2040             for (m = 0; m < CRAM_MAX_METHOD; m++) {
2041                 if (method & (1u<<m)) {
2042                     int lvl = level;
2043                     switch (m) {
2044                     case GZIP:     strat = Z_FILTERED; break;
2045                     case GZIP_1:   strat = Z_DEFAULT_STRATEGY; lvl = 1; break;
2046                     case GZIP_RLE: strat = Z_RLE; break;
2047                     case FQZ:      strat = CRAM_MAJOR_VERS(fd->version); break;
2048                     case FQZ_b:    strat = CRAM_MAJOR_VERS(fd->version)+256; break;
2049                     case FQZ_c:    strat = CRAM_MAJOR_VERS(fd->version)+2*256; break;
2050                     case FQZ_d:    strat = CRAM_MAJOR_VERS(fd->version)+3*256; break;
2051                     case TOK3:     strat = 0; break;
2052                     case TOKA:     strat = 1; break;
2053                     default:       strat = 0;
2054                     }
2055 
2056                     c = cram_compress_by_method(s, (char *)b->data, b->uncomp_size,
2057                                                 b->content_id, &sz[m], m, lvl, strat);
2058 
2059                     if (c && sz_best > sz[m]) {
2060                         sz_best = sz[m];
2061                         method_best = m;
2062                         if (c_best)
2063                             free(c_best);
2064                         c_best = c;
2065                     } else if (c) {
2066                         free(c);
2067                     } else {
2068                         sz[m] = b->uncomp_size*2+1000; // arbitrarily worse than raw
2069                     }
2070                 } else {
2071                     sz[m] = b->uncomp_size*2+1000; // arbitrarily worse than raw
2072                 }
2073             }
2074 
2075             if (c_best) {
2076                 free(b->data);
2077                 b->data = (unsigned char *)c_best;
2078                 b->method = method_best; // adjusted to methmap[method_best] later
2079                 b->comp_size = sz_best;
2080             }
2081 
2082             // Accumulate stats for all methods tried
2083             pthread_mutex_lock(&fd->metrics_lock);
2084             for (m = 0; m < CRAM_MAX_METHOD; m++)
2085                 // don't be overly sure on small blocks.
2086                 // +2000 means eg bzip2 vs gzip (1.07 to 1.04) or gz vs rans1
2087                 // needs to be at least 60 bytes smaller to overcome the
2088                 // fixed size addition.
2089                 metrics->sz[m] += sz[m]+2000;
2090 
2091             // When enough trials performed, find the best on average
2092             if (--metrics->trial == 0) {
2093                 int best_method = RAW;
2094                 int best_sz = INT_MAX;
2095 
2096                 // Relative costs of methods. See enum_cram_block_method_int
2097                 // and methmap
2098                 double meth_cost[32] = {
2099                     // Externally defined methods
2100                     1,    // 0  raw
2101                     1.04, // 1  gzip (Z_FILTERED)
2102                     1.07, // 2  bzip2
2103                     1.08, // 3  lzma
2104                     1.00, // 4  rans    (O0)
2105                     1.00, // 5  ranspr  (O0)
2106                     1.04, // 6  arithpr (O0)
2107                     1.05, // 7  fqz
2108                     1.05, // 8  tok3 (rans)
2109                     1.00, 1.00, // 9,10 reserved
2110 
2111                     // Paramterised versions of above
2112                     1.01, // gzip rle
2113                     1.01, // gzip -1
2114 
2115                     1.05, 1.05, 1.05, // FQZ_b,c,d
2116 
2117                     1.01, // rans O1
2118 
2119                     1.01, // rans_pr1
2120                     1.00, // rans_pr64; if smaller, usually fast
2121                     1.03, // rans_pr65/9
2122                     1.00, // rans_pr128
2123                     1.01, // rans_pr129
2124                     1.00, // rans_pr192
2125                     1.01, // rans_pr193
2126 
2127                     1.07, // tok3 arith
2128 
2129                     1.04, // arith_pr1
2130                     1.04, // arith_pr64
2131                     1.04, // arith_pr9
2132                     1.03, // arith_pr128
2133                     1.04, // arith_pr129
2134                     1.04, // arith_pr192
2135                     1.04, // arith_pr193
2136                 };
2137 
2138                 // Scale methods by cost based on compression level
2139                 if (fd->level <= 1) {
2140                     for (m = 0; m < CRAM_MAX_METHOD; m++)
2141                         metrics->sz[m] *= 1+(meth_cost[m]-1)*4;
2142                 } else if (fd->level <= 3) {
2143                     for (m = 0; m < CRAM_MAX_METHOD; m++)
2144                         metrics->sz[m] *= 1+(meth_cost[m]-1);
2145                 } else if (fd->level <= 6) {
2146                     for (m = 0; m < CRAM_MAX_METHOD; m++)
2147                         metrics->sz[m] *= 1+(meth_cost[m]-1)/2;
2148                 } else if (fd->level <= 7) {
2149                     for (m = 0; m < CRAM_MAX_METHOD; m++)
2150                         metrics->sz[m] *= 1+(meth_cost[m]-1)/3;
2151                 } // else cost is ignored
2152 
2153                 // Ensure these are never used; BSC and ZSTD
2154                 metrics->sz[9] = metrics->sz[10] = INT_MAX;
2155 
2156                 for (m = 0; m < CRAM_MAX_METHOD; m++) {
2157                     if ((!metrics->sz[m]) || (!(method & (1u<<m))))
2158                         continue;
2159 
2160                     if (best_sz > metrics->sz[m])
2161                         best_sz = metrics->sz[m], best_method = m;
2162                 }
2163 
2164                 if (best_method != metrics->method) {
2165                     //metrics->trial = (NTRIALS+1)/2; // be sure
2166                     //metrics->next_trial /= 1.5;
2167                     metrics->consistency = 0;
2168                 } else {
2169                     metrics->next_trial *= MIN(2, 1+metrics->consistency/4.0);
2170                     metrics->consistency++;
2171                 }
2172 
2173                 metrics->method = best_method;
2174                 switch (best_method) {
2175                 case GZIP:     strat = Z_FILTERED; break;
2176                 case GZIP_1:   strat = Z_DEFAULT_STRATEGY; break;
2177                 case GZIP_RLE: strat = Z_RLE; break;
2178                 case FQZ:      strat = CRAM_MAJOR_VERS(fd->version); break;
2179                 case FQZ_b:    strat = CRAM_MAJOR_VERS(fd->version)+256; break;
2180                 case FQZ_c:    strat = CRAM_MAJOR_VERS(fd->version)+2*256; break;
2181                 case FQZ_d:    strat = CRAM_MAJOR_VERS(fd->version)+3*256; break;
2182                 default:       strat = 0;
2183                 }
2184                 metrics->strat  = strat;
2185 
2186                 // If we see at least MAXFAIL trials in a row for a specific
2187                 // compression method with more than MAXDELTA aggregate
2188                 // size then we drop this from the list of methods used
2189                 // for this block type.
2190 #define MAXDELTA 0.20
2191 #define MAXFAILS 4
2192                 for (m = 0; m < CRAM_MAX_METHOD; m++) {
2193                     if (best_method == m) {
2194                         metrics->cnt[m] = 0;
2195                         metrics->extra[m] = 0;
2196                     } else if (best_sz < metrics->sz[m]) {
2197                         double r = (double)metrics->sz[m] / best_sz - 1;
2198                         int mul = 1+(fd->level>=7);
2199                         if (++metrics->cnt[m] >= MAXFAILS*mul &&
2200                             (metrics->extra[m] += r) >= MAXDELTA*mul)
2201                             method &= ~(1u<<m);
2202 
2203                         // Special case for fqzcomp as it rarely changes
2204                         if (m == FQZ || m == FQZ_b || m == FQZ_c || m == FQZ_d) {
2205                             if (metrics->sz[m] > best_sz)
2206                                 method &= ~(1u<<m);
2207                         }
2208                     }
2209                 }
2210 
2211                 //if (fd->verbose > 1 && method != metrics->revised_method)
2212                 //    fprintf(stderr, "%d: revising method from %x to %x\n",
2213                 //          b->content_id, metrics->revised_method, method);
2214                 metrics->revised_method = method;
2215             }
2216             pthread_mutex_unlock(&fd->metrics_lock);
2217         } else {
2218             metrics->input_avg_delta =
2219                 0.9 * (metrics->input_avg_delta +
2220                        ABS(b->uncomp_size - metrics->input_avg_sz));
2221 
2222             metrics->input_avg_sz += b->uncomp_size*.2;
2223             metrics->input_avg_sz *= 0.8;
2224 
2225             strat = metrics->strat;
2226             method = metrics->method;
2227 
2228             pthread_mutex_unlock(&fd->metrics_lock);
2229             comp = cram_compress_by_method(s, (char *)b->data, b->uncomp_size,
2230                                            b->content_id, &comp_size, method,
2231                                            method == GZIP_1 ? 1 : level,
2232                                            strat);
2233             if (!comp)
2234                 return -1;
2235 
2236             if (comp_size < b->uncomp_size) {
2237                 free(b->data);
2238                 b->data = (unsigned char *)comp;
2239                 b->comp_size = comp_size;
2240                 b->method = method;
2241             } else {
2242                 free(comp);
2243             }
2244         }
2245 
2246     } else {
2247         // no cached metrics, so just do zlib?
2248         comp = cram_compress_by_method(s, (char *)b->data, b->uncomp_size,
2249                                        b->content_id, &comp_size, GZIP, level, Z_FILTERED);
2250         if (!comp) {
2251             hts_log_error("Compression failed!");
2252             return -1;
2253         }
2254 
2255         if (comp_size < b->uncomp_size) {
2256             free(b->data);
2257             b->data = (unsigned char *)comp;
2258             b->comp_size = comp_size;
2259             b->method = GZIP;
2260         } else {
2261             free(comp);
2262         }
2263         strat = Z_FILTERED;
2264     }
2265 
2266     hts_log_info("Compressed block ID %d from %d to %d by method %s",
2267                  b->content_id, b->uncomp_size, b->comp_size,
2268                  cram_block_method2str(b->method));
2269 
2270     b->method = methmap[b->method];
2271 
2272     return 0;
2273 }
cram_compress_block(cram_fd * fd,cram_block * b,cram_metrics * metrics,int method,int level)2274 int cram_compress_block(cram_fd *fd, cram_block *b, cram_metrics *metrics,
2275                         int method, int level) {
2276     return cram_compress_block2(fd, NULL, b, metrics, method, level);
2277 }
2278 
cram_new_metrics(void)2279 cram_metrics *cram_new_metrics(void) {
2280     cram_metrics *m = calloc(1, sizeof(*m));
2281     if (!m)
2282         return NULL;
2283     m->trial = NTRIALS-1;
2284     m->next_trial = TRIAL_SPAN/2; // learn quicker at start
2285     m->method = RAW;
2286     m->strat = 0;
2287     m->revised_method = 0;
2288     m->unpackable = 0;
2289 
2290     return m;
2291 }
2292 
cram_block_method2str(enum cram_block_method_int m)2293 char *cram_block_method2str(enum cram_block_method_int m) {
2294     switch(m) {
2295     case RAW:         return "RAW";
2296     case GZIP:        return "GZIP";
2297     case BZIP2:       return "BZIP2";
2298     case LZMA:        return "LZMA";
2299     case RANS0:       return "RANS0";
2300     case RANS1:       return "RANS1";
2301     case GZIP_RLE:    return "GZIP_RLE";
2302     case GZIP_1:      return "GZIP_1";
2303     case FQZ:         return "FQZ";
2304     case FQZ_b:       return "FQZ_b";
2305     case FQZ_c:       return "FQZ_c";
2306     case FQZ_d:       return "FQZ_d";
2307     case RANS_PR0:    return "RANS_PR0";
2308     case RANS_PR1:    return "RANS_PR1";
2309     case RANS_PR64:   return "RANS_PR64";
2310     case RANS_PR9:    return "RANS_PR9";
2311     case RANS_PR128:  return "RANS_PR128";
2312     case RANS_PR129:  return "RANS_PR129";
2313     case RANS_PR192:  return "RANS_PR192";
2314     case RANS_PR193:  return "RANS_PR193";
2315     case TOK3:        return "TOK3_R";
2316     case TOKA:        return "TOK3_A";
2317     case ARITH_PR0:   return "ARITH_PR0";
2318     case ARITH_PR1:   return "ARITH_PR1";
2319     case ARITH_PR64:  return "ARITH_PR64";
2320     case ARITH_PR9:   return "ARITH_PR9";
2321     case ARITH_PR128: return "ARITH_PR128";
2322     case ARITH_PR129: return "ARITH_PR129";
2323     case ARITH_PR192: return "ARITH_PR192";
2324     case ARITH_PR193: return "ARITH_PR193";
2325     case BM_ERROR: break;
2326     }
2327     return "?";
2328 }
2329 
cram_content_type2str(enum cram_content_type t)2330 char *cram_content_type2str(enum cram_content_type t) {
2331     switch (t) {
2332     case FILE_HEADER:         return "FILE_HEADER";
2333     case COMPRESSION_HEADER:  return "COMPRESSION_HEADER";
2334     case MAPPED_SLICE:        return "MAPPED_SLICE";
2335     case UNMAPPED_SLICE:      return "UNMAPPED_SLICE";
2336     case EXTERNAL:            return "EXTERNAL";
2337     case CORE:                return "CORE";
2338     case CT_ERROR:            break;
2339     }
2340     return "?";
2341 }
2342 
2343 /* ----------------------------------------------------------------------
2344  * Reference sequence handling
2345  *
2346  * These revolve around the refs_t structure, which may potentially be
2347  * shared between multiple cram_fd.
2348  *
2349  * We start with refs_create() to allocate an empty refs_t and then
2350  * populate it with @SQ line data using refs_from_header(). This is done on
2351  * cram_open().  Also at start up we can call cram_load_reference() which
2352  * is used with "scramble -r foo.fa". This replaces the fd->refs with the
2353  * new one specified. In either case refs2id() is then called which
2354  * maps ref_entry names to @SQ ids (refs_t->ref_id[]).
2355  *
2356  * Later, possibly within a thread, we will want to know the actual ref
2357  * seq itself, obtained by calling cram_get_ref().  This may use the
2358  * UR: or M5: fields or the filename specified in the original
2359  * cram_load_reference() call.
2360  *
2361  * Given the potential for multi-threaded reference usage, we have
2362  * reference counting (sorry for the confusing double use of "ref") to
2363  * track the number of callers interested in any specific reference.
2364  */
2365 
2366 /*
2367  * Frees/unmaps a reference sequence and associated file handles.
2368  */
ref_entry_free_seq(ref_entry * e)2369 static void ref_entry_free_seq(ref_entry *e) {
2370     if (e->mf)
2371         mfclose(e->mf);
2372     if (e->seq && !e->mf)
2373         free(e->seq);
2374 
2375     e->seq = NULL;
2376     e->mf = NULL;
2377 }
2378 
refs_free(refs_t * r)2379 void refs_free(refs_t *r) {
2380     RP("refs_free()\n");
2381 
2382     if (--r->count > 0)
2383         return;
2384 
2385     if (!r)
2386         return;
2387 
2388     if (r->pool)
2389         string_pool_destroy(r->pool);
2390 
2391     if (r->h_meta) {
2392         khint_t k;
2393 
2394         for (k = kh_begin(r->h_meta); k != kh_end(r->h_meta); k++) {
2395             ref_entry *e;
2396 
2397             if (!kh_exist(r->h_meta, k))
2398                 continue;
2399             if (!(e = kh_val(r->h_meta, k)))
2400                 continue;
2401             ref_entry_free_seq(e);
2402             free(e);
2403         }
2404 
2405         kh_destroy(refs, r->h_meta);
2406     }
2407 
2408     if (r->ref_id)
2409         free(r->ref_id);
2410 
2411     if (r->fp)
2412         bgzf_close(r->fp);
2413 
2414     pthread_mutex_destroy(&r->lock);
2415 
2416     free(r);
2417 }
2418 
refs_create(void)2419 static refs_t *refs_create(void) {
2420     refs_t *r = calloc(1, sizeof(*r));
2421 
2422     RP("refs_create()\n");
2423 
2424     if (!r)
2425         return NULL;
2426 
2427     if (!(r->pool = string_pool_create(8192)))
2428         goto err;
2429 
2430     r->ref_id = NULL; // see refs2id() to populate.
2431     r->count = 1;
2432     r->last = NULL;
2433     r->last_id = -1;
2434 
2435     if (!(r->h_meta = kh_init(refs)))
2436         goto err;
2437 
2438     pthread_mutex_init(&r->lock, NULL);
2439 
2440     return r;
2441 
2442  err:
2443     refs_free(r);
2444     return NULL;
2445 }
2446 
2447 /*
2448  * Opens a reference fasta file as a BGZF stream, allowing for
2449  * compressed files.  It automatically builds a .fai file if
2450  * required and if compressed a .gzi bgzf index too.
2451  *
2452  * Returns a BGZF handle on success;
2453  *         NULL on failure.
2454  */
bgzf_open_ref(char * fn,char * mode,int is_md5)2455 static BGZF *bgzf_open_ref(char *fn, char *mode, int is_md5) {
2456     BGZF *fp;
2457 
2458     if (!is_md5 && !hisremote(fn)) {
2459         char fai_file[PATH_MAX];
2460 
2461         snprintf(fai_file, PATH_MAX, "%s.fai", fn);
2462         if (access(fai_file, R_OK) != 0)
2463             if (fai_build(fn) != 0)
2464                 return NULL;
2465     }
2466 
2467     if (!(fp = bgzf_open(fn, mode))) {
2468         perror(fn);
2469         return NULL;
2470     }
2471 
2472     if (fp->is_compressed == 1 && bgzf_index_load(fp, fn, ".gzi") < 0) {
2473         hts_log_error("Unable to load .gzi index '%s.gzi'", fn);
2474         bgzf_close(fp);
2475         return NULL;
2476     }
2477 
2478     return fp;
2479 }
2480 
2481 /*
2482  * Loads a FAI file for a reference.fasta.
2483  * "is_err" indicates whether failure to load is worthy of emitting an
2484  * error message. In some cases (eg with embedded references) we
2485  * speculatively load, just in case, and silently ignore errors.
2486  *
2487  * Returns the refs_t struct on success (maybe newly allocated);
2488  *         NULL on failure
2489  */
refs_load_fai(refs_t * r_orig,const char * fn,int is_err)2490 static refs_t *refs_load_fai(refs_t *r_orig, const char *fn, int is_err) {
2491     hFILE *fp = NULL;
2492     char fai_fn[PATH_MAX];
2493     char line[8192];
2494     refs_t *r = r_orig;
2495     size_t fn_l = strlen(fn);
2496     int id = 0, id_alloc = 0;
2497 
2498     RP("refs_load_fai %s\n", fn);
2499 
2500     if (!r)
2501         if (!(r = refs_create()))
2502             goto err;
2503 
2504     if (r->fp)
2505         if (bgzf_close(r->fp) != 0)
2506             goto err;
2507     r->fp = NULL;
2508 
2509     /* Look for a FASTA##idx##FAI format */
2510     char *fn_delim = strstr(fn, HTS_IDX_DELIM);
2511     if (fn_delim) {
2512         if (!(r->fn = string_ndup(r->pool, fn, fn_delim - fn)))
2513             goto err;
2514         fn_delim += strlen(HTS_IDX_DELIM);
2515         snprintf(fai_fn, PATH_MAX, "%s", fn_delim);
2516     } else {
2517         /* An index file was provided, instead of the actual reference file */
2518         if (fn_l > 4 && strcmp(&fn[fn_l-4], ".fai") == 0) {
2519             if (!r->fn) {
2520                 if (!(r->fn = string_ndup(r->pool, fn, fn_l-4)))
2521                     goto err;
2522             }
2523             snprintf(fai_fn, PATH_MAX, "%s", fn);
2524         } else {
2525         /* Only the reference file provided. Get the index file name from it */
2526             if (!(r->fn = string_dup(r->pool, fn)))
2527                 goto err;
2528             sprintf(fai_fn, "%.*s.fai", PATH_MAX-5, fn);
2529         }
2530     }
2531 
2532     if (!(r->fp = bgzf_open_ref(r->fn, "r", 0))) {
2533         hts_log_error("Failed to open reference file '%s'", r->fn);
2534         goto err;
2535     }
2536 
2537     if (!(fp = hopen(fai_fn, "r"))) {
2538         hts_log_error("Failed to open index file '%s'", fai_fn);
2539         if (is_err)
2540             perror(fai_fn);
2541         goto err;
2542     }
2543     while (hgets(line, 8192, fp) != NULL) {
2544         ref_entry *e = malloc(sizeof(*e));
2545         char *cp;
2546         int n;
2547         khint_t k;
2548 
2549         if (!e)
2550             return NULL;
2551 
2552         // id
2553         for (cp = line; *cp && !isspace_c(*cp); cp++)
2554             ;
2555         *cp++ = 0;
2556         e->name = string_dup(r->pool, line);
2557 
2558         // length
2559         while (*cp && isspace_c(*cp))
2560             cp++;
2561         e->length = strtoll(cp, &cp, 10);
2562 
2563         // offset
2564         while (*cp && isspace_c(*cp))
2565             cp++;
2566         e->offset = strtoll(cp, &cp, 10);
2567 
2568         // bases per line
2569         while (*cp && isspace_c(*cp))
2570             cp++;
2571         e->bases_per_line = strtol(cp, &cp, 10);
2572 
2573         // line length
2574         while (*cp && isspace_c(*cp))
2575             cp++;
2576         e->line_length = strtol(cp, &cp, 10);
2577 
2578         // filename
2579         e->fn = r->fn;
2580 
2581         e->count = 0;
2582         e->seq = NULL;
2583         e->mf = NULL;
2584         e->is_md5 = 0;
2585 
2586         k = kh_put(refs, r->h_meta, e->name, &n);
2587         if (-1 == n)  {
2588             free(e);
2589             return NULL;
2590         }
2591 
2592         if (n) {
2593             kh_val(r->h_meta, k) = e;
2594         } else {
2595             ref_entry *re = kh_val(r->h_meta, k);
2596             if (re && (re->count != 0 || re->length != 0)) {
2597                 /* Keep old */
2598                 free(e);
2599             } else {
2600                 /* Replace old */
2601                 if (re)
2602                     free(re);
2603                 kh_val(r->h_meta, k) = e;
2604             }
2605         }
2606 
2607         if (id >= id_alloc) {
2608             ref_entry **new_refs;
2609             int x;
2610 
2611             id_alloc = id_alloc ?id_alloc*2 : 16;
2612             new_refs = realloc(r->ref_id, id_alloc * sizeof(*r->ref_id));
2613             if (!new_refs)
2614                 goto err;
2615             r->ref_id = new_refs;
2616 
2617             for (x = id; x < id_alloc; x++)
2618                 r->ref_id[x] = NULL;
2619         }
2620         r->ref_id[id] = e;
2621         r->nref = ++id;
2622     }
2623 
2624     if(hclose(fp) < 0)
2625         goto err;
2626     return r;
2627 
2628  err:
2629     if (fp)
2630         hclose_abruptly(fp);
2631 
2632     if (!r_orig)
2633         refs_free(r);
2634 
2635     return NULL;
2636 }
2637 
2638 /*
2639  * Verifies that the CRAM @SQ lines and .fai files match.
2640  */
sanitise_SQ_lines(cram_fd * fd)2641 static void sanitise_SQ_lines(cram_fd *fd) {
2642     int i;
2643 
2644     if (!fd->header || !fd->header->hrecs)
2645         return;
2646 
2647     if (!fd->refs || !fd->refs->h_meta)
2648         return;
2649 
2650     for (i = 0; i < fd->header->hrecs->nref; i++) {
2651         const char *name = fd->header->hrecs->ref[i].name;
2652         khint_t k = kh_get(refs, fd->refs->h_meta, name);
2653         ref_entry *r;
2654 
2655         // We may have @SQ lines which have no known .fai, but do not
2656         // in themselves pose a problem because they are unused in the file.
2657         if (k == kh_end(fd->refs->h_meta))
2658             continue;
2659 
2660         if (!(r = (ref_entry *)kh_val(fd->refs->h_meta, k)))
2661             continue;
2662 
2663         if (r->length && r->length != fd->header->hrecs->ref[i].len) {
2664             assert(strcmp(r->name, fd->header->hrecs->ref[i].name) == 0);
2665 
2666             // Should we also check MD5sums here to ensure the correct
2667             // reference was given?
2668             hts_log_warning("Header @SQ length mismatch for ref %s, %"PRIhts_pos" vs %d",
2669                             r->name, fd->header->hrecs->ref[i].len, (int)r->length);
2670 
2671             // Fixing the parsed @SQ header will make MD:Z: strings work
2672             // and also stop it producing N for the sequence.
2673             fd->header->hrecs->ref[i].len = r->length;
2674         }
2675     }
2676 }
2677 
2678 /*
2679  * Indexes references by the order they appear in a BAM file. This may not
2680  * necessarily be the same order they appear in the fasta reference file.
2681  *
2682  * Returns 0 on success
2683  *        -1 on failure
2684  */
refs2id(refs_t * r,sam_hdr_t * hdr)2685 int refs2id(refs_t *r, sam_hdr_t *hdr) {
2686     int i;
2687     sam_hrecs_t *h = hdr->hrecs;
2688 
2689     if (r->ref_id)
2690         free(r->ref_id);
2691     if (r->last)
2692         r->last = NULL;
2693 
2694     r->ref_id = calloc(h->nref, sizeof(*r->ref_id));
2695     if (!r->ref_id)
2696         return -1;
2697 
2698     r->nref = h->nref;
2699     for (i = 0; i < h->nref; i++) {
2700         khint_t k = kh_get(refs, r->h_meta, h->ref[i].name);
2701         if (k != kh_end(r->h_meta)) {
2702             r->ref_id[i] = kh_val(r->h_meta, k);
2703         } else {
2704             hts_log_warning("Unable to find ref name '%s'", h->ref[i].name);
2705         }
2706     }
2707 
2708     return 0;
2709 }
2710 
2711 /*
2712  * Generates refs_t entries based on @SQ lines in the header.
2713  * Returns 0 on success
2714  *         -1 on failure
2715  */
refs_from_header(cram_fd * fd)2716 static int refs_from_header(cram_fd *fd) {
2717     if (!fd)
2718         return -1;
2719 
2720     refs_t *r = fd->refs;
2721     if (!r)
2722         return -1;
2723 
2724     sam_hdr_t *h = fd->header;
2725     if (!h)
2726         return 0;
2727 
2728     if (!h->hrecs) {
2729         if (-1 == sam_hdr_fill_hrecs(h))
2730             return -1;
2731     }
2732 
2733     if (h->hrecs->nref == 0)
2734         return 0;
2735 
2736     //fprintf(stderr, "refs_from_header for %p mode %c\n", fd, fd->mode);
2737 
2738     /* Existing refs are fine, as long as they're compatible with the hdr. */
2739     ref_entry **new_ref_id = realloc(r->ref_id, (r->nref + h->hrecs->nref) * sizeof(*r->ref_id));
2740     if (!new_ref_id)
2741         return -1;
2742     r->ref_id = new_ref_id;
2743 
2744     int i, j;
2745     /* Copy info from h->ref[i] over to r */
2746     for (i = 0, j = r->nref; i < h->hrecs->nref; i++) {
2747         sam_hrec_type_t *ty;
2748         sam_hrec_tag_t *tag;
2749         khint_t k;
2750         int n;
2751 
2752         k = kh_get(refs, r->h_meta, h->hrecs->ref[i].name);
2753         if (k != kh_end(r->h_meta))
2754             // Ref already known about
2755             continue;
2756 
2757         if (!(r->ref_id[j] = calloc(1, sizeof(ref_entry))))
2758             return -1;
2759 
2760         if (!h->hrecs->ref[i].name)
2761             return -1;
2762 
2763         r->ref_id[j]->name = string_dup(r->pool, h->hrecs->ref[i].name);
2764         if (!r->ref_id[j]->name) return -1;
2765         r->ref_id[j]->length = 0; // marker for not yet loaded
2766 
2767         /* Initialise likely filename if known */
2768         if ((ty = sam_hrecs_find_type_id(h->hrecs, "SQ", "SN", h->hrecs->ref[i].name))) {
2769             if ((tag = sam_hrecs_find_key(ty, "M5", NULL))) {
2770                 r->ref_id[j]->fn = string_dup(r->pool, tag->str+3);
2771                 //fprintf(stderr, "Tagging @SQ %s / %s\n", r->ref_id[h]->name, r->ref_id[h]->fn);
2772             }
2773         }
2774 
2775         k = kh_put(refs, r->h_meta, r->ref_id[j]->name, &n);
2776         if (n <= 0) // already exists or error
2777             return -1;
2778         kh_val(r->h_meta, k) = r->ref_id[j];
2779 
2780         j++;
2781     }
2782     r->nref = j;
2783 
2784     return 0;
2785 }
2786 
2787 /*
2788  * Attaches a header to a cram_fd.
2789  *
2790  * This should be used when creating a new cram_fd for writing where
2791  * we have a header already constructed (eg from a file we've read
2792  * in).
2793  */
cram_set_header2(cram_fd * fd,const sam_hdr_t * hdr)2794 int cram_set_header2(cram_fd *fd, const sam_hdr_t *hdr) {
2795     if (!fd || !hdr )
2796         return -1;
2797 
2798     if (fd->header != hdr) {
2799         if (fd->header)
2800             sam_hdr_destroy(fd->header);
2801         fd->header = sam_hdr_dup(hdr);
2802         if (!fd->header)
2803             return -1;
2804     }
2805     return refs_from_header(fd);
2806 }
2807 
cram_set_header(cram_fd * fd,sam_hdr_t * hdr)2808 int cram_set_header(cram_fd *fd, sam_hdr_t *hdr) {
2809     return cram_set_header2(fd, hdr);
2810 }
2811 
2812 /*
2813  * Returns whether the path refers to a directory.
2814  */
is_directory(char * fn)2815 static int is_directory(char *fn) {
2816     struct stat buf;
2817     if ( stat(fn,&buf) ) return 0;
2818     return S_ISDIR(buf.st_mode);
2819 }
2820 
2821 /*
2822  * Converts a directory and a filename into an expanded path, replacing %s
2823  * in directory with the filename and %[0-9]+s with portions of the filename
2824  * Any remaining parts of filename are added to the end with /%s.
2825  */
expand_cache_path(char * path,char * dir,const char * fn)2826 static int expand_cache_path(char *path, char *dir, const char *fn) {
2827     char *cp, *start = path;
2828     size_t len;
2829     size_t sz = PATH_MAX;
2830 
2831     while ((cp = strchr(dir, '%'))) {
2832         if (cp-dir >= sz) return -1;
2833         strncpy(path, dir, cp-dir);
2834         path += cp-dir;
2835         sz -= cp-dir;
2836 
2837         if (*++cp == 's') {
2838             len = strlen(fn);
2839             if (len >= sz) return -1;
2840             strcpy(path, fn);
2841             path += len;
2842             sz -= len;
2843             fn += len;
2844             cp++;
2845         } else if (*cp >= '0' && *cp <= '9') {
2846             char *endp;
2847             long l;
2848 
2849             l = strtol(cp, &endp, 10);
2850             l = MIN(l, strlen(fn));
2851             if (*endp == 's') {
2852                 if (l >= sz) return -1;
2853                 strncpy(path, fn, l);
2854                 path += l;
2855                 fn += l;
2856                 sz -= l;
2857                 *path = 0;
2858                 cp = endp+1;
2859             } else {
2860                 if (sz < 3) return -1;
2861                 *path++ = '%';
2862                 *path++ = *cp++;
2863             }
2864         } else {
2865             if (sz < 3) return -1;
2866             *path++ = '%';
2867             *path++ = *cp++;
2868         }
2869         dir = cp;
2870     }
2871 
2872     len = strlen(dir);
2873     if (len >= sz) return -1;
2874     strcpy(path, dir);
2875     path += len;
2876     sz -= len;
2877 
2878     len = strlen(fn) + ((*fn && path > start && path[-1] != '/') ? 1 : 0);
2879     if (len >= sz) return -1;
2880     if (*fn && path > start && path[-1] != '/')
2881         *path++ = '/';
2882     strcpy(path, fn);
2883     return 0;
2884 }
2885 
2886 /*
2887  * Make the directory containing path and any prefix directories.
2888  */
mkdir_prefix(char * path,int mode)2889 static void mkdir_prefix(char *path, int mode) {
2890     char *cp = strrchr(path, '/');
2891     if (!cp)
2892         return;
2893 
2894     *cp = 0;
2895     if (is_directory(path)) {
2896         *cp = '/';
2897         return;
2898     }
2899 
2900     if (mkdir(path, mode) == 0) {
2901         chmod(path, mode);
2902         *cp = '/';
2903         return;
2904     }
2905 
2906     mkdir_prefix(path, mode);
2907     mkdir(path, mode);
2908     chmod(path, mode);
2909     *cp = '/';
2910 }
2911 
2912 /*
2913  * Return the cache directory to use, based on the first of these
2914  * environment variables to be set to a non-empty value.
2915  */
get_cache_basedir(const char ** extra)2916 static const char *get_cache_basedir(const char **extra) {
2917     char *base;
2918 
2919     *extra = "";
2920 
2921     base = getenv("XDG_CACHE_HOME");
2922     if (base && *base) return base;
2923 
2924     base = getenv("HOME");
2925     if (base && *base) { *extra = "/.cache"; return base; }
2926 
2927     base = getenv("TMPDIR");
2928     if (base && *base) return base;
2929 
2930     base = getenv("TEMP");
2931     if (base && *base) return base;
2932 
2933     return "/tmp";
2934 }
2935 
2936 /*
2937  * Queries the M5 string from the header and attempts to populate the
2938  * reference from this using the REF_PATH environment.
2939  *
2940  * Returns 0 on success
2941  *        -1 on failure
2942  */
cram_populate_ref(cram_fd * fd,int id,ref_entry * r)2943 static int cram_populate_ref(cram_fd *fd, int id, ref_entry *r) {
2944     char *ref_path = getenv("REF_PATH");
2945     sam_hrec_type_t *ty;
2946     sam_hrec_tag_t *tag;
2947     char path[PATH_MAX];
2948     kstring_t path_tmp = KS_INITIALIZE;
2949     char cache[PATH_MAX], cache_root[PATH_MAX];
2950     char *local_cache = getenv("REF_CACHE");
2951     mFILE *mf;
2952     int local_path = 0;
2953 
2954     hts_log_info("Running cram_populate_ref on fd %p, id %d", (void *)fd, id);
2955 
2956     cache_root[0] = '\0';
2957 
2958     if (!ref_path || *ref_path == '\0') {
2959         /*
2960          * If we have no ref path, we use the EBI server.
2961          * However to avoid spamming it we require a local ref cache too.
2962          */
2963         ref_path = "https://www.ebi.ac.uk/ena/cram/md5/%s";
2964         if (!local_cache || *local_cache == '\0') {
2965             const char *extra;
2966             const char *base = get_cache_basedir(&extra);
2967             snprintf(cache_root, PATH_MAX, "%s%s/hts-ref", base, extra);
2968             snprintf(cache,PATH_MAX, "%s%s/hts-ref/%%2s/%%2s/%%s", base, extra);
2969             local_cache = cache;
2970             hts_log_info("Populating local cache: %s", local_cache);
2971         }
2972     }
2973 
2974     if (!r->name)
2975         return -1;
2976 
2977     if (!(ty = sam_hrecs_find_type_id(fd->header->hrecs, "SQ", "SN", r->name)))
2978         return -1;
2979 
2980     if (!(tag = sam_hrecs_find_key(ty, "M5", NULL)))
2981         goto no_M5;
2982 
2983     hts_log_info("Querying ref %s", tag->str+3);
2984 
2985     /* Use cache if available */
2986     if (local_cache && *local_cache) {
2987         if (expand_cache_path(path, local_cache, tag->str+3) == 0)
2988             local_path = 1;
2989     }
2990 
2991 #ifndef HAVE_MMAP
2992     char *path2;
2993     /* Search local files in REF_PATH; we can open them and return as above */
2994     if (!local_path && (path2 = find_path(tag->str+3, ref_path))) {
2995         int len = snprintf(path, PATH_MAX, "%s", path2);
2996         free(path2);
2997         if (len > 0 && len < PATH_MAX) // in case it's too long
2998             local_path = 1;
2999     }
3000 #endif
3001 
3002     /* Found via REF_CACHE or local REF_PATH file */
3003     if (local_path) {
3004         struct stat sb;
3005         BGZF *fp;
3006 
3007         if (0 == stat(path, &sb)
3008             && S_ISREG(sb.st_mode)
3009             && (fp = bgzf_open(path, "r"))) {
3010             r->length = sb.st_size;
3011             r->offset = r->line_length = r->bases_per_line = 0;
3012 
3013             r->fn = string_dup(fd->refs->pool, path);
3014 
3015             if (fd->refs->fp)
3016                 if (bgzf_close(fd->refs->fp) != 0)
3017                     return -1;
3018             fd->refs->fp = fp;
3019             fd->refs->fn = r->fn;
3020             r->is_md5 = 1;
3021 
3022             // Fall back to cram_get_ref() where it'll do the actual
3023             // reading of the file.
3024             return 0;
3025         }
3026     }
3027 
3028 
3029     /* Otherwise search full REF_PATH; slower as loads entire file */
3030     if ((mf = open_path_mfile(tag->str+3, ref_path, NULL))) {
3031         size_t sz;
3032         r->seq = mfsteal(mf, &sz);
3033         if (r->seq) {
3034             r->mf = NULL;
3035         } else {
3036             // keep mf around as we couldn't detach
3037             r->seq = mf->data;
3038             r->mf = mf;
3039         }
3040         r->length = sz;
3041         r->is_md5 = 1;
3042     } else {
3043         refs_t *refs;
3044         const char *fn;
3045 
3046     no_M5:
3047         /* Failed to find in search path or M5 cache, see if @SQ UR: tag? */
3048         if (!(tag = sam_hrecs_find_key(ty, "UR", NULL)))
3049             return -1;
3050 
3051         fn = (strncmp(tag->str+3, "file:", 5) == 0)
3052             ? tag->str+8
3053             : tag->str+3;
3054 
3055         if (fd->refs->fp) {
3056             if (bgzf_close(fd->refs->fp) != 0)
3057                 return -1;
3058             fd->refs->fp = NULL;
3059         }
3060         if (!(refs = refs_load_fai(fd->refs, fn, 0)))
3061             return -1;
3062         sanitise_SQ_lines(fd);
3063 
3064         fd->refs = refs;
3065         if (fd->refs->fp) {
3066             if (bgzf_close(fd->refs->fp) != 0)
3067                 return -1;
3068             fd->refs->fp = NULL;
3069         }
3070 
3071         if (!fd->refs->fn)
3072             return -1;
3073 
3074         if (-1 == refs2id(fd->refs, fd->header))
3075             return -1;
3076         if (!fd->refs->ref_id || !fd->refs->ref_id[id])
3077             return -1;
3078 
3079         // Local copy already, so fall back to cram_get_ref().
3080         return 0;
3081     }
3082 
3083     /* Populate the local disk cache if required */
3084     if (local_cache && *local_cache) {
3085         hFILE *fp;
3086 
3087         if (*cache_root && !is_directory(cache_root)) {
3088             hts_log_warning("Creating reference cache directory %s\n"
3089                             "This may become large; see the samtools(1) manual page REF_CACHE discussion",
3090                             cache_root);
3091         }
3092 
3093         if (expand_cache_path(path, local_cache, tag->str+3) < 0) {
3094             return 0; // Not fatal - we have the data already so keep going.
3095         }
3096         hts_log_info("Writing cache file '%s'", path);
3097         mkdir_prefix(path, 01777);
3098 
3099         fp = hts_open_tmpfile(path, "wx", &path_tmp);
3100         if (!fp) {
3101             perror(path_tmp.s);
3102             free(path_tmp.s);
3103 
3104             // Not fatal - we have the data already so keep going.
3105             return 0;
3106         }
3107 
3108         // Check md5sum
3109         hts_md5_context *md5;
3110         char unsigned md5_buf1[16];
3111         char md5_buf2[33];
3112 
3113         if (!(md5 = hts_md5_init())) {
3114             hclose_abruptly(fp);
3115             unlink(path_tmp.s);
3116             free(path_tmp.s);
3117             return -1;
3118         }
3119         hts_md5_update(md5, r->seq, r->length);
3120         hts_md5_final(md5_buf1, md5);
3121         hts_md5_destroy(md5);
3122         hts_md5_hex(md5_buf2, md5_buf1);
3123 
3124         if (strncmp(tag->str+3, md5_buf2, 32) != 0) {
3125             hts_log_error("Mismatching md5sum for downloaded reference");
3126             hclose_abruptly(fp);
3127             unlink(path_tmp.s);
3128             free(path_tmp.s);
3129             return -1;
3130         }
3131 
3132         ssize_t length_written = hwrite(fp, r->seq, r->length);
3133         if (hclose(fp) < 0 || length_written != r->length ||
3134             chmod(path_tmp.s, 0444) < 0 ||
3135             rename(path_tmp.s, path) < 0) {
3136             hts_log_error("Creating reference at %s failed: %s",
3137                           path, strerror(errno));
3138             unlink(path_tmp.s);
3139         }
3140     }
3141 
3142     free(path_tmp.s);
3143     return 0;
3144 }
3145 
cram_ref_incr_locked(refs_t * r,int id)3146 static void cram_ref_incr_locked(refs_t *r, int id) {
3147     RP("%d INC REF %d, %d %p\n", gettid(), id,
3148        (int)(id>=0 && r->ref_id[id]?r->ref_id[id]->count+1:-999),
3149        id>=0 && r->ref_id[id]?r->ref_id[id]->seq:(char *)1);
3150 
3151     if (id < 0 || !r->ref_id[id] || !r->ref_id[id]->seq)
3152         return;
3153 
3154     if (r->last_id == id)
3155         r->last_id = -1;
3156 
3157     ++r->ref_id[id]->count;
3158 }
3159 
cram_ref_incr(refs_t * r,int id)3160 void cram_ref_incr(refs_t *r, int id) {
3161     pthread_mutex_lock(&r->lock);
3162     cram_ref_incr_locked(r, id);
3163     pthread_mutex_unlock(&r->lock);
3164 }
3165 
cram_ref_decr_locked(refs_t * r,int id)3166 static void cram_ref_decr_locked(refs_t *r, int id) {
3167     RP("%d DEC REF %d, %d %p\n", gettid(), id,
3168        (int)(id>=0 && r->ref_id[id]?r->ref_id[id]->count-1:-999),
3169        id>=0 && r->ref_id[id]?r->ref_id[id]->seq:(char *)1);
3170 
3171     if (id < 0 || !r->ref_id[id] || !r->ref_id[id]->seq) {
3172         return;
3173     }
3174 
3175     if (--r->ref_id[id]->count <= 0) {
3176         assert(r->ref_id[id]->count == 0);
3177         if (r->last_id >= 0) {
3178             if (r->ref_id[r->last_id]->count <= 0 &&
3179                 r->ref_id[r->last_id]->seq) {
3180                 RP("%d FREE REF %d (%p)\n", gettid(),
3181                    r->last_id, r->ref_id[r->last_id]->seq);
3182                 ref_entry_free_seq(r->ref_id[r->last_id]);
3183                 if (r->ref_id[r->last_id]->is_md5) r->ref_id[r->last_id]->length = 0;
3184             }
3185         }
3186         r->last_id = id;
3187     }
3188 }
3189 
cram_ref_decr(refs_t * r,int id)3190 void cram_ref_decr(refs_t *r, int id) {
3191     pthread_mutex_lock(&r->lock);
3192     cram_ref_decr_locked(r, id);
3193     pthread_mutex_unlock(&r->lock);
3194 }
3195 
3196 /*
3197  * Used by cram_ref_load and cram_ref_get. The file handle will have
3198  * already been opened, so we can catch it. The ref_entry *e informs us
3199  * of whether this is a multi-line fasta file or a raw MD5 style file.
3200  * Either way we create a single contiguous sequence.
3201  *
3202  * Returns all or part of a reference sequence on success (malloced);
3203  *         NULL on failure.
3204  */
load_ref_portion(BGZF * fp,ref_entry * e,int start,int end)3205 static char *load_ref_portion(BGZF *fp, ref_entry *e, int start, int end) {
3206     off_t offset, len;
3207     char *seq;
3208 
3209     if (end < start)
3210         end = start;
3211 
3212     /*
3213      * Compute locations in file. This is trivial for the MD5 files, but
3214      * is still necessary for the fasta variants.
3215      */
3216     offset = e->line_length
3217         ? e->offset + (start-1)/e->bases_per_line * e->line_length +
3218           (start-1) % e->bases_per_line
3219         : start-1;
3220 
3221     len = (e->line_length
3222            ? e->offset + (end-1)/e->bases_per_line * e->line_length +
3223              (end-1) % e->bases_per_line
3224            : end-1) - offset + 1;
3225 
3226     if (bgzf_useek(fp, offset, SEEK_SET) < 0) {
3227         perror("bgzf_useek() on reference file");
3228         return NULL;
3229     }
3230 
3231     if (len == 0 || !(seq = malloc(len))) {
3232         return NULL;
3233     }
3234 
3235     if (len != bgzf_read(fp, seq, len)) {
3236         perror("bgzf_read() on reference file");
3237         free(seq);
3238         return NULL;
3239     }
3240 
3241     /* Strip white-space if required. */
3242     if (len != end-start+1) {
3243         int i, j;
3244         char *cp = seq;
3245         char *cp_to;
3246 
3247         for (i = j = 0; i < len; i++) {
3248             if (cp[i] >= '!' && cp[i] <= '~')
3249                 cp[j++] = toupper_c(cp[i]);
3250         }
3251         cp_to = cp+j;
3252 
3253         if (cp_to - seq != end-start+1) {
3254             hts_log_error("Malformed reference file");
3255             free(seq);
3256             return NULL;
3257         }
3258     } else {
3259         int i;
3260         for (i = 0; i < len; i++) {
3261             seq[i] = toupper_c(seq[i]);
3262         }
3263     }
3264 
3265     return seq;
3266 }
3267 
3268 /*
3269  * Load the entire reference 'id'.
3270  * This also increments the reference count by 1.
3271  *
3272  * Returns ref_entry on success;
3273  *         NULL on failure
3274  */
cram_ref_load(refs_t * r,int id,int is_md5)3275 ref_entry *cram_ref_load(refs_t *r, int id, int is_md5) {
3276     ref_entry *e = r->ref_id[id];
3277     int start = 1, end = e->length;
3278     char *seq;
3279 
3280     if (e->seq) {
3281         return e;
3282     }
3283 
3284     assert(e->count == 0);
3285 
3286     if (r->last) {
3287 #ifdef REF_DEBUG
3288         int idx = 0;
3289         for (idx = 0; idx < r->nref; idx++)
3290             if (r->last == r->ref_id[idx])
3291                 break;
3292         RP("%d cram_ref_load DECR %d\n", gettid(), idx);
3293 #endif
3294         assert(r->last->count > 0);
3295         if (--r->last->count <= 0) {
3296             RP("%d FREE REF %d (%p)\n", gettid(), id, r->ref_id[id]->seq);
3297             if (r->last->seq)
3298                 ref_entry_free_seq(r->last);
3299         }
3300     }
3301 
3302     if (!r->fn)
3303         return NULL;
3304 
3305     /* Open file if it's not already the current open reference */
3306     if (strcmp(r->fn, e->fn) || r->fp == NULL) {
3307         if (r->fp)
3308             if (bgzf_close(r->fp) != 0)
3309                 return NULL;
3310         r->fn = e->fn;
3311         if (!(r->fp = bgzf_open_ref(r->fn, "r", is_md5)))
3312             return NULL;
3313     }
3314 
3315     RP("%d Loading ref %d (%d..%d)\n", gettid(), id, start, end);
3316 
3317     if (!(seq = load_ref_portion(r->fp, e, start, end))) {
3318         return NULL;
3319     }
3320 
3321     RP("%d Loaded ref %d (%d..%d) = %p\n", gettid(), id, start, end, seq);
3322 
3323     RP("%d INC REF %d, %"PRId64"\n", gettid(), id, (e->count+1));
3324     e->seq = seq;
3325     e->mf = NULL;
3326     e->count++;
3327 
3328     /*
3329      * Also keep track of last used ref so incr/decr loops on the same
3330      * sequence don't cause load/free loops.
3331      */
3332     RP("%d cram_ref_load INCR %d => %"PRId64"\n", gettid(), id, e->count+1);
3333     r->last = e;
3334     e->count++;
3335 
3336     return e;
3337 }
3338 
3339 /*
3340  * Returns a portion of a reference sequence from start to end inclusive.
3341  * The returned pointer is owned by either the cram_file fd or by the
3342  * internal refs_t structure and should not be freed  by the caller.
3343  *
3344  * The difference is whether or not this refs_t is in use by just the one
3345  * cram_fd or by multiples, or whether we have multiple threads accessing
3346  * references. In either case fd->shared will be true and we start using
3347  * reference counting to track the number of users of a specific reference
3348  * sequence.
3349  *
3350  * Otherwise the ref seq returned is allocated as part of cram_fd itself
3351  * and will be freed up on the next call to cram_get_ref or cram_close.
3352  *
3353  * To return the entire reference sequence, specify start as 1 and end
3354  * as 0.
3355  *
3356  * To cease using a reference, call cram_ref_decr().
3357  *
3358  * Returns reference on success,
3359  *         NULL on failure
3360  */
cram_get_ref(cram_fd * fd,int id,int start,int end)3361 char *cram_get_ref(cram_fd *fd, int id, int start, int end) {
3362     ref_entry *r;
3363     char *seq;
3364     int ostart = start;
3365 
3366     if (id == -1 || start < 1)
3367         return NULL;
3368 
3369     /* FIXME: axiomatic query of r->seq being true?
3370      * Or shortcut for unsorted data where we load once and never free?
3371      */
3372 
3373     //fd->shared_ref = 1; // hard code for now to simplify things
3374 
3375     pthread_mutex_lock(&fd->ref_lock);
3376 
3377     RP("%d cram_get_ref on fd %p, id %d, range %d..%d\n", gettid(), fd, id, start, end);
3378 
3379     /*
3380      * Unsorted data implies we want to fetch an entire reference at a time.
3381      * We just deal with this at the moment by claiming we're sharing
3382      * references instead, which has the same requirement.
3383      */
3384     if (fd->unsorted)
3385         fd->shared_ref = 1;
3386 
3387 
3388     /* Sanity checking: does this ID exist? */
3389     if (id >= fd->refs->nref) {
3390         hts_log_error("No reference found for id %d", id);
3391         pthread_mutex_unlock(&fd->ref_lock);
3392         return NULL;
3393     }
3394 
3395     if (!fd->refs || !fd->refs->ref_id[id]) {
3396         hts_log_error("No reference found for id %d", id);
3397         pthread_mutex_unlock(&fd->ref_lock);
3398         return NULL;
3399     }
3400 
3401     if (!(r = fd->refs->ref_id[id])) {
3402         hts_log_error("No reference found for id %d", id);
3403         pthread_mutex_unlock(&fd->ref_lock);
3404         return NULL;
3405     }
3406 
3407 
3408     /*
3409      * It has an entry, but may not have been populated yet.
3410      * Any manually loaded .fai files have their lengths known.
3411      * A ref entry computed from @SQ lines (M5 or UR field) will have
3412      * r->length == 0 unless it's been loaded once and verified that we have
3413      * an on-disk filename for it.
3414      *
3415      * 19 Sep 2013: Moved the lock here as the cram_populate_ref code calls
3416      * open_path_mfile and libcurl, which isn't multi-thread safe unless I
3417      * rewrite my code to have one curl handle per thread.
3418      */
3419     pthread_mutex_lock(&fd->refs->lock);
3420     if (r->length == 0) {
3421         if (cram_populate_ref(fd, id, r) == -1) {
3422             hts_log_error("Failed to populate reference for id %d", id);
3423             pthread_mutex_unlock(&fd->refs->lock);
3424             pthread_mutex_unlock(&fd->ref_lock);
3425             return NULL;
3426         }
3427         r = fd->refs->ref_id[id];
3428         if (fd->unsorted)
3429             cram_ref_incr_locked(fd->refs, id);
3430     }
3431 
3432 
3433     /*
3434      * We now know that we the filename containing the reference, so check
3435      * for limits. If it's over half the reference we'll load all of it in
3436      * memory as this will speed up subsequent calls.
3437      */
3438     if (end < 1)
3439         end = r->length;
3440     if (end >= r->length)
3441         end  = r->length;
3442 
3443     if (end - start >= 0.5*r->length || fd->shared_ref) {
3444         start = 1;
3445         end = r->length;
3446     }
3447 
3448     /*
3449      * Maybe we have it cached already? If so use it.
3450      *
3451      * Alternatively if we don't have the sequence but we're sharing
3452      * references and/or are asking for the entire length of it, then
3453      * load the full reference into the refs structure and return
3454      * a pointer to that one instead.
3455      */
3456     if (fd->shared_ref || r->seq || (start == 1 && end == r->length)) {
3457         char *cp;
3458 
3459         if (id >= 0) {
3460             if (r->seq) {
3461                 cram_ref_incr_locked(fd->refs, id);
3462             } else {
3463                 ref_entry *e;
3464                 if (!(e = cram_ref_load(fd->refs, id, r->is_md5))) {
3465                     pthread_mutex_unlock(&fd->refs->lock);
3466                     pthread_mutex_unlock(&fd->ref_lock);
3467                     return NULL;
3468                 }
3469 
3470                 /* unsorted data implies cache ref indefinitely, to avoid
3471                  * continually loading and unloading.
3472                  */
3473                 if (fd->unsorted)
3474                     cram_ref_incr_locked(fd->refs, id);
3475             }
3476 
3477             fd->ref = NULL; /* We never access it directly */
3478             fd->ref_start = 1;
3479             fd->ref_end   = r->length;
3480             fd->ref_id    = id;
3481 
3482             cp = fd->refs->ref_id[id]->seq + ostart-1;
3483         } else {
3484             fd->ref = NULL;
3485             cp = NULL;
3486         }
3487 
3488         RP("%d cram_get_ref returning for id %d, count %d\n", gettid(), id, (int)r->count);
3489 
3490         pthread_mutex_unlock(&fd->refs->lock);
3491         pthread_mutex_unlock(&fd->ref_lock);
3492         return cp;
3493     }
3494 
3495     /*
3496      * Otherwise we're not sharing, we don't have a copy of it already and
3497      * we're only asking for a small portion of it.
3498      *
3499      * In this case load up just that segment ourselves, freeing any old
3500      * small segments in the process.
3501      */
3502 
3503     /* Unmapped ref ID */
3504     if (id < 0 || !fd->refs->fn) {
3505         if (fd->ref_free) {
3506             free(fd->ref_free);
3507             fd->ref_free = NULL;
3508         }
3509         fd->ref = NULL;
3510         fd->ref_id = id;
3511         pthread_mutex_unlock(&fd->refs->lock);
3512         pthread_mutex_unlock(&fd->ref_lock);
3513         return NULL;
3514     }
3515 
3516     /* Open file if it's not already the current open reference */
3517     if (strcmp(fd->refs->fn, r->fn) || fd->refs->fp == NULL) {
3518         if (fd->refs->fp)
3519             if (bgzf_close(fd->refs->fp) != 0)
3520                 return NULL;
3521         fd->refs->fn = r->fn;
3522         if (!(fd->refs->fp = bgzf_open_ref(fd->refs->fn, "r", r->is_md5))) {
3523             pthread_mutex_unlock(&fd->refs->lock);
3524             pthread_mutex_unlock(&fd->ref_lock);
3525             return NULL;
3526         }
3527     }
3528 
3529     if (!(fd->ref = load_ref_portion(fd->refs->fp, r, start, end))) {
3530         pthread_mutex_unlock(&fd->refs->lock);
3531         pthread_mutex_unlock(&fd->ref_lock);
3532         return NULL;
3533     }
3534 
3535     if (fd->ref_free)
3536         free(fd->ref_free);
3537 
3538     fd->ref_id    = id;
3539     fd->ref_start = start;
3540     fd->ref_end   = end;
3541     fd->ref_free = fd->ref;
3542     seq = fd->ref;
3543 
3544     pthread_mutex_unlock(&fd->refs->lock);
3545     pthread_mutex_unlock(&fd->ref_lock);
3546 
3547     return seq ? seq + ostart - start : NULL;
3548 }
3549 
3550 /*
3551  * If fd has been opened for reading, it may be permitted to specify 'fn'
3552  * as NULL and let the code auto-detect the reference by parsing the
3553  * SAM header @SQ lines.
3554  */
cram_load_reference(cram_fd * fd,char * fn)3555 int cram_load_reference(cram_fd *fd, char *fn) {
3556     int ret = 0;
3557 
3558     if (fn) {
3559         fd->refs = refs_load_fai(fd->refs, fn,
3560                                  !(fd->embed_ref && fd->mode == 'r'));
3561         fn = fd->refs ? fd->refs->fn : NULL;
3562         if (!fn)
3563             ret = -1;
3564         sanitise_SQ_lines(fd);
3565     }
3566     fd->ref_fn = fn;
3567 
3568     if ((!fd->refs || (fd->refs->nref == 0 && !fn)) && fd->header) {
3569         if (fd->refs)
3570             refs_free(fd->refs);
3571         if (!(fd->refs = refs_create()))
3572             return -1;
3573         if (-1 == refs_from_header(fd))
3574             return -1;
3575     }
3576 
3577     if (fd->header)
3578         if (-1 == refs2id(fd->refs, fd->header))
3579             return -1;
3580 
3581     return ret;
3582 }
3583 
3584 /* ----------------------------------------------------------------------
3585  * Containers
3586  */
3587 
3588 /*
3589  * Creates a new container, specifying the maximum number of slices
3590  * and records permitted.
3591  *
3592  * Returns cram_container ptr on success
3593  *         NULL on failure
3594  */
cram_new_container(int nrec,int nslice)3595 cram_container *cram_new_container(int nrec, int nslice) {
3596     cram_container *c = calloc(1, sizeof(*c));
3597     enum cram_DS_ID id;
3598 
3599     if (!c)
3600         return NULL;
3601 
3602     c->curr_ref = -2;
3603 
3604     c->max_c_rec = nrec * nslice;
3605     c->curr_c_rec = 0;
3606 
3607     c->max_rec = nrec;
3608     c->record_counter = 0;
3609     c->num_bases = 0;
3610     c->s_num_bases = 0;
3611 
3612     c->max_slice = nslice;
3613     c->curr_slice = 0;
3614 
3615     c->pos_sorted = 1;
3616     c->max_apos   = 0;
3617     c->multi_seq  = 0;
3618     c->qs_seq_orient = 1;
3619 
3620     c->bams = NULL;
3621 
3622     if (!(c->slices = calloc(nslice != 0 ? nslice : 1, sizeof(cram_slice *))))
3623         goto err;
3624     c->slice = NULL;
3625 
3626     if (!(c->comp_hdr = cram_new_compression_header()))
3627         goto err;
3628     c->comp_hdr_block = NULL;
3629 
3630     for (id = DS_RN; id < DS_TN; id++)
3631         if (!(c->stats[id] = cram_stats_create())) goto err;
3632 
3633     //c->aux_B_stats = cram_stats_create();
3634 
3635     if (!(c->tags_used = kh_init(m_tagmap)))
3636         goto err;
3637     c->refs_used = 0;
3638 
3639     return c;
3640 
3641  err:
3642     if (c) {
3643         if (c->slices)
3644             free(c->slices);
3645         free(c);
3646     }
3647     return NULL;
3648 }
3649 
cram_free_container(cram_container * c)3650 void cram_free_container(cram_container *c) {
3651     enum cram_DS_ID id;
3652     int i;
3653 
3654     if (!c)
3655         return;
3656 
3657     if (c->refs_used)
3658         free(c->refs_used);
3659 
3660     if (c->landmark)
3661         free(c->landmark);
3662 
3663     if (c->comp_hdr)
3664         cram_free_compression_header(c->comp_hdr);
3665 
3666     if (c->comp_hdr_block)
3667         cram_free_block(c->comp_hdr_block);
3668 
3669     // Free the slices; filled out by encoder only
3670     if (c->slices) {
3671         for (i = 0; i < c->max_slice; i++) {
3672             if (c->slices[i])
3673                 cram_free_slice(c->slices[i]);
3674             if (c->slices[i] == c->slice)
3675                 c->slice = NULL;
3676         }
3677         free(c->slices);
3678     }
3679 
3680     // Free the current slice; set by both encoder & decoder
3681     if (c->slice) {
3682         cram_free_slice(c->slice);
3683         c->slice = NULL;
3684     }
3685 
3686     for (id = DS_RN; id < DS_TN; id++)
3687         if (c->stats[id]) cram_stats_free(c->stats[id]);
3688 
3689     //if (c->aux_B_stats) cram_stats_free(c->aux_B_stats);
3690 
3691     if (c->tags_used) {
3692         khint_t k;
3693 
3694         for (k = kh_begin(c->tags_used); k != kh_end(c->tags_used); k++) {
3695             if (!kh_exist(c->tags_used, k))
3696                 continue;
3697 
3698             cram_tag_map *tm = (cram_tag_map *)kh_val(c->tags_used, k);
3699             if (tm) {
3700                 cram_codec *c = tm->codec;
3701 
3702                 if (c) c->free(c);
3703                 free(tm);
3704             }
3705         }
3706 
3707         kh_destroy(m_tagmap, c->tags_used);
3708     }
3709 
3710     free(c);
3711 }
3712 
3713 /*
3714  * Reads a container header.
3715  *
3716  * Returns cram_container on success
3717  *         NULL on failure or no container left (fd->err == 0).
3718  */
cram_read_container(cram_fd * fd)3719 cram_container *cram_read_container(cram_fd *fd) {
3720     cram_container c2, *c;
3721     int i, s;
3722     size_t rd = 0;
3723     uint32_t crc = 0;
3724 
3725     fd->err = 0;
3726     fd->eof = 0;
3727 
3728     memset(&c2, 0, sizeof(c2));
3729     if (CRAM_MAJOR_VERS(fd->version) == 1) {
3730         if ((s = fd->vv.varint_decode32_crc(fd, &c2.length, &crc)) == -1) {
3731             fd->eof = fd->empty_container ? 1 : 2;
3732             return NULL;
3733         } else {
3734             rd+=s;
3735         }
3736     } else if (CRAM_MAJOR_VERS(fd->version) < 4) {
3737         uint32_t len;
3738         if ((s = int32_decode(fd, &c2.length)) == -1) {
3739             if (CRAM_MAJOR_VERS(fd->version) == 2 &&
3740                 CRAM_MINOR_VERS(fd->version) == 0)
3741                 fd->eof = 1; // EOF blocks arrived in v2.1
3742             else
3743                 fd->eof = fd->empty_container ? 1 : 2;
3744             return NULL;
3745         } else {
3746             rd+=s;
3747         }
3748         len = le_int4(c2.length);
3749         crc = crc32(0L, (unsigned char *)&len, 4);
3750     } else {
3751         if ((s = fd->vv.varint_decode32_crc(fd, &c2.length, &crc))   == -1) {
3752             fd->eof = fd->empty_container ? 1 : 2;
3753             return NULL;
3754         } else {
3755             rd+=s;
3756         }
3757     }
3758     if ((s = fd->vv.varint_decode32s_crc(fd, &c2.ref_seq_id, &crc))   == -1) return NULL; else rd+=s;
3759     if (CRAM_MAJOR_VERS(fd->version) >= 4) {
3760         int64_t i64;
3761         if ((s = fd->vv.varint_decode64_crc(fd, &i64, &crc))== -1) return NULL; else rd+=s;
3762         c2.ref_seq_start = i64;
3763         if ((s = fd->vv.varint_decode64_crc(fd, &i64, &crc)) == -1) return NULL; else rd+=s;
3764         c2.ref_seq_span = i64;
3765     } else {
3766         int32_t i32;
3767         if ((s = fd->vv.varint_decode32_crc(fd, &i32, &crc))== -1) return NULL; else rd+=s;
3768         c2.ref_seq_start = i32;
3769         if ((s = fd->vv.varint_decode32_crc(fd, &i32, &crc)) == -1) return NULL; else rd+=s;
3770         c2.ref_seq_span = i32;
3771     }
3772     if ((s = fd->vv.varint_decode32_crc(fd, &c2.num_records, &crc))  == -1) return NULL; else rd+=s;
3773 
3774     if (CRAM_MAJOR_VERS(fd->version) == 1) {
3775         c2.record_counter = 0;
3776         c2.num_bases = 0;
3777     } else {
3778         if (CRAM_MAJOR_VERS(fd->version) >= 3) {
3779             if ((s = fd->vv.varint_decode64_crc(fd, &c2.record_counter, &crc)) == -1)
3780                 return NULL;
3781             else
3782                 rd += s;
3783         } else {
3784             int32_t i32;
3785             if ((s = fd->vv.varint_decode32_crc(fd, &i32, &crc)) == -1)
3786                 return NULL;
3787             else
3788                 rd += s;
3789             c2.record_counter = i32;
3790         }
3791 
3792         if ((s = fd->vv.varint_decode64_crc(fd, &c2.num_bases, &crc))== -1)
3793             return NULL;
3794         else
3795             rd += s;
3796     }
3797     if ((s = fd->vv.varint_decode32_crc(fd, &c2.num_blocks, &crc))   == -1)
3798         return NULL;
3799     else
3800         rd+=s;
3801     if ((s = fd->vv.varint_decode32_crc(fd, &c2.num_landmarks, &crc))== -1)
3802         return NULL;
3803     else
3804         rd+=s;
3805 
3806     if (c2.num_landmarks < 0 || c2.num_landmarks >= SIZE_MAX / sizeof(int32_t))
3807         return NULL;
3808 
3809     if (!(c = calloc(1, sizeof(*c))))
3810         return NULL;
3811 
3812     *c = c2;
3813 
3814     if (c->num_landmarks && !(c->landmark = malloc(c->num_landmarks * sizeof(int32_t)))) {
3815         fd->err = errno;
3816         cram_free_container(c);
3817         return NULL;
3818     }
3819     for (i = 0; i < c->num_landmarks; i++) {
3820         if ((s = fd->vv.varint_decode32_crc(fd, &c->landmark[i], &crc)) == -1) {
3821             cram_free_container(c);
3822             return NULL;
3823         } else {
3824             rd += s;
3825         }
3826     }
3827 
3828     if (CRAM_MAJOR_VERS(fd->version) >= 3) {
3829         if (-1 == int32_decode(fd, (int32_t *)&c->crc32)) {
3830             cram_free_container(c);
3831             return NULL;
3832         } else {
3833             rd+=4;
3834         }
3835 
3836         if (crc != c->crc32) {
3837             hts_log_error("Container header CRC32 failure");
3838             cram_free_container(c);
3839             return NULL;
3840         }
3841     }
3842 
3843     c->offset = rd;
3844     c->slices = NULL;
3845     c->slice = NULL;
3846     c->curr_slice = 0;
3847     c->max_slice = c->num_landmarks;
3848     c->slice_rec = 0;
3849     c->curr_rec = 0;
3850     c->max_rec = 0;
3851 
3852     if (c->ref_seq_id == -2) {
3853         c->multi_seq = 1;
3854         fd->multi_seq = 1;
3855     }
3856 
3857     fd->empty_container =
3858         (c->num_records == 0 &&
3859          c->ref_seq_id == -1 &&
3860          c->ref_seq_start == 0x454f46 /* EOF */) ? 1 : 0;
3861 
3862     return c;
3863 }
3864 
3865 
3866 /* MAXIMUM storage size needed for the container. */
cram_container_size(cram_container * c)3867 int cram_container_size(cram_container *c) {
3868     return 55 + 5*c->num_landmarks;
3869 }
3870 
3871 
3872 /*
3873  * Stores the container structure in dat and returns *size as the
3874  * number of bytes written to dat[].  The input size of dat is also
3875  * held in *size and should be initialised to cram_container_size(c).
3876  *
3877  * Returns 0 on success;
3878  *        -1 on failure
3879  */
cram_store_container(cram_fd * fd,cram_container * c,char * dat,int * size)3880 int cram_store_container(cram_fd *fd, cram_container *c, char *dat, int *size)
3881 {
3882     char *cp = (char *)dat;
3883     int i;
3884 
3885     // Check the input buffer is large enough according to our stated
3886     // requirements. (NOTE: it may actually take less.)
3887     if (cram_container_size(c) > *size)
3888         return -1;
3889 
3890     if (CRAM_MAJOR_VERS(fd->version) == 1) {
3891         cp += itf8_put(cp, c->length);
3892     } else {
3893         *(int32_t *)cp = le_int4(c->length);
3894         cp += 4;
3895     }
3896     if (c->multi_seq) {
3897         cp += fd->vv.varint_put32(cp, NULL, -2);
3898         cp += fd->vv.varint_put32(cp, NULL, 0);
3899         cp += fd->vv.varint_put32(cp, NULL, 0);
3900     } else {
3901         cp += fd->vv.varint_put32s(cp, NULL, c->ref_seq_id);
3902         if (CRAM_MAJOR_VERS(fd->version) >= 4) {
3903             cp += fd->vv.varint_put64(cp, NULL, c->ref_seq_start);
3904             cp += fd->vv.varint_put64(cp, NULL, c->ref_seq_span);
3905         } else {
3906             cp += fd->vv.varint_put32(cp, NULL, c->ref_seq_start);
3907             cp += fd->vv.varint_put32(cp, NULL, c->ref_seq_span);
3908         }
3909     }
3910     cp += fd->vv.varint_put32(cp, NULL, c->num_records);
3911     if (CRAM_MAJOR_VERS(fd->version) == 2) {
3912         cp += fd->vv.varint_put64(cp, NULL, c->record_counter);
3913     } else if (CRAM_MAJOR_VERS(fd->version) >= 3) {
3914         cp += fd->vv.varint_put32(cp, NULL, c->record_counter);
3915     }
3916     cp += fd->vv.varint_put64(cp, NULL, c->num_bases);
3917     cp += fd->vv.varint_put32(cp, NULL, c->num_blocks);
3918     cp += fd->vv.varint_put32(cp, NULL, c->num_landmarks);
3919     for (i = 0; i < c->num_landmarks; i++)
3920         cp += fd->vv.varint_put32(cp, NULL, c->landmark[i]);
3921 
3922     if (CRAM_MAJOR_VERS(fd->version) >= 3) {
3923         c->crc32 = crc32(0L, (uc *)dat, cp-dat);
3924         cp[0] =  c->crc32        & 0xff;
3925         cp[1] = (c->crc32 >>  8) & 0xff;
3926         cp[2] = (c->crc32 >> 16) & 0xff;
3927         cp[3] = (c->crc32 >> 24) & 0xff;
3928         cp += 4;
3929     }
3930 
3931     *size = cp-dat; // actual used size
3932 
3933     return 0;
3934 }
3935 
3936 
3937 /*
3938  * Writes a container structure.
3939  *
3940  * Returns 0 on success
3941  *        -1 on failure
3942  */
cram_write_container(cram_fd * fd,cram_container * c)3943 int cram_write_container(cram_fd *fd, cram_container *c) {
3944     char buf_a[1024], *buf = buf_a, *cp;
3945     int i;
3946 
3947     if (61 + c->num_landmarks * 10 >= 1024) {
3948         buf = malloc(61 + c->num_landmarks * 10);
3949         if (!buf)
3950             return -1;
3951     }
3952     cp = buf;
3953 
3954     if (CRAM_MAJOR_VERS(fd->version) == 1) {
3955         cp += itf8_put(cp, c->length);
3956     } else if (CRAM_MAJOR_VERS(fd->version) <= 3) {
3957         *(int32_t *)cp = le_int4(c->length);
3958         cp += 4;
3959     } else {
3960         cp += fd->vv.varint_put32(cp, NULL, c->length);
3961     }
3962     if (c->multi_seq) {
3963         cp += fd->vv.varint_put32(cp, NULL, (uint32_t)-2);
3964         cp += fd->vv.varint_put32(cp, NULL, 0);
3965         cp += fd->vv.varint_put32(cp, NULL, 0);
3966     } else {
3967         cp += fd->vv.varint_put32s(cp, NULL, c->ref_seq_id);
3968         if (CRAM_MAJOR_VERS(fd->version) >= 4) {
3969             cp += fd->vv.varint_put64(cp, NULL, c->ref_seq_start);
3970             cp += fd->vv.varint_put64(cp, NULL, c->ref_seq_span);
3971         } else {
3972             cp += fd->vv.varint_put32(cp, NULL, c->ref_seq_start);
3973             cp += fd->vv.varint_put32(cp, NULL, c->ref_seq_span);
3974         }
3975     }
3976     cp += fd->vv.varint_put32(cp, NULL, c->num_records);
3977     if (CRAM_MAJOR_VERS(fd->version) >= 3)
3978         cp += fd->vv.varint_put64(cp, NULL, c->record_counter);
3979     else
3980         cp += fd->vv.varint_put32(cp, NULL, c->record_counter);
3981     cp += fd->vv.varint_put64(cp, NULL, c->num_bases);
3982     cp += fd->vv.varint_put32(cp, NULL, c->num_blocks);
3983     cp += fd->vv.varint_put32(cp, NULL, c->num_landmarks);
3984     for (i = 0; i < c->num_landmarks; i++)
3985         cp += fd->vv.varint_put32(cp, NULL, c->landmark[i]);
3986 
3987     if (CRAM_MAJOR_VERS(fd->version) >= 3) {
3988         c->crc32 = crc32(0L, (uc *)buf, cp-buf);
3989         cp[0] =  c->crc32        & 0xff;
3990         cp[1] = (c->crc32 >>  8) & 0xff;
3991         cp[2] = (c->crc32 >> 16) & 0xff;
3992         cp[3] = (c->crc32 >> 24) & 0xff;
3993         cp += 4;
3994     }
3995 
3996     if (cp-buf != hwrite(fd->fp, buf, cp-buf)) {
3997         if (buf != buf_a)
3998             free(buf);
3999         return -1;
4000     }
4001 
4002     if (buf != buf_a)
4003         free(buf);
4004 
4005     return 0;
4006 }
4007 
4008 // common component shared by cram_flush_container{,_mt}
cram_flush_container2(cram_fd * fd,cram_container * c)4009 static int cram_flush_container2(cram_fd *fd, cram_container *c) {
4010     int i, j;
4011 
4012     if (c->curr_slice > 0 && !c->slices)
4013         return -1;
4014 
4015     //fprintf(stderr, "Writing container %d, sum %u\n", c->record_counter, sum);
4016 
4017     off_t c_offset = htell(fd->fp); // File offset of container
4018 
4019     /* Write the container struct itself */
4020     if (0 != cram_write_container(fd, c))
4021         return -1;
4022 
4023     off_t hdr_size = htell(fd->fp) - c_offset;
4024 
4025     /* And the compression header */
4026     if (0 != cram_write_block(fd, c->comp_hdr_block))
4027         return -1;
4028 
4029     /* Followed by the slice blocks */
4030     off_t file_offset = htell(fd->fp);
4031     for (i = 0; i < c->curr_slice; i++) {
4032         cram_slice *s = c->slices[i];
4033         off_t spos = file_offset - c_offset - hdr_size;
4034 
4035         if (0 != cram_write_block(fd, s->hdr_block))
4036             return -1;
4037 
4038         for (j = 0; j < s->hdr->num_blocks; j++) {
4039             if (0 != cram_write_block(fd, s->block[j]))
4040                 return -1;
4041         }
4042 
4043         file_offset = htell(fd->fp);
4044         off_t sz = file_offset - c_offset - hdr_size - spos;
4045 
4046         if (fd->idxfp) {
4047             if (cram_index_slice(fd, c, s, fd->idxfp, c_offset, spos, sz) < 0)
4048                 return -1;
4049         }
4050     }
4051 
4052     return 0;
4053 }
4054 
4055 /*
4056  * Flushes a completely or partially full container to disk, writing
4057  * container structure, header and blocks. This also calls the encoder
4058  * functions.
4059  *
4060  * Returns 0 on success
4061  *        -1 on failure
4062  */
cram_flush_container(cram_fd * fd,cram_container * c)4063 int cram_flush_container(cram_fd *fd, cram_container *c) {
4064     /* Encode the container blocks and generate compression header */
4065     if (0 != cram_encode_container(fd, c))
4066         return -1;
4067 
4068     return cram_flush_container2(fd, c);
4069 }
4070 
4071 typedef struct {
4072     cram_fd *fd;
4073     cram_container *c;
4074 } cram_job;
4075 
cram_flush_thread(void * arg)4076 void *cram_flush_thread(void *arg) {
4077     cram_job *j = (cram_job *)arg;
4078 
4079     /* Encode the container blocks and generate compression header */
4080     if (0 != cram_encode_container(j->fd, j->c)) {
4081         hts_log_error("Call to cram_encode_container failed");
4082         return NULL;
4083     }
4084 
4085     return arg;
4086 }
4087 
cram_flush_result(cram_fd * fd)4088 static int cram_flush_result(cram_fd *fd) {
4089     int i, ret = 0;
4090     hts_tpool_result *r;
4091     cram_container *lc = NULL;
4092 
4093     // NB: we can have one result per slice, not per container,
4094     // so we need to free the container only after all slices
4095     // within it have been freed.  (Automatic via reference counting.)
4096     while ((r = hts_tpool_next_result(fd->rqueue))) {
4097         cram_job *j = (cram_job *)hts_tpool_result_data(r);
4098         cram_container *c;
4099 
4100         if (!j) {
4101             hts_tpool_delete_result(r, 0);
4102             return -1;
4103         }
4104 
4105         fd = j->fd;
4106         c = j->c;
4107 
4108         if (fd->mode == 'w')
4109             if (0 != cram_flush_container2(fd, c))
4110                 return -1;
4111 
4112         // Free the slices; filled out by encoder only
4113         if (c->slices) {
4114             for (i = 0; i < c->max_slice; i++) {
4115                 if (c->slices[i])
4116                     cram_free_slice(c->slices[i]);
4117                 if (c->slices[i] == c->slice)
4118                     c->slice = NULL;
4119                 c->slices[i] = NULL;
4120             }
4121         }
4122 
4123         // Free the current slice; set by both encoder & decoder
4124         if (c->slice) {
4125             cram_free_slice(c->slice);
4126             c->slice = NULL;
4127         }
4128         c->curr_slice = 0;
4129 
4130         // Our jobs will be in order, so we free the last
4131         // container when our job has switched to a new one.
4132         if (c != lc) {
4133             if (lc) {
4134                 if (fd->ctr == lc)
4135                     fd->ctr = NULL;
4136                 if (fd->ctr_mt == lc)
4137                     fd->ctr_mt = NULL;
4138                 cram_free_container(lc);
4139             }
4140             lc = c;
4141         }
4142 
4143         hts_tpool_delete_result(r, 1);
4144     }
4145     if (lc) {
4146         if (fd->ctr == lc)
4147             fd->ctr = NULL;
4148         if (fd->ctr_mt == lc)
4149             fd->ctr_mt = NULL;
4150         cram_free_container(lc);
4151     }
4152 
4153     return ret;
4154 }
4155 
4156 // Note: called while metrics_lock is held.
4157 // Will be left in this state too, but may temporarily unlock.
reset_metrics(cram_fd * fd)4158 void reset_metrics(cram_fd *fd) {
4159     int i;
4160 
4161     if (fd->pool) {
4162         // If multi-threaded we have multiple blocks being
4163         // compressed already and several on the to-do list
4164         // (fd->rqueue->pending).  It's tricky to reset the
4165         // metrics exactly the correct point, so instead we
4166         // just flush the pool, reset, and then continue again.
4167 
4168         // Don't bother starting a new trial before then though.
4169         for (i = 0; i < DS_END; i++) {
4170             cram_metrics *m = fd->m[i];
4171             if (!m)
4172                 continue;
4173             m->next_trial = 999;
4174         }
4175 
4176         pthread_mutex_unlock(&fd->metrics_lock);
4177         hts_tpool_process_flush(fd->rqueue);
4178         pthread_mutex_lock(&fd->metrics_lock);
4179     }
4180 
4181     for (i = 0; i < DS_END; i++) {
4182         cram_metrics *m = fd->m[i];
4183         if (!m)
4184             continue;
4185 
4186         m->trial = NTRIALS;
4187         m->next_trial = TRIAL_SPAN;
4188         m->revised_method = 0;
4189         m->unpackable = 0;
4190 
4191         memset(m->sz, 0, sizeof(m->sz));
4192     }
4193 }
4194 
cram_flush_container_mt(cram_fd * fd,cram_container * c)4195 int cram_flush_container_mt(cram_fd *fd, cram_container *c) {
4196     cram_job *j;
4197 
4198     // At the junction of mapped to unmapped data the compression
4199     // methods may need to change due to very different statistical
4200     // properties; particularly BA if minhash sorted.
4201     //
4202     // However with threading we'll have several in-flight blocks
4203     // arriving out of order.
4204     //
4205     // So we do one trial reset of NThreads to last for NThreads
4206     // duration to get us over this transition period, followed
4207     // by another retrial of the usual ntrials & trial span.
4208     pthread_mutex_lock(&fd->metrics_lock);
4209     if (c->n_mapped < 0.3*c->curr_rec &&
4210         fd->last_mapped > 0.7*c->max_rec) {
4211         reset_metrics(fd);
4212     }
4213     fd->last_mapped = c->n_mapped * (c->max_rec+1)/(c->curr_rec+1) ;
4214     pthread_mutex_unlock(&fd->metrics_lock);
4215 
4216     if (!fd->pool)
4217         return cram_flush_container(fd, c);
4218 
4219     if (!(j = malloc(sizeof(*j))))
4220         return -1;
4221     j->fd = fd;
4222     j->c = c;
4223 
4224     // Flush the job.  Note our encoder queue may be full, so we
4225     // either have to keep trying in non-blocking mode (what we do) or
4226     // use a dedicated separate thread for draining the queue.
4227     for (;;) {
4228         errno = 0;
4229         hts_tpool_dispatch2(fd->pool, fd->rqueue, cram_flush_thread, j, 1);
4230         int pending = (errno == EAGAIN);
4231         if (cram_flush_result(fd) != 0)
4232             return -1;
4233         if (!pending)
4234             break;
4235 
4236         usleep(1000);
4237     }
4238 
4239     return 0;
4240 }
4241 
4242 /* ----------------------------------------------------------------------
4243  * Compression headers; the first part of the container
4244  */
4245 
4246 /*
4247  * Creates a new blank container compression header
4248  *
4249  * Returns header ptr on success
4250  *         NULL on failure
4251  */
cram_new_compression_header(void)4252 cram_block_compression_hdr *cram_new_compression_header(void) {
4253     cram_block_compression_hdr *hdr = calloc(1, sizeof(*hdr));
4254     if (!hdr)
4255         return NULL;
4256 
4257     if (!(hdr->TD_blk = cram_new_block(CORE, 0))) {
4258         free(hdr);
4259         return NULL;
4260     }
4261 
4262     if (!(hdr->TD_hash = kh_init(m_s2i))) {
4263         cram_free_block(hdr->TD_blk);
4264         free(hdr);
4265         return NULL;
4266     }
4267 
4268     if (!(hdr->TD_keys = string_pool_create(8192))) {
4269         kh_destroy(m_s2i, hdr->TD_hash);
4270         cram_free_block(hdr->TD_blk);
4271         free(hdr);
4272         return NULL;
4273     }
4274 
4275     return hdr;
4276 }
4277 
cram_free_compression_header(cram_block_compression_hdr * hdr)4278 void cram_free_compression_header(cram_block_compression_hdr *hdr) {
4279     int i;
4280 
4281     if (hdr->landmark)
4282         free(hdr->landmark);
4283 
4284     if (hdr->preservation_map)
4285         kh_destroy(map, hdr->preservation_map);
4286 
4287     for (i = 0; i < CRAM_MAP_HASH; i++) {
4288         cram_map *m, *m2;
4289         for (m = hdr->rec_encoding_map[i]; m; m = m2) {
4290             m2 = m->next;
4291             if (m->codec)
4292                 m->codec->free(m->codec);
4293             free(m);
4294         }
4295     }
4296 
4297     for (i = 0; i < CRAM_MAP_HASH; i++) {
4298         cram_map *m, *m2;
4299         for (m = hdr->tag_encoding_map[i]; m; m = m2) {
4300             m2 = m->next;
4301             if (m->codec)
4302                 m->codec->free(m->codec);
4303             free(m);
4304         }
4305     }
4306 
4307     for (i = 0; i < DS_END; i++) {
4308         if (hdr->codecs[i])
4309             hdr->codecs[i]->free(hdr->codecs[i]);
4310     }
4311 
4312     if (hdr->TL)
4313         free(hdr->TL);
4314     if (hdr->TD_blk)
4315         cram_free_block(hdr->TD_blk);
4316     if (hdr->TD_hash)
4317         kh_destroy(m_s2i, hdr->TD_hash);
4318     if (hdr->TD_keys)
4319         string_pool_destroy(hdr->TD_keys);
4320 
4321     free(hdr);
4322 }
4323 
4324 
4325 /* ----------------------------------------------------------------------
4326  * Slices and slice headers
4327  */
4328 
cram_free_slice_header(cram_block_slice_hdr * hdr)4329 void cram_free_slice_header(cram_block_slice_hdr *hdr) {
4330     if (!hdr)
4331         return;
4332 
4333     if (hdr->block_content_ids)
4334         free(hdr->block_content_ids);
4335 
4336     free(hdr);
4337 
4338     return;
4339 }
4340 
cram_free_slice(cram_slice * s)4341 void cram_free_slice(cram_slice *s) {
4342     if (!s)
4343         return;
4344 
4345     if (s->hdr_block)
4346         cram_free_block(s->hdr_block);
4347 
4348     if (s->block) {
4349         int i;
4350 
4351         if (s->hdr) {
4352             for (i = 0; i < s->hdr->num_blocks; i++) {
4353                 if (i > 0 && s->block[i] == s->block[0])
4354                     continue;
4355                 cram_free_block(s->block[i]);
4356             }
4357         }
4358         free(s->block);
4359     }
4360 
4361     if (s->block_by_id)
4362         free(s->block_by_id);
4363 
4364     if (s->hdr)
4365         cram_free_slice_header(s->hdr);
4366 
4367     if (s->seqs_blk)
4368         cram_free_block(s->seqs_blk);
4369 
4370     if (s->qual_blk)
4371         cram_free_block(s->qual_blk);
4372 
4373     if (s->name_blk)
4374         cram_free_block(s->name_blk);
4375 
4376     if (s->aux_blk)
4377         cram_free_block(s->aux_blk);
4378 
4379     if (s->base_blk)
4380         cram_free_block(s->base_blk);
4381 
4382     if (s->soft_blk)
4383         cram_free_block(s->soft_blk);
4384 
4385     if (s->cigar)
4386         free(s->cigar);
4387 
4388     if (s->crecs)
4389         free(s->crecs);
4390 
4391     if (s->features)
4392         free(s->features);
4393 
4394     if (s->TN)
4395         free(s->TN);
4396 
4397     if (s->pair_keys)
4398         string_pool_destroy(s->pair_keys);
4399 
4400     if (s->pair[0])
4401         kh_destroy(m_s2i, s->pair[0]);
4402     if (s->pair[1])
4403         kh_destroy(m_s2i, s->pair[1]);
4404 
4405     if (s->aux_block)
4406         free(s->aux_block);
4407 
4408     free(s);
4409 }
4410 
4411 /*
4412  * Creates a new empty slice in memory, for subsequent writing to
4413  * disk.
4414  *
4415  * Returns cram_slice ptr on success
4416  *         NULL on failure
4417  */
cram_new_slice(enum cram_content_type type,int nrecs)4418 cram_slice *cram_new_slice(enum cram_content_type type, int nrecs) {
4419     cram_slice *s = calloc(1, sizeof(*s));
4420     if (!s)
4421         return NULL;
4422 
4423     if (!(s->hdr = (cram_block_slice_hdr *)calloc(1, sizeof(*s->hdr))))
4424         goto err;
4425     s->hdr->content_type = type;
4426 
4427     s->hdr_block = NULL;
4428     s->block = NULL;
4429     s->block_by_id = NULL;
4430     s->last_apos = 0;
4431     if (!(s->crecs = malloc(nrecs * sizeof(cram_record))))  goto err;
4432     s->cigar_alloc = 1024;
4433     if (!(s->cigar = malloc(s->cigar_alloc * sizeof(*s->cigar)))) goto err;
4434     s->ncigar = 0;
4435 
4436     if (!(s->seqs_blk = cram_new_block(EXTERNAL, 0)))       goto err;
4437     if (!(s->qual_blk = cram_new_block(EXTERNAL, DS_QS)))   goto err;
4438     if (!(s->name_blk = cram_new_block(EXTERNAL, DS_RN)))   goto err;
4439     if (!(s->aux_blk  = cram_new_block(EXTERNAL, DS_aux)))  goto err;
4440     if (!(s->base_blk = cram_new_block(EXTERNAL, DS_IN)))   goto err;
4441     if (!(s->soft_blk = cram_new_block(EXTERNAL, DS_SC)))   goto err;
4442 
4443     s->features = NULL;
4444     s->nfeatures = s->afeatures = 0;
4445 
4446 #ifndef TN_external
4447     s->TN = NULL;
4448     s->nTN = s->aTN = 0;
4449 #endif
4450 
4451     // Volatile keys as we do realloc in dstring
4452     if (!(s->pair_keys = string_pool_create(8192))) goto err;
4453     if (!(s->pair[0] = kh_init(m_s2i)))             goto err;
4454     if (!(s->pair[1] = kh_init(m_s2i)))             goto err;
4455 
4456 #ifdef BA_external
4457     s->BA_len = 0;
4458 #endif
4459 
4460     return s;
4461 
4462  err:
4463     if (s)
4464         cram_free_slice(s);
4465 
4466     return NULL;
4467 }
4468 
4469 /*
4470  * Loads an entire slice.
4471  * FIXME: In 1.0 the native unit of slices within CRAM is broken
4472  * as slices contain references to objects in other slices.
4473  * To work around this while keeping the slice oriented outer loop
4474  * we read all slices and stitch them together into a fake large
4475  * slice instead.
4476  *
4477  * Returns cram_slice ptr on success
4478  *         NULL on failure
4479  */
cram_read_slice(cram_fd * fd)4480 cram_slice *cram_read_slice(cram_fd *fd) {
4481     cram_block *b = cram_read_block(fd);
4482     cram_slice *s = calloc(1, sizeof(*s));
4483     int i, n, max_id, min_id;
4484 
4485     if (!b || !s)
4486         goto err;
4487 
4488     s->hdr_block = b;
4489     switch (b->content_type) {
4490     case MAPPED_SLICE:
4491     case UNMAPPED_SLICE:
4492         if (!(s->hdr = cram_decode_slice_header(fd, b)))
4493             goto err;
4494         break;
4495 
4496     default:
4497         hts_log_error("Unexpected block of type %s",
4498                       cram_content_type2str(b->content_type));
4499         goto err;
4500     }
4501 
4502     if (s->hdr->num_blocks < 1) {
4503         hts_log_error("Slice does not include any data blocks");
4504         goto err;
4505     }
4506 
4507     s->block = calloc(n = s->hdr->num_blocks, sizeof(*s->block));
4508     if (!s->block)
4509         goto err;
4510 
4511     for (max_id = i = 0, min_id = INT_MAX; i < n; i++) {
4512         if (!(s->block[i] = cram_read_block(fd)))
4513             goto err;
4514 
4515         if (s->block[i]->content_type == EXTERNAL) {
4516             if (max_id < s->block[i]->content_id)
4517                 max_id = s->block[i]->content_id;
4518             if (min_id > s->block[i]->content_id)
4519                 min_id = s->block[i]->content_id;
4520         }
4521     }
4522 
4523     if (!(s->block_by_id = calloc(512, sizeof(s->block[0]))))
4524         goto err;
4525 
4526     for (i = 0; i < n; i++) {
4527         if (s->block[i]->content_type != EXTERNAL)
4528             continue;
4529         uint32_t v = s->block[i]->content_id;
4530         if (v >= 256)
4531             v = 256 + v % 251;
4532         s->block_by_id[v] = s->block[i];
4533     }
4534 
4535     /* Initialise encoding/decoding tables */
4536     s->cigar_alloc = 1024;
4537     if (!(s->cigar = malloc(s->cigar_alloc * sizeof(*s->cigar)))) goto err;
4538     s->ncigar = 0;
4539 
4540     if (!(s->seqs_blk = cram_new_block(EXTERNAL, 0)))      goto err;
4541     if (!(s->qual_blk = cram_new_block(EXTERNAL, DS_QS)))  goto err;
4542     if (!(s->name_blk = cram_new_block(EXTERNAL, DS_RN)))  goto err;
4543     if (!(s->aux_blk  = cram_new_block(EXTERNAL, DS_aux))) goto err;
4544     if (!(s->base_blk = cram_new_block(EXTERNAL, DS_IN)))  goto err;
4545     if (!(s->soft_blk = cram_new_block(EXTERNAL, DS_SC)))  goto err;
4546 
4547     s->crecs = NULL;
4548 
4549     s->last_apos = s->hdr->ref_seq_start;
4550     s->decode_md = fd->decode_md;
4551 
4552     return s;
4553 
4554  err:
4555     if (b)
4556         cram_free_block(b);
4557     if (s) {
4558         s->hdr_block = NULL;
4559         cram_free_slice(s);
4560     }
4561     return NULL;
4562 }
4563 
4564 
4565 /* ----------------------------------------------------------------------
4566  * CRAM file definition (header)
4567  */
4568 
4569 /*
4570  * Reads a CRAM file definition structure.
4571  * Returns file_def ptr on success
4572  *         NULL on failure
4573  */
cram_read_file_def(cram_fd * fd)4574 cram_file_def *cram_read_file_def(cram_fd *fd) {
4575     cram_file_def *def = malloc(sizeof(*def));
4576     if (!def)
4577         return NULL;
4578 
4579     if (26 != hread(fd->fp, &def->magic[0], 26)) {
4580         free(def);
4581         return NULL;
4582     }
4583 
4584     if (memcmp(def->magic, "CRAM", 4) != 0) {
4585         free(def);
4586         return NULL;
4587     }
4588 
4589     if (def->major_version > 4) {
4590         hts_log_error("CRAM version number mismatch. Expected 1.x, 2.x, 3.x or 4.x, got %d.%d",
4591                       def->major_version, def->minor_version);
4592         free(def);
4593         return NULL;
4594     }
4595 
4596     fd->first_container += 26;
4597     fd->curr_position = fd->first_container;
4598     fd->last_slice = 0;
4599 
4600     return def;
4601 }
4602 
4603 /*
4604  * Writes a cram_file_def structure to cram_fd.
4605  * Returns 0 on success
4606  *        -1 on failure
4607  */
cram_write_file_def(cram_fd * fd,cram_file_def * def)4608 int cram_write_file_def(cram_fd *fd, cram_file_def *def) {
4609     return (hwrite(fd->fp, &def->magic[0], 26) == 26) ? 0 : -1;
4610 }
4611 
cram_free_file_def(cram_file_def * def)4612 void cram_free_file_def(cram_file_def *def) {
4613     if (def) free(def);
4614 }
4615 
4616 /* ----------------------------------------------------------------------
4617  * SAM header I/O
4618  */
4619 
4620 
4621 /*
4622  * Reads the SAM header from the first CRAM data block.
4623  * Also performs minimal parsing to extract read-group
4624  * and sample information.
4625 
4626  * Returns SAM hdr ptr on success
4627  *         NULL on failure
4628  */
cram_read_SAM_hdr(cram_fd * fd)4629 sam_hdr_t *cram_read_SAM_hdr(cram_fd *fd) {
4630     int32_t header_len;
4631     char *header;
4632     sam_hdr_t *hdr;
4633 
4634     /* 1.1 onwards stores the header in the first block of a container */
4635     if (CRAM_MAJOR_VERS(fd->version) == 1) {
4636         /* Length */
4637         if (-1 == int32_decode(fd, &header_len))
4638             return NULL;
4639 
4640         /* Alloc and read */
4641         if (header_len < 0 || NULL == (header = malloc((size_t) header_len+1)))
4642             return NULL;
4643 
4644         if (header_len != hread(fd->fp, header, header_len)) {
4645             free(header);
4646             return NULL;
4647         }
4648         header[header_len] = '\0';
4649 
4650         fd->first_container += 4 + header_len;
4651     } else {
4652         cram_container *c = cram_read_container(fd);
4653         cram_block *b;
4654         int i;
4655         int64_t len;
4656 
4657         if (!c)
4658             return NULL;
4659 
4660         fd->first_container += c->length + c->offset;
4661         fd->curr_position = fd->first_container;
4662 
4663         if (c->num_blocks < 1) {
4664             cram_free_container(c);
4665             return NULL;
4666         }
4667 
4668         if (!(b = cram_read_block(fd))) {
4669             cram_free_container(c);
4670             return NULL;
4671         }
4672         if (cram_uncompress_block(b) != 0) {
4673             cram_free_container(c);
4674             cram_free_block(b);
4675             return NULL;
4676         }
4677 
4678         len = b->comp_size + 2 + 4*(CRAM_MAJOR_VERS(fd->version) >= 3) +
4679             fd->vv.varint_size(b->content_id) +
4680             fd->vv.varint_size(b->uncomp_size) +
4681             fd->vv.varint_size(b->comp_size);
4682 
4683         /* Extract header from 1st block */
4684         if (-1 == int32_get_blk(b, &header_len) ||
4685             header_len < 0 || /* Spec. says signed...  why? */
4686             b->uncomp_size - 4 < header_len) {
4687             cram_free_container(c);
4688             cram_free_block(b);
4689             return NULL;
4690         }
4691         if (NULL == (header = malloc((size_t) header_len+1))) {
4692             cram_free_container(c);
4693             cram_free_block(b);
4694             return NULL;
4695         }
4696         memcpy(header, BLOCK_END(b), header_len);
4697         header[header_len] = '\0';
4698         cram_free_block(b);
4699 
4700         /* Consume any remaining blocks */
4701         for (i = 1; i < c->num_blocks; i++) {
4702             if (!(b = cram_read_block(fd))) {
4703                 cram_free_container(c);
4704                 free(header);
4705                 return NULL;
4706             }
4707             len += b->comp_size + 2 + 4*(CRAM_MAJOR_VERS(fd->version) >= 3) +
4708                 fd->vv.varint_size(b->content_id) +
4709                 fd->vv.varint_size(b->uncomp_size) +
4710                 fd->vv.varint_size(b->comp_size);
4711             cram_free_block(b);
4712         }
4713 
4714         if (c->length > 0 && len > 0 && c->length > len) {
4715             // Consume padding
4716             char *pads = malloc(c->length - len);
4717             if (!pads) {
4718                 cram_free_container(c);
4719                 free(header);
4720                 return NULL;
4721             }
4722 
4723             if (c->length - len != hread(fd->fp, pads, c->length - len)) {
4724                 cram_free_container(c);
4725                 free(header);
4726                 free(pads);
4727                 return NULL;
4728             }
4729             free(pads);
4730         }
4731 
4732         cram_free_container(c);
4733     }
4734 
4735     /* Parse */
4736     hdr = sam_hdr_init();
4737     if (!hdr) {
4738         free(header);
4739         return NULL;
4740     }
4741 
4742     if (-1 == sam_hdr_add_lines(hdr, header, header_len)) {
4743         free(header);
4744         sam_hdr_destroy(hdr);
4745         return NULL;
4746     }
4747 
4748     hdr->l_text = header_len;
4749     hdr->text = header;
4750 
4751     return hdr;
4752 
4753 }
4754 
4755 /*
4756  * Converts 'in' to a full pathname to store in out.
4757  * Out must be at least PATH_MAX bytes long.
4758  */
full_path(char * out,char * in)4759 static void full_path(char *out, char *in) {
4760     size_t in_l = strlen(in);
4761     if (hisremote(in)) {
4762         if (in_l > PATH_MAX) {
4763             hts_log_error("Reference path is longer than %d", PATH_MAX);
4764             return;
4765         }
4766         strncpy(out, in, PATH_MAX-1);
4767         out[PATH_MAX-1] = 0;
4768         return;
4769     }
4770     if (*in == '/' ||
4771         // Windows paths
4772         (in_l > 3 && toupper_c(*in) >= 'A'  && toupper_c(*in) <= 'Z' &&
4773          in[1] == ':' && (in[2] == '/' || in[2] == '\\'))) {
4774         strncpy(out, in, PATH_MAX-1);
4775         out[PATH_MAX-1] = 0;
4776     } else {
4777         int len;
4778 
4779         // unable to get dir or out+in is too long
4780         if (!getcwd(out, PATH_MAX) ||
4781             (len = strlen(out))+1+strlen(in) >= PATH_MAX) {
4782             strncpy(out, in, PATH_MAX-1);
4783             out[PATH_MAX-1] = 0;
4784             return;
4785         }
4786 
4787         sprintf(out+len, "/%.*s", PATH_MAX - 2 - len, in);
4788 
4789         // FIXME: cope with `pwd`/../../../foo.fa ?
4790     }
4791 }
4792 
4793 /*
4794  * Writes a CRAM SAM header.
4795  * Returns 0 on success
4796  *        -1 on failure
4797  */
cram_write_SAM_hdr(cram_fd * fd,sam_hdr_t * hdr)4798 int cram_write_SAM_hdr(cram_fd *fd, sam_hdr_t *hdr) {
4799     size_t header_len;
4800     int blank_block = (CRAM_MAJOR_VERS(fd->version) >= 3);
4801 
4802     /* Write CRAM MAGIC if not yet written. */
4803     if (fd->file_def->major_version == 0) {
4804         fd->file_def->major_version = CRAM_MAJOR_VERS(fd->version);
4805         fd->file_def->minor_version = CRAM_MINOR_VERS(fd->version);
4806         if (0 != cram_write_file_def(fd, fd->file_def))
4807             return -1;
4808     }
4809 
4810     /* 1.0 requires an UNKNOWN read-group */
4811     if (CRAM_MAJOR_VERS(fd->version) == 1) {
4812         if (!sam_hrecs_find_rg(hdr->hrecs, "UNKNOWN"))
4813             if (sam_hdr_add_line(hdr, "RG",
4814                             "ID", "UNKNOWN", "SM", "UNKNOWN", NULL))
4815                 return -1;
4816     }
4817 
4818     /* Fix M5 strings */
4819     if (fd->refs && !fd->no_ref) {
4820         int i;
4821         for (i = 0; i < hdr->hrecs->nref; i++) {
4822             sam_hrec_type_t *ty;
4823             char *ref;
4824 
4825             if (!(ty = sam_hrecs_find_type_id(hdr->hrecs, "SQ", "SN", hdr->hrecs->ref[i].name)))
4826                 return -1;
4827 
4828             if (!sam_hrecs_find_key(ty, "M5", NULL)) {
4829                 char unsigned buf[16];
4830                 char buf2[33];
4831                 int rlen;
4832                 hts_md5_context *md5;
4833 
4834                 if (!fd->refs ||
4835                     !fd->refs->ref_id ||
4836                     !fd->refs->ref_id[i]) {
4837                     return -1;
4838                 }
4839                 rlen = fd->refs->ref_id[i]->length;
4840                 if (!(md5 = hts_md5_init()))
4841                     return -1;
4842                 ref = cram_get_ref(fd, i, 1, rlen);
4843                 if (NULL == ref) return -1;
4844                 rlen = fd->refs->ref_id[i]->length; /* In case it just loaded */
4845                 hts_md5_update(md5, ref, rlen);
4846                 hts_md5_final(buf, md5);
4847                 hts_md5_destroy(md5);
4848                 cram_ref_decr(fd->refs, i);
4849 
4850                 hts_md5_hex(buf2, buf);
4851                 if (sam_hdr_update_line(hdr, "SQ", "SN", hdr->hrecs->ref[i].name, "M5", buf2, NULL))
4852                     return -1;
4853             }
4854 
4855             if (fd->ref_fn) {
4856                 char ref_fn[PATH_MAX];
4857                 full_path(ref_fn, fd->ref_fn);
4858                 if (sam_hdr_update_line(hdr, "SQ", "SN", hdr->hrecs->ref[i].name, "UR", ref_fn, NULL))
4859                     return -1;
4860             }
4861         }
4862     }
4863 
4864     /* Length */
4865     header_len = sam_hdr_length(hdr);
4866     if (header_len > INT32_MAX) {
4867         hts_log_error("Header is too long for CRAM format");
4868         return -1;
4869     }
4870     if (CRAM_MAJOR_VERS(fd->version) == 1) {
4871         if (-1 == int32_encode(fd, header_len))
4872             return -1;
4873 
4874         /* Text data */
4875         if (header_len != hwrite(fd->fp, sam_hdr_str(hdr), header_len))
4876             return -1;
4877     } else {
4878         /* Create block(s) inside a container */
4879         cram_block *b = cram_new_block(FILE_HEADER, 0);
4880         cram_container *c = cram_new_container(0, 0);
4881         int padded_length;
4882         char *pads;
4883         int is_cram_3 = (CRAM_MAJOR_VERS(fd->version) >= 3);
4884 
4885         if (!b || !c) {
4886             if (b) cram_free_block(b);
4887             if (c) cram_free_container(c);
4888             return -1;
4889         }
4890 
4891         if (int32_put_blk(b, header_len) < 0)
4892             return -1;
4893         if (header_len)
4894             BLOCK_APPEND(b, sam_hdr_str(hdr), header_len);
4895         BLOCK_UPLEN(b);
4896 
4897         // Compress header block if V3.0 and above
4898         if (CRAM_MAJOR_VERS(fd->version) >= 3)
4899             if (cram_compress_block(fd, b, NULL, -1, -1) < 0)
4900                 return -1;
4901 
4902         if (blank_block) {
4903             c->length = b->comp_size + 2 + 4*is_cram_3 +
4904                 fd->vv.varint_size(b->content_id)   +
4905                 fd->vv.varint_size(b->uncomp_size)  +
4906                 fd->vv.varint_size(b->comp_size);
4907 
4908             c->num_blocks = 2;
4909             c->num_landmarks = 2;
4910             if (!(c->landmark = malloc(2*sizeof(*c->landmark)))) {
4911                 cram_free_block(b);
4912                 cram_free_container(c);
4913                 return -1;
4914             }
4915             c->landmark[0] = 0;
4916             c->landmark[1] = c->length;
4917 
4918             // Plus extra storage for uncompressed secondary blank block
4919             padded_length = MIN(c->length*.5, 10000);
4920             c->length += padded_length + 2 + 4*is_cram_3 +
4921                 fd->vv.varint_size(b->content_id) +
4922                 fd->vv.varint_size(padded_length)*2;
4923         } else {
4924             // Pad the block instead.
4925             c->num_blocks = 1;
4926             c->num_landmarks = 1;
4927             if (!(c->landmark = malloc(sizeof(*c->landmark))))
4928                 return -1;
4929             c->landmark[0] = 0;
4930 
4931             padded_length = MAX(c->length*1.5, 10000) - c->length;
4932 
4933             c->length = b->comp_size + padded_length +
4934                 2 + 4*is_cram_3 +
4935                 fd->vv.varint_size(b->content_id)   +
4936                 fd->vv.varint_size(b->uncomp_size)  +
4937                 fd->vv.varint_size(b->comp_size);
4938 
4939             if (NULL == (pads = calloc(1, padded_length))) {
4940                 cram_free_block(b);
4941                 cram_free_container(c);
4942                 return -1;
4943             }
4944             BLOCK_APPEND(b, pads, padded_length);
4945             BLOCK_UPLEN(b);
4946             free(pads);
4947         }
4948 
4949         if (-1 == cram_write_container(fd, c)) {
4950             cram_free_block(b);
4951             cram_free_container(c);
4952             return -1;
4953         }
4954 
4955         if (-1 == cram_write_block(fd, b)) {
4956             cram_free_block(b);
4957             cram_free_container(c);
4958             return -1;
4959         }
4960 
4961         if (blank_block) {
4962             BLOCK_RESIZE(b, padded_length);
4963             memset(BLOCK_DATA(b), 0, padded_length);
4964             BLOCK_SIZE(b) = padded_length;
4965             BLOCK_UPLEN(b);
4966             b->method = RAW;
4967             if (-1 == cram_write_block(fd, b)) {
4968                 cram_free_block(b);
4969                 cram_free_container(c);
4970                 return -1;
4971             }
4972         }
4973 
4974         cram_free_block(b);
4975         cram_free_container(c);
4976     }
4977 
4978     if (-1 == refs_from_header(fd))
4979         return -1;
4980     if (-1 == refs2id(fd->refs, fd->header))
4981         return -1;
4982 
4983     if (0 != hflush(fd->fp))
4984         return -1;
4985 
4986     RP("=== Finishing saving header ===\n");
4987 
4988     return 0;
4989 
4990  block_err:
4991     return -1;
4992 }
4993 
4994 /* ----------------------------------------------------------------------
4995  * The top-level cram opening, closing and option handling
4996  */
4997 
4998 /*
4999  * Sets CRAM variable sized integer decode function tables.
5000  * CRAM 1, 2, and 3.x all used ITF8 for uint32 and UTF8 for uint64.
5001  * CRAM 4.x uses the same encoding mechanism for 32-bit and 64-bit
5002  * (or anything inbetween), but also now supports signed values.
5003  *
5004  * Version is the CRAM major version number.
5005  * vv is the vector table (probably &cram_fd->vv)
5006  */
cram_init_varint(varint_vec * vv,int version)5007 static void cram_init_varint(varint_vec *vv, int version) {
5008     if (version >= 4) {
5009         vv->varint_get32 = uint7_get_32; // FIXME: varint.h API should be size agnostic
5010         vv->varint_get32s = sint7_get_32;
5011         vv->varint_get64 = uint7_get_64;
5012         vv->varint_get64s = sint7_get_64;
5013         vv->varint_put32 = uint7_put_32;
5014         vv->varint_put32s = sint7_put_32;
5015         vv->varint_put64 = uint7_put_64;
5016         vv->varint_put64s = sint7_put_64;
5017         vv->varint_put32_blk = uint7_put_blk_32;
5018         vv->varint_put32s_blk = sint7_put_blk_32;
5019         vv->varint_put64_blk = uint7_put_blk_64;
5020         vv->varint_put64s_blk = sint7_put_blk_64;
5021         vv->varint_size = uint7_size;
5022         vv->varint_decode32_crc = uint7_decode_crc32;
5023         vv->varint_decode32s_crc = sint7_decode_crc32;
5024         vv->varint_decode64_crc = uint7_decode_crc64;
5025     } else {
5026         vv->varint_get32 = safe_itf8_get;
5027         vv->varint_get32s = safe_itf8_get;
5028         vv->varint_get64 = safe_ltf8_get;
5029         vv->varint_get64s = safe_ltf8_get;
5030         vv->varint_put32 = safe_itf8_put;
5031         vv->varint_put32s = safe_itf8_put;
5032         vv->varint_put64 = safe_ltf8_put;
5033         vv->varint_put64s = safe_ltf8_put;
5034         vv->varint_put32_blk = itf8_put_blk;
5035         vv->varint_put32s_blk = itf8_put_blk;
5036         vv->varint_put64_blk = ltf8_put_blk;
5037         vv->varint_put64s_blk = ltf8_put_blk;
5038         vv->varint_size = itf8_size;
5039         vv->varint_decode32_crc = itf8_decode_crc;
5040         vv->varint_decode32s_crc = itf8_decode_crc;
5041         vv->varint_decode64_crc = ltf8_decode_crc;
5042     }
5043 }
5044 
5045 /*
5046  * Initialises the lookup tables. These could be global statics, but they're
5047  * clumsy to setup in a multi-threaded environment unless we generate
5048  * verbatim code and include that.
5049  */
cram_init_tables(cram_fd * fd)5050 static void cram_init_tables(cram_fd *fd) {
5051     int i;
5052 
5053     memset(fd->L1, 4, 256);
5054     fd->L1['A'] = 0; fd->L1['a'] = 0;
5055     fd->L1['C'] = 1; fd->L1['c'] = 1;
5056     fd->L1['G'] = 2; fd->L1['g'] = 2;
5057     fd->L1['T'] = 3; fd->L1['t'] = 3;
5058 
5059     memset(fd->L2, 5, 256);
5060     fd->L2['A'] = 0; fd->L2['a'] = 0;
5061     fd->L2['C'] = 1; fd->L2['c'] = 1;
5062     fd->L2['G'] = 2; fd->L2['g'] = 2;
5063     fd->L2['T'] = 3; fd->L2['t'] = 3;
5064     fd->L2['N'] = 4; fd->L2['n'] = 4;
5065 
5066     if (CRAM_MAJOR_VERS(fd->version) == 1) {
5067         for (i = 0; i < 0x200; i++) {
5068             int f = 0;
5069 
5070             if (i & CRAM_FPAIRED)      f |= BAM_FPAIRED;
5071             if (i & CRAM_FPROPER_PAIR) f |= BAM_FPROPER_PAIR;
5072             if (i & CRAM_FUNMAP)       f |= BAM_FUNMAP;
5073             if (i & CRAM_FREVERSE)     f |= BAM_FREVERSE;
5074             if (i & CRAM_FREAD1)       f |= BAM_FREAD1;
5075             if (i & CRAM_FREAD2)       f |= BAM_FREAD2;
5076             if (i & CRAM_FSECONDARY)   f |= BAM_FSECONDARY;
5077             if (i & CRAM_FQCFAIL)      f |= BAM_FQCFAIL;
5078             if (i & CRAM_FDUP)         f |= BAM_FDUP;
5079 
5080             fd->bam_flag_swap[i]  = f;
5081         }
5082 
5083         for (i = 0; i < 0x1000; i++) {
5084             int g = 0;
5085 
5086             if (i & BAM_FPAIRED)           g |= CRAM_FPAIRED;
5087             if (i & BAM_FPROPER_PAIR)  g |= CRAM_FPROPER_PAIR;
5088             if (i & BAM_FUNMAP)        g |= CRAM_FUNMAP;
5089             if (i & BAM_FREVERSE)      g |= CRAM_FREVERSE;
5090             if (i & BAM_FREAD1)        g |= CRAM_FREAD1;
5091             if (i & BAM_FREAD2)        g |= CRAM_FREAD2;
5092             if (i & BAM_FSECONDARY)    g |= CRAM_FSECONDARY;
5093             if (i & BAM_FQCFAIL)       g |= CRAM_FQCFAIL;
5094             if (i & BAM_FDUP)          g |= CRAM_FDUP;
5095 
5096             fd->cram_flag_swap[i] = g;
5097         }
5098     } else {
5099         /* NOP */
5100         for (i = 0; i < 0x1000; i++)
5101             fd->bam_flag_swap[i] = i;
5102         for (i = 0; i < 0x1000; i++)
5103             fd->cram_flag_swap[i] = i;
5104     }
5105 
5106     memset(fd->cram_sub_matrix, 4, 32*32);
5107     for (i = 0; i < 32; i++) {
5108         fd->cram_sub_matrix[i]['A'&0x1f]=0;
5109         fd->cram_sub_matrix[i]['C'&0x1f]=1;
5110         fd->cram_sub_matrix[i]['G'&0x1f]=2;
5111         fd->cram_sub_matrix[i]['T'&0x1f]=3;
5112         fd->cram_sub_matrix[i]['N'&0x1f]=4;
5113     }
5114     for (i = 0; i < 20; i+=4) {
5115         int j;
5116         for (j = 0; j < 20; j++) {
5117             fd->cram_sub_matrix["ACGTN"[i>>2]&0x1f][j]=3;
5118             fd->cram_sub_matrix["ACGTN"[i>>2]&0x1f][j]=3;
5119             fd->cram_sub_matrix["ACGTN"[i>>2]&0x1f][j]=3;
5120             fd->cram_sub_matrix["ACGTN"[i>>2]&0x1f][j]=3;
5121         }
5122         fd->cram_sub_matrix["ACGTN"[i>>2]&0x1f][CRAM_SUBST_MATRIX[i+0]&0x1f]=0;
5123         fd->cram_sub_matrix["ACGTN"[i>>2]&0x1f][CRAM_SUBST_MATRIX[i+1]&0x1f]=1;
5124         fd->cram_sub_matrix["ACGTN"[i>>2]&0x1f][CRAM_SUBST_MATRIX[i+2]&0x1f]=2;
5125         fd->cram_sub_matrix["ACGTN"[i>>2]&0x1f][CRAM_SUBST_MATRIX[i+3]&0x1f]=3;
5126     }
5127 
5128     cram_init_varint(&fd->vv, CRAM_MAJOR_VERS(fd->version));
5129 }
5130 
5131 // Default version numbers for CRAM
5132 static int major_version = 3;
5133 static int minor_version = 0;
5134 
5135 /*
5136  * Opens a CRAM file for read (mode "rb") or write ("wb").
5137  * The filename may be "-" to indicate stdin or stdout.
5138  *
5139  * Returns file handle on success
5140  *         NULL on failure.
5141  */
cram_open(const char * filename,const char * mode)5142 cram_fd *cram_open(const char *filename, const char *mode) {
5143     hFILE *fp;
5144     cram_fd *fd;
5145     char fmode[3]= { mode[0], '\0', '\0' };
5146 
5147     if (strlen(mode) > 1 && (mode[1] == 'b' || mode[1] == 'c')) {
5148         fmode[1] = 'b';
5149     }
5150 
5151     fp = hopen(filename, fmode);
5152     if (!fp)
5153         return NULL;
5154 
5155     fd = cram_dopen(fp, filename, mode);
5156     if (!fd)
5157         hclose_abruptly(fp);
5158 
5159     return fd;
5160 }
5161 
5162 /* Opens an existing stream for reading or writing.
5163  *
5164  * Returns file handle on success;
5165  *         NULL on failure.
5166  */
cram_dopen(hFILE * fp,const char * filename,const char * mode)5167 cram_fd *cram_dopen(hFILE *fp, const char *filename, const char *mode) {
5168     int i;
5169     char *cp;
5170     cram_fd *fd = calloc(1, sizeof(*fd));
5171     if (!fd)
5172         return NULL;
5173 
5174     fd->level = CRAM_DEFAULT_LEVEL;
5175     for (i = 0; mode[i]; i++) {
5176         if (mode[i] >= '0' && mode[i] <= '9') {
5177             fd->level = mode[i] - '0';
5178             break;
5179         }
5180     }
5181 
5182     fd->fp = fp;
5183     fd->mode = *mode;
5184     fd->first_container = 0;
5185     fd->curr_position = 0;
5186 
5187     if (fd->mode == 'r') {
5188         /* Reader */
5189 
5190         if (!(fd->file_def = cram_read_file_def(fd)))
5191             goto err;
5192 
5193         fd->version = fd->file_def->major_version * 256 +
5194             fd->file_def->minor_version;
5195 
5196         cram_init_tables(fd);
5197 
5198         if (!(fd->header = cram_read_SAM_hdr(fd))) {
5199             cram_free_file_def(fd->file_def);
5200             goto err;
5201         }
5202 
5203     } else {
5204         /* Writer */
5205         cram_file_def *def = calloc(1, sizeof(*def));
5206         if (!def)
5207             return NULL;
5208 
5209         fd->file_def = def;
5210 
5211         def->magic[0] = 'C';
5212         def->magic[1] = 'R';
5213         def->magic[2] = 'A';
5214         def->magic[3] = 'M';
5215         def->major_version = 0; // Indicator to write file def later.
5216         def->minor_version = 0;
5217         memset(def->file_id, 0, 20);
5218         strncpy(def->file_id, filename, 20);
5219 
5220         fd->version = major_version * 256 + minor_version;
5221         cram_init_tables(fd);
5222 
5223         /* SAM header written later along with this file_def */
5224     }
5225 
5226     fd->prefix = strdup((cp = strrchr(filename, '/')) ? cp+1 : filename);
5227     if (!fd->prefix)
5228         goto err;
5229     fd->first_base = fd->last_base = -1;
5230     fd->record_counter = 0;
5231 
5232     fd->ctr = NULL;
5233     fd->ctr_mt = NULL;
5234     fd->refs  = refs_create();
5235     if (!fd->refs)
5236         goto err;
5237     fd->ref_id = -2;
5238     fd->ref = NULL;
5239 
5240     fd->decode_md = 0;
5241     fd->seqs_per_slice = SEQS_PER_SLICE;
5242     fd->bases_per_slice = BASES_PER_SLICE;
5243     fd->slices_per_container = SLICE_PER_CNT;
5244     fd->embed_ref = 0;
5245     fd->no_ref = 0;
5246     fd->ap_delta = 0;
5247     fd->ignore_md5 = 0;
5248     fd->lossy_read_names = 0;
5249     fd->use_bz2 = 0;
5250     fd->use_rans = (CRAM_MAJOR_VERS(fd->version) >= 3);
5251     fd->use_tok = (CRAM_MAJOR_VERS(fd->version) >= 3) && (CRAM_MINOR_VERS(fd->version) >= 1);
5252     fd->use_lzma = 0;
5253     fd->multi_seq = -1;
5254     fd->multi_seq_user = -1;
5255     fd->unsorted   = 0;
5256     fd->shared_ref = 0;
5257     fd->store_md = 0;
5258     fd->store_nm = 0;
5259     fd->last_RI_count = 0;
5260 
5261     fd->index       = NULL;
5262     fd->own_pool    = 0;
5263     fd->pool        = NULL;
5264     fd->rqueue      = NULL;
5265     fd->job_pending = NULL;
5266     fd->ooc         = 0;
5267     fd->required_fields = INT_MAX;
5268 
5269     for (i = 0; i < DS_END; i++) {
5270         fd->m[i] = cram_new_metrics();
5271         if (!fd->m[i])
5272             goto err;
5273     }
5274 
5275     if (!(fd->tags_used = kh_init(m_metrics)))
5276         goto err;
5277 
5278     fd->range.refid = -2; // no ref.
5279     fd->eof = 1;          // See samtools issue #150
5280     fd->ref_fn = NULL;
5281 
5282     fd->bl = NULL;
5283 
5284     /* Initialise dummy refs from the @SQ headers */
5285     if (-1 == refs_from_header(fd))
5286         goto err;
5287 
5288     return fd;
5289 
5290  err:
5291     if (fd)
5292         free(fd);
5293 
5294     return NULL;
5295 }
5296 
5297 /*
5298  * Seek within a CRAM file.
5299  *
5300  * Returns 0 on success
5301  *        -1 on failure
5302  */
cram_seek(cram_fd * fd,off_t offset,int whence)5303 int cram_seek(cram_fd *fd, off_t offset, int whence) {
5304     char buf[65536];
5305 
5306     fd->ooc = 0;
5307 
5308     cram_drain_rqueue(fd);
5309 
5310     if (hseek(fd->fp, offset, whence) >= 0) {
5311         return 0;
5312     }
5313 
5314     if (!(whence == SEEK_CUR && offset >= 0))
5315         return -1;
5316 
5317     /* Couldn't fseek, but we're in SEEK_CUR mode so read instead */
5318     while (offset > 0) {
5319         int len = MIN(65536, offset);
5320         if (len != hread(fd->fp, buf, len))
5321             return -1;
5322         offset -= len;
5323     }
5324 
5325     return 0;
5326 }
5327 
5328 /*
5329  * Flushes a CRAM file.
5330  * Useful for when writing to stdout without wishing to close the stream.
5331  *
5332  * Returns 0 on success
5333  *        -1 on failure
5334  */
cram_flush(cram_fd * fd)5335 int cram_flush(cram_fd *fd) {
5336     if (!fd)
5337         return -1;
5338 
5339     if (fd->mode == 'w' && fd->ctr) {
5340         if(fd->ctr->slice)
5341             cram_update_curr_slice(fd->ctr);
5342 
5343         if (-1 == cram_flush_container_mt(fd, fd->ctr))
5344             return -1;
5345     }
5346 
5347     return 0;
5348 }
5349 
5350 /*
5351  * Writes an EOF block to a CRAM file.
5352  *
5353  * Returns 0 on success
5354  *        -1 on failure
5355  */
cram_write_eof_block(cram_fd * fd)5356 int cram_write_eof_block(cram_fd *fd) {
5357     // EOF block is a container with special values to aid detection
5358     if (CRAM_MAJOR_VERS(fd->version) >= 2) {
5359         // Empty container with
5360         //   ref_seq_id -1
5361         //   start pos 0x454f46 ("EOF")
5362         //   span 0
5363         //   nrec 0
5364         //   counter 0
5365         //   nbases 0
5366         //   1 block (landmark 0)
5367         //   (CRC32)
5368         cram_container c;
5369         memset(&c, 0, sizeof(c));
5370         c.ref_seq_id = -1;
5371         c.ref_seq_start = 0x454f46; // "EOF"
5372         c.ref_seq_span = 0;
5373         c.record_counter = 0;
5374         c.num_bases = 0;
5375         c.num_blocks = 1;
5376         int32_t land[1] = {0};
5377         c.landmark = land;
5378 
5379         // An empty compression header block with
5380         //   method raw (0)
5381         //   type comp header (1)
5382         //   content id 0
5383         //   block contents size 6
5384         //   raw size 6
5385         //     empty preservation map (01 00)
5386         //     empty data series map (01 00)
5387         //     empty tag map (01 00)
5388         //   block CRC
5389         cram_block_compression_hdr ch;
5390         memset(&ch, 0, sizeof(ch));
5391         c.comp_hdr_block = cram_encode_compression_header(fd, &c, &ch);
5392 
5393         c.length = c.comp_hdr_block->byte            // Landmark[0]
5394             + 5                                      // block struct
5395             + 4*(CRAM_MAJOR_VERS(fd->version) >= 3); // CRC
5396         if (cram_write_container(fd, &c) < 0 ||
5397             cram_write_block(fd, c.comp_hdr_block) < 0) {
5398             cram_close(fd);
5399             cram_free_block(c.comp_hdr_block);
5400             return -1;
5401         }
5402         if (ch.preservation_map)
5403             kh_destroy(map, ch.preservation_map);
5404         cram_free_block(c.comp_hdr_block);
5405 
5406         // V2.1 bytes
5407         // 0b 00 00 00 ff ff ff ff 0f // Cont HDR: size, ref seq id
5408         // e0 45 4f 46 00 00 00       // Cont HDR: pos, span, nrec, counter
5409         // 00 01 00                   // Cont HDR: nbase, nblk, landmark
5410         // 00 01 00 06 06             // Comp.HDR blk
5411         // 01 00 01 00 01 00          // Comp.HDR blk
5412 
5413         // V3.0 bytes:
5414         // 0f 00 00 00 ff ff ff ff 0f // Cont HDR: size, ref seq id
5415         // e0 45 4f 46 00 00 00       // Cont HDR: pos, span, nrec, counter
5416         // 00 01 00                   // Cont HDR: nbase, nblk, landmark
5417         // 05 bd d9 4f                // CRC32
5418         // 00 01 00 06 06             // Comp.HDR blk
5419         // 01 00 01 00 01 00          // Comp.HDR blk
5420         // ee 63 01 4b                // CRC32
5421 
5422         // V4.0 bytes:
5423         // 0f 00 00 00 8f ff ff ff    // Cont HDR: size, ref seq id
5424         // 82 95 9e 46 00 00 00       // Cont HDR: pos, span, nrec, counter
5425         // 00 01 00                   // Cont HDR: nbase, nblk, landmark
5426         // ac d6 05 bc                // CRC32
5427         // 00 01 00 06 06             // Comp.HDR blk
5428         // 01 00 01 00 01 00          // Comp.HDR blk
5429         // ee 63 01 4b                // CRC32
5430     }
5431 
5432     return 0;
5433 }
5434 /*
5435  * Closes a CRAM file.
5436  * Returns 0 on success
5437  *        -1 on failure
5438  */
cram_close(cram_fd * fd)5439 int cram_close(cram_fd *fd) {
5440     spare_bams *bl, *next;
5441     int i;
5442 
5443     if (!fd)
5444         return -1;
5445 
5446     if (fd->mode == 'w' && fd->ctr) {
5447         if(fd->ctr->slice)
5448             cram_update_curr_slice(fd->ctr);
5449 
5450         if (-1 == cram_flush_container_mt(fd, fd->ctr))
5451             return -1;
5452     }
5453 
5454     if (fd->mode != 'w')
5455         cram_drain_rqueue(fd);
5456 
5457     if (fd->pool && fd->eof >= 0 && fd->rqueue) {
5458         hts_tpool_process_flush(fd->rqueue);
5459 
5460         if (0 != cram_flush_result(fd))
5461             return -1;
5462 
5463         if (fd->mode == 'w')
5464             fd->ctr = NULL; // prevent double freeing
5465 
5466         pthread_mutex_destroy(&fd->metrics_lock);
5467         pthread_mutex_destroy(&fd->ref_lock);
5468         pthread_mutex_destroy(&fd->bam_list_lock);
5469 
5470         //fprintf(stderr, "CRAM: destroy queue %p\n", fd->rqueue);
5471 
5472         hts_tpool_process_destroy(fd->rqueue);
5473     }
5474 
5475     if (fd->mode == 'w') {
5476         /* Write EOF block */
5477         if (0 != cram_write_eof_block(fd))
5478             return -1;
5479     }
5480 
5481     for (bl = fd->bl; bl; bl = next) {
5482         int i, max_rec = fd->seqs_per_slice * fd->slices_per_container;
5483 
5484         next = bl->next;
5485         for (i = 0; i < max_rec; i++) {
5486             if (bl->bams[i])
5487                 bam_free(bl->bams[i]);
5488         }
5489         free(bl->bams);
5490         free(bl);
5491     }
5492 
5493     if (hclose(fd->fp) != 0)
5494         return -1;
5495 
5496     if (fd->file_def)
5497         cram_free_file_def(fd->file_def);
5498 
5499     if (fd->header)
5500         sam_hdr_destroy(fd->header);
5501 
5502     free(fd->prefix);
5503 
5504     if (fd->ctr)
5505         cram_free_container(fd->ctr);
5506 
5507     if (fd->ctr_mt && fd->ctr_mt != fd->ctr)
5508         cram_free_container(fd->ctr_mt);
5509 
5510     if (fd->refs)
5511         refs_free(fd->refs);
5512     if (fd->ref_free)
5513         free(fd->ref_free);
5514 
5515     for (i = 0; i < DS_END; i++)
5516         if (fd->m[i])
5517             free(fd->m[i]);
5518 
5519     if (fd->tags_used) {
5520         khint_t k;
5521 
5522         for (k = kh_begin(fd->tags_used); k != kh_end(fd->tags_used); k++) {
5523             if (kh_exist(fd->tags_used, k))
5524                 free(kh_val(fd->tags_used, k));
5525         }
5526 
5527         kh_destroy(m_metrics, fd->tags_used);
5528     }
5529 
5530     if (fd->index)
5531         cram_index_free(fd);
5532 
5533     if (fd->own_pool && fd->pool)
5534         hts_tpool_destroy(fd->pool);
5535 
5536     if (fd->idxfp)
5537         if (bgzf_close(fd->idxfp) < 0)
5538             return -1;
5539 
5540     free(fd);
5541     return 0;
5542 }
5543 
5544 /*
5545  * Returns 1 if we hit an EOF while reading.
5546  */
cram_eof(cram_fd * fd)5547 int cram_eof(cram_fd *fd) {
5548     return fd->eof;
5549 }
5550 
5551 
5552 /*
5553  * Sets options on the cram_fd. See CRAM_OPT_* definitions in cram_structs.h.
5554  * Use this immediately after opening.
5555  *
5556  * Returns 0 on success
5557  *        -1 on failure
5558  */
cram_set_option(cram_fd * fd,enum hts_fmt_option opt,...)5559 int cram_set_option(cram_fd *fd, enum hts_fmt_option opt, ...) {
5560     int r;
5561     va_list args;
5562 
5563     va_start(args, opt);
5564     r = cram_set_voption(fd, opt, args);
5565     va_end(args);
5566 
5567     return r;
5568 }
5569 
5570 /*
5571  * Sets options on the cram_fd. See CRAM_OPT_* definitions in cram_structs.h.
5572  * Use this immediately after opening.
5573  *
5574  * Returns 0 on success
5575  *        -1 on failure
5576  */
cram_set_voption(cram_fd * fd,enum hts_fmt_option opt,va_list args)5577 int cram_set_voption(cram_fd *fd, enum hts_fmt_option opt, va_list args) {
5578     refs_t *refs;
5579 
5580     if (!fd) {
5581         errno = EBADF;
5582         return -1;
5583     }
5584 
5585     switch (opt) {
5586     case CRAM_OPT_DECODE_MD:
5587         fd->decode_md = va_arg(args, int);
5588         break;
5589 
5590     case CRAM_OPT_PREFIX:
5591         if (fd->prefix)
5592             free(fd->prefix);
5593         if (!(fd->prefix = strdup(va_arg(args, char *))))
5594             return -1;
5595         break;
5596 
5597     case CRAM_OPT_VERBOSITY:
5598         break;
5599 
5600     case CRAM_OPT_SEQS_PER_SLICE:
5601         fd->seqs_per_slice = va_arg(args, int);
5602         if (fd->bases_per_slice == BASES_PER_SLICE)
5603             fd->bases_per_slice = fd->seqs_per_slice * 500;
5604         break;
5605 
5606     case CRAM_OPT_BASES_PER_SLICE:
5607         fd->bases_per_slice = va_arg(args, int);
5608         break;
5609 
5610     case CRAM_OPT_SLICES_PER_CONTAINER:
5611         fd->slices_per_container = va_arg(args, int);
5612         break;
5613 
5614     case CRAM_OPT_EMBED_REF:
5615         fd->embed_ref = va_arg(args, int);
5616         break;
5617 
5618     case CRAM_OPT_NO_REF:
5619         fd->no_ref = va_arg(args, int);
5620         break;
5621 
5622     case CRAM_OPT_POS_DELTA:
5623         fd->ap_delta = va_arg(args, int);
5624         break;
5625 
5626     case CRAM_OPT_IGNORE_MD5:
5627         fd->ignore_md5 = va_arg(args, int);
5628         break;
5629 
5630     case CRAM_OPT_LOSSY_NAMES:
5631         fd->lossy_read_names = va_arg(args, int);
5632         // Currently lossy read names required paired (attached) reads.
5633         // TLEN 0 or being 1 out causes read pairs to be detached, breaking
5634         // the lossy read name compression, so we have extra options to
5635         // slacken the exact TLEN round-trip checks.
5636         fd->tlen_approx = fd->lossy_read_names;
5637         fd->tlen_zero = fd->lossy_read_names;
5638         break;
5639 
5640     case CRAM_OPT_USE_BZIP2:
5641         fd->use_bz2 = va_arg(args, int);
5642         break;
5643 
5644     case CRAM_OPT_USE_RANS:
5645         fd->use_rans = va_arg(args, int);
5646         break;
5647 
5648     case CRAM_OPT_USE_TOK:
5649         fd->use_tok = va_arg(args, int);
5650         break;
5651 
5652     case CRAM_OPT_USE_FQZ:
5653         fd->use_fqz = va_arg(args, int);
5654         break;
5655 
5656     case CRAM_OPT_USE_ARITH:
5657         fd->use_arith = va_arg(args, int);
5658         break;
5659 
5660     case CRAM_OPT_USE_LZMA:
5661         fd->use_lzma = va_arg(args, int);
5662         break;
5663 
5664     case CRAM_OPT_SHARED_REF:
5665         fd->shared_ref = 1;
5666         refs = va_arg(args, refs_t *);
5667         if (refs != fd->refs) {
5668             if (fd->refs)
5669                 refs_free(fd->refs);
5670             fd->refs = refs;
5671             fd->refs->count++;
5672         }
5673         break;
5674 
5675     case CRAM_OPT_RANGE: {
5676         int r = cram_seek_to_refpos(fd, va_arg(args, cram_range *));
5677         pthread_mutex_lock(&fd->range_lock);
5678         if (fd->range.refid != -2)
5679             fd->required_fields |= SAM_POS;
5680         pthread_mutex_unlock(&fd->range_lock);
5681         return r;
5682     }
5683 
5684     case CRAM_OPT_RANGE_NOSEEK: {
5685         // As per CRAM_OPT_RANGE, but no seeking
5686         pthread_mutex_lock(&fd->range_lock);
5687         cram_range *r = va_arg(args, cram_range *);
5688         fd->range = *r;
5689         if (r->refid == HTS_IDX_NOCOOR) {
5690             fd->range.refid = -1;
5691             fd->range.start = 0;
5692         } else if (r->refid == HTS_IDX_START || r->refid == HTS_IDX_REST) {
5693             fd->range.refid = -2; // special case in cram_next_slice
5694         }
5695         if (fd->range.refid != -2)
5696             fd->required_fields |= SAM_POS;
5697         fd->ooc = 0;
5698         fd->eof = 0;
5699         pthread_mutex_unlock(&fd->range_lock);
5700         return 0;
5701     }
5702 
5703     case CRAM_OPT_REFERENCE:
5704         return cram_load_reference(fd, va_arg(args, char *));
5705 
5706     case CRAM_OPT_VERSION: {
5707         int major, minor;
5708         char *s = va_arg(args, char *);
5709         if (2 != sscanf(s, "%d.%d", &major, &minor)) {
5710             hts_log_error("Malformed version string %s", s);
5711             return -1;
5712         }
5713         if (!((major == 1 &&  minor == 0) ||
5714               (major == 2 && (minor == 0 || minor == 1)) ||
5715               (major == 3 && (minor == 0 || minor == 1)) ||
5716               (major == 4 &&  minor == 0))) {
5717             hts_log_error("Unknown version string; use 1.0, 2.0, 2.1, 3.0, 3.1 or 4.0");
5718             errno = EINVAL;
5719             return -1;
5720         }
5721 
5722         if (major > 3 || (major == 3 && minor > 0)) {
5723             hts_log_warning(
5724                 "CRAM version %s is still a draft and subject to change.\n"
5725                 "This is a technology demonstration that should not be "
5726                 "used for archival data.", s);
5727         }
5728 
5729         fd->version = major*256 + minor;
5730 
5731         fd->use_rans = (CRAM_MAJOR_VERS(fd->version) >= 3) ? 1 : 0;
5732 
5733         fd->use_tok = ((CRAM_MAJOR_VERS(fd->version) == 3 &&
5734                         CRAM_MINOR_VERS(fd->version) >= 1) ||
5735                         CRAM_MAJOR_VERS(fd->version) >= 4) ? 1 : 0;
5736         cram_init_tables(fd);
5737 
5738         break;
5739     }
5740 
5741     case CRAM_OPT_MULTI_SEQ_PER_SLICE:
5742         fd->multi_seq_user = fd->multi_seq = va_arg(args, int);
5743         break;
5744 
5745     case CRAM_OPT_NTHREADS: {
5746         int nthreads =  va_arg(args, int);
5747         if (nthreads >= 1) {
5748             if (!(fd->pool = hts_tpool_init(nthreads)))
5749                 return -1;
5750 
5751             fd->rqueue = hts_tpool_process_init(fd->pool, nthreads*2, 0);
5752             pthread_mutex_init(&fd->metrics_lock, NULL);
5753             pthread_mutex_init(&fd->ref_lock, NULL);
5754             pthread_mutex_init(&fd->range_lock, NULL);
5755             pthread_mutex_init(&fd->bam_list_lock, NULL);
5756             fd->shared_ref = 1;
5757             fd->own_pool = 1;
5758         }
5759         break;
5760     }
5761 
5762     case CRAM_OPT_THREAD_POOL: {
5763         htsThreadPool *p = va_arg(args, htsThreadPool *);
5764         fd->pool = p ? p->pool : NULL;
5765         if (fd->pool) {
5766             fd->rqueue = hts_tpool_process_init(fd->pool,
5767                                                 p->qsize ? p->qsize : hts_tpool_size(fd->pool)*2,
5768                                                 0);
5769             pthread_mutex_init(&fd->metrics_lock, NULL);
5770             pthread_mutex_init(&fd->ref_lock, NULL);
5771             pthread_mutex_init(&fd->range_lock, NULL);
5772             pthread_mutex_init(&fd->bam_list_lock, NULL);
5773         }
5774         fd->shared_ref = 1; // Needed to avoid clobbering ref between threads
5775         fd->own_pool = 0;
5776 
5777         //fd->qsize = 1;
5778         //fd->decoded = calloc(fd->qsize, sizeof(cram_container *));
5779         //hts_tpool_dispatch(fd->pool, cram_decoder_thread, fd);
5780         break;
5781     }
5782 
5783     case CRAM_OPT_REQUIRED_FIELDS:
5784         fd->required_fields = va_arg(args, int);
5785         if (fd->range.refid != -2)
5786             fd->required_fields |= SAM_POS;
5787         break;
5788 
5789     case CRAM_OPT_STORE_MD:
5790         fd->store_md = va_arg(args, int);
5791         break;
5792 
5793     case CRAM_OPT_STORE_NM:
5794         fd->store_nm = va_arg(args, int);
5795         break;
5796 
5797     case HTS_OPT_COMPRESSION_LEVEL:
5798         fd->level = va_arg(args, int);
5799         break;
5800 
5801     case HTS_OPT_PROFILE: {
5802         enum hts_profile_option prof = va_arg(args, int);
5803         switch (prof) {
5804         case HTS_PROFILE_FAST:
5805             if (fd->level == CRAM_DEFAULT_LEVEL) fd->level = 1;
5806             fd->use_tok = 0;
5807             fd->seqs_per_slice = 10000;
5808             break;
5809 
5810         case HTS_PROFILE_NORMAL:
5811             break;
5812 
5813         case HTS_PROFILE_SMALL:
5814             if (fd->level == CRAM_DEFAULT_LEVEL) fd->level = 6;
5815             fd->use_bz2 = 1;
5816             fd->use_fqz = 1;
5817             fd->seqs_per_slice = 25000;
5818             break;
5819 
5820         case HTS_PROFILE_ARCHIVE:
5821             if (fd->level == CRAM_DEFAULT_LEVEL) fd->level = 7;
5822             fd->use_bz2 = 1;
5823             fd->use_fqz = 1;
5824             fd->use_arith = 1;
5825             if (fd->level > 7)
5826                 fd->use_lzma = 1;
5827             fd->seqs_per_slice = 100000;
5828             break;
5829         }
5830 
5831         if (fd->bases_per_slice == BASES_PER_SLICE)
5832             fd->bases_per_slice = fd->seqs_per_slice * 500;
5833         break;
5834     }
5835 
5836     default:
5837         hts_log_error("Unknown CRAM option code %d", opt);
5838         errno = EINVAL;
5839         return -1;
5840     }
5841 
5842     return 0;
5843 }
5844 
cram_check_EOF(cram_fd * fd)5845 int cram_check_EOF(cram_fd *fd)
5846 {
5847     // Byte 9 in these templates is & with 0x0f to resolve differences
5848     // between ITF-8 interpretations between early Java and C
5849     // implementations of CRAM
5850     static const unsigned char TEMPLATE_2_1[30] = {
5851         0x0b, 0x00, 0x00, 0x00, 0xff, 0xff, 0xff, 0xff, 0x0f, 0xe0,
5852         0x45, 0x4f, 0x46, 0x00, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00,
5853         0x01, 0x00, 0x06, 0x06, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00
5854     };
5855     static const unsigned char TEMPLATE_3[38] = {
5856         0x0f, 0x00, 0x00, 0x00, 0xff, 0xff, 0xff, 0xff, 0x0f, 0xe0,
5857         0x45, 0x4f, 0x46, 0x00, 0x00, 0x00, 0x00, 0x01, 0x00, 0x05,
5858         0xbd, 0xd9, 0x4f, 0x00, 0x01, 0x00, 0x06, 0x06, 0x01, 0x00,
5859         0x01, 0x00, 0x01, 0x00, 0xee, 0x63, 0x01, 0x4b
5860     };
5861 
5862     unsigned char buf[38]; // max(sizeof TEMPLATE_*)
5863 
5864     uint8_t major = CRAM_MAJOR_VERS(fd->version);
5865     uint8_t minor = CRAM_MINOR_VERS(fd->version);
5866 
5867     const unsigned char *template;
5868     ssize_t template_len;
5869     if ((major < 2) ||
5870         (major == 2 && minor == 0)) {
5871         return 3; // No EOF support in cram versions less than 2.1
5872     } else if (major == 2 && minor == 1) {
5873         template = TEMPLATE_2_1;
5874         template_len = sizeof TEMPLATE_2_1;
5875     } else {
5876         template = TEMPLATE_3;
5877         template_len = sizeof TEMPLATE_3;
5878     }
5879 
5880     off_t offset = htell(fd->fp);
5881     if (hseek(fd->fp, -template_len, SEEK_END) < 0) {
5882         if (errno == ESPIPE) {
5883             hclearerr(fd->fp);
5884             return 2;
5885         }
5886         else {
5887             return -1;
5888         }
5889     }
5890     if (hread(fd->fp, buf, template_len) != template_len) return -1;
5891     if (hseek(fd->fp, offset, SEEK_SET) < 0) return -1;
5892     buf[8] &= 0x0f;
5893     return (memcmp(template, buf, template_len) == 0)? 1 : 0;
5894 }
5895