1 /*
2 Copyright (c) 2012-2016 Genome Research Ltd.
3 Author: James Bonfield <jkb@sanger.ac.uk>
4 
5 Redistribution and use in source and binary forms, with or without
6 modification, are permitted provided that the following conditions are met:
7 
8    1. Redistributions of source code must retain the above copyright notice,
9 this list of conditions and the following disclaimer.
10 
11    2. Redistributions in binary form must reproduce the above copyright notice,
12 this list of conditions and the following disclaimer in the documentation
13 and/or other materials provided with the distribution.
14 
15    3. Neither the names Genome Research Ltd and Wellcome Trust Sanger
16 Institute nor the names of its contributors may be used to endorse or promote
17 products derived from this software without specific prior written permission.
18 
19 THIS SOFTWARE IS PROVIDED BY GENOME RESEARCH LTD AND CONTRIBUTORS "AS IS" AND
20 ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
21 WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
22 DISCLAIMED. IN NO EVENT SHALL GENOME RESEARCH LTD OR CONTRIBUTORS BE LIABLE
23 FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
24 DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
25 SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
26 CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
27 OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
28 OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
29 */
30 
31 /*
32  * CRAM I/O primitives.
33  *
34  * - ITF8 encoding and decoding.
35  * - Block based I/O
36  * - Zlib inflating and deflating (memory)
37  * - CRAM basic data structure reading and writing
38  * - File opening / closing
39  * - Reference sequence handling
40  */
41 
42 /*
43  * TODO: BLOCK_GROW, BLOCK_RESIZE, BLOCK_APPEND and itf8_put_blk all need
44  * a way to return errors for when malloc fails.
45  */
46 
47 #include <config.h>
48 
49 #include <stdio.h>
50 #include <errno.h>
51 #include <assert.h>
52 #include <stdlib.h>
53 #include <string.h>
54 #include <unistd.h>
55 #include <zlib.h>
56 #ifdef HAVE_LIBBZ2
57 #include <bzlib.h>
58 #endif
59 #ifdef HAVE_LIBLZMA
60 #ifdef HAVE_LZMA_H
61 #include <lzma.h>
62 #else
63 #include "os/lzma_stub.h"
64 #endif
65 #endif
66 #include <sys/types.h>
67 #include <sys/stat.h>
68 #include <math.h>
69 #include <time.h>
70 #include <stdint.h>
71 
72 #ifdef HAVE_LIBDEFLATE
73 #include <libdeflate.h>
74 #define crc32(a,b,c) libdeflate_crc32((a),(b),(c))
75 #endif
76 
77 #include "cram/cram.h"
78 #include "cram/os.h"
79 #include "htslib/hts.h"
80 #include "cram/open_trace_file.h"
81 #include "cram/rANS_static.h"
82 
83 //#define REF_DEBUG
84 
85 #ifdef REF_DEBUG
86 #include <sys/syscall.h>
87 #define gettid() (int)syscall(SYS_gettid)
88 
89 #define RP(...) fprintf (stderr, __VA_ARGS__)
90 #else
91 #define RP(...)
92 #endif
93 
94 #include "htslib/hfile.h"
95 #include "htslib/bgzf.h"
96 #include "htslib/faidx.h"
97 #include "hts_internal.h"
98 
99 #ifndef PATH_MAX
100 #define PATH_MAX FILENAME_MAX
101 #endif
102 
103 #define TRIAL_SPAN 50
104 #define NTRIALS 3
105 
106 
107 /* ----------------------------------------------------------------------
108  * ITF8 encoding and decoding.
109  *
110  * Also see the itf8_get and itf8_put macros in cram_io.h
111  */
112 
113 /*
114  * LEGACY: consider using itf8_decode_crc.
115  *
116  * Reads an integer in ITF-8 encoding from 'cp' and stores it in
117  * *val.
118  *
119  * Returns the number of bytes read on success
120  *        -1 on failure
121  */
itf8_decode(cram_fd * fd,int32_t * val_p)122 int itf8_decode(cram_fd *fd, int32_t *val_p) {
123     static int nbytes[16] = {
124         0,0,0,0, 0,0,0,0,                               // 0000xxxx - 0111xxxx
125         1,1,1,1,                                        // 1000xxxx - 1011xxxx
126         2,2,                                            // 1100xxxx - 1101xxxx
127         3,                                              // 1110xxxx
128         4,                                              // 1111xxxx
129     };
130 
131     static int nbits[16] = {
132         0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, // 0000xxxx - 0111xxxx
133         0x3f, 0x3f, 0x3f, 0x3f,                         // 1000xxxx - 1011xxxx
134         0x1f, 0x1f,                                     // 1100xxxx - 1101xxxx
135         0x0f,                                           // 1110xxxx
136         0x0f,                                           // 1111xxxx
137     };
138 
139     int32_t val = hgetc(fd->fp);
140     if (val == -1)
141         return -1;
142 
143     int i = nbytes[val>>4];
144     val &= nbits[val>>4];
145 
146     switch(i) {
147     case 0:
148         *val_p = val;
149         return 1;
150 
151     case 1:
152         val = (val<<8) | (unsigned char)hgetc(fd->fp);
153         *val_p = val;
154         return 2;
155 
156     case 2:
157         val = (val<<8) | (unsigned char)hgetc(fd->fp);
158         val = (val<<8) | (unsigned char)hgetc(fd->fp);
159         *val_p = val;
160         return 3;
161 
162     case 3:
163         val = (val<<8) | (unsigned char)hgetc(fd->fp);
164         val = (val<<8) | (unsigned char)hgetc(fd->fp);
165         val = (val<<8) | (unsigned char)hgetc(fd->fp);
166         *val_p = val;
167         return 4;
168 
169     case 4: // really 3.5 more, why make it different?
170         val = (val<<8) | (unsigned char)hgetc(fd->fp);
171         val = (val<<8) | (unsigned char)hgetc(fd->fp);
172         val = (val<<8) | (unsigned char)hgetc(fd->fp);
173         val = (val<<4) | (((unsigned char)hgetc(fd->fp)) & 0x0f);
174         *val_p = val;
175     }
176 
177     return 5;
178 }
179 
itf8_decode_crc(cram_fd * fd,int32_t * val_p,uint32_t * crc)180 int itf8_decode_crc(cram_fd *fd, int32_t *val_p, uint32_t *crc) {
181     static int nbytes[16] = {
182         0,0,0,0, 0,0,0,0,                               // 0000xxxx - 0111xxxx
183         1,1,1,1,                                        // 1000xxxx - 1011xxxx
184         2,2,                                            // 1100xxxx - 1101xxxx
185         3,                                              // 1110xxxx
186         4,                                              // 1111xxxx
187     };
188 
189     static int nbits[16] = {
190         0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, // 0000xxxx - 0111xxxx
191         0x3f, 0x3f, 0x3f, 0x3f,                         // 1000xxxx - 1011xxxx
192         0x1f, 0x1f,                                     // 1100xxxx - 1101xxxx
193         0x0f,                                           // 1110xxxx
194         0x0f,                                           // 1111xxxx
195     };
196     unsigned char c[5];
197 
198     int32_t val = hgetc(fd->fp);
199     if (val == -1)
200         return -1;
201 
202     c[0]=val;
203 
204     int i = nbytes[val>>4];
205     val &= nbits[val>>4];
206 
207     switch(i) {
208     case 0:
209         *val_p = val;
210         *crc = crc32(*crc, c, 1);
211         return 1;
212 
213     case 1:
214         val = (val<<8) | (c[1]=hgetc(fd->fp));
215         *val_p = val;
216         *crc = crc32(*crc, c, 2);
217         return 2;
218 
219     case 2:
220         val = (val<<8) | (c[1]=hgetc(fd->fp));
221         val = (val<<8) | (c[2]=hgetc(fd->fp));
222         *val_p = val;
223         *crc = crc32(*crc, c, 3);
224         return 3;
225 
226     case 3:
227         val = (val<<8) | (c[1]=hgetc(fd->fp));
228         val = (val<<8) | (c[2]=hgetc(fd->fp));
229         val = (val<<8) | (c[3]=hgetc(fd->fp));
230         *val_p = val;
231         *crc = crc32(*crc, c, 4);
232         return 4;
233 
234     case 4: // really 3.5 more, why make it different?
235         {
236             uint32_t uv = val;
237             uv = (uv<<8) |   (c[1]=hgetc(fd->fp));
238             uv = (uv<<8) |   (c[2]=hgetc(fd->fp));
239             uv = (uv<<8) |   (c[3]=hgetc(fd->fp));
240             uv = (uv<<4) | (((c[4]=hgetc(fd->fp))) & 0x0f);
241             *val_p = uv < 0x80000000UL ? uv : -((int32_t) (0xffffffffUL - uv)) - 1;
242             *crc = crc32(*crc, c, 5);
243         }
244     }
245 
246     return 5;
247 }
248 
249 /*
250  * Encodes and writes a single integer in ITF-8 format.
251  * Returns 0 on success
252  *        -1 on failure
253  */
itf8_encode(cram_fd * fd,int32_t val)254 int itf8_encode(cram_fd *fd, int32_t val) {
255     char buf[5];
256     int len = itf8_put(buf, val);
257     return hwrite(fd->fp, buf, len) == len ? 0 : -1;
258 }
259 
260 const int itf8_bytes[16] = {
261     1, 1, 1, 1,  1, 1, 1, 1,
262     2, 2, 2, 2,  3, 3, 4, 5
263 };
264 
265 const int ltf8_bytes[256] = {
266     1, 1, 1, 1,  1, 1, 1, 1,  1, 1, 1, 1,  1, 1, 1, 1,
267     1, 1, 1, 1,  1, 1, 1, 1,  1, 1, 1, 1,  1, 1, 1, 1,
268     1, 1, 1, 1,  1, 1, 1, 1,  1, 1, 1, 1,  1, 1, 1, 1,
269     1, 1, 1, 1,  1, 1, 1, 1,  1, 1, 1, 1,  1, 1, 1, 1,
270 
271     1, 1, 1, 1,  1, 1, 1, 1,  1, 1, 1, 1,  1, 1, 1, 1,
272     1, 1, 1, 1,  1, 1, 1, 1,  1, 1, 1, 1,  1, 1, 1, 1,
273     1, 1, 1, 1,  1, 1, 1, 1,  1, 1, 1, 1,  1, 1, 1, 1,
274     1, 1, 1, 1,  1, 1, 1, 1,  1, 1, 1, 1,  1, 1, 1, 1,
275 
276     2, 2, 2, 2,  2, 2, 2, 2,  2, 2, 2, 2,  2, 2, 2, 2,
277     2, 2, 2, 2,  2, 2, 2, 2,  2, 2, 2, 2,  2, 2, 2, 2,
278     2, 2, 2, 2,  2, 2, 2, 2,  2, 2, 2, 2,  2, 2, 2, 2,
279     2, 2, 2, 2,  2, 2, 2, 2,  2, 2, 2, 2,  2, 2, 2, 2,
280 
281     3, 3, 3, 3,  3, 3, 3, 3,  3, 3, 3, 3,  3, 3, 3, 3,
282     3, 3, 3, 3,  3, 3, 3, 3,  3, 3, 3, 3,  3, 3, 3, 3,
283 
284     4, 4, 4, 4,  4, 4, 4, 4,  4, 4, 4, 4,  4, 4, 4, 4,
285 
286     5, 5, 5, 5,  5, 5, 5, 5,  6, 6, 6, 6,  7, 7, 8, 9
287 };
288 
289 /*
290  * LEGACY: consider using ltf8_decode_crc.
291  */
ltf8_decode(cram_fd * fd,int64_t * val_p)292 int ltf8_decode(cram_fd *fd, int64_t *val_p) {
293     int c = hgetc(fd->fp);
294     int64_t val = (unsigned char)c;
295     if (c == -1)
296         return -1;
297 
298     if (val < 0x80) {
299         *val_p =   val;
300         return 1;
301 
302     } else if (val < 0xc0) {
303         val = (val<<8) | (unsigned char)hgetc(fd->fp);
304         *val_p = val & (((1LL<<(6+8)))-1);
305         return 2;
306 
307     } else if (val < 0xe0) {
308         val = (val<<8) | (unsigned char)hgetc(fd->fp);
309         val = (val<<8) | (unsigned char)hgetc(fd->fp);
310         *val_p = val & ((1LL<<(5+2*8))-1);
311         return 3;
312 
313     } else if (val < 0xf0) {
314         val = (val<<8) | (unsigned char)hgetc(fd->fp);
315         val = (val<<8) | (unsigned char)hgetc(fd->fp);
316         val = (val<<8) | (unsigned char)hgetc(fd->fp);
317         *val_p = val & ((1LL<<(4+3*8))-1);
318         return 4;
319 
320     } else if (val < 0xf8) {
321         val = (val<<8) | (unsigned char)hgetc(fd->fp);
322         val = (val<<8) | (unsigned char)hgetc(fd->fp);
323         val = (val<<8) | (unsigned char)hgetc(fd->fp);
324         val = (val<<8) | (unsigned char)hgetc(fd->fp);
325         *val_p = val & ((1LL<<(3+4*8))-1);
326         return 5;
327 
328     } else if (val < 0xfc) {
329         val = (val<<8) | (unsigned char)hgetc(fd->fp);
330         val = (val<<8) | (unsigned char)hgetc(fd->fp);
331         val = (val<<8) | (unsigned char)hgetc(fd->fp);
332         val = (val<<8) | (unsigned char)hgetc(fd->fp);
333         val = (val<<8) | (unsigned char)hgetc(fd->fp);
334         *val_p = val & ((1LL<<(2+5*8))-1);
335         return 6;
336 
337     } else if (val < 0xfe) {
338         val = (val<<8) | (unsigned char)hgetc(fd->fp);
339         val = (val<<8) | (unsigned char)hgetc(fd->fp);
340         val = (val<<8) | (unsigned char)hgetc(fd->fp);
341         val = (val<<8) | (unsigned char)hgetc(fd->fp);
342         val = (val<<8) | (unsigned char)hgetc(fd->fp);
343         val = (val<<8) | (unsigned char)hgetc(fd->fp);
344         *val_p = val & ((1LL<<(1+6*8))-1);
345         return 7;
346 
347     } else if (val < 0xff) {
348         val = (val<<8) | (unsigned char)hgetc(fd->fp);
349         val = (val<<8) | (unsigned char)hgetc(fd->fp);
350         val = (val<<8) | (unsigned char)hgetc(fd->fp);
351         val = (val<<8) | (unsigned char)hgetc(fd->fp);
352         val = (val<<8) | (unsigned char)hgetc(fd->fp);
353         val = (val<<8) | (unsigned char)hgetc(fd->fp);
354         val = (val<<8) | (unsigned char)hgetc(fd->fp);
355         *val_p = val & ((1LL<<(7*8))-1);
356         return 8;
357 
358     } else {
359         val = (val<<8) | (unsigned char)hgetc(fd->fp);
360         val = (val<<8) | (unsigned char)hgetc(fd->fp);
361         val = (val<<8) | (unsigned char)hgetc(fd->fp);
362         val = (val<<8) | (unsigned char)hgetc(fd->fp);
363         val = (val<<8) | (unsigned char)hgetc(fd->fp);
364         val = (val<<8) | (unsigned char)hgetc(fd->fp);
365         val = (val<<8) | (unsigned char)hgetc(fd->fp);
366         val = (val<<8) | (unsigned char)hgetc(fd->fp);
367         *val_p = val;
368     }
369 
370     return 9;
371 }
372 
ltf8_decode_crc(cram_fd * fd,int64_t * val_p,uint32_t * crc)373 int ltf8_decode_crc(cram_fd *fd, int64_t *val_p, uint32_t *crc) {
374     unsigned char c[9];
375     int64_t val = (unsigned char)hgetc(fd->fp);
376     if (val == -1)
377         return -1;
378 
379     c[0] = val;
380 
381     if (val < 0x80) {
382         *val_p =   val;
383         *crc = crc32(*crc, c, 1);
384         return 1;
385 
386     } else if (val < 0xc0) {
387         val = (val<<8) | (c[1]=hgetc(fd->fp));;
388         *val_p = val & (((1LL<<(6+8)))-1);
389         *crc = crc32(*crc, c, 2);
390         return 2;
391 
392     } else if (val < 0xe0) {
393         val = (val<<8) | (c[1]=hgetc(fd->fp));;
394         val = (val<<8) | (c[2]=hgetc(fd->fp));;
395         *val_p = val & ((1LL<<(5+2*8))-1);
396         *crc = crc32(*crc, c, 3);
397         return 3;
398 
399     } else if (val < 0xf0) {
400         val = (val<<8) | (c[1]=hgetc(fd->fp));;
401         val = (val<<8) | (c[2]=hgetc(fd->fp));;
402         val = (val<<8) | (c[3]=hgetc(fd->fp));;
403         *val_p = val & ((1LL<<(4+3*8))-1);
404         *crc = crc32(*crc, c, 4);
405         return 4;
406 
407     } else if (val < 0xf8) {
408         val = (val<<8) | (c[1]=hgetc(fd->fp));;
409         val = (val<<8) | (c[2]=hgetc(fd->fp));;
410         val = (val<<8) | (c[3]=hgetc(fd->fp));;
411         val = (val<<8) | (c[4]=hgetc(fd->fp));;
412         *val_p = val & ((1LL<<(3+4*8))-1);
413         *crc = crc32(*crc, c, 5);
414         return 5;
415 
416     } else if (val < 0xfc) {
417         val = (val<<8) | (c[1]=hgetc(fd->fp));;
418         val = (val<<8) | (c[2]=hgetc(fd->fp));;
419         val = (val<<8) | (c[3]=hgetc(fd->fp));;
420         val = (val<<8) | (c[4]=hgetc(fd->fp));;
421         val = (val<<8) | (c[5]=hgetc(fd->fp));;
422         *val_p = val & ((1LL<<(2+5*8))-1);
423         *crc = crc32(*crc, c, 6);
424         return 6;
425 
426     } else if (val < 0xfe) {
427         val = (val<<8) | (c[1]=hgetc(fd->fp));;
428         val = (val<<8) | (c[2]=hgetc(fd->fp));;
429         val = (val<<8) | (c[3]=hgetc(fd->fp));;
430         val = (val<<8) | (c[4]=hgetc(fd->fp));;
431         val = (val<<8) | (c[5]=hgetc(fd->fp));;
432         val = (val<<8) | (c[6]=hgetc(fd->fp));;
433         *val_p = val & ((1LL<<(1+6*8))-1);
434         *crc = crc32(*crc, c, 7);
435         return 7;
436 
437     } else if (val < 0xff) {
438         val = (val<<8) | (c[1]=hgetc(fd->fp));;
439         val = (val<<8) | (c[2]=hgetc(fd->fp));;
440         val = (val<<8) | (c[3]=hgetc(fd->fp));;
441         val = (val<<8) | (c[4]=hgetc(fd->fp));;
442         val = (val<<8) | (c[5]=hgetc(fd->fp));;
443         val = (val<<8) | (c[6]=hgetc(fd->fp));;
444         val = (val<<8) | (c[7]=hgetc(fd->fp));;
445         *val_p = val & ((1LL<<(7*8))-1);
446         *crc = crc32(*crc, c, 8);
447         return 8;
448 
449     } else {
450         val = (val<<8) | (c[1]=hgetc(fd->fp));;
451         val = (val<<8) | (c[2]=hgetc(fd->fp));;
452         val = (val<<8) | (c[3]=hgetc(fd->fp));;
453         val = (val<<8) | (c[4]=hgetc(fd->fp));;
454         val = (val<<8) | (c[5]=hgetc(fd->fp));;
455         val = (val<<8) | (c[6]=hgetc(fd->fp));;
456         val = (val<<8) | (c[7]=hgetc(fd->fp));;
457         val = (val<<8) | (c[8]=hgetc(fd->fp));;
458         *crc = crc32(*crc, c, 9);
459         *val_p = val;
460     }
461 
462     return 9;
463 }
464 
465 /*
466  * Pushes a value in ITF8 format onto the end of a block.
467  * This shouldn't be used for high-volume data as it is not the fastest
468  * method.
469  *
470  * Returns the number of bytes written
471  */
itf8_put_blk(cram_block * blk,int val)472 int itf8_put_blk(cram_block *blk, int val) {
473     char buf[5];
474     int sz;
475 
476     sz = itf8_put(buf, val);
477     BLOCK_APPEND(blk, buf, sz);
478     return sz;
479 }
480 
481 /*
482  * Decodes a 32-bit little endian value from fd and stores in val.
483  *
484  * Returns the number of bytes read on success
485  *         -1 on failure
486  */
int32_decode(cram_fd * fd,int32_t * val)487 int int32_decode(cram_fd *fd, int32_t *val) {
488     int32_t i;
489     if (4 != hread(fd->fp, &i, 4))
490         return -1;
491 
492     *val = le_int4(i);
493     return 4;
494 }
495 
496 /*
497  * Encodes a 32-bit little endian value 'val' and writes to fd.
498  *
499  * Returns the number of bytes written on success
500  *         -1 on failure
501  */
int32_encode(cram_fd * fd,int32_t val)502 int int32_encode(cram_fd *fd, int32_t val) {
503     val = le_int4(val);
504     if (4 != hwrite(fd->fp, &val, 4))
505         return -1;
506 
507     return 4;
508 }
509 
510 /* As int32_decoded/encode, but from/to blocks instead of cram_fd */
int32_get_blk(cram_block * b,int32_t * val)511 int int32_get_blk(cram_block *b, int32_t *val) {
512     if (b->uncomp_size - BLOCK_SIZE(b) < 4)
513         return -1;
514 
515     *val =
516          b->data[b->byte  ]        |
517         (b->data[b->byte+1] <<  8) |
518         (b->data[b->byte+2] << 16) |
519         (b->data[b->byte+3] << 24);
520     BLOCK_SIZE(b) += 4;
521     return 4;
522 }
523 
524 /* As int32_decoded/encode, but from/to blocks instead of cram_fd */
int32_put_blk(cram_block * b,int32_t val)525 int int32_put_blk(cram_block *b, int32_t val) {
526     unsigned char cp[4];
527     cp[0] = ( val      & 0xff);
528     cp[1] = ((val>>8)  & 0xff);
529     cp[2] = ((val>>16) & 0xff);
530     cp[3] = ((val>>24) & 0xff);
531 
532     BLOCK_APPEND(b, cp, 4);
533     return b->data ? 0 : -1;
534 }
535 
536 /* ----------------------------------------------------------------------
537  * zlib compression code - from Gap5's tg_iface_g.c
538  * They're static here as they're only used within the cram_compress_block
539  * and cram_uncompress_block functions, which are the external interface.
540  */
zlib_mem_inflate(char * cdata,size_t csize,size_t * size)541 char *zlib_mem_inflate(char *cdata, size_t csize, size_t *size) {
542     z_stream s;
543     unsigned char *data = NULL; /* Uncompressed output */
544     int data_alloc = 0;
545     int err;
546 
547     /* Starting point at uncompressed size, and scale after that */
548     data = malloc(data_alloc = csize*1.2+100);
549     if (!data)
550         return NULL;
551 
552     /* Initialise zlib stream */
553     s.zalloc = Z_NULL; /* use default allocation functions */
554     s.zfree  = Z_NULL;
555     s.opaque = Z_NULL;
556     s.next_in  = (unsigned char *)cdata;
557     s.avail_in = csize;
558     s.total_in = 0;
559     s.next_out  = data;
560     s.avail_out = data_alloc;
561     s.total_out = 0;
562 
563     //err = inflateInit(&s);
564     err = inflateInit2(&s, 15 + 32);
565     if (err != Z_OK) {
566         hts_log_error("Call to zlib inflateInit failed: %s", s.msg);
567         free(data);
568         return NULL;
569     }
570 
571     /* Decode to 'data' array */
572     for (;s.avail_in;) {
573         unsigned char *data_tmp;
574         int alloc_inc;
575 
576         s.next_out = &data[s.total_out];
577         err = inflate(&s, Z_NO_FLUSH);
578         if (err == Z_STREAM_END)
579             break;
580 
581         if (err != Z_OK) {
582             hts_log_error("Call to zlib inflate failed: %s", s.msg);
583             if (data)
584                 free(data);
585             return NULL;
586         }
587 
588         /* More to come, so realloc based on growth so far */
589         alloc_inc = (double)s.avail_in/s.total_in * s.total_out + 100;
590         data = realloc((data_tmp = data), data_alloc += alloc_inc);
591         if (!data) {
592             free(data_tmp);
593             return NULL;
594         }
595         s.avail_out += alloc_inc;
596     }
597     inflateEnd(&s);
598 
599     *size = s.total_out;
600     return (char *)data;
601 }
602 
zlib_mem_deflate(char * data,size_t size,size_t * cdata_size,int level,int strat)603 static char *zlib_mem_deflate(char *data, size_t size, size_t *cdata_size,
604                               int level, int strat) {
605     z_stream s;
606     unsigned char *cdata = NULL; /* Compressed output */
607     int cdata_alloc = 0;
608     int cdata_pos = 0;
609     int err;
610 
611     cdata = malloc(cdata_alloc = size*1.05+100);
612     if (!cdata)
613         return NULL;
614     cdata_pos = 0;
615 
616     /* Initialise zlib stream */
617     s.zalloc = Z_NULL; /* use default allocation functions */
618     s.zfree  = Z_NULL;
619     s.opaque = Z_NULL;
620     s.next_in  = (unsigned char *)data;
621     s.avail_in = size;
622     s.total_in = 0;
623     s.next_out  = cdata;
624     s.avail_out = cdata_alloc;
625     s.total_out = 0;
626     s.data_type = Z_BINARY;
627 
628     err = deflateInit2(&s, level, Z_DEFLATED, 15|16, 9, strat);
629     if (err != Z_OK) {
630         hts_log_error("Call to zlib deflateInit2 failed: %s", s.msg);
631         return NULL;
632     }
633 
634     /* Encode to 'cdata' array */
635     for (;s.avail_in;) {
636         s.next_out = &cdata[cdata_pos];
637         s.avail_out = cdata_alloc - cdata_pos;
638         if (cdata_alloc - cdata_pos <= 0) {
639             hts_log_error("Deflate produced larger output than expected");
640             return NULL;
641         }
642         err = deflate(&s, Z_NO_FLUSH);
643         cdata_pos = cdata_alloc - s.avail_out;
644         if (err != Z_OK) {
645             hts_log_error("Call to zlib deflate failed: %s", s.msg);
646             break;
647         }
648     }
649     if (deflate(&s, Z_FINISH) != Z_STREAM_END) {
650         hts_log_error("Call to zlib deflate failed: %s", s.msg);
651     }
652     *cdata_size = s.total_out;
653 
654     if (deflateEnd(&s) != Z_OK) {
655         hts_log_error("Call to zlib deflate failed: %s", s.msg);
656     }
657     return (char *)cdata;
658 }
659 
660 #ifdef HAVE_LIBLZMA
661 /* ------------------------------------------------------------------------ */
662 /*
663  * Data compression routines using liblzma (xz)
664  *
665  * On a test set this shrunk the main db from 136157104 bytes to 114796168, but
666  * caused tg_index to grow from 2m43.707s to 15m3.961s. Exporting as bfastq
667  * went from 18.3s to 36.3s. So decompression suffers too, but not as bad
668  * as compression times.
669  *
670  * For now we disable this functionality. If it's to be reenabled make sure you
671  * improve the mem_inflate implementation as it's just a test hack at the
672  * moment.
673  */
674 
lzma_mem_deflate(char * data,size_t size,size_t * cdata_size,int level)675 static char *lzma_mem_deflate(char *data, size_t size, size_t *cdata_size,
676                               int level) {
677     char *out;
678     size_t out_size = lzma_stream_buffer_bound(size);
679     *cdata_size = 0;
680 
681     out = malloc(out_size);
682 
683     /* Single call compression */
684     if (LZMA_OK != lzma_easy_buffer_encode(level, LZMA_CHECK_CRC32, NULL,
685                                            (uint8_t *)data, size,
686                                            (uint8_t *)out, cdata_size,
687                                            out_size))
688         return NULL;
689 
690     return out;
691 }
692 
lzma_mem_inflate(char * cdata,size_t csize,size_t * size)693 static char *lzma_mem_inflate(char *cdata, size_t csize, size_t *size) {
694     lzma_stream strm = LZMA_STREAM_INIT;
695     size_t out_size = 0, out_pos = 0;
696     char *out = NULL;
697     int r;
698 
699     /* Initiate the decoder */
700     if (LZMA_OK != lzma_stream_decoder(&strm, lzma_easy_decoder_memusage(9), 0))
701         return NULL;
702 
703     /* Decode loop */
704     strm.avail_in = csize;
705     strm.next_in = (uint8_t *)cdata;
706 
707     for (;strm.avail_in;) {
708         if (strm.avail_in > out_size - out_pos) {
709             out_size += strm.avail_in * 4 + 32768;
710             out = realloc(out, out_size);
711         }
712         strm.avail_out = out_size - out_pos;
713         strm.next_out = (uint8_t *)&out[out_pos];
714 
715         r = lzma_code(&strm, LZMA_RUN);
716         if (LZMA_OK != r && LZMA_STREAM_END != r) {
717             hts_log_error("LZMA decode failure (error %d)", r);
718             return NULL;
719         }
720 
721         out_pos = strm.total_out;
722 
723         if (r == LZMA_STREAM_END)
724             break;
725     }
726 
727     /* finish up any unflushed data; necessary? */
728     r = lzma_code(&strm, LZMA_FINISH);
729     if (r != LZMA_OK && r != LZMA_STREAM_END) {
730         hts_log_error("Call to lzma_code failed with error %d", r);
731         return NULL;
732     }
733 
734     out = realloc(out, strm.total_out);
735     *size = strm.total_out;
736 
737     lzma_end(&strm);
738 
739     return out;
740 }
741 #endif
742 
743 /* ----------------------------------------------------------------------
744  * CRAM blocks - the dynamically growable data block. We have code to
745  * create, update, (un)compress and read/write.
746  *
747  * These are derived from the deflate_interlaced.c blocks, but with the
748  * CRAM extension of content types and IDs.
749  */
750 
751 /*
752  * Allocates a new cram_block structure with a specified content_type and
753  * id.
754  *
755  * Returns block pointer on success
756  *         NULL on failure
757  */
cram_new_block(enum cram_content_type content_type,int content_id)758 cram_block *cram_new_block(enum cram_content_type content_type,
759                            int content_id) {
760     cram_block *b = malloc(sizeof(*b));
761     if (!b)
762         return NULL;
763     b->method = b->orig_method = RAW;
764     b->content_type = content_type;
765     b->content_id = content_id;
766     b->comp_size = 0;
767     b->uncomp_size = 0;
768     b->data = NULL;
769     b->alloc = 0;
770     b->byte = 0;
771     b->bit = 7; // MSB
772 
773     return b;
774 }
775 
776 /*
777  * Reads a block from a cram file.
778  * Returns cram_block pointer on success.
779  *         NULL on failure
780  */
cram_read_block(cram_fd * fd)781 cram_block *cram_read_block(cram_fd *fd) {
782     cram_block *b = malloc(sizeof(*b));
783     unsigned char c;
784     uint32_t crc = 0;
785     if (!b)
786         return NULL;
787 
788     //fprintf(stderr, "Block at %d\n", (int)ftell(fd->fp));
789 
790     if (-1 == (b->method      = hgetc(fd->fp))) { free(b); return NULL; }
791     c = b->method; crc = crc32(crc, &c, 1);
792     if (-1 == (b->content_type= hgetc(fd->fp))) { free(b); return NULL; }
793     c = b->content_type; crc = crc32(crc, &c, 1);
794     if (-1 == itf8_decode_crc(fd, &b->content_id, &crc))  { free(b); return NULL; }
795     if (-1 == itf8_decode_crc(fd, &b->comp_size, &crc))   { free(b); return NULL; }
796     if (-1 == itf8_decode_crc(fd, &b->uncomp_size, &crc)) { free(b); return NULL; }
797 
798     //fprintf(stderr, "  method %d, ctype %d, cid %d, csize %d, ucsize %d\n",
799     //      b->method, b->content_type, b->content_id, b->comp_size, b->uncomp_size);
800 
801     if (b->method == RAW) {
802         if (b->uncomp_size < 0 || b->comp_size != b->uncomp_size) {
803             free(b);
804             return NULL;
805         }
806         b->alloc = b->uncomp_size;
807         if (!(b->data = malloc(b->uncomp_size))){ free(b); return NULL; }
808         if (b->uncomp_size != hread(fd->fp, b->data, b->uncomp_size)) {
809             free(b->data);
810             free(b);
811             return NULL;
812         }
813     } else {
814         if (b->comp_size < 0) { free(b); return NULL; }
815         b->alloc = b->comp_size;
816         if (!(b->data = malloc(b->comp_size)))  { free(b); return NULL; }
817         if (b->comp_size != hread(fd->fp, b->data, b->comp_size)) {
818             free(b->data);
819             free(b);
820             return NULL;
821         }
822     }
823 
824     if (CRAM_MAJOR_VERS(fd->version) >= 3) {
825         if (-1 == int32_decode(fd, (int32_t *)&b->crc32)) {
826             free(b);
827             return NULL;
828         }
829 
830         crc = crc32(crc, b->data ? b->data : (uc *)"", b->alloc);
831         if (crc != b->crc32) {
832             hts_log_error("Block CRC32 failure");
833             free(b->data);
834             free(b);
835             return NULL;
836         }
837     }
838 
839     b->orig_method = b->method;
840     b->idx = 0;
841     b->byte = 0;
842     b->bit = 7; // MSB
843 
844     return b;
845 }
846 
847 
848 /*
849  * Computes the size of a cram block, including the block
850  * header itself.
851  */
cram_block_size(cram_block * b)852 uint32_t cram_block_size(cram_block *b) {
853     unsigned char dat[100], *cp = dat;;
854     uint32_t sz;
855 
856     *cp++ = b->method;
857     *cp++ = b->content_type;
858     cp += itf8_put((char*)cp, b->content_id);
859     cp += itf8_put((char*)cp, b->comp_size);
860     cp += itf8_put((char*)cp, b->uncomp_size);
861 
862     sz = cp-dat + 4;
863     sz += b->method == RAW ? b->uncomp_size : b->comp_size;
864 
865     return sz;
866 }
867 
868 /*
869  * Writes a CRAM block.
870  * Returns 0 on success
871  *        -1 on failure
872  */
cram_write_block(cram_fd * fd,cram_block * b)873 int cram_write_block(cram_fd *fd, cram_block *b) {
874     assert(b->method != RAW || (b->comp_size == b->uncomp_size));
875 
876     if (hputc(b->method,       fd->fp)  == EOF) return -1;
877     if (hputc(b->content_type, fd->fp)  == EOF) return -1;
878     if (itf8_encode(fd, b->content_id)  ==  -1) return -1;
879     if (itf8_encode(fd, b->comp_size)   ==  -1) return -1;
880     if (itf8_encode(fd, b->uncomp_size) ==  -1) return -1;
881 
882     if (b->data) {
883         if (b->method == RAW) {
884             if (b->uncomp_size != hwrite(fd->fp, b->data, b->uncomp_size))
885                 return -1;
886         } else {
887             if (b->comp_size != hwrite(fd->fp, b->data, b->comp_size))
888                 return -1;
889         }
890     } else {
891         // Absent blocks should be size 0
892         assert(b->method == RAW && b->uncomp_size == 0);
893     }
894 
895     if (CRAM_MAJOR_VERS(fd->version) >= 3) {
896         unsigned char dat[100], *cp = dat;;
897         uint32_t crc;
898 
899         *cp++ = b->method;
900         *cp++ = b->content_type;
901         cp += itf8_put((char*)cp, b->content_id);
902         cp += itf8_put((char*)cp, b->comp_size);
903         cp += itf8_put((char*)cp, b->uncomp_size);
904         crc = crc32(0L, dat, cp-dat);
905 
906         if (b->method == RAW) {
907             b->crc32 = crc32(crc, b->data ? b->data : (uc*)"", b->uncomp_size);
908         } else {
909             b->crc32 = crc32(crc, b->data ? b->data : (uc*)"", b->comp_size);
910         }
911 
912         if (-1 == int32_encode(fd, b->crc32))
913             return -1;
914     }
915 
916     return 0;
917 }
918 
919 /*
920  * Frees a CRAM block, deallocating internal data too.
921  */
cram_free_block(cram_block * b)922 void cram_free_block(cram_block *b) {
923     if (!b)
924         return;
925     if (b->data)
926         free(b->data);
927     free(b);
928 }
929 
930 /*
931  * Uncompresses a CRAM block, if compressed.
932  */
cram_uncompress_block(cram_block * b)933 int cram_uncompress_block(cram_block *b) {
934     char *uncomp;
935     size_t uncomp_size = 0;
936 
937     if (b->uncomp_size == 0) {
938         // blank block
939         b->method = RAW;
940         return 0;
941     }
942 
943     switch (b->method) {
944     case RAW:
945         return 0;
946 
947     case GZIP:
948         uncomp = zlib_mem_inflate((char *)b->data, b->comp_size, &uncomp_size);
949         if (!uncomp)
950             return -1;
951         if ((int)uncomp_size != b->uncomp_size) {
952             free(uncomp);
953             return -1;
954         }
955         free(b->data);
956         b->data = (unsigned char *)uncomp;
957         b->alloc = uncomp_size;
958         b->method = RAW;
959         break;
960 
961 #ifdef HAVE_LIBBZ2
962     case BZIP2: {
963         unsigned int usize = b->uncomp_size;
964         if (!(uncomp = malloc(usize)))
965             return -1;
966         if (BZ_OK != BZ2_bzBuffToBuffDecompress(uncomp, &usize,
967                                                 (char *)b->data, b->comp_size,
968                                                 0, 0)) {
969             free(uncomp);
970             return -1;
971         }
972         free(b->data);
973         b->data = (unsigned char *)uncomp;
974         b->alloc = usize;
975         b->method = RAW;
976         b->uncomp_size = usize; // Just incase it differs
977         break;
978     }
979 #else
980     case BZIP2:
981         hts_log_error("Bzip2 compression is not compiled into this version. Please rebuild and try again");
982         return -1;
983 #endif
984 
985 #ifdef HAVE_LIBLZMA
986     case LZMA:
987         uncomp = lzma_mem_inflate((char *)b->data, b->comp_size, &uncomp_size);
988         if (!uncomp)
989             return -1;
990         if ((int)uncomp_size != b->uncomp_size)
991             return -1;
992         free(b->data);
993         b->data = (unsigned char *)uncomp;
994         b->alloc = uncomp_size;
995         b->method = RAW;
996         break;
997 #else
998     case LZMA:
999         hts_log_error("Lzma compression is not compiled into this version. Please rebuild and try again");
1000         return -1;
1001         break;
1002 #endif
1003 
1004     case RANS: {
1005         unsigned int usize = b->uncomp_size, usize2;
1006         uncomp = (char *)rans_uncompress(b->data, b->comp_size, &usize2);
1007         if (!uncomp || usize != usize2)
1008             return -1;
1009         free(b->data);
1010         b->data = (unsigned char *)uncomp;
1011         b->alloc = usize2;
1012         b->method = RAW;
1013         b->uncomp_size = usize2; // Just incase it differs
1014         //fprintf(stderr, "Expanded %d to %d\n", b->comp_size, b->uncomp_size);
1015         break;
1016     }
1017 
1018     default:
1019         return -1;
1020     }
1021 
1022     return 0;
1023 }
1024 
cram_compress_by_method(char * in,size_t in_size,size_t * out_size,enum cram_block_method method,int level,int strat)1025 static char *cram_compress_by_method(char *in, size_t in_size,
1026                                      size_t *out_size,
1027                                      enum cram_block_method method,
1028                                      int level, int strat) {
1029     switch (method) {
1030     case GZIP:
1031         return zlib_mem_deflate(in, in_size, out_size, level, strat);
1032 
1033     case BZIP2: {
1034 #ifdef HAVE_LIBBZ2
1035         unsigned int comp_size = in_size*1.01 + 600;
1036         char *comp = malloc(comp_size);
1037         if (!comp)
1038             return NULL;
1039 
1040         if (BZ_OK != BZ2_bzBuffToBuffCompress(comp, &comp_size,
1041                                               in, in_size,
1042                                               level, 0, 30)) {
1043             free(comp);
1044             return NULL;
1045         }
1046         *out_size = comp_size;
1047         return comp;
1048 #else
1049         return NULL;
1050 #endif
1051     }
1052 
1053     case LZMA:
1054 #ifdef HAVE_LIBLZMA
1055         return lzma_mem_deflate(in, in_size, out_size, level);
1056 #else
1057         return NULL;
1058 #endif
1059 
1060     case RANS0: {
1061         unsigned int out_size_i;
1062         unsigned char *cp;
1063         cp = rans_compress((unsigned char *)in, in_size, &out_size_i, 0);
1064         *out_size = out_size_i;
1065         return (char *)cp;
1066     }
1067 
1068     case RANS1: {
1069         unsigned int out_size_i;
1070         unsigned char *cp;
1071 
1072         cp = rans_compress((unsigned char *)in, in_size, &out_size_i, 1);
1073         *out_size = out_size_i;
1074         return (char *)cp;
1075     }
1076 
1077     case RAW:
1078         break;
1079 
1080     default:
1081         return NULL;
1082     }
1083 
1084     return NULL;
1085 }
1086 
1087 
1088 /*
1089  * Compresses a block using one of two different zlib strategies. If we only
1090  * want one choice set strat2 to be -1.
1091  *
1092  * The logic here is that sometimes Z_RLE does a better job than Z_FILTERED
1093  * or Z_DEFAULT_STRATEGY on quality data. If so, we'd rather use it as it is
1094  * significantly faster.
1095  *
1096  * Method and level -1 implies defaults, as specified in cram_fd.
1097  */
cram_compress_block(cram_fd * fd,cram_block * b,cram_metrics * metrics,int method,int level)1098 int cram_compress_block(cram_fd *fd, cram_block *b, cram_metrics *metrics,
1099                         int method, int level) {
1100 
1101     char *comp = NULL;
1102     size_t comp_size = 0;
1103     int strat;
1104 
1105     if (b->method != RAW) {
1106         // Maybe already compressed if s->block[0] was compressed and
1107         // we have e.g. s->block[DS_BA] set to s->block[0] due to only
1108         // one base type present and hence using E_HUFFMAN on block 0.
1109         // A second explicit attempt to compress the same block then
1110         // occurs.
1111         return 0;
1112     }
1113 
1114     if (method == -1) {
1115         method = 1<<GZIP;
1116         if (fd->use_bz2)
1117             method |= 1<<BZIP2;
1118         if (fd->use_lzma)
1119             method |= 1<<LZMA;
1120     }
1121 
1122     if (level == -1)
1123         level = fd->level;
1124 
1125     //fprintf(stderr, "IN: block %d, sz %d\n", b->content_id, b->uncomp_size);
1126 
1127     if (method == RAW || level == 0 || b->uncomp_size == 0) {
1128         b->method = RAW;
1129         b->comp_size = b->uncomp_size;
1130         //fprintf(stderr, "Skip block id %d\n", b->content_id);
1131         return 0;
1132     }
1133 
1134     if (metrics) {
1135         pthread_mutex_lock(&fd->metrics_lock);
1136         if (metrics->trial > 0 || --metrics->next_trial <= 0) {
1137             size_t sz_best = INT_MAX;
1138             size_t sz_gz_rle = 0;
1139             size_t sz_gz_def = 0;
1140             size_t sz_rans0 = 0;
1141             size_t sz_rans1 = 0;
1142             size_t sz_bzip2 = 0;
1143             size_t sz_lzma = 0;
1144             int method_best = 0;
1145             char *c_best = NULL, *c = NULL;
1146 
1147             if (metrics->revised_method)
1148                 method = metrics->revised_method;
1149             else
1150                 metrics->revised_method = method;
1151 
1152             if (metrics->next_trial <= 0) {
1153                 metrics->next_trial = TRIAL_SPAN;
1154                 metrics->trial = NTRIALS;
1155                 metrics->sz_gz_rle /= 2;
1156                 metrics->sz_gz_def /= 2;
1157                 metrics->sz_rans0  /= 2;
1158                 metrics->sz_rans1  /= 2;
1159                 metrics->sz_bzip2  /= 2;
1160                 metrics->sz_lzma   /= 2;
1161             }
1162 
1163             pthread_mutex_unlock(&fd->metrics_lock);
1164 
1165             if (method & (1<<GZIP_RLE)) {
1166                 c = cram_compress_by_method((char *)b->data, b->uncomp_size,
1167                                             &sz_gz_rle, GZIP, 1, Z_RLE);
1168                 if (c && sz_best > sz_gz_rle) {
1169                     sz_best = sz_gz_rle;
1170                     method_best = GZIP_RLE;
1171                     if (c_best)
1172                         free(c_best);
1173                     c_best = c;
1174                 } else if (c) {
1175                     free(c);
1176                 } else {
1177                     sz_gz_rle = b->uncomp_size*2+1000;
1178                 }
1179 
1180                 //fprintf(stderr, "Block %d; %d->%d\n", b->content_id, b->uncomp_size, sz_gz_rle);
1181             }
1182 
1183             if (method & (1<<GZIP)) {
1184                 c = cram_compress_by_method((char *)b->data, b->uncomp_size,
1185                                             &sz_gz_def, GZIP, level,
1186                                             Z_FILTERED);
1187                 if (c && sz_best > sz_gz_def) {
1188                     sz_best = sz_gz_def;
1189                     method_best = GZIP;
1190                     if (c_best)
1191                         free(c_best);
1192                     c_best = c;
1193                 } else if (c) {
1194                     free(c);
1195                 } else {
1196                     sz_gz_def = b->uncomp_size*2+1000;
1197                 }
1198 
1199                 //fprintf(stderr, "Block %d; %d->%d\n", b->content_id, b->uncomp_size, sz_gz_def);
1200             }
1201 
1202             if (method & (1<<RANS0)) {
1203                 c = cram_compress_by_method((char *)b->data, b->uncomp_size,
1204                                             &sz_rans0, RANS0, 0, 0);
1205                 if (c && sz_best > sz_rans0) {
1206                     sz_best = sz_rans0;
1207                     method_best = RANS0;
1208                     if (c_best)
1209                         free(c_best);
1210                     c_best = c;
1211                 } else if (c) {
1212                     free(c);
1213                 } else {
1214                     sz_rans0 = b->uncomp_size*2+1000;
1215                 }
1216             }
1217 
1218             if (method & (1<<RANS1)) {
1219                 c = cram_compress_by_method((char *)b->data, b->uncomp_size,
1220                                             &sz_rans1, RANS1, 0, 0);
1221                 if (c && sz_best > sz_rans1) {
1222                     sz_best = sz_rans1;
1223                     method_best = RANS1;
1224                     if (c_best)
1225                         free(c_best);
1226                     c_best = c;
1227                 } else if (c) {
1228                     free(c);
1229                 } else {
1230                     sz_rans1 = b->uncomp_size*2+1000;
1231                 }
1232             }
1233 
1234             if (method & (1<<BZIP2)) {
1235                 c = cram_compress_by_method((char *)b->data, b->uncomp_size,
1236                                             &sz_bzip2, BZIP2, level, 0);
1237                 if (c && sz_best > sz_bzip2) {
1238                     sz_best = sz_bzip2;
1239                     method_best = BZIP2;
1240                     if (c_best)
1241                         free(c_best);
1242                     c_best = c;
1243                 } else if (c) {
1244                     free(c);
1245                 } else {
1246                     sz_bzip2 = b->uncomp_size*2+1000;
1247                 }
1248             }
1249 
1250             if (method & (1<<LZMA)) {
1251                 c = cram_compress_by_method((char *)b->data, b->uncomp_size,
1252                                             &sz_lzma, LZMA, level, 0);
1253                 if (c && sz_best > sz_lzma) {
1254                     sz_best = sz_lzma;
1255                     method_best = LZMA;
1256                     if (c_best)
1257                         free(c_best);
1258                     c_best = c;
1259                 } else if (c) {
1260                     free(c);
1261                 } else {
1262                     sz_lzma = b->uncomp_size*2+1000;
1263                 }
1264             }
1265 
1266             //fprintf(stderr, "sz_best = %d\n", sz_best);
1267 
1268             free(b->data);
1269             b->data = (unsigned char *)c_best;
1270             //printf("method_best = %s\n", cram_block_method2str(method_best));
1271             b->method = method_best == GZIP_RLE ? GZIP : method_best;
1272             b->comp_size = sz_best;
1273 
1274             pthread_mutex_lock(&fd->metrics_lock);
1275             metrics->sz_gz_rle += sz_gz_rle;
1276             metrics->sz_gz_def += sz_gz_def;
1277             metrics->sz_rans0  += sz_rans0;
1278             metrics->sz_rans1  += sz_rans1;
1279             metrics->sz_bzip2  += sz_bzip2;
1280             metrics->sz_lzma   += sz_lzma;
1281             if (--metrics->trial == 0) {
1282                 int best_method = RAW;
1283                 int best_sz = INT_MAX;
1284 
1285                 // Scale methods by cost
1286                 if (fd->level <= 3) {
1287                     metrics->sz_rans1  *= 1.02;
1288                     metrics->sz_gz_def *= 1.04;
1289                     metrics->sz_bzip2  *= 1.08;
1290                     metrics->sz_lzma   *= 1.10;
1291                 } else if (fd->level <= 6) {
1292                     metrics->sz_rans1  *= 1.01;
1293                     metrics->sz_gz_def *= 1.02;
1294                     metrics->sz_bzip2  *= 1.03;
1295                     metrics->sz_lzma   *= 1.05;
1296                 }
1297 
1298                 if (method & (1<<GZIP_RLE) && best_sz > metrics->sz_gz_rle)
1299                     best_sz = metrics->sz_gz_rle, best_method = GZIP_RLE;
1300 
1301                 if (method & (1<<GZIP) && best_sz > metrics->sz_gz_def)
1302                     best_sz = metrics->sz_gz_def, best_method = GZIP;
1303 
1304                 if (method & (1<<RANS0) && best_sz > metrics->sz_rans0)
1305                     best_sz = metrics->sz_rans0, best_method = RANS0;
1306 
1307                 if (method & (1<<RANS1) && best_sz > metrics->sz_rans1)
1308                     best_sz = metrics->sz_rans1, best_method = RANS1;
1309 
1310                 if (method & (1<<BZIP2) && best_sz > metrics->sz_bzip2)
1311                     best_sz = metrics->sz_bzip2, best_method = BZIP2;
1312 
1313                 if (method & (1<<LZMA) && best_sz > metrics->sz_lzma)
1314                     best_sz = metrics->sz_lzma, best_method = LZMA;
1315 
1316                 if (best_method == GZIP_RLE) {
1317                     metrics->method = GZIP;
1318                     metrics->strat  = Z_RLE;
1319                 } else {
1320                     metrics->method = best_method;
1321                     metrics->strat  = Z_FILTERED;
1322                 }
1323 
1324                 // If we see at least MAXFAIL trials in a row for a specific
1325                 // compression method with more than MAXDELTA aggregate
1326                 // size then we drop this from the list of methods used
1327                 // for this block type.
1328 #define MAXDELTA 0.20
1329 #define MAXFAILS 4
1330                 if (best_method == GZIP_RLE) {
1331                     metrics->gz_rle_cnt = 0;
1332                     metrics->gz_rle_extra = 0;
1333                 } else if (best_sz < metrics->sz_gz_rle) {
1334                     double r = (double)metrics->sz_gz_rle / best_sz - 1;
1335                     if (++metrics->gz_rle_cnt >= MAXFAILS &&
1336                         (metrics->gz_rle_extra += r) >= MAXDELTA)
1337                         method &= ~(1<<GZIP_RLE);
1338                 }
1339 
1340                 if (best_method == GZIP) {
1341                     metrics->gz_def_cnt = 0;
1342                     metrics->gz_def_extra = 0;
1343                 } else if (best_sz < metrics->sz_gz_def) {
1344                     double r = (double)metrics->sz_gz_def / best_sz - 1;
1345                     if (++metrics->gz_def_cnt >= MAXFAILS &&
1346                         (metrics->gz_def_extra += r) >= MAXDELTA)
1347                         method &= ~(1<<GZIP);
1348                 }
1349 
1350                 if (best_method == RANS0) {
1351                     metrics->rans0_cnt = 0;
1352                     metrics->rans0_extra = 0;
1353                 } else if (best_sz < metrics->sz_rans0) {
1354                     double r = (double)metrics->sz_rans0 / best_sz - 1;
1355                     if (++metrics->rans0_cnt >= MAXFAILS &&
1356                         (metrics->rans0_extra += r) >= MAXDELTA)
1357                         method &= ~(1<<RANS0);
1358                 }
1359 
1360                 if (best_method == RANS1) {
1361                     metrics->rans1_cnt = 0;
1362                     metrics->rans1_extra = 0;
1363                 } else if (best_sz < metrics->sz_rans1) {
1364                     double r = (double)metrics->sz_rans1 / best_sz - 1;
1365                     if (++metrics->rans1_cnt >= MAXFAILS &&
1366                         (metrics->rans1_extra += r) >= MAXDELTA)
1367                         method &= ~(1<<RANS1);
1368                 }
1369 
1370                 if (best_method == BZIP2) {
1371                     metrics->bzip2_cnt = 0;
1372                     metrics->bzip2_extra = 0;
1373                 } else if (best_sz < metrics->sz_bzip2) {
1374                     double r = (double)metrics->sz_bzip2 / best_sz - 1;
1375                     if (++metrics->bzip2_cnt >= MAXFAILS &&
1376                         (metrics->bzip2_extra += r) >= MAXDELTA)
1377                         method &= ~(1<<BZIP2);
1378                 }
1379 
1380                 if (best_method == LZMA) {
1381                     metrics->lzma_cnt = 0;
1382                     metrics->lzma_extra = 0;
1383                 } else if (best_sz < metrics->sz_lzma) {
1384                     double r = (double)metrics->sz_lzma / best_sz - 1;
1385                     if (++metrics->lzma_cnt >= MAXFAILS &&
1386                         (metrics->lzma_extra += r) >= MAXDELTA)
1387                         method &= ~(1<<LZMA);
1388                 }
1389 
1390                 //if (method != metrics->revised_method)
1391                 //    fprintf(stderr, "%d: method from %x to %x\n",
1392                 //          b->content_id, metrics->revised_method, method);
1393                 metrics->revised_method = method;
1394             }
1395             pthread_mutex_unlock(&fd->metrics_lock);
1396         } else {
1397             strat = metrics->strat;
1398             method = metrics->method;
1399 
1400             pthread_mutex_unlock(&fd->metrics_lock);
1401             comp = cram_compress_by_method((char *)b->data, b->uncomp_size,
1402                                            &comp_size, method,
1403                                            level, strat);
1404             if (!comp)
1405                 return -1;
1406             free(b->data);
1407             b->data = (unsigned char *)comp;
1408             b->comp_size = comp_size;
1409             b->method = method;
1410         }
1411 
1412     } else {
1413         // no cached metrics, so just do zlib?
1414         comp = cram_compress_by_method((char *)b->data, b->uncomp_size,
1415                                        &comp_size, GZIP, level, Z_FILTERED);
1416         if (!comp) {
1417             hts_log_error("Compression failed");
1418             return -1;
1419         }
1420         free(b->data);
1421         b->data = (unsigned char *)comp;
1422         b->comp_size = comp_size;
1423         b->method = GZIP;
1424     }
1425 
1426     hts_log_info("Compressed block ID %d from %d to %d by method %s",
1427                  b->content_id, b->uncomp_size, b->comp_size,
1428                  cram_block_method2str(b->method));
1429 
1430     if (b->method == RANS1)
1431         b->method = RANS0; // Spec just has RANS (not 0/1) with auto-sensing
1432 
1433     return 0;
1434 }
1435 
cram_new_metrics(void)1436 cram_metrics *cram_new_metrics(void) {
1437     cram_metrics *m = calloc(1, sizeof(*m));
1438     if (!m)
1439         return NULL;
1440     m->trial = NTRIALS-1;
1441     m->next_trial = TRIAL_SPAN;
1442     m->method = RAW;
1443     m->strat = 0;
1444     m->revised_method = 0;
1445 
1446     return m;
1447 }
1448 
cram_block_method2str(enum cram_block_method m)1449 char *cram_block_method2str(enum cram_block_method m) {
1450     switch(m) {
1451     case RAW:      return "RAW";
1452     case GZIP:     return "GZIP";
1453     case BZIP2:    return "BZIP2";
1454     case LZMA:     return "LZMA";
1455     case RANS0:    return "RANS0";
1456     case RANS1:    return "RANS1";
1457     case GZIP_RLE: return "GZIP_RLE";
1458     case BM_ERROR: break;
1459     }
1460     return "?";
1461 }
1462 
cram_content_type2str(enum cram_content_type t)1463 char *cram_content_type2str(enum cram_content_type t) {
1464     switch (t) {
1465     case FILE_HEADER:         return "FILE_HEADER";
1466     case COMPRESSION_HEADER:  return "COMPRESSION_HEADER";
1467     case MAPPED_SLICE:        return "MAPPED_SLICE";
1468     case UNMAPPED_SLICE:      return "UNMAPPED_SLICE";
1469     case EXTERNAL:            return "EXTERNAL";
1470     case CORE:                return "CORE";
1471     case CT_ERROR:            break;
1472     }
1473     return "?";
1474 }
1475 
1476 /* ----------------------------------------------------------------------
1477  * Reference sequence handling
1478  *
1479  * These revolve around the refs_t structure, which may potentially be
1480  * shared between multiple cram_fd.
1481  *
1482  * We start with refs_create() to allocate an empty refs_t and then
1483  * populate it with @SQ line data using refs_from_header(). This is done on
1484  * cram_open().  Also at start up we can call cram_load_reference() which
1485  * is used with "scramble -r foo.fa". This replaces the fd->refs with the
1486  * new one specified. In either case refs2id() is then called which
1487  * maps ref_entry names to @SQ ids (refs_t->ref_id[]).
1488  *
1489  * Later, possibly within a thread, we will want to know the actual ref
1490  * seq itself, obtained by calling cram_get_ref().  This may use the
1491  * UR: or M5: fields or the filename specified in the original
1492  * cram_load_reference() call.
1493  *
1494  * Given the potential for multi-threaded reference usage, we have
1495  * reference counting (sorry for the confusing double use of "ref") to
1496  * track the number of callers interested in any specific reference.
1497  */
1498 
1499 /*
1500  * Frees/unmaps a reference sequence and associated file handles.
1501  */
ref_entry_free_seq(ref_entry * e)1502 static void ref_entry_free_seq(ref_entry *e) {
1503     if (e->mf)
1504         mfclose(e->mf);
1505     if (e->seq && !e->mf)
1506         free(e->seq);
1507 
1508     e->seq = NULL;
1509     e->mf = NULL;
1510 }
1511 
refs_free(refs_t * r)1512 void refs_free(refs_t *r) {
1513     RP("refs_free()\n");
1514 
1515     if (--r->count > 0)
1516         return;
1517 
1518     if (!r)
1519         return;
1520 
1521     if (r->pool)
1522         string_pool_destroy(r->pool);
1523 
1524     if (r->h_meta) {
1525         khint_t k;
1526 
1527         for (k = kh_begin(r->h_meta); k != kh_end(r->h_meta); k++) {
1528             ref_entry *e;
1529 
1530             if (!kh_exist(r->h_meta, k))
1531                 continue;
1532             if (!(e = kh_val(r->h_meta, k)))
1533                 continue;
1534             ref_entry_free_seq(e);
1535             free(e);
1536         }
1537 
1538         kh_destroy(refs, r->h_meta);
1539     }
1540 
1541     if (r->ref_id)
1542         free(r->ref_id);
1543 
1544     if (r->fp)
1545         bgzf_close(r->fp);
1546 
1547     pthread_mutex_destroy(&r->lock);
1548 
1549     free(r);
1550 }
1551 
refs_create(void)1552 static refs_t *refs_create(void) {
1553     refs_t *r = calloc(1, sizeof(*r));
1554 
1555     RP("refs_create()\n");
1556 
1557     if (!r)
1558         return NULL;
1559 
1560     if (!(r->pool = string_pool_create(8192)))
1561         goto err;
1562 
1563     r->ref_id = NULL; // see refs2id() to populate.
1564     r->count = 1;
1565     r->last = NULL;
1566     r->last_id = -1;
1567 
1568     if (!(r->h_meta = kh_init(refs)))
1569         goto err;
1570 
1571     pthread_mutex_init(&r->lock, NULL);
1572 
1573     return r;
1574 
1575  err:
1576     refs_free(r);
1577     return NULL;
1578 }
1579 
1580 /*
1581  * Opens a reference fasta file as a BGZF stream, allowing for
1582  * compressed files.  It automatically builds a .fai file if
1583  * required and if compressed a .gzi bgzf index too.
1584  *
1585  * Returns a BGZF handle on success;
1586  *         NULL on failure.
1587  */
bgzf_open_ref(char * fn,char * mode,int is_md5)1588 static BGZF *bgzf_open_ref(char *fn, char *mode, int is_md5) {
1589     BGZF *fp;
1590 
1591     if (!is_md5) {
1592         char fai_file[PATH_MAX];
1593 
1594         snprintf(fai_file, PATH_MAX, "%s.fai", fn);
1595         if (access(fai_file, R_OK) != 0)
1596             if (fai_build(fn) != 0)
1597                 return NULL;
1598     }
1599 
1600     if (!(fp = bgzf_open(fn, mode))) {
1601         perror(fn);
1602         return NULL;
1603     }
1604 
1605     if (fp->is_compressed == 1 && bgzf_index_load(fp, fn, ".gzi") < 0) {
1606         hts_log_error("Unable to load .gzi index '%s.gzi'", fn);
1607         bgzf_close(fp);
1608         return NULL;
1609     }
1610 
1611     return fp;
1612 }
1613 
1614 /*
1615  * Loads a FAI file for a reference.fasta.
1616  * "is_err" indicates whether failure to load is worthy of emitting an
1617  * error message. In some cases (eg with embedded references) we
1618  * speculatively load, just incase, and silently ignore errors.
1619  *
1620  * Returns the refs_t struct on success (maybe newly allocated);
1621  *         NULL on failure
1622  */
refs_load_fai(refs_t * r_orig,char * fn,int is_err)1623 static refs_t *refs_load_fai(refs_t *r_orig, char *fn, int is_err) {
1624     struct stat sb;
1625     FILE *fp = NULL;
1626     char fai_fn[PATH_MAX];
1627     char line[8192];
1628     refs_t *r = r_orig;
1629     size_t fn_l = strlen(fn);
1630     int id = 0, id_alloc = 0;
1631 
1632     RP("refs_load_fai %s\n", fn);
1633 
1634     if (!r)
1635         if (!(r = refs_create()))
1636             goto err;
1637 
1638     /* Open reference, for later use */
1639     if (stat(fn, &sb) != 0) {
1640         if (is_err)
1641             perror(fn);
1642         goto err;
1643     }
1644 
1645     if (r->fp)
1646         if (bgzf_close(r->fp) != 0)
1647             goto err;
1648     r->fp = NULL;
1649 
1650     if (!(r->fn = string_dup(r->pool, fn)))
1651         goto err;
1652 
1653     if (fn_l > 4 && strcmp(&fn[fn_l-4], ".fai") == 0)
1654         r->fn[fn_l-4] = 0;
1655 
1656     if (!(r->fp = bgzf_open_ref(r->fn, "r", 0)))
1657         goto err;
1658 
1659     /* Parse .fai file and load meta-data */
1660     sprintf(fai_fn, "%.*s.fai", PATH_MAX-5, r->fn);
1661 
1662     if (stat(fai_fn, &sb) != 0) {
1663         if (is_err)
1664             perror(fai_fn);
1665         goto err;
1666     }
1667     if (!(fp = fopen(fai_fn, "r"))) {
1668         if (is_err)
1669             perror(fai_fn);
1670         goto err;
1671     }
1672     while (fgets(line, 8192, fp) != NULL) {
1673         ref_entry *e = malloc(sizeof(*e));
1674         char *cp;
1675         int n;
1676         khint_t k;
1677 
1678         if (!e)
1679             return NULL;
1680 
1681         // id
1682         for (cp = line; *cp && !isspace_c(*cp); cp++)
1683             ;
1684         *cp++ = 0;
1685         e->name = string_dup(r->pool, line);
1686 
1687         // length
1688         while (*cp && isspace_c(*cp))
1689             cp++;
1690         e->length = strtoll(cp, &cp, 10);
1691 
1692         // offset
1693         while (*cp && isspace_c(*cp))
1694             cp++;
1695         e->offset = strtoll(cp, &cp, 10);
1696 
1697         // bases per line
1698         while (*cp && isspace_c(*cp))
1699             cp++;
1700         e->bases_per_line = strtol(cp, &cp, 10);
1701 
1702         // line length
1703         while (*cp && isspace_c(*cp))
1704             cp++;
1705         e->line_length = strtol(cp, &cp, 10);
1706 
1707         // filename
1708         e->fn = r->fn;
1709 
1710         e->count = 0;
1711         e->seq = NULL;
1712         e->mf = NULL;
1713         e->is_md5 = 0;
1714 
1715         k = kh_put(refs, r->h_meta, e->name, &n);
1716         if (-1 == n)  {
1717             free(e);
1718             return NULL;
1719         }
1720 
1721         if (n) {
1722             kh_val(r->h_meta, k) = e;
1723         } else {
1724             ref_entry *re = kh_val(r->h_meta, k);
1725             if (re && (re->count != 0 || re->length != 0)) {
1726                 /* Keep old */
1727                 free(e);
1728             } else {
1729                 /* Replace old */
1730                 if (re)
1731                     free(re);
1732                 kh_val(r->h_meta, k) = e;
1733             }
1734         }
1735 
1736         if (id >= id_alloc) {
1737             int x;
1738 
1739             id_alloc = id_alloc ?id_alloc*2 : 16;
1740             r->ref_id = realloc(r->ref_id, id_alloc * sizeof(*r->ref_id));
1741 
1742             for (x = id; x < id_alloc; x++)
1743                 r->ref_id[x] = NULL;
1744         }
1745         r->ref_id[id] = e;
1746         r->nref = ++id;
1747     }
1748 
1749     fclose(fp);
1750     return r;
1751 
1752  err:
1753     if (fp)
1754         fclose(fp);
1755 
1756     if (!r_orig)
1757         refs_free(r);
1758 
1759     return NULL;
1760 }
1761 
1762 /*
1763  * Verifies that the CRAM @SQ lines and .fai files match.
1764  */
sanitise_SQ_lines(cram_fd * fd)1765 static void sanitise_SQ_lines(cram_fd *fd) {
1766     int i;
1767 
1768     if (!fd->header)
1769         return;
1770 
1771     if (!fd->refs || !fd->refs->h_meta)
1772         return;
1773 
1774     for (i = 0; i < fd->header->nref; i++) {
1775         char *name = fd->header->ref[i].name;
1776         khint_t k = kh_get(refs, fd->refs->h_meta, name);
1777         ref_entry *r;
1778 
1779         // We may have @SQ lines which have no known .fai, but do not
1780         // in themselves pose a problem because they are unused in the file.
1781         if (k == kh_end(fd->refs->h_meta))
1782             continue;
1783 
1784         if (!(r = (ref_entry *)kh_val(fd->refs->h_meta, k)))
1785             continue;
1786 
1787         if (r->length && r->length != fd->header->ref[i].len) {
1788             assert(strcmp(r->name, fd->header->ref[i].name) == 0);
1789 
1790             // Should we also check MD5sums here to ensure the correct
1791             // reference was given?
1792             hts_log_warning("Header @SQ length mismatch for ref %s, %d vs %d",
1793                             r->name, fd->header->ref[i].len, (int)r->length);
1794 
1795             // Fixing the parsed @SQ header will make MD:Z: strings work
1796             // and also stop it producing N for the sequence.
1797             fd->header->ref[i].len = r->length;
1798         }
1799     }
1800 }
1801 
1802 /*
1803  * Indexes references by the order they appear in a BAM file. This may not
1804  * necessarily be the same order they appear in the fasta reference file.
1805  *
1806  * Returns 0 on success
1807  *        -1 on failure
1808  */
refs2id(refs_t * r,SAM_hdr * h)1809 int refs2id(refs_t *r, SAM_hdr *h) {
1810     int i;
1811 
1812     if (r->ref_id)
1813         free(r->ref_id);
1814     if (r->last)
1815         r->last = NULL;
1816 
1817     r->ref_id = calloc(h->nref, sizeof(*r->ref_id));
1818     if (!r->ref_id)
1819         return -1;
1820 
1821     r->nref = h->nref;
1822     for (i = 0; i < h->nref; i++) {
1823         khint_t k = kh_get(refs, r->h_meta, h->ref[i].name);
1824         if (k != kh_end(r->h_meta)) {
1825             r->ref_id[i] = kh_val(r->h_meta, k);
1826         } else {
1827             hts_log_warning("Unable to find ref name '%s'", h->ref[i].name);
1828         }
1829     }
1830 
1831     return 0;
1832 }
1833 
1834 /*
1835  * Generates refs_t entries based on @SQ lines in the header.
1836  * Returns 0 on success
1837  *         -1 on failure
1838  */
refs_from_header(refs_t * r,cram_fd * fd,SAM_hdr * h)1839 static int refs_from_header(refs_t *r, cram_fd *fd, SAM_hdr *h) {
1840     int i, j;
1841 
1842     if (!r)
1843         return -1;
1844 
1845     if (!h || h->nref == 0)
1846         return 0;
1847 
1848     //fprintf(stderr, "refs_from_header for %p mode %c\n", fd, fd->mode);
1849 
1850     /* Existing refs are fine, as long as they're compatible with the hdr. */
1851     if (!(r->ref_id = realloc(r->ref_id, (r->nref + h->nref) * sizeof(*r->ref_id))))
1852         return -1;
1853 
1854     /* Copy info from h->ref[i] over to r */
1855     for (i = 0, j = r->nref; i < h->nref; i++) {
1856         SAM_hdr_type *ty;
1857         SAM_hdr_tag *tag;
1858         khint_t k;
1859         int n;
1860 
1861         k = kh_get(refs, r->h_meta, h->ref[i].name);
1862         if (k != kh_end(r->h_meta))
1863             // Ref already known about
1864             continue;
1865 
1866         if (!(r->ref_id[j] = calloc(1, sizeof(ref_entry))))
1867             return -1;
1868 
1869         if (!h->ref[i].name)
1870             return -1;
1871 
1872         r->ref_id[j]->name = string_dup(r->pool, h->ref[i].name);
1873         r->ref_id[j]->length = 0; // marker for not yet loaded
1874 
1875         /* Initialise likely filename if known */
1876         if ((ty = sam_hdr_find(h, "SQ", "SN", h->ref[i].name))) {
1877             if ((tag = sam_hdr_find_key(h, ty, "M5", NULL))) {
1878                 r->ref_id[j]->fn = string_dup(r->pool, tag->str+3);
1879                 //fprintf(stderr, "Tagging @SQ %s / %s\n", r->ref_id[h]->name, r->ref_id[h]->fn);
1880             }
1881         }
1882 
1883         k = kh_put(refs, r->h_meta, r->ref_id[j]->name, &n);
1884         if (n <= 0) // already exists or error
1885             return -1;
1886         kh_val(r->h_meta, k) = r->ref_id[j];
1887 
1888         j++;
1889     }
1890     r->nref = j;
1891 
1892     return 0;
1893 }
1894 
1895 /*
1896  * Attaches a header to a cram_fd.
1897  *
1898  * This should be used when creating a new cram_fd for writing where
1899  * we have an SAM_hdr already constructed (eg from a file we've read
1900  * in).
1901  */
cram_set_header(cram_fd * fd,SAM_hdr * hdr)1902 int cram_set_header(cram_fd *fd, SAM_hdr *hdr) {
1903     if (fd->header)
1904         sam_hdr_free(fd->header);
1905     fd->header = hdr;
1906     return refs_from_header(fd->refs, fd, hdr);
1907 }
1908 
1909 /*
1910  * Converts a directory and a filename into an expanded path, replacing %s
1911  * in directory with the filename and %[0-9]+s with portions of the filename
1912  * Any remaining parts of filename are added to the end with /%s.
1913  */
expand_cache_path(char * path,char * dir,char * fn)1914 int expand_cache_path(char *path, char *dir, char *fn) {
1915     char *cp, *start = path;
1916     size_t len;
1917     size_t sz = PATH_MAX;
1918 
1919     while ((cp = strchr(dir, '%'))) {
1920         if (cp-dir >= sz) return -1;
1921         strncpy(path, dir, cp-dir);
1922         path += cp-dir;
1923         sz -= cp-dir;
1924 
1925         if (*++cp == 's') {
1926             len = strlen(fn);
1927             if (len >= sz) return -1;
1928             strcpy(path, fn);
1929             path += len;
1930             sz -= len;
1931             fn += len;
1932             cp++;
1933         } else if (*cp >= '0' && *cp <= '9') {
1934             char *endp;
1935             long l;
1936 
1937             l = strtol(cp, &endp, 10);
1938             l = MIN(l, strlen(fn));
1939             if (*endp == 's') {
1940                 if (l >= sz) return -1;
1941                 strncpy(path, fn, l);
1942                 path += l;
1943                 fn += l;
1944                 sz -= l;
1945                 *path = 0;
1946                 cp = endp+1;
1947             } else {
1948                 if (sz < 3) return -1;
1949                 *path++ = '%';
1950                 *path++ = *cp++;
1951             }
1952         } else {
1953             if (sz < 3) return -1;
1954             *path++ = '%';
1955             *path++ = *cp++;
1956         }
1957         dir = cp;
1958     }
1959 
1960     len = strlen(dir);
1961     if (len >= sz) return -1;
1962     strcpy(path, dir);
1963     path += len;
1964     sz -= len;
1965 
1966     len = strlen(fn) + ((*fn && path > start && path[-1] != '/') ? 1 : 0);
1967     if (len >= sz) return -1;
1968     if (*fn && path > start && path[-1] != '/')
1969         *path++ = '/';
1970     strcpy(path, fn);
1971     return 0;
1972 }
1973 
1974 /*
1975  * Make the directory containing path and any prefix directories.
1976  */
mkdir_prefix(char * path,int mode)1977 void mkdir_prefix(char *path, int mode) {
1978     char *cp = strrchr(path, '/');
1979     if (!cp)
1980         return;
1981 
1982     *cp = 0;
1983     if (is_directory(path)) {
1984         *cp = '/';
1985         return;
1986     }
1987 
1988     if (mkdir(path, mode) == 0) {
1989         chmod(path, mode);
1990         *cp = '/';
1991         return;
1992     }
1993 
1994     mkdir_prefix(path, mode);
1995     mkdir(path, mode);
1996     chmod(path, mode);
1997     *cp = '/';
1998 }
1999 
2000 /*
2001  * Return the cache directory to use, based on the first of these
2002  * environment variables to be set to a non-empty value.
2003  */
get_cache_basedir(const char ** extra)2004 static const char *get_cache_basedir(const char **extra) {
2005     char *base;
2006 
2007     *extra = "";
2008 
2009     base = getenv("XDG_CACHE_HOME");
2010     if (base && *base) return base;
2011 
2012     base = getenv("HOME");
2013     if (base && *base) { *extra = "/.cache"; return base; }
2014 
2015     base = getenv("TMPDIR");
2016     if (base && *base) return base;
2017 
2018     base = getenv("TEMP");
2019     if (base && *base) return base;
2020 
2021     return "/tmp";
2022 }
2023 
2024 /*
2025  * Return an integer representation of pthread_self().
2026  */
get_int_threadid()2027 static unsigned get_int_threadid() {
2028     pthread_t pt = pthread_self();
2029     unsigned char *s = (unsigned char *) &pt;
2030     size_t i;
2031     unsigned h = 0;
2032     for (i = 0; i < sizeof(pthread_t); i++)
2033         h = (h << 5) - h + s[i];
2034     return h;
2035 }
2036 
2037 /*
2038  * Queries the M5 string from the header and attempts to populate the
2039  * reference from this using the REF_PATH environment.
2040  *
2041  * Returns 0 on sucess
2042  *        -1 on failure
2043  */
cram_populate_ref(cram_fd * fd,int id,ref_entry * r)2044 static int cram_populate_ref(cram_fd *fd, int id, ref_entry *r) {
2045     char *ref_path = getenv("REF_PATH");
2046     SAM_hdr_type *ty;
2047     SAM_hdr_tag *tag;
2048     char path[PATH_MAX], path_tmp[PATH_MAX + 64];
2049     char cache[PATH_MAX], cache_root[PATH_MAX];
2050     char *local_cache = getenv("REF_CACHE");
2051     mFILE *mf;
2052     int local_path = 0;
2053 
2054     hts_log_info("Running cram_populate_ref on fd %p, id %d", (void *)fd, id);
2055 
2056     cache_root[0] = '\0';
2057 
2058     if (!ref_path || *ref_path == '\0') {
2059         /*
2060          * If we have no ref path, we use the EBI server.
2061          * However to avoid spamming it we require a local ref cache too.
2062          */
2063         ref_path = "https://www.ebi.ac.uk/ena/cram/md5/%s";
2064         if (!local_cache || *local_cache == '\0') {
2065             const char *extra;
2066             const char *base = get_cache_basedir(&extra);
2067             snprintf(cache_root, PATH_MAX, "%s%s/hts-ref", base, extra);
2068             snprintf(cache,PATH_MAX, "%s%s/hts-ref/%%2s/%%2s/%%s", base, extra);
2069             local_cache = cache;
2070             hts_log_info("Populating local cache: %s", local_cache);
2071         }
2072     }
2073 
2074     if (!r->name)
2075         return -1;
2076 
2077     if (!(ty = sam_hdr_find(fd->header, "SQ", "SN", r->name)))
2078         return -1;
2079 
2080     if (!(tag = sam_hdr_find_key(fd->header, ty, "M5", NULL)))
2081         goto no_M5;
2082 
2083     hts_log_info("Querying ref %s", tag->str+3);
2084 
2085     /* Use cache if available */
2086     if (local_cache && *local_cache) {
2087         if (expand_cache_path(path, local_cache, tag->str+3) == 0)
2088             local_path = 1;
2089     }
2090 
2091 #ifndef HAVE_MMAP
2092     char *path2;
2093     /* Search local files in REF_PATH; we can open them and return as above */
2094     if (!local_path && (path2 = find_path(tag->str+3, ref_path))) {
2095         strncpy(path, path2, PATH_MAX);
2096         free(path2);
2097         if (is_file(path)) // incase it's too long
2098             local_path = 1;
2099     }
2100 #endif
2101 
2102     /* Found via REF_CACHE or local REF_PATH file */
2103     if (local_path) {
2104         struct stat sb;
2105         BGZF *fp;
2106 
2107         if (0 == stat(path, &sb) && (fp = bgzf_open(path, "r"))) {
2108             r->length = sb.st_size;
2109             r->offset = r->line_length = r->bases_per_line = 0;
2110 
2111             r->fn = string_dup(fd->refs->pool, path);
2112 
2113             if (fd->refs->fp)
2114                 if (bgzf_close(fd->refs->fp) != 0)
2115                     return -1;
2116             fd->refs->fp = fp;
2117             fd->refs->fn = r->fn;
2118             r->is_md5 = 1;
2119 
2120             // Fall back to cram_get_ref() where it'll do the actual
2121             // reading of the file.
2122             return 0;
2123         }
2124     }
2125 
2126 
2127     /* Otherwise search full REF_PATH; slower as loads entire file */
2128     if ((mf = open_path_mfile(tag->str+3, ref_path, NULL))) {
2129         size_t sz;
2130         r->seq = mfsteal(mf, &sz);
2131         if (r->seq) {
2132             r->mf = NULL;
2133         } else {
2134             // keep mf around as we couldn't detach
2135             r->seq = mf->data;
2136             r->mf = mf;
2137         }
2138         r->length = sz;
2139         r->is_md5 = 1;
2140     } else {
2141         refs_t *refs;
2142         char *fn;
2143 
2144     no_M5:
2145         /* Failed to find in search path or M5 cache, see if @SQ UR: tag? */
2146         if (!(tag = sam_hdr_find_key(fd->header, ty, "UR", NULL)))
2147             return -1;
2148 
2149         fn = (strncmp(tag->str+3, "file:", 5) == 0)
2150             ? tag->str+8
2151             : tag->str+3;
2152 
2153         if (fd->refs->fp) {
2154             if (bgzf_close(fd->refs->fp) != 0)
2155                 return -1;
2156             fd->refs->fp = NULL;
2157         }
2158         if (!(refs = refs_load_fai(fd->refs, fn, 0)))
2159             return -1;
2160         sanitise_SQ_lines(fd);
2161 
2162         fd->refs = refs;
2163         if (fd->refs->fp) {
2164             if (bgzf_close(fd->refs->fp) != 0)
2165                 return -1;
2166             fd->refs->fp = NULL;
2167         }
2168 
2169         if (!fd->refs->fn)
2170             return -1;
2171 
2172         if (-1 == refs2id(fd->refs, fd->header))
2173             return -1;
2174         if (!fd->refs->ref_id || !fd->refs->ref_id[id])
2175             return -1;
2176 
2177         // Local copy already, so fall back to cram_get_ref().
2178         return 0;
2179     }
2180 
2181     /* Populate the local disk cache if required */
2182     if (local_cache && *local_cache) {
2183         int pid = (int) getpid();
2184         unsigned thrid = get_int_threadid();
2185         hFILE *fp;
2186 
2187         if (*cache_root && !is_directory(cache_root)) {
2188             hts_log_warning("Creating reference cache directory %s\n"
2189                             "This may become large; see the samtools(1) manual page REF_CACHE discussion",
2190                             cache_root);
2191         }
2192 
2193         if (expand_cache_path(path, local_cache, tag->str+3) < 0) {
2194             return 0; // Not fatal - we have the data already so keep going.
2195         }
2196         hts_log_info("Writing cache file '%s'", path);
2197         mkdir_prefix(path, 01777);
2198 
2199         do {
2200             // Attempt to further uniquify the temporary filename
2201             unsigned t = ((unsigned) time(NULL)) ^ ((unsigned) clock());
2202             thrid++; // Ensure filename changes even if time/clock haven't
2203 
2204             snprintf(path_tmp, sizeof(path_tmp), "%s.tmp_%d_%u_%u",
2205                      path, pid, thrid, t);
2206             fp = hopen(path_tmp, "wx");
2207         } while (fp == NULL && errno == EEXIST);
2208         if (!fp) {
2209             perror(path_tmp);
2210 
2211             // Not fatal - we have the data already so keep going.
2212             return 0;
2213         }
2214 
2215         // Check md5sum
2216         hts_md5_context *md5;
2217         char unsigned md5_buf1[16];
2218         char md5_buf2[33];
2219 
2220         if (!(md5 = hts_md5_init())) {
2221             hclose_abruptly(fp);
2222             unlink(path_tmp);
2223             return -1;
2224         }
2225         hts_md5_update(md5, r->seq, r->length);
2226         hts_md5_final(md5_buf1, md5);
2227         hts_md5_destroy(md5);
2228         hts_md5_hex(md5_buf2, md5_buf1);
2229 
2230         if (strncmp(tag->str+3, md5_buf2, 32) != 0) {
2231             hts_log_error("Mismatching md5sum for downloaded reference");
2232             hclose_abruptly(fp);
2233             unlink(path_tmp);
2234             return -1;
2235         }
2236 
2237         if (hwrite(fp, r->seq, r->length) != r->length) {
2238             perror(path);
2239         }
2240         if (hclose(fp) < 0) {
2241             unlink(path_tmp);
2242         } else {
2243             if (0 == chmod(path_tmp, 0444))
2244                 rename(path_tmp, path);
2245             else
2246                 unlink(path_tmp);
2247         }
2248     }
2249 
2250     return 0;
2251 }
2252 
cram_ref_incr_locked(refs_t * r,int id)2253 static void cram_ref_incr_locked(refs_t *r, int id) {
2254     RP("%d INC REF %d, %d %p\n", gettid(), id, (int)(id>=0?r->ref_id[id]->count+1:-999), id>=0?r->ref_id[id]->seq:(char *)1);
2255 
2256     if (id < 0 || !r->ref_id[id] || !r->ref_id[id]->seq)
2257         return;
2258 
2259     if (r->last_id == id)
2260         r->last_id = -1;
2261 
2262     ++r->ref_id[id]->count;
2263 }
2264 
cram_ref_incr(refs_t * r,int id)2265 void cram_ref_incr(refs_t *r, int id) {
2266     pthread_mutex_lock(&r->lock);
2267     cram_ref_incr_locked(r, id);
2268     pthread_mutex_unlock(&r->lock);
2269 }
2270 
cram_ref_decr_locked(refs_t * r,int id)2271 static void cram_ref_decr_locked(refs_t *r, int id) {
2272     RP("%d DEC REF %d, %d %p\n", gettid(), id, (int)(id>=0?r->ref_id[id]->count-1:-999), id>=0?r->ref_id[id]->seq:(char *)1);
2273 
2274     if (id < 0 || !r->ref_id[id] || !r->ref_id[id]->seq) {
2275         assert(r->ref_id[id]->count >= 0);
2276         return;
2277     }
2278 
2279     if (--r->ref_id[id]->count <= 0) {
2280         assert(r->ref_id[id]->count == 0);
2281         if (r->last_id >= 0) {
2282             if (r->ref_id[r->last_id]->count <= 0 &&
2283                 r->ref_id[r->last_id]->seq) {
2284                 RP("%d FREE REF %d (%p)\n", gettid(),
2285                    r->last_id, r->ref_id[r->last_id]->seq);
2286                 ref_entry_free_seq(r->ref_id[r->last_id]);
2287                 if (r->ref_id[r->last_id]->is_md5) r->ref_id[r->last_id]->length = 0;
2288             }
2289         }
2290         r->last_id = id;
2291     }
2292 }
2293 
cram_ref_decr(refs_t * r,int id)2294 void cram_ref_decr(refs_t *r, int id) {
2295     pthread_mutex_lock(&r->lock);
2296     cram_ref_decr_locked(r, id);
2297     pthread_mutex_unlock(&r->lock);
2298 }
2299 
2300 /*
2301  * Used by cram_ref_load and cram_ref_get. The file handle will have
2302  * already been opened, so we can catch it. The ref_entry *e informs us
2303  * of whether this is a multi-line fasta file or a raw MD5 style file.
2304  * Either way we create a single contiguous sequence.
2305  *
2306  * Returns all or part of a reference sequence on success (malloced);
2307  *         NULL on failure.
2308  */
load_ref_portion(BGZF * fp,ref_entry * e,int start,int end)2309 static char *load_ref_portion(BGZF *fp, ref_entry *e, int start, int end) {
2310     off_t offset, len;
2311     char *seq;
2312 
2313     if (end < start)
2314         end = start;
2315 
2316     /*
2317      * Compute locations in file. This is trivial for the MD5 files, but
2318      * is still necessary for the fasta variants.
2319      */
2320     offset = e->line_length
2321         ? e->offset + (start-1)/e->bases_per_line * e->line_length +
2322           (start-1) % e->bases_per_line
2323         : start-1;
2324 
2325     len = (e->line_length
2326            ? e->offset + (end-1)/e->bases_per_line * e->line_length +
2327              (end-1) % e->bases_per_line
2328            : end-1) - offset + 1;
2329 
2330     if (bgzf_useek(fp, offset, SEEK_SET) < 0) {
2331         perror("bgzf_useek() on reference file");
2332         return NULL;
2333     }
2334 
2335     if (len == 0 || !(seq = malloc(len))) {
2336         return NULL;
2337     }
2338 
2339     if (len != bgzf_read(fp, seq, len)) {
2340         perror("bgzf_read() on reference file");
2341         free(seq);
2342         return NULL;
2343     }
2344 
2345     /* Strip white-space if required. */
2346     if (len != end-start+1) {
2347         int i, j;
2348         char *cp = seq;
2349         char *cp_to;
2350 
2351         for (i = j = 0; i < len; i++) {
2352             if (cp[i] >= '!' && cp[i] <= '~')
2353                 cp[j++] = toupper_c(cp[i]);
2354         }
2355         cp_to = cp+j;
2356 
2357         if (cp_to - seq != end-start+1) {
2358             hts_log_error("Malformed reference file");
2359             free(seq);
2360             return NULL;
2361         }
2362     } else {
2363         int i;
2364         for (i = 0; i < len; i++) {
2365             seq[i] = toupper_c(seq[i]);
2366         }
2367     }
2368 
2369     return seq;
2370 }
2371 
2372 /*
2373  * Load the entire reference 'id'.
2374  * This also increments the reference count by 1.
2375  *
2376  * Returns ref_entry on success;
2377  *         NULL on failure
2378  */
cram_ref_load(refs_t * r,int id,int is_md5)2379 ref_entry *cram_ref_load(refs_t *r, int id, int is_md5) {
2380     ref_entry *e = r->ref_id[id];
2381     int start = 1, end = e->length;
2382     char *seq;
2383 
2384     if (e->seq) {
2385         return e;
2386     }
2387 
2388     assert(e->count == 0);
2389 
2390     if (r->last) {
2391 #ifdef REF_DEBUG
2392         int idx = 0;
2393         for (idx = 0; idx < r->nref; idx++)
2394             if (r->last == r->ref_id[idx])
2395                 break;
2396         RP("%d cram_ref_load DECR %d\n", gettid(), idx);
2397 #endif
2398         assert(r->last->count > 0);
2399         if (--r->last->count <= 0) {
2400             RP("%d FREE REF %d (%p)\n", gettid(), id, r->ref_id[id]->seq);
2401             if (r->last->seq)
2402                 ref_entry_free_seq(r->last);
2403         }
2404     }
2405 
2406     /* Open file if it's not already the current open reference */
2407     if (strcmp(r->fn, e->fn) || r->fp == NULL) {
2408         if (r->fp)
2409             if (bgzf_close(r->fp) != 0)
2410                 return NULL;
2411         r->fn = e->fn;
2412         if (!(r->fp = bgzf_open_ref(r->fn, "r", is_md5)))
2413             return NULL;
2414     }
2415 
2416     RP("%d Loading ref %d (%d..%d)\n", gettid(), id, start, end);
2417 
2418     if (!(seq = load_ref_portion(r->fp, e, start, end))) {
2419         return NULL;
2420     }
2421 
2422     RP("%d Loaded ref %d (%d..%d) = %p\n", gettid(), id, start, end, seq);
2423 
2424     RP("%d INC REF %d, %d\n", gettid(), id, (int)(e->count+1));
2425     e->seq = seq;
2426     e->mf = NULL;
2427     e->count++;
2428 
2429     /*
2430      * Also keep track of last used ref so incr/decr loops on the same
2431      * sequence don't cause load/free loops.
2432      */
2433     RP("%d cram_ref_load INCR %d => %d\n", gettid(), id, e->count+1);
2434     r->last = e;
2435     e->count++;
2436 
2437     return e;
2438 }
2439 
2440 /*
2441  * Returns a portion of a reference sequence from start to end inclusive.
2442  * The returned pointer is owned by either the cram_file fd or by the
2443  * internal refs_t structure and should not be freed  by the caller.
2444  *
2445  * The difference is whether or not this refs_t is in use by just the one
2446  * cram_fd or by multiples, or whether we have multiple threads accessing
2447  * references. In either case fd->shared will be true and we start using
2448  * reference counting to track the number of users of a specific reference
2449  * sequence.
2450  *
2451  * Otherwise the ref seq returned is allocated as part of cram_fd itself
2452  * and will be freed up on the next call to cram_get_ref or cram_close.
2453  *
2454  * To return the entire reference sequence, specify start as 1 and end
2455  * as 0.
2456  *
2457  * To cease using a reference, call cram_ref_decr().
2458  *
2459  * Returns reference on success,
2460  *         NULL on failure
2461  */
cram_get_ref(cram_fd * fd,int id,int start,int end)2462 char *cram_get_ref(cram_fd *fd, int id, int start, int end) {
2463     ref_entry *r;
2464     char *seq;
2465     int ostart = start;
2466 
2467     if (id == -1)
2468         return NULL;
2469 
2470     /* FIXME: axiomatic query of r->seq being true?
2471      * Or shortcut for unsorted data where we load once and never free?
2472      */
2473 
2474     //fd->shared_ref = 1; // hard code for now to simplify things
2475 
2476     pthread_mutex_lock(&fd->ref_lock);
2477 
2478     RP("%d cram_get_ref on fd %p, id %d, range %d..%d\n", gettid(), fd, id, start, end);
2479 
2480     /*
2481      * Unsorted data implies we want to fetch an entire reference at a time.
2482      * We just deal with this at the moment by claiming we're sharing
2483      * references instead, which has the same requirement.
2484      */
2485     if (fd->unsorted)
2486         fd->shared_ref = 1;
2487 
2488 
2489     /* Sanity checking: does this ID exist? */
2490     if (id >= fd->refs->nref) {
2491         hts_log_error("No reference found for id %d", id);
2492         pthread_mutex_unlock(&fd->ref_lock);
2493         return NULL;
2494     }
2495 
2496     if (!fd->refs || !fd->refs->ref_id[id]) {
2497         hts_log_error("No reference found for id %d", id);
2498         pthread_mutex_unlock(&fd->ref_lock);
2499         return NULL;
2500     }
2501 
2502     if (!(r = fd->refs->ref_id[id])) {
2503         hts_log_error("No reference found for id %d", id);
2504         pthread_mutex_unlock(&fd->ref_lock);
2505         return NULL;
2506     }
2507 
2508 
2509     /*
2510      * It has an entry, but may not have been populated yet.
2511      * Any manually loaded .fai files have their lengths known.
2512      * A ref entry computed from @SQ lines (M5 or UR field) will have
2513      * r->length == 0 unless it's been loaded once and verified that we have
2514      * an on-disk filename for it.
2515      *
2516      * 19 Sep 2013: Moved the lock here as the cram_populate_ref code calls
2517      * open_path_mfile and libcurl, which isn't multi-thread safe unless I
2518      * rewrite my code to have one curl handle per thread.
2519      */
2520     pthread_mutex_lock(&fd->refs->lock);
2521     if (r->length == 0) {
2522         if (cram_populate_ref(fd, id, r) == -1) {
2523             hts_log_error("Failed to populate reference for id %d", id);
2524             pthread_mutex_unlock(&fd->refs->lock);
2525             pthread_mutex_unlock(&fd->ref_lock);
2526             return NULL;
2527         }
2528         r = fd->refs->ref_id[id];
2529         if (fd->unsorted)
2530             cram_ref_incr_locked(fd->refs, id);
2531     }
2532 
2533 
2534     /*
2535      * We now know that we the filename containing the reference, so check
2536      * for limits. If it's over half the reference we'll load all of it in
2537      * memory as this will speed up subsequent calls.
2538      */
2539     if (end < 1)
2540         end = r->length;
2541     if (end >= r->length)
2542         end  = r->length;
2543     if (start < 1)
2544         return NULL;
2545 
2546     if (end - start >= 0.5*r->length || fd->shared_ref) {
2547         start = 1;
2548         end = r->length;
2549     }
2550 
2551     /*
2552      * Maybe we have it cached already? If so use it.
2553      *
2554      * Alternatively if we don't have the sequence but we're sharing
2555      * references and/or are asking for the entire length of it, then
2556      * load the full reference into the refs structure and return
2557      * a pointer to that one instead.
2558      */
2559     if (fd->shared_ref || r->seq || (start == 1 && end == r->length)) {
2560         char *cp;
2561 
2562         if (id >= 0) {
2563             if (r->seq) {
2564                 cram_ref_incr_locked(fd->refs, id);
2565             } else {
2566                 ref_entry *e;
2567                 if (!(e = cram_ref_load(fd->refs, id, r->is_md5))) {
2568                     pthread_mutex_unlock(&fd->refs->lock);
2569                     pthread_mutex_unlock(&fd->ref_lock);
2570                     return NULL;
2571                 }
2572 
2573                 /* unsorted data implies cache ref indefinitely, to avoid
2574                  * continually loading and unloading.
2575                  */
2576                 if (fd->unsorted)
2577                     cram_ref_incr_locked(fd->refs, id);
2578             }
2579 
2580             fd->ref = NULL; /* We never access it directly */
2581             fd->ref_start = 1;
2582             fd->ref_end   = r->length;
2583             fd->ref_id    = id;
2584 
2585             cp = fd->refs->ref_id[id]->seq + ostart-1;
2586         } else {
2587             fd->ref = NULL;
2588             cp = NULL;
2589         }
2590 
2591         RP("%d cram_get_ref returning for id %d, count %d\n", gettid(), id, (int)r->count);
2592 
2593         pthread_mutex_unlock(&fd->refs->lock);
2594         pthread_mutex_unlock(&fd->ref_lock);
2595         return cp;
2596     }
2597 
2598     /*
2599      * Otherwise we're not sharing, we don't have a copy of it already and
2600      * we're only asking for a small portion of it.
2601      *
2602      * In this case load up just that segment ourselves, freeing any old
2603      * small segments in the process.
2604      */
2605 
2606     /* Unmapped ref ID */
2607     if (id < 0) {
2608         if (fd->ref_free) {
2609             free(fd->ref_free);
2610             fd->ref_free = NULL;
2611         }
2612         fd->ref = NULL;
2613         fd->ref_id = id;
2614         pthread_mutex_unlock(&fd->refs->lock);
2615         pthread_mutex_unlock(&fd->ref_lock);
2616         return NULL;
2617     }
2618 
2619     /* Open file if it's not already the current open reference */
2620     if (strcmp(fd->refs->fn, r->fn) || fd->refs->fp == NULL) {
2621         if (fd->refs->fp)
2622             if (bgzf_close(fd->refs->fp) != 0)
2623                 return NULL;
2624         fd->refs->fn = r->fn;
2625         if (!(fd->refs->fp = bgzf_open_ref(fd->refs->fn, "r", r->is_md5))) {
2626             pthread_mutex_unlock(&fd->refs->lock);
2627             pthread_mutex_unlock(&fd->ref_lock);
2628             return NULL;
2629         }
2630     }
2631 
2632     if (!(fd->ref = load_ref_portion(fd->refs->fp, r, start, end))) {
2633         pthread_mutex_unlock(&fd->refs->lock);
2634         pthread_mutex_unlock(&fd->ref_lock);
2635         return NULL;
2636     }
2637 
2638     if (fd->ref_free)
2639         free(fd->ref_free);
2640 
2641     fd->ref_id    = id;
2642     fd->ref_start = start;
2643     fd->ref_end   = end;
2644     fd->ref_free = fd->ref;
2645     seq = fd->ref;
2646 
2647     pthread_mutex_unlock(&fd->refs->lock);
2648     pthread_mutex_unlock(&fd->ref_lock);
2649 
2650     return seq + ostart - start;
2651 }
2652 
2653 /*
2654  * If fd has been opened for reading, it may be permitted to specify 'fn'
2655  * as NULL and let the code auto-detect the reference by parsing the
2656  * SAM header @SQ lines.
2657  */
cram_load_reference(cram_fd * fd,char * fn)2658 int cram_load_reference(cram_fd *fd, char *fn) {
2659     int ret = 0;
2660 
2661     if (fn) {
2662         fd->refs = refs_load_fai(fd->refs, fn,
2663                                  !(fd->embed_ref && fd->mode == 'r'));
2664         fn = fd->refs ? fd->refs->fn : NULL;
2665         if (!fn)
2666             ret = -1;
2667         sanitise_SQ_lines(fd);
2668     }
2669     fd->ref_fn = fn;
2670 
2671     if ((!fd->refs || (fd->refs->nref == 0 && !fn)) && fd->header) {
2672         if (fd->refs)
2673             refs_free(fd->refs);
2674         if (!(fd->refs = refs_create()))
2675             return -1;
2676         if (-1 == refs_from_header(fd->refs, fd, fd->header))
2677             return -1;
2678     }
2679 
2680     if (fd->header)
2681         if (-1 == refs2id(fd->refs, fd->header))
2682             return -1;
2683 
2684     return ret;
2685 }
2686 
2687 /* ----------------------------------------------------------------------
2688  * Containers
2689  */
2690 
2691 /*
2692  * Creates a new container, specifying the maximum number of slices
2693  * and records permitted.
2694  *
2695  * Returns cram_container ptr on success
2696  *         NULL on failure
2697  */
cram_new_container(int nrec,int nslice)2698 cram_container *cram_new_container(int nrec, int nslice) {
2699     cram_container *c = calloc(1, sizeof(*c));
2700     enum cram_DS_ID id;
2701 
2702     if (!c)
2703         return NULL;
2704 
2705     c->curr_ref = -2;
2706 
2707     c->max_c_rec = nrec * nslice;
2708     c->curr_c_rec = 0;
2709 
2710     c->max_rec = nrec;
2711     c->record_counter = 0;
2712     c->num_bases = 0;
2713     c->s_num_bases = 0;
2714 
2715     c->max_slice = nslice;
2716     c->curr_slice = 0;
2717 
2718     c->pos_sorted = 1;
2719     c->max_apos   = 0;
2720     c->multi_seq  = 0;
2721 
2722     c->bams = NULL;
2723 
2724     if (!(c->slices = (cram_slice **)calloc(nslice, sizeof(cram_slice *))))
2725         goto err;
2726     c->slice = NULL;
2727 
2728     if (!(c->comp_hdr = cram_new_compression_header()))
2729         goto err;
2730     c->comp_hdr_block = NULL;
2731 
2732     for (id = DS_RN; id < DS_TN; id++)
2733         if (!(c->stats[id] = cram_stats_create())) goto err;
2734 
2735     //c->aux_B_stats = cram_stats_create();
2736 
2737     if (!(c->tags_used = kh_init(m_tagmap)))
2738         goto err;
2739     c->refs_used = 0;
2740 
2741     return c;
2742 
2743  err:
2744     if (c) {
2745         if (c->slices)
2746             free(c->slices);
2747         free(c);
2748     }
2749     return NULL;
2750 }
2751 
cram_free_container(cram_container * c)2752 void cram_free_container(cram_container *c) {
2753     enum cram_DS_ID id;
2754     int i;
2755 
2756     if (!c)
2757         return;
2758 
2759     if (c->refs_used)
2760         free(c->refs_used);
2761 
2762     if (c->landmark)
2763         free(c->landmark);
2764 
2765     if (c->comp_hdr)
2766         cram_free_compression_header(c->comp_hdr);
2767 
2768     if (c->comp_hdr_block)
2769         cram_free_block(c->comp_hdr_block);
2770 
2771     // Free the slices; filled out by encoder only
2772     if (c->slices) {
2773         for (i = 0; i < c->max_slice; i++) {
2774             if (c->slices[i])
2775                 cram_free_slice(c->slices[i]);
2776             if (c->slices[i] == c->slice)
2777                 c->slice = NULL;
2778         }
2779         free(c->slices);
2780     }
2781 
2782     // Free the current slice; set by both encoder & decoder
2783     if (c->slice) {
2784         cram_free_slice(c->slice);
2785         c->slice = NULL;
2786     }
2787 
2788     for (id = DS_RN; id < DS_TN; id++)
2789         if (c->stats[id]) cram_stats_free(c->stats[id]);
2790 
2791     //if (c->aux_B_stats) cram_stats_free(c->aux_B_stats);
2792 
2793     if (c->tags_used) {
2794         khint_t k;
2795 
2796         for (k = kh_begin(c->tags_used); k != kh_end(c->tags_used); k++) {
2797             if (!kh_exist(c->tags_used, k))
2798                 continue;
2799 
2800             cram_tag_map *tm = (cram_tag_map *)kh_val(c->tags_used, k);
2801             cram_codec *c = tm->codec;
2802 
2803             if (c) c->free(c);
2804             free(tm);
2805         }
2806 
2807         kh_destroy(m_tagmap, c->tags_used);
2808     }
2809 
2810     free(c);
2811 }
2812 
2813 /*
2814  * Reads a container header.
2815  *
2816  * Returns cram_container on success
2817  *         NULL on failure or no container left (fd->err == 0).
2818  */
cram_read_container(cram_fd * fd)2819 cram_container *cram_read_container(cram_fd *fd) {
2820     cram_container c2, *c;
2821     int i, s;
2822     size_t rd = 0;
2823     uint32_t crc = 0;
2824 
2825     fd->err = 0;
2826     fd->eof = 0;
2827 
2828     memset(&c2, 0, sizeof(c2));
2829     if (CRAM_MAJOR_VERS(fd->version) == 1) {
2830         if ((s = itf8_decode_crc(fd, &c2.length, &crc)) == -1) {
2831             fd->eof = fd->empty_container ? 1 : 2;
2832             return NULL;
2833         } else {
2834             rd+=s;
2835         }
2836     } else {
2837         uint32_t len;
2838         if ((s = int32_decode(fd, &c2.length)) == -1) {
2839             if (CRAM_MAJOR_VERS(fd->version) == 2 &&
2840                 CRAM_MINOR_VERS(fd->version) == 0)
2841                 fd->eof = 1; // EOF blocks arrived in v2.1
2842             else
2843                 fd->eof = fd->empty_container ? 1 : 2;
2844             return NULL;
2845         } else {
2846             rd+=s;
2847         }
2848         len = le_int4(c2.length);
2849         crc = crc32(0L, (unsigned char *)&len, 4);
2850     }
2851     if ((s = itf8_decode_crc(fd, &c2.ref_seq_id, &crc))   == -1) return NULL; else rd+=s;
2852     if ((s = itf8_decode_crc(fd, &c2.ref_seq_start, &crc))== -1) return NULL; else rd+=s;
2853     if ((s = itf8_decode_crc(fd, &c2.ref_seq_span, &crc)) == -1) return NULL; else rd+=s;
2854     if ((s = itf8_decode_crc(fd, &c2.num_records, &crc))  == -1) return NULL; else rd+=s;
2855 
2856     if (CRAM_MAJOR_VERS(fd->version) == 1) {
2857         c2.record_counter = 0;
2858         c2.num_bases = 0;
2859     } else {
2860         if (CRAM_MAJOR_VERS(fd->version) >= 3) {
2861             if ((s = ltf8_decode_crc(fd, &c2.record_counter, &crc)) == -1)
2862                 return NULL;
2863             else
2864                 rd += s;
2865         } else {
2866             int32_t i32;
2867             if ((s = itf8_decode_crc(fd, &i32, &crc)) == -1)
2868                 return NULL;
2869             else
2870                 rd += s;
2871             c2.record_counter = i32;
2872         }
2873 
2874         if ((s = ltf8_decode_crc(fd, &c2.num_bases, &crc))== -1)
2875             return NULL;
2876         else
2877             rd += s;
2878     }
2879     if ((s = itf8_decode_crc(fd, &c2.num_blocks, &crc))   == -1) return NULL; else rd+=s;
2880     if ((s = itf8_decode_crc(fd, &c2.num_landmarks, &crc))== -1) return NULL; else rd+=s;
2881 
2882     if (c2.num_landmarks < 0 || c2.num_landmarks >= SIZE_MAX / sizeof(int32_t))
2883         return NULL;
2884 
2885     if (!(c = calloc(1, sizeof(*c))))
2886         return NULL;
2887 
2888     *c = c2;
2889 
2890     if (!(c->landmark = malloc(c->num_landmarks * sizeof(int32_t))) &&
2891         c->num_landmarks) {
2892         fd->err = errno;
2893         cram_free_container(c);
2894         return NULL;
2895     }
2896     for (i = 0; i < c->num_landmarks; i++) {
2897         if ((s = itf8_decode_crc(fd, &c->landmark[i], &crc)) == -1) {
2898             cram_free_container(c);
2899             return NULL;
2900         } else {
2901             rd += s;
2902         }
2903     }
2904 
2905     if (CRAM_MAJOR_VERS(fd->version) >= 3) {
2906         if (-1 == int32_decode(fd, (int32_t *)&c->crc32))
2907             return NULL;
2908         else
2909             rd+=4;
2910 
2911         if (crc != c->crc32) {
2912             hts_log_error("Container header CRC32 failure");
2913             cram_free_container(c);
2914             return NULL;
2915         }
2916     }
2917 
2918     c->offset = rd;
2919     c->slices = NULL;
2920     c->slice = NULL;
2921     c->curr_slice = 0;
2922     c->max_slice = c->num_landmarks;
2923     c->slice_rec = 0;
2924     c->curr_rec = 0;
2925     c->max_rec = 0;
2926 
2927     if (c->ref_seq_id == -2) {
2928         c->multi_seq = 1;
2929         fd->multi_seq = 1;
2930     }
2931 
2932     fd->empty_container =
2933         (c->num_records == 0 &&
2934          c->ref_seq_id == -1 &&
2935          c->ref_seq_start == 0x454f46 /* EOF */) ? 1 : 0;
2936 
2937     return c;
2938 }
2939 
2940 
2941 /* MAXIMUM storage size needed for the container. */
cram_container_size(cram_container * c)2942 int cram_container_size(cram_container *c) {
2943     return 55 + 5*c->num_landmarks;
2944 }
2945 
2946 
2947 /*
2948  * Stores the container structure in dat and returns *size as the
2949  * number of bytes written to dat[].  The input size of dat is also
2950  * held in *size and should be initialised to cram_container_size(c).
2951  *
2952  * Returns 0 on success;
2953  *        -1 on failure
2954  */
cram_store_container(cram_fd * fd,cram_container * c,char * dat,int * size)2955 int cram_store_container(cram_fd *fd, cram_container *c, char *dat, int *size)
2956 {
2957     unsigned char *cp = (unsigned char *)dat;
2958     int i;
2959 
2960     // Check the input buffer is large enough according to our stated
2961     // requirements. (NOTE: it may actually take less.)
2962     if (cram_container_size(c) > *size)
2963         return -1;
2964 
2965     if (CRAM_MAJOR_VERS(fd->version) == 1) {
2966         cp += itf8_put((char*)cp, c->length);
2967     } else {
2968         *(int32_t *)cp = le_int4(c->length);
2969         cp += 4;
2970     }
2971     if (c->multi_seq) {
2972         cp += itf8_put((char*)cp, -2);
2973         cp += itf8_put((char*)cp, 0);
2974         cp += itf8_put((char*)cp, 0);
2975     } else {
2976         cp += itf8_put((char*)cp, c->ref_seq_id);
2977         cp += itf8_put((char*)cp, c->ref_seq_start);
2978         cp += itf8_put((char*)cp, c->ref_seq_span);
2979     }
2980     cp += itf8_put((char*)cp, c->num_records);
2981     if (CRAM_MAJOR_VERS(fd->version) == 2) {
2982         cp += itf8_put((char*)cp, c->record_counter);
2983         cp += ltf8_put((char*)cp, c->num_bases);
2984     } else if (CRAM_MAJOR_VERS(fd->version) >= 3) {
2985         cp += ltf8_put((char*)cp, c->record_counter);
2986         cp += ltf8_put((char*)cp, c->num_bases);
2987     }
2988 
2989     cp += itf8_put((char*)cp, c->num_blocks);
2990     cp += itf8_put((char*)cp, c->num_landmarks);
2991     for (i = 0; i < c->num_landmarks; i++)
2992         cp += itf8_put((char*)cp, c->landmark[i]);
2993 
2994     if (CRAM_MAJOR_VERS(fd->version) >= 3) {
2995         c->crc32 = crc32(0L, (uc *)dat, (char*)cp-dat);
2996         cp[0] =  c->crc32        & 0xff;
2997         cp[1] = (c->crc32 >>  8) & 0xff;
2998         cp[2] = (c->crc32 >> 16) & 0xff;
2999         cp[3] = (c->crc32 >> 24) & 0xff;
3000         cp += 4;
3001     }
3002 
3003     *size = (char *)cp-dat; // actual used size
3004 
3005     return 0;
3006 }
3007 
3008 
3009 /*
3010  * Writes a container structure.
3011  *
3012  * Returns 0 on success
3013  *        -1 on failure
3014  */
cram_write_container(cram_fd * fd,cram_container * c)3015 int cram_write_container(cram_fd *fd, cram_container *c) {
3016     char buf_a[1024], *buf = buf_a;
3017     unsigned char *cp;
3018     int i;
3019 
3020     if (55 + c->num_landmarks * 5 >= 1024)
3021         buf = malloc(55 + c->num_landmarks * 5);
3022     cp = (unsigned char *)buf;
3023 
3024     if (CRAM_MAJOR_VERS(fd->version) == 1) {
3025         cp += itf8_put((char*)cp, c->length);
3026     } else {
3027         *(int32_t *)cp = le_int4(c->length);
3028         cp += 4;
3029     }
3030     if (c->multi_seq) {
3031         cp += itf8_put((char*)cp, -2);
3032         cp += itf8_put((char*)cp, 0);
3033         cp += itf8_put((char*)cp, 0);
3034     } else {
3035         cp += itf8_put((char*)cp, c->ref_seq_id);
3036         cp += itf8_put((char*)cp, c->ref_seq_start);
3037         cp += itf8_put((char*)cp, c->ref_seq_span);
3038     }
3039     cp += itf8_put((char*)cp, c->num_records);
3040     if (CRAM_MAJOR_VERS(fd->version) == 2) {
3041         cp += itf8_put((char*)cp, c->record_counter);
3042         cp += ltf8_put((char*)cp, c->num_bases);
3043     } else if (CRAM_MAJOR_VERS(fd->version) >= 3) {
3044         cp += ltf8_put((char*)cp, c->record_counter);
3045         cp += ltf8_put((char*)cp, c->num_bases);
3046     }
3047 
3048     cp += itf8_put((char*)cp, c->num_blocks);
3049     cp += itf8_put((char*)cp, c->num_landmarks);
3050     for (i = 0; i < c->num_landmarks; i++)
3051         cp += itf8_put((char*)cp, c->landmark[i]);
3052 
3053     if (CRAM_MAJOR_VERS(fd->version) >= 3) {
3054         c->crc32 = crc32(0L, (uc *)buf, (char*)cp-buf);
3055         cp[0] =  c->crc32        & 0xff;
3056         cp[1] = (c->crc32 >>  8) & 0xff;
3057         cp[2] = (c->crc32 >> 16) & 0xff;
3058         cp[3] = (c->crc32 >> 24) & 0xff;
3059         cp += 4;
3060     }
3061 
3062     if ((char*)cp-buf != hwrite(fd->fp, buf, (char*)cp-buf)) {
3063         if (buf != buf_a)
3064             free(buf);
3065         return -1;
3066     }
3067 
3068     if (buf != buf_a)
3069         free(buf);
3070 
3071     return 0;
3072 }
3073 
3074 // common component shared by cram_flush_container{,_mt}
cram_flush_container2(cram_fd * fd,cram_container * c)3075 static int cram_flush_container2(cram_fd *fd, cram_container *c) {
3076     int i, j;
3077 
3078     if (c->curr_slice > 0 && !c->slices)
3079         return -1;
3080 
3081     //fprintf(stderr, "Writing container %d, sum %u\n", c->record_counter, sum);
3082 
3083     /* Write the container struct itself */
3084     if (0 != cram_write_container(fd, c))
3085         return -1;
3086 
3087     /* And the compression header */
3088     if (0 != cram_write_block(fd, c->comp_hdr_block))
3089         return -1;
3090 
3091     /* Followed by the slice blocks */
3092     for (i = 0; i < c->curr_slice; i++) {
3093         cram_slice *s = c->slices[i];
3094 
3095         if (0 != cram_write_block(fd, s->hdr_block))
3096             return -1;
3097 
3098         for (j = 0; j < s->hdr->num_blocks; j++) {
3099             if (0 != cram_write_block(fd, s->block[j]))
3100                 return -1;
3101         }
3102     }
3103 
3104     return hflush(fd->fp) == 0 ? 0 : -1;
3105 }
3106 
3107 /*
3108  * Flushes a completely or partially full container to disk, writing
3109  * container structure, header and blocks. This also calls the encoder
3110  * functions.
3111  *
3112  * Returns 0 on success
3113  *        -1 on failure
3114  */
cram_flush_container(cram_fd * fd,cram_container * c)3115 int cram_flush_container(cram_fd *fd, cram_container *c) {
3116     /* Encode the container blocks and generate compression header */
3117     if (0 != cram_encode_container(fd, c))
3118         return -1;
3119 
3120     return cram_flush_container2(fd, c);
3121 }
3122 
3123 typedef struct {
3124     cram_fd *fd;
3125     cram_container *c;
3126 } cram_job;
3127 
cram_flush_thread(void * arg)3128 void *cram_flush_thread(void *arg) {
3129     cram_job *j = (cram_job *)arg;
3130 
3131     /* Encode the container blocks and generate compression header */
3132     if (0 != cram_encode_container(j->fd, j->c)) {
3133         hts_log_error("Call to cram_encode_container failed");
3134         return NULL;
3135     }
3136 
3137     return arg;
3138 }
3139 
cram_flush_result(cram_fd * fd)3140 static int cram_flush_result(cram_fd *fd) {
3141     int i, ret = 0;
3142     hts_tpool_result *r;
3143     cram_container *lc = NULL;
3144 
3145     // NB: we can have one result per slice, not per container,
3146     // so we need to free the container only after all slices
3147     // within it have been freed.  (Automatic via reference counting.)
3148     while ((r = hts_tpool_next_result(fd->rqueue))) {
3149         cram_job *j = (cram_job *)hts_tpool_result_data(r);
3150         cram_container *c;
3151 
3152         if (!j) {
3153             hts_tpool_delete_result(r, 0);
3154             return -1;
3155         }
3156 
3157         fd = j->fd;
3158         c = j->c;
3159 
3160         if (fd->mode == 'w')
3161             if (0 != cram_flush_container2(fd, c))
3162                 return -1;
3163 
3164         // Free the slices; filled out by encoder only
3165         if (c->slices) {
3166             for (i = 0; i < c->max_slice; i++) {
3167                 if (c->slices[i])
3168                     cram_free_slice(c->slices[i]);
3169                 if (c->slices[i] == c->slice)
3170                     c->slice = NULL;
3171                 c->slices[i] = NULL;
3172             }
3173         }
3174 
3175         // Free the current slice; set by both encoder & decoder
3176         if (c->slice) {
3177             cram_free_slice(c->slice);
3178             c->slice = NULL;
3179         }
3180         c->curr_slice = 0;
3181 
3182         // Our jobs will be in order, so we free the last
3183         // container when our job has switched to a new one.
3184         if (c != lc) {
3185             if (lc) {
3186                 if (fd->ctr == lc)
3187                     fd->ctr = NULL;
3188                 if (fd->ctr_mt == lc)
3189                     fd->ctr_mt = NULL;
3190                 cram_free_container(lc);
3191             }
3192             lc = c;
3193         }
3194 
3195         if (fd->mode == 'w')
3196             ret |= hflush(fd->fp) == 0 ? 0 : -1;
3197 
3198         hts_tpool_delete_result(r, 1);
3199     }
3200     if (lc) {
3201         if (fd->ctr == lc)
3202             fd->ctr = NULL;
3203         if (fd->ctr_mt == lc)
3204             fd->ctr_mt = NULL;
3205         cram_free_container(lc);
3206     }
3207 
3208     return ret;
3209 }
3210 
cram_flush_container_mt(cram_fd * fd,cram_container * c)3211 int cram_flush_container_mt(cram_fd *fd, cram_container *c) {
3212     cram_job *j;
3213 
3214     if (!fd->pool)
3215         return cram_flush_container(fd, c);
3216 
3217     if (!(j = malloc(sizeof(*j))))
3218         return -1;
3219     j->fd = fd;
3220     j->c = c;
3221 
3222     // Flush the job.  Note our encoder queue may be full, so we
3223     // either have to keep trying in non-blocking mode (what we do) or
3224     // use a dedicated separate thread for draining the queue.
3225     for (;;) {
3226         errno = 0;
3227         hts_tpool_dispatch2(fd->pool, fd->rqueue, cram_flush_thread, j, 1);
3228         int pending = (errno == EAGAIN);
3229         if (cram_flush_result(fd) != 0)
3230             return -1;
3231         if (!pending)
3232             break;
3233 
3234         usleep(1000);
3235     }
3236 
3237     return 0;
3238 }
3239 
3240 /* ----------------------------------------------------------------------
3241  * Compression headers; the first part of the container
3242  */
3243 
3244 /*
3245  * Creates a new blank container compression header
3246  *
3247  * Returns header ptr on success
3248  *         NULL on failure
3249  */
cram_new_compression_header(void)3250 cram_block_compression_hdr *cram_new_compression_header(void) {
3251     cram_block_compression_hdr *hdr = calloc(1, sizeof(*hdr));
3252     if (!hdr)
3253         return NULL;
3254 
3255     if (!(hdr->TD_blk = cram_new_block(CORE, 0))) {
3256         free(hdr);
3257         return NULL;
3258     }
3259 
3260     if (!(hdr->TD_hash = kh_init(m_s2i))) {
3261         cram_free_block(hdr->TD_blk);
3262         free(hdr);
3263         return NULL;
3264     }
3265 
3266     if (!(hdr->TD_keys = string_pool_create(8192))) {
3267         kh_destroy(m_s2i, hdr->TD_hash);
3268         cram_free_block(hdr->TD_blk);
3269         free(hdr);
3270         return NULL;
3271     }
3272 
3273     return hdr;
3274 }
3275 
cram_free_compression_header(cram_block_compression_hdr * hdr)3276 void cram_free_compression_header(cram_block_compression_hdr *hdr) {
3277     int i;
3278 
3279     if (hdr->landmark)
3280         free(hdr->landmark);
3281 
3282     if (hdr->preservation_map)
3283         kh_destroy(map, hdr->preservation_map);
3284 
3285     for (i = 0; i < CRAM_MAP_HASH; i++) {
3286         cram_map *m, *m2;
3287         for (m = hdr->rec_encoding_map[i]; m; m = m2) {
3288             m2 = m->next;
3289             if (m->codec)
3290                 m->codec->free(m->codec);
3291             free(m);
3292         }
3293     }
3294 
3295     for (i = 0; i < CRAM_MAP_HASH; i++) {
3296         cram_map *m, *m2;
3297         for (m = hdr->tag_encoding_map[i]; m; m = m2) {
3298             m2 = m->next;
3299             if (m->codec)
3300                 m->codec->free(m->codec);
3301             free(m);
3302         }
3303     }
3304 
3305     for (i = 0; i < DS_END; i++) {
3306         if (hdr->codecs[i])
3307             hdr->codecs[i]->free(hdr->codecs[i]);
3308     }
3309 
3310     if (hdr->TL)
3311         free(hdr->TL);
3312     if (hdr->TD_blk)
3313         cram_free_block(hdr->TD_blk);
3314     if (hdr->TD_hash)
3315         kh_destroy(m_s2i, hdr->TD_hash);
3316     if (hdr->TD_keys)
3317         string_pool_destroy(hdr->TD_keys);
3318 
3319     free(hdr);
3320 }
3321 
3322 
3323 /* ----------------------------------------------------------------------
3324  * Slices and slice headers
3325  */
3326 
cram_free_slice_header(cram_block_slice_hdr * hdr)3327 void cram_free_slice_header(cram_block_slice_hdr *hdr) {
3328     if (!hdr)
3329         return;
3330 
3331     if (hdr->block_content_ids)
3332         free(hdr->block_content_ids);
3333 
3334     free(hdr);
3335 
3336     return;
3337 }
3338 
cram_free_slice(cram_slice * s)3339 void cram_free_slice(cram_slice *s) {
3340     if (!s)
3341         return;
3342 
3343     if (s->hdr_block)
3344         cram_free_block(s->hdr_block);
3345 
3346     if (s->block) {
3347         int i;
3348 
3349         if (s->hdr) {
3350             for (i = 0; i < s->hdr->num_blocks; i++) {
3351                 cram_free_block(s->block[i]);
3352             }
3353         }
3354         free(s->block);
3355     }
3356 
3357     if (s->block_by_id)
3358         free(s->block_by_id);
3359 
3360     if (s->hdr)
3361         cram_free_slice_header(s->hdr);
3362 
3363     if (s->seqs_blk)
3364         cram_free_block(s->seqs_blk);
3365 
3366     if (s->qual_blk)
3367         cram_free_block(s->qual_blk);
3368 
3369     if (s->name_blk)
3370         cram_free_block(s->name_blk);
3371 
3372     if (s->aux_blk)
3373         cram_free_block(s->aux_blk);
3374 
3375     if (s->base_blk)
3376         cram_free_block(s->base_blk);
3377 
3378     if (s->soft_blk)
3379         cram_free_block(s->soft_blk);
3380 
3381     if (s->cigar)
3382         free(s->cigar);
3383 
3384     if (s->crecs)
3385         free(s->crecs);
3386 
3387     if (s->features)
3388         free(s->features);
3389 
3390     if (s->TN)
3391         free(s->TN);
3392 
3393     if (s->pair_keys)
3394         string_pool_destroy(s->pair_keys);
3395 
3396     if (s->pair[0])
3397         kh_destroy(m_s2i, s->pair[0]);
3398     if (s->pair[1])
3399         kh_destroy(m_s2i, s->pair[1]);
3400 
3401     if (s->aux_block)
3402         free(s->aux_block);
3403 
3404     free(s);
3405 }
3406 
3407 /*
3408  * Creates a new empty slice in memory, for subsequent writing to
3409  * disk.
3410  *
3411  * Returns cram_slice ptr on success
3412  *         NULL on failure
3413  */
cram_new_slice(enum cram_content_type type,int nrecs)3414 cram_slice *cram_new_slice(enum cram_content_type type, int nrecs) {
3415     cram_slice *s = calloc(1, sizeof(*s));
3416     if (!s)
3417         return NULL;
3418 
3419     if (!(s->hdr = (cram_block_slice_hdr *)calloc(1, sizeof(*s->hdr))))
3420         goto err;
3421     s->hdr->content_type = type;
3422 
3423     s->hdr_block = NULL;
3424     s->block = NULL;
3425     s->block_by_id = NULL;
3426     s->last_apos = 0;
3427     if (!(s->crecs = malloc(nrecs * sizeof(cram_record))))  goto err;
3428     s->cigar = NULL;
3429     s->cigar_alloc = 0;
3430     s->ncigar = 0;
3431 
3432     if (!(s->seqs_blk = cram_new_block(EXTERNAL, 0)))       goto err;
3433     if (!(s->qual_blk = cram_new_block(EXTERNAL, DS_QS)))   goto err;
3434     if (!(s->name_blk = cram_new_block(EXTERNAL, DS_RN)))   goto err;
3435     if (!(s->aux_blk  = cram_new_block(EXTERNAL, DS_aux)))  goto err;
3436     if (!(s->base_blk = cram_new_block(EXTERNAL, DS_IN)))   goto err;
3437     if (!(s->soft_blk = cram_new_block(EXTERNAL, DS_SC)))   goto err;
3438 
3439     s->features = NULL;
3440     s->nfeatures = s->afeatures = 0;
3441 
3442 #ifndef TN_external
3443     s->TN = NULL;
3444     s->nTN = s->aTN = 0;
3445 #endif
3446 
3447     // Volatile keys as we do realloc in dstring
3448     if (!(s->pair_keys = string_pool_create(8192))) goto err;
3449     if (!(s->pair[0] = kh_init(m_s2i)))             goto err;
3450     if (!(s->pair[1] = kh_init(m_s2i)))             goto err;
3451 
3452 #ifdef BA_external
3453     s->BA_len = 0;
3454 #endif
3455 
3456     return s;
3457 
3458  err:
3459     if (s)
3460         cram_free_slice(s);
3461 
3462     return NULL;
3463 }
3464 
3465 /*
3466  * Loads an entire slice.
3467  * FIXME: In 1.0 the native unit of slices within CRAM is broken
3468  * as slices contain references to objects in other slices.
3469  * To work around this while keeping the slice oriented outer loop
3470  * we read all slices and stitch them together into a fake large
3471  * slice instead.
3472  *
3473  * Returns cram_slice ptr on success
3474  *         NULL on failure
3475  */
cram_read_slice(cram_fd * fd)3476 cram_slice *cram_read_slice(cram_fd *fd) {
3477     cram_block *b = cram_read_block(fd);
3478     cram_slice *s = calloc(1, sizeof(*s));
3479     int i, n, max_id, min_id;
3480 
3481     if (!b || !s)
3482         goto err;
3483 
3484     s->hdr_block = b;
3485     switch (b->content_type) {
3486     case MAPPED_SLICE:
3487     case UNMAPPED_SLICE:
3488         if (!(s->hdr = cram_decode_slice_header(fd, b)))
3489             goto err;
3490         break;
3491 
3492     default:
3493         hts_log_error("Unexpected block of type %s",
3494                       cram_content_type2str(b->content_type));
3495         goto err;
3496     }
3497 
3498     if (s->hdr->num_blocks < 1) {
3499         hts_log_error("Slice does not include any data blocks");
3500         goto err;
3501     }
3502 
3503     s->block = calloc(n = s->hdr->num_blocks, sizeof(*s->block));
3504     if (!s->block)
3505         goto err;
3506 
3507     for (max_id = i = 0, min_id = INT_MAX; i < n; i++) {
3508         if (!(s->block[i] = cram_read_block(fd)))
3509             goto err;
3510 
3511         if (s->block[i]->content_type == EXTERNAL) {
3512             if (max_id < s->block[i]->content_id)
3513                 max_id = s->block[i]->content_id;
3514             if (min_id > s->block[i]->content_id)
3515                 min_id = s->block[i]->content_id;
3516         }
3517     }
3518 
3519     if (!(s->block_by_id = calloc(512, sizeof(s->block[0]))))
3520         goto err;
3521 
3522     for (i = 0; i < n; i++) {
3523         if (s->block[i]->content_type != EXTERNAL)
3524             continue;
3525         int v = s->block[i]->content_id;
3526         if (v < 0 || v >= 256)
3527             v = 256 + (v > 0 ? v % 251 : (-v) % 251);
3528         s->block_by_id[v] = s->block[i];
3529     }
3530 
3531     /* Initialise encoding/decoding tables */
3532     s->cigar = NULL;
3533     s->cigar_alloc = 0;
3534     s->ncigar = 0;
3535 
3536     if (!(s->seqs_blk = cram_new_block(EXTERNAL, 0)))      goto err;
3537     if (!(s->qual_blk = cram_new_block(EXTERNAL, DS_QS)))  goto err;
3538     if (!(s->name_blk = cram_new_block(EXTERNAL, DS_RN)))  goto err;
3539     if (!(s->aux_blk  = cram_new_block(EXTERNAL, DS_aux))) goto err;
3540     if (!(s->base_blk = cram_new_block(EXTERNAL, DS_IN)))  goto err;
3541     if (!(s->soft_blk = cram_new_block(EXTERNAL, DS_SC)))  goto err;
3542 
3543     s->crecs = NULL;
3544 
3545     s->last_apos = s->hdr->ref_seq_start;
3546     s->decode_md = fd->decode_md;
3547 
3548     return s;
3549 
3550  err:
3551     if (b)
3552         cram_free_block(b);
3553     if (s) {
3554         s->hdr_block = NULL;
3555         cram_free_slice(s);
3556     }
3557     return NULL;
3558 }
3559 
3560 
3561 /* ----------------------------------------------------------------------
3562  * CRAM file definition (header)
3563  */
3564 
3565 /*
3566  * Reads a CRAM file definition structure.
3567  * Returns file_def ptr on success
3568  *         NULL on failure
3569  */
cram_read_file_def(cram_fd * fd)3570 cram_file_def *cram_read_file_def(cram_fd *fd) {
3571     cram_file_def *def = malloc(sizeof(*def));
3572     if (!def)
3573         return NULL;
3574 
3575     if (26 != hread(fd->fp, &def->magic[0], 26)) {
3576         free(def);
3577         return NULL;
3578     }
3579 
3580     if (memcmp(def->magic, "CRAM", 4) != 0) {
3581         free(def);
3582         return NULL;
3583     }
3584 
3585     if (def->major_version > 3) {
3586         hts_log_error("CRAM version number mismatch. Expected 1.x, 2.x or 3.x, got %d.%d",
3587                       def->major_version, def->minor_version);
3588         free(def);
3589         return NULL;
3590     }
3591 
3592     fd->first_container += 26;
3593     fd->last_slice = 0;
3594 
3595     return def;
3596 }
3597 
3598 /*
3599  * Writes a cram_file_def structure to cram_fd.
3600  * Returns 0 on success
3601  *        -1 on failure
3602  */
cram_write_file_def(cram_fd * fd,cram_file_def * def)3603 int cram_write_file_def(cram_fd *fd, cram_file_def *def) {
3604     return (hwrite(fd->fp, &def->magic[0], 26) == 26) ? 0 : -1;
3605 }
3606 
cram_free_file_def(cram_file_def * def)3607 void cram_free_file_def(cram_file_def *def) {
3608     if (def) free(def);
3609 }
3610 
3611 /* ----------------------------------------------------------------------
3612  * SAM header I/O
3613  */
3614 
3615 
3616 /*
3617  * Reads the SAM header from the first CRAM data block.
3618  * Also performs minimal parsing to extract read-group
3619  * and sample information.
3620 
3621  * Returns SAM hdr ptr on success
3622  *         NULL on failure
3623  */
cram_read_SAM_hdr(cram_fd * fd)3624 SAM_hdr *cram_read_SAM_hdr(cram_fd *fd) {
3625     int32_t header_len;
3626     char *header;
3627     SAM_hdr *hdr;
3628 
3629     /* 1.1 onwards stores the header in the first block of a container */
3630     if (CRAM_MAJOR_VERS(fd->version) == 1) {
3631         /* Length */
3632         if (-1 == int32_decode(fd, &header_len))
3633             return NULL;
3634 
3635         /* Alloc and read */
3636         if (header_len < 0 || NULL == (header = malloc((size_t) header_len+1)))
3637             return NULL;
3638 
3639         if (header_len != hread(fd->fp, header, header_len))
3640             return NULL;
3641         header[header_len] = '\0';
3642 
3643         fd->first_container += 4 + header_len;
3644     } else {
3645         cram_container *c = cram_read_container(fd);
3646         cram_block *b;
3647         int i;
3648         int64_t len;
3649 
3650         if (!c)
3651             return NULL;
3652 
3653         fd->first_container += c->length + c->offset;
3654 
3655         if (c->num_blocks < 1) {
3656             cram_free_container(c);
3657             return NULL;
3658         }
3659 
3660         if (!(b = cram_read_block(fd))) {
3661             cram_free_container(c);
3662             return NULL;
3663         }
3664         if (cram_uncompress_block(b) != 0) {
3665             cram_free_container(c);
3666             cram_free_block(b);
3667             return NULL;
3668         }
3669 
3670         len = b->comp_size + 2 + 4*(CRAM_MAJOR_VERS(fd->version) >= 3) +
3671             itf8_size(b->content_id) +
3672             itf8_size(b->uncomp_size) +
3673             itf8_size(b->comp_size);
3674 
3675         /* Extract header from 1st block */
3676         if (-1 == int32_get_blk(b, &header_len) ||
3677             header_len < 0 || /* Spec. says signed...  why? */
3678             b->uncomp_size - 4 < header_len) {
3679             cram_free_container(c);
3680             cram_free_block(b);
3681             return NULL;
3682         }
3683         if (NULL == (header = malloc((size_t) header_len+1))) {
3684             cram_free_container(c);
3685             cram_free_block(b);
3686             return NULL;
3687         }
3688         memcpy(header, BLOCK_END(b), header_len);
3689         header[header_len] = '\0';
3690         cram_free_block(b);
3691 
3692         /* Consume any remaining blocks */
3693         for (i = 1; i < c->num_blocks; i++) {
3694             if (!(b = cram_read_block(fd))) {
3695                 cram_free_container(c);
3696                 return NULL;
3697             }
3698             len += b->comp_size + 2 + 4*(CRAM_MAJOR_VERS(fd->version) >= 3) +
3699                 itf8_size(b->content_id) +
3700                 itf8_size(b->uncomp_size) +
3701                 itf8_size(b->comp_size);
3702             cram_free_block(b);
3703         }
3704 
3705         if (c->length > 0 && len > 0 && c->length > len) {
3706             // Consume padding
3707             char *pads = malloc(c->length - len);
3708             if (!pads) {
3709                 cram_free_container(c);
3710                 return NULL;
3711             }
3712 
3713             if (c->length - len != hread(fd->fp, pads, c->length - len)) {
3714                 cram_free_container(c);
3715                 return NULL;
3716             }
3717             free(pads);
3718         }
3719 
3720         cram_free_container(c);
3721     }
3722 
3723     /* Parse */
3724     hdr = sam_hdr_parse_(header, header_len);
3725     free(header);
3726 
3727     return hdr;
3728 }
3729 
3730 /*
3731  * Converts 'in' to a full pathname to store in out.
3732  * Out must be at least PATH_MAX bytes long.
3733  */
full_path(char * out,char * in)3734 static void full_path(char *out, char *in) {
3735     size_t in_l = strlen(in);
3736     if (*in == '/' ||
3737         // Windows paths
3738         (in_l > 3 && toupper_c(*in) >= 'A'  && toupper_c(*in) <= 'Z' &&
3739          in[1] == ':' && (in[2] == '/' || in[2] == '\\'))) {
3740         strncpy(out, in, PATH_MAX);
3741         out[PATH_MAX-1] = 0;
3742     } else {
3743         int len;
3744 
3745         // unable to get dir or out+in is too long
3746         if (!getcwd(out, PATH_MAX) ||
3747             (len = strlen(out))+1+strlen(in) >= PATH_MAX) {
3748             strncpy(out, in, PATH_MAX);
3749             out[PATH_MAX-1] = 0;
3750             return;
3751         }
3752 
3753         sprintf(out+len, "/%.*s", PATH_MAX - len, in);
3754 
3755         // FIXME: cope with `pwd`/../../../foo.fa ?
3756     }
3757 }
3758 
3759 /*
3760  * Writes a CRAM SAM header.
3761  * Returns 0 on success
3762  *        -1 on failure
3763  */
cram_write_SAM_hdr(cram_fd * fd,SAM_hdr * hdr)3764 int cram_write_SAM_hdr(cram_fd *fd, SAM_hdr *hdr) {
3765     int header_len;
3766     int blank_block = (CRAM_MAJOR_VERS(fd->version) >= 3);
3767 
3768     /* Write CRAM MAGIC if not yet written. */
3769     if (fd->file_def->major_version == 0) {
3770         fd->file_def->major_version = CRAM_MAJOR_VERS(fd->version);
3771         fd->file_def->minor_version = CRAM_MINOR_VERS(fd->version);
3772         if (0 != cram_write_file_def(fd, fd->file_def))
3773             return -1;
3774     }
3775 
3776     /* 1.0 requires an UNKNOWN read-group */
3777     if (CRAM_MAJOR_VERS(fd->version) == 1) {
3778         if (!sam_hdr_find_rg(hdr, "UNKNOWN"))
3779             if (sam_hdr_add(hdr, "RG",
3780                             "ID", "UNKNOWN", "SM", "UNKNOWN", NULL))
3781                 return -1;
3782     }
3783 
3784     /* Fix M5 strings */
3785     if (fd->refs && !fd->no_ref) {
3786         int i;
3787         for (i = 0; i < hdr->nref; i++) {
3788             SAM_hdr_type *ty;
3789             char *ref;
3790 
3791             if (!(ty = sam_hdr_find(hdr, "SQ", "SN", hdr->ref[i].name)))
3792                 return -1;
3793 
3794             if (!sam_hdr_find_key(hdr, ty, "M5", NULL)) {
3795                 char unsigned buf[16];
3796                 char buf2[33];
3797                 int rlen;
3798                 hts_md5_context *md5;
3799 
3800                 if (!fd->refs ||
3801                     !fd->refs->ref_id ||
3802                     !fd->refs->ref_id[i]) {
3803                     return -1;
3804                 }
3805                 rlen = fd->refs->ref_id[i]->length;
3806                 if (!(md5 = hts_md5_init()))
3807                     return -1;
3808                 ref = cram_get_ref(fd, i, 1, rlen);
3809                 if (NULL == ref) return -1;
3810                 rlen = fd->refs->ref_id[i]->length; /* In case it just loaded */
3811                 hts_md5_update(md5, ref, rlen);
3812                 hts_md5_final(buf, md5);
3813                 hts_md5_destroy(md5);
3814                 cram_ref_decr(fd->refs, i);
3815 
3816                 hts_md5_hex(buf2, buf);
3817                 if (sam_hdr_update(hdr, ty, "M5", buf2, NULL))
3818                     return -1;
3819             }
3820 
3821             if (fd->ref_fn) {
3822                 char ref_fn[PATH_MAX];
3823                 full_path(ref_fn, fd->ref_fn);
3824                 if (sam_hdr_update(hdr, ty, "UR", ref_fn, NULL))
3825                     return -1;
3826             }
3827         }
3828     }
3829 
3830     if (sam_hdr_rebuild(hdr))
3831         return -1;
3832 
3833     /* Length */
3834     header_len = sam_hdr_length(hdr);
3835     if (CRAM_MAJOR_VERS(fd->version) == 1) {
3836         if (-1 == int32_encode(fd, header_len))
3837             return -1;
3838 
3839         /* Text data */
3840         if (header_len != hwrite(fd->fp, sam_hdr_str(hdr), header_len))
3841             return -1;
3842     } else {
3843         /* Create block(s) inside a container */
3844         cram_block *b = cram_new_block(FILE_HEADER, 0);
3845         cram_container *c = cram_new_container(0, 0);
3846         int padded_length;
3847         char *pads;
3848         int is_cram_3 = (CRAM_MAJOR_VERS(fd->version) >= 3);
3849 
3850         if (!b || !c) {
3851             if (b) cram_free_block(b);
3852             if (c) cram_free_container(c);
3853             return -1;
3854         }
3855 
3856         int32_put_blk(b, header_len);
3857         if (header_len)
3858             BLOCK_APPEND(b, sam_hdr_str(hdr), header_len);
3859         BLOCK_UPLEN(b);
3860 
3861         // Compress header block if V3.0 and above
3862         if (CRAM_MAJOR_VERS(fd->version) >= 3)
3863             cram_compress_block(fd, b, NULL, -1, -1);
3864 
3865         if (blank_block) {
3866             c->length = b->comp_size + 2 + 4*is_cram_3 +
3867                 itf8_size(b->content_id)   +
3868                 itf8_size(b->uncomp_size)  +
3869                 itf8_size(b->comp_size);
3870 
3871             c->num_blocks = 2;
3872             c->num_landmarks = 2;
3873             if (!(c->landmark = malloc(2*sizeof(*c->landmark)))) {
3874                 cram_free_block(b);
3875                 cram_free_container(c);
3876                 return -1;
3877             }
3878             c->landmark[0] = 0;
3879             c->landmark[1] = c->length;
3880 
3881             // Plus extra storage for uncompressed secondary blank block
3882             padded_length = MIN(c->length*.5, 10000);
3883             c->length += padded_length + 2 + 4*is_cram_3 +
3884                 itf8_size(b->content_id) +
3885                 itf8_size(padded_length)*2;
3886         } else {
3887             // Pad the block instead.
3888             c->num_blocks = 1;
3889             c->num_landmarks = 1;
3890             if (!(c->landmark = malloc(sizeof(*c->landmark))))
3891                 return -1;
3892             c->landmark[0] = 0;
3893 
3894             padded_length = MAX(c->length*1.5, 10000) - c->length;
3895 
3896             c->length = b->comp_size + padded_length +
3897                 2 + 4*is_cram_3 +
3898                 itf8_size(b->content_id)   +
3899                 itf8_size(b->uncomp_size)  +
3900                 itf8_size(b->comp_size);
3901 
3902             if (NULL == (pads = calloc(1, padded_length))) {
3903                 cram_free_block(b);
3904                 cram_free_container(c);
3905                 return -1;
3906             }
3907             BLOCK_APPEND(b, pads, padded_length);
3908             BLOCK_UPLEN(b);
3909             free(pads);
3910         }
3911 
3912         if (-1 == cram_write_container(fd, c)) {
3913             cram_free_block(b);
3914             cram_free_container(c);
3915             return -1;
3916         }
3917 
3918         if (-1 == cram_write_block(fd, b)) {
3919             cram_free_block(b);
3920             cram_free_container(c);
3921             return -1;
3922         }
3923 
3924         if (blank_block) {
3925             BLOCK_RESIZE(b, padded_length);
3926             memset(BLOCK_DATA(b), 0, padded_length);
3927             BLOCK_SIZE(b) = padded_length;
3928             BLOCK_UPLEN(b);
3929             b->method = RAW;
3930             if (-1 == cram_write_block(fd, b)) {
3931                 cram_free_block(b);
3932                 cram_free_container(c);
3933                 return -1;
3934             }
3935         }
3936 
3937         cram_free_block(b);
3938         cram_free_container(c);
3939     }
3940 
3941     if (-1 == refs_from_header(fd->refs, fd, fd->header))
3942         return -1;
3943     if (-1 == refs2id(fd->refs, fd->header))
3944         return -1;
3945 
3946     if (0 != hflush(fd->fp))
3947         return -1;
3948 
3949     RP("=== Finishing saving header ===\n");
3950 
3951     return 0;
3952 }
3953 
3954 /* ----------------------------------------------------------------------
3955  * The top-level cram opening, closing and option handling
3956  */
3957 
3958 /*
3959  * Initialises the lookup tables. These could be global statics, but they're
3960  * clumsy to setup in a multi-threaded environment unless we generate
3961  * verbatim code and include that.
3962  */
cram_init_tables(cram_fd * fd)3963 static void cram_init_tables(cram_fd *fd) {
3964     int i;
3965 
3966     memset(fd->L1, 4, 256);
3967     fd->L1['A'] = 0; fd->L1['a'] = 0;
3968     fd->L1['C'] = 1; fd->L1['c'] = 1;
3969     fd->L1['G'] = 2; fd->L1['g'] = 2;
3970     fd->L1['T'] = 3; fd->L1['t'] = 3;
3971 
3972     memset(fd->L2, 5, 256);
3973     fd->L2['A'] = 0; fd->L2['a'] = 0;
3974     fd->L2['C'] = 1; fd->L2['c'] = 1;
3975     fd->L2['G'] = 2; fd->L2['g'] = 2;
3976     fd->L2['T'] = 3; fd->L2['t'] = 3;
3977     fd->L2['N'] = 4; fd->L2['n'] = 4;
3978 
3979     if (CRAM_MAJOR_VERS(fd->version) == 1) {
3980         for (i = 0; i < 0x200; i++) {
3981             int f = 0;
3982 
3983             if (i & CRAM_FPAIRED)      f |= BAM_FPAIRED;
3984             if (i & CRAM_FPROPER_PAIR) f |= BAM_FPROPER_PAIR;
3985             if (i & CRAM_FUNMAP)       f |= BAM_FUNMAP;
3986             if (i & CRAM_FREVERSE)     f |= BAM_FREVERSE;
3987             if (i & CRAM_FREAD1)       f |= BAM_FREAD1;
3988             if (i & CRAM_FREAD2)       f |= BAM_FREAD2;
3989             if (i & CRAM_FSECONDARY)   f |= BAM_FSECONDARY;
3990             if (i & CRAM_FQCFAIL)      f |= BAM_FQCFAIL;
3991             if (i & CRAM_FDUP)         f |= BAM_FDUP;
3992 
3993             fd->bam_flag_swap[i]  = f;
3994         }
3995 
3996         for (i = 0; i < 0x1000; i++) {
3997             int g = 0;
3998 
3999             if (i & BAM_FPAIRED)           g |= CRAM_FPAIRED;
4000             if (i & BAM_FPROPER_PAIR)  g |= CRAM_FPROPER_PAIR;
4001             if (i & BAM_FUNMAP)        g |= CRAM_FUNMAP;
4002             if (i & BAM_FREVERSE)      g |= CRAM_FREVERSE;
4003             if (i & BAM_FREAD1)        g |= CRAM_FREAD1;
4004             if (i & BAM_FREAD2)        g |= CRAM_FREAD2;
4005             if (i & BAM_FSECONDARY)    g |= CRAM_FSECONDARY;
4006             if (i & BAM_FQCFAIL)       g |= CRAM_FQCFAIL;
4007             if (i & BAM_FDUP)          g |= CRAM_FDUP;
4008 
4009             fd->cram_flag_swap[i] = g;
4010         }
4011     } else {
4012         /* NOP */
4013         for (i = 0; i < 0x1000; i++)
4014             fd->bam_flag_swap[i] = i;
4015         for (i = 0; i < 0x1000; i++)
4016             fd->cram_flag_swap[i] = i;
4017     }
4018 
4019     memset(fd->cram_sub_matrix, 4, 32*32);
4020     for (i = 0; i < 32; i++) {
4021         fd->cram_sub_matrix[i]['A'&0x1f]=0;
4022         fd->cram_sub_matrix[i]['C'&0x1f]=1;
4023         fd->cram_sub_matrix[i]['G'&0x1f]=2;
4024         fd->cram_sub_matrix[i]['T'&0x1f]=3;
4025         fd->cram_sub_matrix[i]['N'&0x1f]=4;
4026     }
4027     for (i = 0; i < 20; i+=4) {
4028         int j;
4029         for (j = 0; j < 20; j++) {
4030             fd->cram_sub_matrix["ACGTN"[i>>2]&0x1f][j]=3;
4031             fd->cram_sub_matrix["ACGTN"[i>>2]&0x1f][j]=3;
4032             fd->cram_sub_matrix["ACGTN"[i>>2]&0x1f][j]=3;
4033             fd->cram_sub_matrix["ACGTN"[i>>2]&0x1f][j]=3;
4034         }
4035         fd->cram_sub_matrix["ACGTN"[i>>2]&0x1f][CRAM_SUBST_MATRIX[i+0]&0x1f]=0;
4036         fd->cram_sub_matrix["ACGTN"[i>>2]&0x1f][CRAM_SUBST_MATRIX[i+1]&0x1f]=1;
4037         fd->cram_sub_matrix["ACGTN"[i>>2]&0x1f][CRAM_SUBST_MATRIX[i+2]&0x1f]=2;
4038         fd->cram_sub_matrix["ACGTN"[i>>2]&0x1f][CRAM_SUBST_MATRIX[i+3]&0x1f]=3;
4039     }
4040 }
4041 
4042 // Default version numbers for CRAM
4043 static int major_version = 3;
4044 static int minor_version = 0;
4045 
4046 /*
4047  * Opens a CRAM file for read (mode "rb") or write ("wb").
4048  * The filename may be "-" to indicate stdin or stdout.
4049  *
4050  * Returns file handle on success
4051  *         NULL on failure.
4052  */
cram_open(const char * filename,const char * mode)4053 cram_fd *cram_open(const char *filename, const char *mode) {
4054     hFILE *fp;
4055     cram_fd *fd;
4056     char fmode[3]= { mode[0], '\0', '\0' };
4057 
4058     if (strlen(mode) > 1 && (mode[1] == 'b' || mode[1] == 'c')) {
4059         fmode[1] = 'b';
4060     }
4061 
4062     fp = hopen(filename, fmode);
4063     if (!fp)
4064         return NULL;
4065 
4066     fd = cram_dopen(fp, filename, mode);
4067     if (!fd)
4068         hclose_abruptly(fp);
4069 
4070     return fd;
4071 }
4072 
4073 /* Opens an existing stream for reading or writing.
4074  *
4075  * Returns file handle on success;
4076  *         NULL on failure.
4077  */
cram_dopen(hFILE * fp,const char * filename,const char * mode)4078 cram_fd *cram_dopen(hFILE *fp, const char *filename, const char *mode) {
4079     int i;
4080     char *cp;
4081     cram_fd *fd = calloc(1, sizeof(*fd));
4082     if (!fd)
4083         return NULL;
4084 
4085     fd->level = 5;
4086     for (i = 0; mode[i]; i++) {
4087         if (mode[i] >= '0' && mode[i] <= '9') {
4088             fd->level = mode[i] - '0';
4089             break;
4090         }
4091     }
4092 
4093     fd->fp = fp;
4094     fd->mode = *mode;
4095     fd->first_container = 0;
4096 
4097     if (fd->mode == 'r') {
4098         /* Reader */
4099 
4100         if (!(fd->file_def = cram_read_file_def(fd)))
4101             goto err;
4102 
4103         fd->version = fd->file_def->major_version * 256 +
4104             fd->file_def->minor_version;
4105 
4106         if (!(fd->header = cram_read_SAM_hdr(fd))) {
4107             cram_free_file_def(fd->file_def);
4108             goto err;
4109         }
4110 
4111     } else {
4112         /* Writer */
4113         cram_file_def *def = calloc(1, sizeof(*def));
4114         if (!def)
4115             return NULL;
4116 
4117         fd->file_def = def;
4118 
4119         def->magic[0] = 'C';
4120         def->magic[1] = 'R';
4121         def->magic[2] = 'A';
4122         def->magic[3] = 'M';
4123         def->major_version = 0; // Indicator to write file def later.
4124         def->minor_version = 0;
4125         memset(def->file_id, 0, 20);
4126         strncpy(def->file_id, filename, 20);
4127 
4128         fd->version = major_version * 256 + minor_version;
4129 
4130         /* SAM header written later along with this file_def */
4131     }
4132 
4133     cram_init_tables(fd);
4134 
4135     fd->prefix = strdup((cp = strrchr(filename, '/')) ? cp+1 : filename);
4136     if (!fd->prefix)
4137         goto err;
4138     fd->first_base = fd->last_base = -1;
4139     fd->record_counter = 0;
4140 
4141     fd->ctr = NULL;
4142     fd->ctr_mt = NULL;
4143     fd->refs  = refs_create();
4144     if (!fd->refs)
4145         goto err;
4146     fd->ref_id = -2;
4147     fd->ref = NULL;
4148 
4149     fd->decode_md = 0;
4150     fd->seqs_per_slice = SEQS_PER_SLICE;
4151     fd->bases_per_slice = BASES_PER_SLICE;
4152     fd->slices_per_container = SLICE_PER_CNT;
4153     fd->embed_ref = 0;
4154     fd->no_ref = 0;
4155     fd->ignore_md5 = 0;
4156     fd->lossy_read_names = 0;
4157     fd->use_bz2 = 0;
4158     fd->use_rans = (CRAM_MAJOR_VERS(fd->version) >= 3);
4159     fd->use_lzma = 0;
4160     fd->multi_seq = -1;
4161     fd->unsorted   = 0;
4162     fd->shared_ref = 0;
4163     fd->store_md = 0;
4164     fd->store_nm = 0;
4165 
4166     fd->index       = NULL;
4167     fd->own_pool    = 0;
4168     fd->pool        = NULL;
4169     fd->rqueue      = NULL;
4170     fd->job_pending = NULL;
4171     fd->ooc         = 0;
4172     fd->required_fields = INT_MAX;
4173 
4174     for (i = 0; i < DS_END; i++)
4175         fd->m[i] = cram_new_metrics();
4176 
4177     if (!(fd->tags_used = kh_init(m_metrics)))
4178         goto err;
4179 
4180     fd->range.refid = -2; // no ref.
4181     fd->eof = 1;          // See samtools issue #150
4182     fd->ref_fn = NULL;
4183 
4184     fd->bl = NULL;
4185 
4186     /* Initialise dummy refs from the @SQ headers */
4187     if (-1 == refs_from_header(fd->refs, fd, fd->header))
4188         goto err;
4189 
4190     return fd;
4191 
4192  err:
4193     if (fd)
4194         free(fd);
4195 
4196     return NULL;
4197 }
4198 
4199 /*
4200  * Seek within a CRAM file.
4201  *
4202  * Returns 0 on success
4203  *        -1 on failure
4204  */
cram_seek(cram_fd * fd,off_t offset,int whence)4205 int cram_seek(cram_fd *fd, off_t offset, int whence) {
4206     char buf[65536];
4207 
4208     fd->ooc = 0;
4209 
4210     cram_drain_rqueue(fd);
4211 
4212     if (hseek(fd->fp, offset, whence) >= 0) {
4213         return 0;
4214     }
4215 
4216     if (!(whence == SEEK_CUR && offset >= 0))
4217         return -1;
4218 
4219     /* Couldn't fseek, but we're in SEEK_CUR mode so read instead */
4220     while (offset > 0) {
4221         int len = MIN(65536, offset);
4222         if (len != hread(fd->fp, buf, len))
4223             return -1;
4224         offset -= len;
4225     }
4226 
4227     return 0;
4228 }
4229 
4230 /*
4231  * Flushes a CRAM file.
4232  * Useful for when writing to stdout without wishing to close the stream.
4233  *
4234  * Returns 0 on success
4235  *        -1 on failure
4236  */
cram_flush(cram_fd * fd)4237 int cram_flush(cram_fd *fd) {
4238     if (!fd)
4239         return -1;
4240 
4241     if (fd->mode == 'w' && fd->ctr) {
4242         if(fd->ctr->slice)
4243             cram_update_curr_slice(fd->ctr);
4244 
4245         if (-1 == cram_flush_container_mt(fd, fd->ctr))
4246             return -1;
4247     }
4248 
4249     return 0;
4250 }
4251 
4252 /*
4253  * Closes a CRAM file.
4254  * Returns 0 on success
4255  *        -1 on failure
4256  */
cram_close(cram_fd * fd)4257 int cram_close(cram_fd *fd) {
4258     spare_bams *bl, *next;
4259     int i;
4260 
4261     if (!fd)
4262         return -1;
4263 
4264     if (fd->mode == 'w' && fd->ctr) {
4265         if(fd->ctr->slice)
4266             cram_update_curr_slice(fd->ctr);
4267 
4268         if (-1 == cram_flush_container_mt(fd, fd->ctr))
4269             return -1;
4270     }
4271 
4272     if (fd->mode != 'w')
4273         cram_drain_rqueue(fd);
4274 
4275     if (fd->pool && fd->eof >= 0) {
4276         hts_tpool_process_flush(fd->rqueue);
4277 
4278         if (0 != cram_flush_result(fd))
4279             return -1;
4280 
4281         if (fd->mode == 'w')
4282             fd->ctr = NULL; // prevent double freeing
4283 
4284         pthread_mutex_destroy(&fd->metrics_lock);
4285         pthread_mutex_destroy(&fd->ref_lock);
4286         pthread_mutex_destroy(&fd->bam_list_lock);
4287 
4288         //fprintf(stderr, "CRAM: destroy queue %p\n", fd->rqueue);
4289 
4290         hts_tpool_process_destroy(fd->rqueue);
4291     }
4292 
4293     if (fd->mode == 'w') {
4294         /* Write EOF block */
4295         if (CRAM_MAJOR_VERS(fd->version) == 3) {
4296             if (38 != hwrite(fd->fp,
4297                              "\x0f\x00\x00\x00\xff\xff\xff\xff" // Cont HDR
4298                              "\x0f\xe0\x45\x4f\x46\x00\x00\x00" // Cont HDR
4299                              "\x00\x01\x00"                     // Cont HDR
4300                              "\x05\xbd\xd9\x4f"                 // CRC32
4301                              "\x00\x01\x00\x06\x06"             // Comp.HDR blk
4302                              "\x01\x00\x01\x00\x01\x00"         // Comp.HDR blk
4303                              "\xee\x63\x01\x4b",                // CRC32
4304                              38))
4305                 return -1;
4306         } else {
4307             if (30 != hwrite(fd->fp,
4308                              "\x0b\x00\x00\x00\xff\xff\xff\xff"
4309                              "\x0f\xe0\x45\x4f\x46\x00\x00\x00"
4310                              "\x00\x01\x00\x00\x01\x00\x06\x06"
4311                              "\x01\x00\x01\x00\x01\x00", 30))
4312                 return -1;
4313         }
4314     }
4315 
4316     for (bl = fd->bl; bl; bl = next) {
4317         int i, max_rec = fd->seqs_per_slice * fd->slices_per_container;
4318 
4319         next = bl->next;
4320         for (i = 0; i < max_rec; i++) {
4321             if (bl->bams[i])
4322                 bam_free(bl->bams[i]);
4323         }
4324         free(bl->bams);
4325         free(bl);
4326     }
4327 
4328     if (hclose(fd->fp) != 0)
4329         return -1;
4330 
4331     if (fd->file_def)
4332         cram_free_file_def(fd->file_def);
4333 
4334     if (fd->header)
4335         sam_hdr_free(fd->header);
4336 
4337     free(fd->prefix);
4338 
4339     if (fd->ctr)
4340         cram_free_container(fd->ctr);
4341 
4342     if (fd->ctr_mt && fd->ctr_mt != fd->ctr)
4343         cram_free_container(fd->ctr_mt);
4344 
4345     if (fd->refs)
4346         refs_free(fd->refs);
4347     if (fd->ref_free)
4348         free(fd->ref_free);
4349 
4350     for (i = 0; i < DS_END; i++)
4351         if (fd->m[i])
4352             free(fd->m[i]);
4353 
4354     if (fd->tags_used) {
4355         khint_t k;
4356 
4357         for (k = kh_begin(fd->tags_used); k != kh_end(fd->tags_used); k++) {
4358             if (kh_exist(fd->tags_used, k))
4359                 free(kh_val(fd->tags_used, k));
4360         }
4361 
4362         kh_destroy(m_metrics, fd->tags_used);
4363     }
4364 
4365     if (fd->index)
4366         cram_index_free(fd);
4367 
4368     if (fd->own_pool && fd->pool)
4369         hts_tpool_destroy(fd->pool);
4370 
4371     free(fd);
4372     return 0;
4373 }
4374 
4375 /*
4376  * Returns 1 if we hit an EOF while reading.
4377  */
cram_eof(cram_fd * fd)4378 int cram_eof(cram_fd *fd) {
4379     return fd->eof;
4380 }
4381 
4382 
4383 /*
4384  * Sets options on the cram_fd. See CRAM_OPT_* definitions in cram_structs.h.
4385  * Use this immediately after opening.
4386  *
4387  * Returns 0 on success
4388  *        -1 on failure
4389  */
cram_set_option(cram_fd * fd,enum hts_fmt_option opt,...)4390 int cram_set_option(cram_fd *fd, enum hts_fmt_option opt, ...) {
4391     int r;
4392     va_list args;
4393 
4394     va_start(args, opt);
4395     r = cram_set_voption(fd, opt, args);
4396     va_end(args);
4397 
4398     return r;
4399 }
4400 
4401 /*
4402  * Sets options on the cram_fd. See CRAM_OPT_* definitions in cram_structs.h.
4403  * Use this immediately after opening.
4404  *
4405  * Returns 0 on success
4406  *        -1 on failure
4407  */
cram_set_voption(cram_fd * fd,enum hts_fmt_option opt,va_list args)4408 int cram_set_voption(cram_fd *fd, enum hts_fmt_option opt, va_list args) {
4409     refs_t *refs;
4410 
4411     if (!fd) {
4412         errno = EBADF;
4413         return -1;
4414     }
4415 
4416     switch (opt) {
4417     case CRAM_OPT_DECODE_MD:
4418         fd->decode_md = va_arg(args, int);
4419         break;
4420 
4421     case CRAM_OPT_PREFIX:
4422         if (fd->prefix)
4423             free(fd->prefix);
4424         if (!(fd->prefix = strdup(va_arg(args, char *))))
4425             return -1;
4426         break;
4427 
4428     case CRAM_OPT_VERBOSITY:
4429         break;
4430 
4431     case CRAM_OPT_SEQS_PER_SLICE:
4432         fd->seqs_per_slice = va_arg(args, int);
4433         break;
4434 
4435     case CRAM_OPT_BASES_PER_SLICE:
4436         fd->bases_per_slice = va_arg(args, int);
4437         break;
4438 
4439     case CRAM_OPT_SLICES_PER_CONTAINER:
4440         fd->slices_per_container = va_arg(args, int);
4441         break;
4442 
4443     case CRAM_OPT_EMBED_REF:
4444         fd->embed_ref = va_arg(args, int);
4445         break;
4446 
4447     case CRAM_OPT_NO_REF:
4448         fd->no_ref = va_arg(args, int);
4449         break;
4450 
4451     case CRAM_OPT_IGNORE_MD5:
4452         fd->ignore_md5 = va_arg(args, int);
4453         break;
4454 
4455     case CRAM_OPT_LOSSY_NAMES:
4456         fd->lossy_read_names = va_arg(args, int);
4457         // Currently lossy read names required paired (attached) reads.
4458         // TLEN 0 or being 1 out causes read pairs to be detached, breaking
4459         // the lossy read name compression, so we have extra options to
4460         // slacken the exact TLEN round-trip checks.
4461         fd->tlen_approx = fd->lossy_read_names;
4462         fd->tlen_zero = fd->lossy_read_names;
4463         break;
4464 
4465     case CRAM_OPT_USE_BZIP2:
4466         fd->use_bz2 = va_arg(args, int);
4467         break;
4468 
4469     case CRAM_OPT_USE_RANS:
4470         fd->use_rans = va_arg(args, int);
4471         break;
4472 
4473     case CRAM_OPT_USE_LZMA:
4474         fd->use_lzma = va_arg(args, int);
4475         break;
4476 
4477     case CRAM_OPT_SHARED_REF:
4478         fd->shared_ref = 1;
4479         refs = va_arg(args, refs_t *);
4480         if (refs != fd->refs) {
4481             if (fd->refs)
4482                 refs_free(fd->refs);
4483             fd->refs = refs;
4484             fd->refs->count++;
4485         }
4486         break;
4487 
4488     case CRAM_OPT_RANGE: {
4489         int r = cram_seek_to_refpos(fd, va_arg(args, cram_range *));
4490         pthread_mutex_lock(&fd->range_lock);
4491         if (fd->range.refid != -2)
4492             fd->required_fields |= SAM_POS;
4493         pthread_mutex_unlock(&fd->range_lock);
4494         return r;
4495     }
4496 
4497     case CRAM_OPT_REFERENCE:
4498         return cram_load_reference(fd, va_arg(args, char *));
4499 
4500     case CRAM_OPT_VERSION: {
4501         int major, minor;
4502         char *s = va_arg(args, char *);
4503         if (2 != sscanf(s, "%d.%d", &major, &minor)) {
4504             hts_log_error("Malformed version string %s", s);
4505             return -1;
4506         }
4507         if (!((major == 1 &&  minor == 0) ||
4508               (major == 2 && (minor == 0 || minor == 1)) ||
4509               (major == 3 &&  minor == 0))) {
4510             hts_log_error("Unknown version string; use 1.0, 2.0, 2.1 or 3.0");
4511             errno = EINVAL;
4512             return -1;
4513         }
4514         fd->version = major*256 + minor;
4515 
4516         if (CRAM_MAJOR_VERS(fd->version) >= 3)
4517             fd->use_rans = 1;
4518         break;
4519     }
4520 
4521     case CRAM_OPT_MULTI_SEQ_PER_SLICE:
4522         fd->multi_seq = va_arg(args, int);
4523         break;
4524 
4525     case CRAM_OPT_NTHREADS: {
4526         int nthreads =  va_arg(args, int);
4527         if (nthreads >= 1) {
4528             if (!(fd->pool = hts_tpool_init(nthreads)))
4529                 return -1;
4530 
4531             fd->rqueue = hts_tpool_process_init(fd->pool, nthreads*2, 0);
4532             pthread_mutex_init(&fd->metrics_lock, NULL);
4533             pthread_mutex_init(&fd->ref_lock, NULL);
4534             pthread_mutex_init(&fd->range_lock, NULL);
4535             pthread_mutex_init(&fd->bam_list_lock, NULL);
4536             fd->shared_ref = 1;
4537             fd->own_pool = 1;
4538         }
4539         break;
4540     }
4541 
4542     case CRAM_OPT_THREAD_POOL: {
4543         htsThreadPool *p = va_arg(args, htsThreadPool *);
4544         fd->pool = p ? p->pool : NULL;
4545         if (fd->pool) {
4546             fd->rqueue = hts_tpool_process_init(fd->pool,
4547                                                 p->qsize ? p->qsize : hts_tpool_size(fd->pool)*2,
4548                                                 0);
4549             pthread_mutex_init(&fd->metrics_lock, NULL);
4550             pthread_mutex_init(&fd->ref_lock, NULL);
4551             pthread_mutex_init(&fd->range_lock, NULL);
4552             pthread_mutex_init(&fd->bam_list_lock, NULL);
4553         }
4554         fd->shared_ref = 1; // Needed to avoid clobbering ref between threads
4555         fd->own_pool = 0;
4556 
4557         //fd->qsize = 1;
4558         //fd->decoded = calloc(fd->qsize, sizeof(cram_container *));
4559         //hts_tpool_dispatch(fd->pool, cram_decoder_thread, fd);
4560         break;
4561     }
4562 
4563     case CRAM_OPT_REQUIRED_FIELDS:
4564         fd->required_fields = va_arg(args, int);
4565         if (fd->range.refid != -2)
4566             fd->required_fields |= SAM_POS;
4567         break;
4568 
4569     case CRAM_OPT_STORE_MD:
4570         fd->store_md = va_arg(args, int);
4571         break;
4572 
4573     case CRAM_OPT_STORE_NM:
4574         fd->store_nm = va_arg(args, int);
4575         break;
4576 
4577     case HTS_OPT_COMPRESSION_LEVEL:
4578         fd->level = va_arg(args, int);
4579         break;
4580 
4581     default:
4582         hts_log_error("Unknown CRAM option code %d", opt);
4583         errno = EINVAL;
4584         return -1;
4585     }
4586 
4587     return 0;
4588 }
4589 
cram_check_EOF(cram_fd * fd)4590 int cram_check_EOF(cram_fd *fd)
4591 {
4592     // Byte 9 in these templates is & with 0x0f to resolve differences
4593     // between ITF-8 interpretations between early Java and C
4594     // implementations of CRAM
4595     static const unsigned char TEMPLATE_2_1[30] = {
4596         0x0b, 0x00, 0x00, 0x00, 0xff, 0xff, 0xff, 0xff, 0x0f, 0xe0,
4597         0x45, 0x4f, 0x46, 0x00, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00,
4598         0x01, 0x00, 0x06, 0x06, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00
4599     };
4600     static const unsigned char TEMPLATE_3[38] = {
4601         0x0f, 0x00, 0x00, 0x00, 0xff, 0xff, 0xff, 0xff, 0x0f, 0xe0,
4602         0x45, 0x4f, 0x46, 0x00, 0x00, 0x00, 0x00, 0x01, 0x00, 0x05,
4603         0xbd, 0xd9, 0x4f, 0x00, 0x01, 0x00, 0x06, 0x06, 0x01, 0x00,
4604         0x01, 0x00, 0x01, 0x00, 0xee, 0x63, 0x01, 0x4b
4605     };
4606 
4607     unsigned char buf[38]; // max(sizeof TEMPLATE_*)
4608 
4609     uint8_t major = CRAM_MAJOR_VERS(fd->version);
4610     uint8_t minor = CRAM_MINOR_VERS(fd->version);
4611 
4612     const unsigned char *template;
4613     ssize_t template_len;
4614     if ((major < 2) ||
4615         (major == 2 && minor == 0)) {
4616         return 3; // No EOF support in cram versions less than 2.1
4617     } else if (major == 2 && minor == 1) {
4618         template = TEMPLATE_2_1;
4619         template_len = sizeof TEMPLATE_2_1;
4620     } else {
4621         template = TEMPLATE_3;
4622         template_len = sizeof TEMPLATE_3;
4623     }
4624 
4625     off_t offset = htell(fd->fp);
4626     if (hseek(fd->fp, -template_len, SEEK_END) < 0) {
4627         if (errno == ESPIPE) {
4628             hclearerr(fd->fp);
4629             return 2;
4630         }
4631         else {
4632             return -1;
4633         }
4634     }
4635     if (hread(fd->fp, buf, template_len) != template_len) return -1;
4636     if (hseek(fd->fp, offset, SEEK_SET) < 0) return -1;
4637     buf[8] &= 0x0f;
4638     return (memcmp(template, buf, template_len) == 0)? 1 : 0;
4639 }
4640