1 /*
2 Copyright (c) 2012-2016 Genome Research Ltd.
3 Author: James Bonfield <jkb@sanger.ac.uk>
4 
5 Redistribution and use in source and binary forms, with or without
6 modification, are permitted provided that the following conditions are met:
7 
8    1. Redistributions of source code must retain the above copyright notice,
9 this list of conditions and the following disclaimer.
10 
11    2. Redistributions in binary form must reproduce the above copyright notice,
12 this list of conditions and the following disclaimer in the documentation
13 and/or other materials provided with the distribution.
14 
15    3. Neither the names Genome Research Ltd and Wellcome Trust Sanger
16 Institute nor the names of its contributors may be used to endorse or promote
17 products derived from this software without specific prior written permission.
18 
19 THIS SOFTWARE IS PROVIDED BY GENOME RESEARCH LTD AND CONTRIBUTORS "AS IS" AND
20 ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
21 WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
22 DISCLAIMED. IN NO EVENT SHALL GENOME RESEARCH LTD OR CONTRIBUTORS BE LIABLE
23 FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
24 DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
25 SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
26 CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
27 OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
28 OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
29 */
30 
31 /*
32  * CRAM I/O primitives.
33  *
34  * - ITF8 encoding and decoding.
35  * - Block based I/O
36  * - Zlib inflating and deflating (memory)
37  * - CRAM basic data structure reading and writing
38  * - File opening / closing
39  * - Reference sequence handling
40  */
41 
42 /*
43  * TODO: BLOCK_GROW, BLOCK_RESIZE, BLOCK_APPEND and itf8_put_blk all need
44  * a way to return errors for when malloc fails.
45  */
46 
47 #include <config.h>
48 
49 #include <stdio.h>
50 #include <errno.h>
51 #include <assert.h>
52 #include <stdlib.h>
53 #include <string.h>
54 #include <unistd.h>
55 #include <zlib.h>
56 #ifdef HAVE_LIBBZ2
57 #include <bzlib.h>
58 #endif
59 #ifdef HAVE_LIBLZMA
60 #ifdef HAVE_LZMA_H
61 #include <lzma.h>
62 #else
63 #include "os/lzma_stub.h"
64 #endif
65 #endif
66 #include <sys/types.h>
67 #include <sys/stat.h>
68 #include <math.h>
69 #include <time.h>
70 #include <stdint.h>
71 
72 #include "cram/cram.h"
73 #include "cram/os.h"
74 #include "htslib/hts.h"
75 #include "cram/open_trace_file.h"
76 #include "cram/rANS_static.h"
77 
78 //#define REF_DEBUG
79 
80 #ifdef REF_DEBUG
81 #include <sys/syscall.h>
82 #define gettid() (int)syscall(SYS_gettid)
83 
84 #define RP(...) fprintf (stderr, __VA_ARGS__)
85 #else
86 #define RP(...)
87 #endif
88 
89 #include "htslib/hfile.h"
90 #include "htslib/bgzf.h"
91 #include "htslib/faidx.h"
92 #include "hts_internal.h"
93 
94 #ifndef PATH_MAX
95 #define PATH_MAX FILENAME_MAX
96 #endif
97 
98 #define TRIAL_SPAN 50
99 #define NTRIALS 3
100 
101 
102 /* ----------------------------------------------------------------------
103  * ITF8 encoding and decoding.
104  *
105 * Also see the itf8_get and itf8_put macros in cram_io.h
106  */
107 
108 /*
109  * LEGACY: consider using itf8_decode_crc.
110  *
111  * Reads an integer in ITF-8 encoding from 'cp' and stores it in
112  * *val.
113  *
114  * Returns the number of bytes read on success
115  *        -1 on failure
116  */
itf8_decode(cram_fd * fd,int32_t * val_p)117 int itf8_decode(cram_fd *fd, int32_t *val_p) {
118     static int nbytes[16] = {
119 	0,0,0,0, 0,0,0,0,                               // 0000xxxx - 0111xxxx
120 	1,1,1,1,                                        // 1000xxxx - 1011xxxx
121 	2,2,                                            // 1100xxxx - 1101xxxx
122 	3,                                              // 1110xxxx
123 	4,                                              // 1111xxxx
124     };
125 
126     static int nbits[16] = {
127 	0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, // 0000xxxx - 0111xxxx
128 	0x3f, 0x3f, 0x3f, 0x3f,                         // 1000xxxx - 1011xxxx
129 	0x1f, 0x1f,                                     // 1100xxxx - 1101xxxx
130 	0x0f,                                           // 1110xxxx
131 	0x0f,                                           // 1111xxxx
132     };
133 
134     int32_t val = hgetc(fd->fp);
135     if (val == -1)
136 	return -1;
137 
138     int i = nbytes[val>>4];
139     val &= nbits[val>>4];
140 
141     switch(i) {
142     case 0:
143 	*val_p = val;
144 	return 1;
145 
146     case 1:
147 	val = (val<<8) | (unsigned char)hgetc(fd->fp);
148 	*val_p = val;
149 	return 2;
150 
151     case 2:
152 	val = (val<<8) | (unsigned char)hgetc(fd->fp);
153 	val = (val<<8) | (unsigned char)hgetc(fd->fp);
154 	*val_p = val;
155 	return 3;
156 
157     case 3:
158 	val = (val<<8) | (unsigned char)hgetc(fd->fp);
159 	val = (val<<8) | (unsigned char)hgetc(fd->fp);
160 	val = (val<<8) | (unsigned char)hgetc(fd->fp);
161 	*val_p = val;
162 	return 4;
163 
164     case 4: // really 3.5 more, why make it different?
165 	val = (val<<8) | (unsigned char)hgetc(fd->fp);
166 	val = (val<<8) | (unsigned char)hgetc(fd->fp);
167 	val = (val<<8) | (unsigned char)hgetc(fd->fp);
168 	val = (val<<4) | (((unsigned char)hgetc(fd->fp)) & 0x0f);
169 	*val_p = val;
170     }
171 
172     return 5;
173 }
174 
itf8_decode_crc(cram_fd * fd,int32_t * val_p,uint32_t * crc)175 int itf8_decode_crc(cram_fd *fd, int32_t *val_p, uint32_t *crc) {
176     static int nbytes[16] = {
177 	0,0,0,0, 0,0,0,0,                               // 0000xxxx - 0111xxxx
178 	1,1,1,1,                                        // 1000xxxx - 1011xxxx
179 	2,2,                                            // 1100xxxx - 1101xxxx
180 	3,                                              // 1110xxxx
181 	4,                                              // 1111xxxx
182     };
183 
184     static int nbits[16] = {
185 	0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, // 0000xxxx - 0111xxxx
186 	0x3f, 0x3f, 0x3f, 0x3f,                         // 1000xxxx - 1011xxxx
187 	0x1f, 0x1f,                                     // 1100xxxx - 1101xxxx
188 	0x0f,                                           // 1110xxxx
189 	0x0f,                                           // 1111xxxx
190     };
191     unsigned char c[5];
192 
193     int32_t val = hgetc(fd->fp);
194     if (val == -1)
195 	return -1;
196 
197     c[0]=val;
198 
199     int i = nbytes[val>>4];
200     val &= nbits[val>>4];
201 
202     switch(i) {
203     case 0:
204 	*val_p = val;
205 	*crc = crc32(*crc, c, 1);
206 	return 1;
207 
208     case 1:
209 	val = (val<<8) | (c[1]=hgetc(fd->fp));
210 	*val_p = val;
211 	*crc = crc32(*crc, c, 2);
212 	return 2;
213 
214     case 2:
215 	val = (val<<8) | (c[1]=hgetc(fd->fp));
216 	val = (val<<8) | (c[2]=hgetc(fd->fp));
217 	*val_p = val;
218 	*crc = crc32(*crc, c, 3);
219 	return 3;
220 
221     case 3:
222 	val = (val<<8) | (c[1]=hgetc(fd->fp));
223 	val = (val<<8) | (c[2]=hgetc(fd->fp));
224 	val = (val<<8) | (c[3]=hgetc(fd->fp));
225 	*val_p = val;
226 	*crc = crc32(*crc, c, 4);
227 	return 4;
228 
229     case 4: // really 3.5 more, why make it different?
230         {
231             uint32_t uv = val;
232             uv = (uv<<8) |   (c[1]=hgetc(fd->fp));
233             uv = (uv<<8) |   (c[2]=hgetc(fd->fp));
234             uv = (uv<<8) |   (c[3]=hgetc(fd->fp));
235             uv = (uv<<4) | (((c[4]=hgetc(fd->fp))) & 0x0f);
236             *val_p = uv < 0x80000000UL ? uv : -((int32_t) (0xffffffffUL - uv)) - 1;
237             *crc = crc32(*crc, c, 5);
238         }
239     }
240 
241     return 5;
242 }
243 
244 /*
245  * Encodes and writes a single integer in ITF-8 format.
246  * Returns 0 on success
247  *        -1 on failure
248  */
itf8_encode(cram_fd * fd,int32_t val)249 int itf8_encode(cram_fd *fd, int32_t val) {
250     char buf[5];
251     int len = itf8_put(buf, val);
252     return hwrite(fd->fp, buf, len) == len ? 0 : -1;
253 }
254 
255 const int itf8_bytes[16] = {
256     1, 1, 1, 1,  1, 1, 1, 1,
257     2, 2, 2, 2,  3, 3, 4, 5
258 };
259 
260 const int ltf8_bytes[256] = {
261     1, 1, 1, 1,  1, 1, 1, 1,  1, 1, 1, 1,  1, 1, 1, 1,
262     1, 1, 1, 1,  1, 1, 1, 1,  1, 1, 1, 1,  1, 1, 1, 1,
263     1, 1, 1, 1,  1, 1, 1, 1,  1, 1, 1, 1,  1, 1, 1, 1,
264     1, 1, 1, 1,  1, 1, 1, 1,  1, 1, 1, 1,  1, 1, 1, 1,
265 
266     1, 1, 1, 1,  1, 1, 1, 1,  1, 1, 1, 1,  1, 1, 1, 1,
267     1, 1, 1, 1,  1, 1, 1, 1,  1, 1, 1, 1,  1, 1, 1, 1,
268     1, 1, 1, 1,  1, 1, 1, 1,  1, 1, 1, 1,  1, 1, 1, 1,
269     1, 1, 1, 1,  1, 1, 1, 1,  1, 1, 1, 1,  1, 1, 1, 1,
270 
271     2, 2, 2, 2,  2, 2, 2, 2,  2, 2, 2, 2,  2, 2, 2, 2,
272     2, 2, 2, 2,  2, 2, 2, 2,  2, 2, 2, 2,  2, 2, 2, 2,
273     2, 2, 2, 2,  2, 2, 2, 2,  2, 2, 2, 2,  2, 2, 2, 2,
274     2, 2, 2, 2,  2, 2, 2, 2,  2, 2, 2, 2,  2, 2, 2, 2,
275 
276     3, 3, 3, 3,  3, 3, 3, 3,  3, 3, 3, 3,  3, 3, 3, 3,
277     3, 3, 3, 3,  3, 3, 3, 3,  3, 3, 3, 3,  3, 3, 3, 3,
278 
279     4, 4, 4, 4,  4, 4, 4, 4,  4, 4, 4, 4,  4, 4, 4, 4,
280 
281     5, 5, 5, 5,  5, 5, 5, 5,  6, 6, 6, 6,  7, 7, 8, 9
282 };
283 
284 /*
285  * LEGACY: consider using ltf8_decode_crc.
286  */
ltf8_decode(cram_fd * fd,int64_t * val_p)287 int ltf8_decode(cram_fd *fd, int64_t *val_p) {
288     int c = hgetc(fd->fp);
289     int64_t val = (unsigned char)c;
290     if (c == -1)
291 	return -1;
292 
293     if (val < 0x80) {
294 	*val_p =   val;
295 	return 1;
296 
297     } else if (val < 0xc0) {
298 	val = (val<<8) | (unsigned char)hgetc(fd->fp);
299 	*val_p = val & (((1LL<<(6+8)))-1);
300 	return 2;
301 
302     } else if (val < 0xe0) {
303 	val = (val<<8) | (unsigned char)hgetc(fd->fp);
304 	val = (val<<8) | (unsigned char)hgetc(fd->fp);
305 	*val_p = val & ((1LL<<(5+2*8))-1);
306 	return 3;
307 
308     } else if (val < 0xf0) {
309 	val = (val<<8) | (unsigned char)hgetc(fd->fp);
310 	val = (val<<8) | (unsigned char)hgetc(fd->fp);
311 	val = (val<<8) | (unsigned char)hgetc(fd->fp);
312 	*val_p = val & ((1LL<<(4+3*8))-1);
313 	return 4;
314 
315     } else if (val < 0xf8) {
316 	val = (val<<8) | (unsigned char)hgetc(fd->fp);
317 	val = (val<<8) | (unsigned char)hgetc(fd->fp);
318 	val = (val<<8) | (unsigned char)hgetc(fd->fp);
319 	val = (val<<8) | (unsigned char)hgetc(fd->fp);
320 	*val_p = val & ((1LL<<(3+4*8))-1);
321 	return 5;
322 
323     } else if (val < 0xfc) {
324 	val = (val<<8) | (unsigned char)hgetc(fd->fp);
325 	val = (val<<8) | (unsigned char)hgetc(fd->fp);
326 	val = (val<<8) | (unsigned char)hgetc(fd->fp);
327 	val = (val<<8) | (unsigned char)hgetc(fd->fp);
328 	val = (val<<8) | (unsigned char)hgetc(fd->fp);
329 	*val_p = val & ((1LL<<(2+5*8))-1);
330 	return 6;
331 
332     } else if (val < 0xfe) {
333 	val = (val<<8) | (unsigned char)hgetc(fd->fp);
334 	val = (val<<8) | (unsigned char)hgetc(fd->fp);
335 	val = (val<<8) | (unsigned char)hgetc(fd->fp);
336 	val = (val<<8) | (unsigned char)hgetc(fd->fp);
337 	val = (val<<8) | (unsigned char)hgetc(fd->fp);
338 	val = (val<<8) | (unsigned char)hgetc(fd->fp);
339 	*val_p = val & ((1LL<<(1+6*8))-1);
340 	return 7;
341 
342     } else if (val < 0xff) {
343 	val = (val<<8) | (unsigned char)hgetc(fd->fp);
344 	val = (val<<8) | (unsigned char)hgetc(fd->fp);
345 	val = (val<<8) | (unsigned char)hgetc(fd->fp);
346 	val = (val<<8) | (unsigned char)hgetc(fd->fp);
347 	val = (val<<8) | (unsigned char)hgetc(fd->fp);
348 	val = (val<<8) | (unsigned char)hgetc(fd->fp);
349 	val = (val<<8) | (unsigned char)hgetc(fd->fp);
350 	*val_p = val & ((1LL<<(7*8))-1);
351 	return 8;
352 
353     } else {
354 	val = (val<<8) | (unsigned char)hgetc(fd->fp);
355 	val = (val<<8) | (unsigned char)hgetc(fd->fp);
356 	val = (val<<8) | (unsigned char)hgetc(fd->fp);
357 	val = (val<<8) | (unsigned char)hgetc(fd->fp);
358 	val = (val<<8) | (unsigned char)hgetc(fd->fp);
359 	val = (val<<8) | (unsigned char)hgetc(fd->fp);
360 	val = (val<<8) | (unsigned char)hgetc(fd->fp);
361 	val = (val<<8) | (unsigned char)hgetc(fd->fp);
362 	*val_p = val;
363     }
364 
365     return 9;
366 }
367 
ltf8_decode_crc(cram_fd * fd,int64_t * val_p,uint32_t * crc)368 int ltf8_decode_crc(cram_fd *fd, int64_t *val_p, uint32_t *crc) {
369     unsigned char c[9];
370     int64_t val = (unsigned char)hgetc(fd->fp);
371     if (val == -1)
372 	return -1;
373 
374     c[0] = val;
375 
376     if (val < 0x80) {
377 	*val_p =   val;
378 	*crc = crc32(*crc, c, 1);
379 	return 1;
380 
381     } else if (val < 0xc0) {
382 	val = (val<<8) | (c[1]=hgetc(fd->fp));;
383 	*val_p = val & (((1LL<<(6+8)))-1);
384 	*crc = crc32(*crc, c, 2);
385 	return 2;
386 
387     } else if (val < 0xe0) {
388 	val = (val<<8) | (c[1]=hgetc(fd->fp));;
389 	val = (val<<8) | (c[2]=hgetc(fd->fp));;
390 	*val_p = val & ((1LL<<(5+2*8))-1);
391 	*crc = crc32(*crc, c, 3);
392 	return 3;
393 
394     } else if (val < 0xf0) {
395 	val = (val<<8) | (c[1]=hgetc(fd->fp));;
396 	val = (val<<8) | (c[2]=hgetc(fd->fp));;
397 	val = (val<<8) | (c[3]=hgetc(fd->fp));;
398 	*val_p = val & ((1LL<<(4+3*8))-1);
399 	*crc = crc32(*crc, c, 4);
400 	return 4;
401 
402     } else if (val < 0xf8) {
403 	val = (val<<8) | (c[1]=hgetc(fd->fp));;
404 	val = (val<<8) | (c[2]=hgetc(fd->fp));;
405 	val = (val<<8) | (c[3]=hgetc(fd->fp));;
406 	val = (val<<8) | (c[4]=hgetc(fd->fp));;
407 	*val_p = val & ((1LL<<(3+4*8))-1);
408 	*crc = crc32(*crc, c, 5);
409 	return 5;
410 
411     } else if (val < 0xfc) {
412 	val = (val<<8) | (c[1]=hgetc(fd->fp));;
413 	val = (val<<8) | (c[2]=hgetc(fd->fp));;
414 	val = (val<<8) | (c[3]=hgetc(fd->fp));;
415 	val = (val<<8) | (c[4]=hgetc(fd->fp));;
416 	val = (val<<8) | (c[5]=hgetc(fd->fp));;
417 	*val_p = val & ((1LL<<(2+5*8))-1);
418 	*crc = crc32(*crc, c, 6);
419 	return 6;
420 
421     } else if (val < 0xfe) {
422 	val = (val<<8) | (c[1]=hgetc(fd->fp));;
423 	val = (val<<8) | (c[2]=hgetc(fd->fp));;
424 	val = (val<<8) | (c[3]=hgetc(fd->fp));;
425 	val = (val<<8) | (c[4]=hgetc(fd->fp));;
426 	val = (val<<8) | (c[5]=hgetc(fd->fp));;
427 	val = (val<<8) | (c[6]=hgetc(fd->fp));;
428 	*val_p = val & ((1LL<<(1+6*8))-1);
429 	*crc = crc32(*crc, c, 7);
430 	return 7;
431 
432     } else if (val < 0xff) {
433 	val = (val<<8) | (c[1]=hgetc(fd->fp));;
434 	val = (val<<8) | (c[2]=hgetc(fd->fp));;
435 	val = (val<<8) | (c[3]=hgetc(fd->fp));;
436 	val = (val<<8) | (c[4]=hgetc(fd->fp));;
437 	val = (val<<8) | (c[5]=hgetc(fd->fp));;
438 	val = (val<<8) | (c[6]=hgetc(fd->fp));;
439 	val = (val<<8) | (c[7]=hgetc(fd->fp));;
440 	*val_p = val & ((1LL<<(7*8))-1);
441 	*crc = crc32(*crc, c, 8);
442 	return 8;
443 
444     } else {
445 	val = (val<<8) | (c[1]=hgetc(fd->fp));;
446 	val = (val<<8) | (c[2]=hgetc(fd->fp));;
447 	val = (val<<8) | (c[3]=hgetc(fd->fp));;
448 	val = (val<<8) | (c[4]=hgetc(fd->fp));;
449 	val = (val<<8) | (c[5]=hgetc(fd->fp));;
450 	val = (val<<8) | (c[6]=hgetc(fd->fp));;
451 	val = (val<<8) | (c[7]=hgetc(fd->fp));;
452 	val = (val<<8) | (c[8]=hgetc(fd->fp));;
453 	*crc = crc32(*crc, c, 9);
454 	*val_p = val;
455     }
456 
457     return 9;
458 }
459 
460 /*
461  * Pushes a value in ITF8 format onto the end of a block.
462  * This shouldn't be used for high-volume data as it is not the fastest
463  * method.
464  *
465  * Returns the number of bytes written
466  */
itf8_put_blk(cram_block * blk,int val)467 int itf8_put_blk(cram_block *blk, int val) {
468     char buf[5];
469     int sz;
470 
471     sz = itf8_put(buf, val);
472     BLOCK_APPEND(blk, buf, sz);
473     return sz;
474 }
475 
476 /*
477  * Decodes a 32-bit little endian value from fd and stores in val.
478  *
479  * Returns the number of bytes read on success
480  *         -1 on failure
481  */
int32_decode(cram_fd * fd,int32_t * val)482 int int32_decode(cram_fd *fd, int32_t *val) {
483     int32_t i;
484     if (4 != hread(fd->fp, &i, 4))
485 	return -1;
486 
487     *val = le_int4(i);
488     return 4;
489 }
490 
491 /*
492  * Encodes a 32-bit little endian value 'val' and writes to fd.
493  *
494  * Returns the number of bytes written on success
495  *         -1 on failure
496  */
int32_encode(cram_fd * fd,int32_t val)497 int int32_encode(cram_fd *fd, int32_t val) {
498     val = le_int4(val);
499     if (4 != hwrite(fd->fp, &val, 4))
500 	return -1;
501 
502     return 4;
503 }
504 
505 /* As int32_decoded/encode, but from/to blocks instead of cram_fd */
int32_get_blk(cram_block * b,int32_t * val)506 int int32_get_blk(cram_block *b, int32_t *val) {
507     if (b->uncomp_size - BLOCK_SIZE(b) < 4)
508 	return -1;
509 
510     *val =
511 	 b->data[b->byte  ]        |
512 	(b->data[b->byte+1] <<  8) |
513 	(b->data[b->byte+2] << 16) |
514 	(b->data[b->byte+3] << 24);
515     BLOCK_SIZE(b) += 4;
516     return 4;
517 }
518 
519 /* As int32_decoded/encode, but from/to blocks instead of cram_fd */
int32_put_blk(cram_block * b,int32_t val)520 int int32_put_blk(cram_block *b, int32_t val) {
521     unsigned char cp[4];
522     cp[0] = ( val      & 0xff);
523     cp[1] = ((val>>8)  & 0xff);
524     cp[2] = ((val>>16) & 0xff);
525     cp[3] = ((val>>24) & 0xff);
526 
527     BLOCK_APPEND(b, cp, 4);
528     return b->data ? 0 : -1;
529 }
530 
531 /* ----------------------------------------------------------------------
532  * zlib compression code - from Gap5's tg_iface_g.c
533  * They're static here as they're only used within the cram_compress_block
534  * and cram_uncompress_block functions, which are the external interface.
535  */
zlib_mem_inflate(char * cdata,size_t csize,size_t * size)536 char *zlib_mem_inflate(char *cdata, size_t csize, size_t *size) {
537     z_stream s;
538     unsigned char *data = NULL; /* Uncompressed output */
539     int data_alloc = 0;
540     int err;
541 
542     /* Starting point at uncompressed size, and scale after that */
543     data = malloc(data_alloc = csize*1.2+100);
544     if (!data)
545 	return NULL;
546 
547     /* Initialise zlib stream */
548     s.zalloc = Z_NULL; /* use default allocation functions */
549     s.zfree  = Z_NULL;
550     s.opaque = Z_NULL;
551     s.next_in  = (unsigned char *)cdata;
552     s.avail_in = csize;
553     s.total_in = 0;
554     s.next_out  = data;
555     s.avail_out = data_alloc;
556     s.total_out = 0;
557 
558     //err = inflateInit(&s);
559     err = inflateInit2(&s, 15 + 32);
560     if (err != Z_OK) {
561 	hts_log_error("Call to zlib inflateInit failed: %s", s.msg);
562 	free(data);
563 	return NULL;
564     }
565 
566     /* Decode to 'data' array */
567     for (;s.avail_in;) {
568 	unsigned char *data_tmp;
569 	int alloc_inc;
570 
571 	s.next_out = &data[s.total_out];
572 	err = inflate(&s, Z_NO_FLUSH);
573 	if (err == Z_STREAM_END)
574 	    break;
575 
576 	if (err != Z_OK) {
577 	    hts_log_error("Call to zlib inflate failed: %s", s.msg);
578 	    if (data)
579 		free(data);
580 	    return NULL;
581 	}
582 
583 	/* More to come, so realloc based on growth so far */
584 	alloc_inc = (double)s.avail_in/s.total_in * s.total_out + 100;
585 	data = realloc((data_tmp = data), data_alloc += alloc_inc);
586 	if (!data) {
587 	    free(data_tmp);
588 	    return NULL;
589 	}
590 	s.avail_out += alloc_inc;
591     }
592     inflateEnd(&s);
593 
594     *size = s.total_out;
595     return (char *)data;
596 }
597 
zlib_mem_deflate(char * data,size_t size,size_t * cdata_size,int level,int strat)598 static char *zlib_mem_deflate(char *data, size_t size, size_t *cdata_size,
599 			      int level, int strat) {
600     z_stream s;
601     unsigned char *cdata = NULL; /* Compressed output */
602     int cdata_alloc = 0;
603     int cdata_pos = 0;
604     int err;
605 
606     cdata = malloc(cdata_alloc = size*1.05+100);
607     if (!cdata)
608 	return NULL;
609     cdata_pos = 0;
610 
611     /* Initialise zlib stream */
612     s.zalloc = Z_NULL; /* use default allocation functions */
613     s.zfree  = Z_NULL;
614     s.opaque = Z_NULL;
615     s.next_in  = (unsigned char *)data;
616     s.avail_in = size;
617     s.total_in = 0;
618     s.next_out  = cdata;
619     s.avail_out = cdata_alloc;
620     s.total_out = 0;
621     s.data_type = Z_BINARY;
622 
623     err = deflateInit2(&s, level, Z_DEFLATED, 15|16, 9, strat);
624     if (err != Z_OK) {
625 	hts_log_error("Call to zlib deflateInit2 failed: %s", s.msg);
626 	return NULL;
627     }
628 
629     /* Encode to 'cdata' array */
630     for (;s.avail_in;) {
631 	s.next_out = &cdata[cdata_pos];
632 	s.avail_out = cdata_alloc - cdata_pos;
633 	if (cdata_alloc - cdata_pos <= 0) {
634 	    hts_log_error("Deflate produced larger output than expected");
635 	    return NULL;
636 	}
637 	err = deflate(&s, Z_NO_FLUSH);
638 	cdata_pos = cdata_alloc - s.avail_out;
639 	if (err != Z_OK) {
640 	    hts_log_error("Call to zlib deflate failed: %s", s.msg);
641 	    break;
642 	}
643     }
644     if (deflate(&s, Z_FINISH) != Z_STREAM_END) {
645 	hts_log_error("Call to zlib deflate failed: %s", s.msg);
646     }
647     *cdata_size = s.total_out;
648 
649     if (deflateEnd(&s) != Z_OK) {
650 	hts_log_error("Call to zlib deflate failed: %s", s.msg);
651     }
652     return (char *)cdata;
653 }
654 
655 #ifdef HAVE_LIBLZMA
656 /* ------------------------------------------------------------------------ */
657 /*
658  * Data compression routines using liblzma (xz)
659  *
660  * On a test set this shrunk the main db from 136157104 bytes to 114796168, but
661  * caused tg_index to grow from 2m43.707s to 15m3.961s. Exporting as bfastq
662  * went from 18.3s to 36.3s. So decompression suffers too, but not as bad
663  * as compression times.
664  *
665  * For now we disable this functionality. If it's to be reenabled make sure you
666  * improve the mem_inflate implementation as it's just a test hack at the
667  * moment.
668  */
669 
lzma_mem_deflate(char * data,size_t size,size_t * cdata_size,int level)670 static char *lzma_mem_deflate(char *data, size_t size, size_t *cdata_size,
671 			      int level) {
672     char *out;
673     size_t out_size = lzma_stream_buffer_bound(size);
674     *cdata_size = 0;
675 
676     out = malloc(out_size);
677 
678     /* Single call compression */
679     if (LZMA_OK != lzma_easy_buffer_encode(level, LZMA_CHECK_CRC32, NULL,
680 					   (uint8_t *)data, size,
681 					   (uint8_t *)out, cdata_size,
682 					   out_size))
683     	return NULL;
684 
685     return out;
686 }
687 
lzma_mem_inflate(char * cdata,size_t csize,size_t * size)688 static char *lzma_mem_inflate(char *cdata, size_t csize, size_t *size) {
689     lzma_stream strm = LZMA_STREAM_INIT;
690     size_t out_size = 0, out_pos = 0;
691     char *out = NULL;
692     int r;
693 
694     /* Initiate the decoder */
695     if (LZMA_OK != lzma_stream_decoder(&strm, lzma_easy_decoder_memusage(9), 0))
696 	return NULL;
697 
698     /* Decode loop */
699     strm.avail_in = csize;
700     strm.next_in = (uint8_t *)cdata;
701 
702     for (;strm.avail_in;) {
703 	if (strm.avail_in > out_size - out_pos) {
704 	    out_size += strm.avail_in * 4 + 32768;
705 	    out = realloc(out, out_size);
706 	}
707 	strm.avail_out = out_size - out_pos;
708 	strm.next_out = (uint8_t *)&out[out_pos];
709 
710 	r = lzma_code(&strm, LZMA_RUN);
711 	if (LZMA_OK != r && LZMA_STREAM_END != r) {
712 	    hts_log_error("LZMA decode failure (error %d)", r);
713 	    return NULL;
714 	}
715 
716 	out_pos = strm.total_out;
717 
718 	if (r == LZMA_STREAM_END)
719 	    break;
720     }
721 
722     /* finish up any unflushed data; necessary? */
723     r = lzma_code(&strm, LZMA_FINISH);
724     if (r != LZMA_OK && r != LZMA_STREAM_END) {
725 	hts_log_error("Call to lzma_code failed with error %d", r);
726 	return NULL;
727     }
728 
729     out = realloc(out, strm.total_out);
730     *size = strm.total_out;
731 
732     lzma_end(&strm);
733 
734     return out;
735 }
736 #endif
737 
738 /* ----------------------------------------------------------------------
739  * CRAM blocks - the dynamically growable data block. We have code to
740  * create, update, (un)compress and read/write.
741  *
742  * These are derived from the deflate_interlaced.c blocks, but with the
743  * CRAM extension of content types and IDs.
744  */
745 
746 /*
747  * Allocates a new cram_block structure with a specified content_type and
748  * id.
749  *
750  * Returns block pointer on success
751  *         NULL on failure
752  */
cram_new_block(enum cram_content_type content_type,int content_id)753 cram_block *cram_new_block(enum cram_content_type content_type,
754 			   int content_id) {
755     cram_block *b = malloc(sizeof(*b));
756     if (!b)
757 	return NULL;
758     b->method = b->orig_method = RAW;
759     b->content_type = content_type;
760     b->content_id = content_id;
761     b->comp_size = 0;
762     b->uncomp_size = 0;
763     b->data = NULL;
764     b->alloc = 0;
765     b->byte = 0;
766     b->bit = 7; // MSB
767 
768     return b;
769 }
770 
771 /*
772  * Reads a block from a cram file.
773  * Returns cram_block pointer on success.
774  *         NULL on failure
775  */
cram_read_block(cram_fd * fd)776 cram_block *cram_read_block(cram_fd *fd) {
777     cram_block *b = malloc(sizeof(*b));
778     unsigned char c;
779     uint32_t crc = 0;
780     if (!b)
781 	return NULL;
782 
783     //fprintf(stderr, "Block at %d\n", (int)ftell(fd->fp));
784 
785     if (-1 == (b->method      = hgetc(fd->fp))) { free(b); return NULL; }
786     c = b->method; crc = crc32(crc, &c, 1);
787     if (-1 == (b->content_type= hgetc(fd->fp))) { free(b); return NULL; }
788     c = b->content_type; crc = crc32(crc, &c, 1);
789     if (-1 == itf8_decode_crc(fd, &b->content_id, &crc))  { free(b); return NULL; }
790     if (-1 == itf8_decode_crc(fd, &b->comp_size, &crc))   { free(b); return NULL; }
791     if (-1 == itf8_decode_crc(fd, &b->uncomp_size, &crc)) { free(b); return NULL; }
792 
793     //fprintf(stderr, "  method %d, ctype %d, cid %d, csize %d, ucsize %d\n",
794     //	    b->method, b->content_type, b->content_id, b->comp_size, b->uncomp_size);
795 
796     if (b->method == RAW) {
797         if (b->uncomp_size < 0 || b->comp_size != b->uncomp_size) {
798             free(b);
799             return NULL;
800         }
801 	b->alloc = b->uncomp_size;
802 	if (!(b->data = malloc(b->uncomp_size))){ free(b); return NULL; }
803 	if (b->uncomp_size != hread(fd->fp, b->data, b->uncomp_size)) {
804 	    free(b->data);
805 	    free(b);
806 	    return NULL;
807 	}
808     } else {
809         if (b->comp_size < 0) { free(b); return NULL; }
810 	b->alloc = b->comp_size;
811 	if (!(b->data = malloc(b->comp_size)))  { free(b); return NULL; }
812 	if (b->comp_size != hread(fd->fp, b->data, b->comp_size)) {
813 	    free(b->data);
814 	    free(b);
815 	    return NULL;
816 	}
817     }
818 
819     if (CRAM_MAJOR_VERS(fd->version) >= 3) {
820 	if (-1 == int32_decode(fd, (int32_t *)&b->crc32)) {
821 	    free(b);
822 	    return NULL;
823 	}
824 
825 	crc = crc32(crc, b->data ? b->data : (uc *)"", b->alloc);
826 	if (crc != b->crc32) {
827 	    hts_log_error("Block CRC32 failure");
828 	    free(b->data);
829 	    free(b);
830 	    return NULL;
831 	}
832     }
833 
834     b->orig_method = b->method;
835     b->idx = 0;
836     b->byte = 0;
837     b->bit = 7; // MSB
838 
839     return b;
840 }
841 
842 
843 /*
844  * Computes the size of a cram block, including the block
845  * header itself.
846  */
cram_block_size(cram_block * b)847 uint32_t cram_block_size(cram_block *b) {
848     unsigned char dat[100], *cp = dat;;
849     uint32_t sz;
850 
851     *cp++ = b->method;
852     *cp++ = b->content_type;
853     cp += itf8_put((char*)cp, b->content_id);
854     cp += itf8_put((char*)cp, b->comp_size);
855     cp += itf8_put((char*)cp, b->uncomp_size);
856 
857     sz = cp-dat + 4;
858     sz += b->method == RAW ? b->uncomp_size : b->comp_size;
859 
860     return sz;
861 }
862 
863 /*
864  * Writes a CRAM block.
865  * Returns 0 on success
866  *        -1 on failure
867  */
cram_write_block(cram_fd * fd,cram_block * b)868 int cram_write_block(cram_fd *fd, cram_block *b) {
869     assert(b->method != RAW || (b->comp_size == b->uncomp_size));
870 
871     if (hputc(b->method,       fd->fp)  == EOF) return -1;
872     if (hputc(b->content_type, fd->fp)  == EOF) return -1;
873     if (itf8_encode(fd, b->content_id)  ==  -1) return -1;
874     if (itf8_encode(fd, b->comp_size)   ==  -1) return -1;
875     if (itf8_encode(fd, b->uncomp_size) ==  -1) return -1;
876 
877     if (b->data) {
878         if (b->method == RAW) {
879             if (b->uncomp_size != hwrite(fd->fp, b->data, b->uncomp_size))
880                 return -1;
881         } else {
882             if (b->comp_size != hwrite(fd->fp, b->data, b->comp_size))
883                 return -1;
884         }
885     } else {
886         // Absent blocks should be size 0
887         assert(b->method == RAW && b->uncomp_size == 0);
888     }
889 
890     if (CRAM_MAJOR_VERS(fd->version) >= 3) {
891 	unsigned char dat[100], *cp = dat;;
892 	uint32_t crc;
893 
894 	*cp++ = b->method;
895 	*cp++ = b->content_type;
896 	cp += itf8_put((char*)cp, b->content_id);
897 	cp += itf8_put((char*)cp, b->comp_size);
898 	cp += itf8_put((char*)cp, b->uncomp_size);
899 	crc = crc32(0L, dat, cp-dat);
900 
901 	if (b->method == RAW) {
902 	    b->crc32 = crc32(crc, b->data ? b->data : (uc*)"", b->uncomp_size);
903 	} else {
904 	    b->crc32 = crc32(crc, b->data ? b->data : (uc*)"", b->comp_size);
905 	}
906 
907 	if (-1 == int32_encode(fd, b->crc32))
908 	    return -1;
909     }
910 
911     return 0;
912 }
913 
914 /*
915  * Frees a CRAM block, deallocating internal data too.
916  */
cram_free_block(cram_block * b)917 void cram_free_block(cram_block *b) {
918     if (!b)
919 	return;
920     if (b->data)
921 	free(b->data);
922     free(b);
923 }
924 
925 /*
926  * Uncompresses a CRAM block, if compressed.
927  */
cram_uncompress_block(cram_block * b)928 int cram_uncompress_block(cram_block *b) {
929     char *uncomp;
930     size_t uncomp_size = 0;
931 
932     if (b->uncomp_size == 0) {
933 	// blank block
934 	b->method = RAW;
935 	return 0;
936     }
937 
938     switch (b->method) {
939     case RAW:
940 	return 0;
941 
942     case GZIP:
943 	uncomp = zlib_mem_inflate((char *)b->data, b->comp_size, &uncomp_size);
944 	if (!uncomp)
945 	    return -1;
946 	if ((int)uncomp_size != b->uncomp_size) {
947 	    free(uncomp);
948 	    return -1;
949 	}
950 	free(b->data);
951 	b->data = (unsigned char *)uncomp;
952 	b->alloc = uncomp_size;
953 	b->method = RAW;
954 	break;
955 
956 #ifdef HAVE_LIBBZ2
957     case BZIP2: {
958 	unsigned int usize = b->uncomp_size;
959 	if (!(uncomp = malloc(usize)))
960 	    return -1;
961 	if (BZ_OK != BZ2_bzBuffToBuffDecompress(uncomp, &usize,
962 						(char *)b->data, b->comp_size,
963 						0, 0)) {
964 	    free(uncomp);
965 	    return -1;
966 	}
967 	free(b->data);
968 	b->data = (unsigned char *)uncomp;
969 	b->alloc = usize;
970 	b->method = RAW;
971 	b->uncomp_size = usize; // Just incase it differs
972 	break;
973     }
974 #else
975     case BZIP2:
976 	hts_log_error("Bzip2 compression is not compiled into this version. Please rebuild and try again");
977 	return -1;
978 #endif
979 
980 #ifdef HAVE_LIBLZMA
981     case LZMA:
982 	uncomp = lzma_mem_inflate((char *)b->data, b->comp_size, &uncomp_size);
983 	if (!uncomp)
984 	    return -1;
985 	if ((int)uncomp_size != b->uncomp_size)
986 	    return -1;
987 	free(b->data);
988 	b->data = (unsigned char *)uncomp;
989 	b->alloc = uncomp_size;
990 	b->method = RAW;
991 	break;
992 #else
993     case LZMA:
994 	hts_log_error("Lzma compression is not compiled into this version. Please rebuild and try again");
995 	return -1;
996 	break;
997 #endif
998 
999     case RANS: {
1000 	unsigned int usize = b->uncomp_size, usize2;
1001 	uncomp = (char *)rans_uncompress(b->data, b->comp_size, &usize2);
1002 	if (!uncomp || usize != usize2)
1003 	    return -1;
1004 	free(b->data);
1005 	b->data = (unsigned char *)uncomp;
1006 	b->alloc = usize2;
1007 	b->method = RAW;
1008 	b->uncomp_size = usize2; // Just incase it differs
1009 	//fprintf(stderr, "Expanded %d to %d\n", b->comp_size, b->uncomp_size);
1010 	break;
1011     }
1012 
1013     default:
1014 	return -1;
1015     }
1016 
1017     return 0;
1018 }
1019 
cram_compress_by_method(char * in,size_t in_size,size_t * out_size,enum cram_block_method method,int level,int strat)1020 static char *cram_compress_by_method(char *in, size_t in_size,
1021 				     size_t *out_size,
1022 				     enum cram_block_method method,
1023 				     int level, int strat) {
1024     switch (method) {
1025     case GZIP:
1026 	return zlib_mem_deflate(in, in_size, out_size, level, strat);
1027 
1028     case BZIP2: {
1029 #ifdef HAVE_LIBBZ2
1030 	unsigned int comp_size = in_size*1.01 + 600;
1031 	char *comp = malloc(comp_size);
1032 	if (!comp)
1033 	    return NULL;
1034 
1035 	if (BZ_OK != BZ2_bzBuffToBuffCompress(comp, &comp_size,
1036 					      in, in_size,
1037 					      level, 0, 30)) {
1038 	    free(comp);
1039 	    return NULL;
1040 	}
1041 	*out_size = comp_size;
1042 	return comp;
1043 #else
1044 	return NULL;
1045 #endif
1046     }
1047 
1048     case LZMA:
1049 #ifdef HAVE_LIBLZMA
1050 	return lzma_mem_deflate(in, in_size, out_size, level);
1051 #else
1052 	return NULL;
1053 #endif
1054 
1055     case RANS0: {
1056 	unsigned int out_size_i;
1057 	unsigned char *cp;
1058 	cp = rans_compress((unsigned char *)in, in_size, &out_size_i, 0);
1059 	*out_size = out_size_i;
1060 	return (char *)cp;
1061     }
1062 
1063     case RANS1: {
1064 	unsigned int out_size_i;
1065 	unsigned char *cp;
1066 
1067 	cp = rans_compress((unsigned char *)in, in_size, &out_size_i, 1);
1068 	*out_size = out_size_i;
1069 	return (char *)cp;
1070     }
1071 
1072     case RAW:
1073 	break;
1074 
1075     default:
1076 	return NULL;
1077     }
1078 
1079     return NULL;
1080 }
1081 
1082 
1083 /*
1084  * Compresses a block using one of two different zlib strategies. If we only
1085  * want one choice set strat2 to be -1.
1086  *
1087  * The logic here is that sometimes Z_RLE does a better job than Z_FILTERED
1088  * or Z_DEFAULT_STRATEGY on quality data. If so, we'd rather use it as it is
1089  * significantly faster.
1090  *
1091  * Method and level -1 implies defaults, as specified in cram_fd.
1092  */
cram_compress_block(cram_fd * fd,cram_block * b,cram_metrics * metrics,int method,int level)1093 int cram_compress_block(cram_fd *fd, cram_block *b, cram_metrics *metrics,
1094 			int method, int level) {
1095 
1096     char *comp = NULL;
1097     size_t comp_size = 0;
1098     int strat;
1099 
1100     if (b->method != RAW) {
1101         // Maybe already compressed if s->block[0] was compressed and
1102         // we have e.g. s->block[DS_BA] set to s->block[0] due to only
1103         // one base type present and hence using E_HUFFMAN on block 0.
1104         // A second explicit attempt to compress the same block then
1105         // occurs.
1106         return 0;
1107     }
1108 
1109     if (method == -1) {
1110 	method = 1<<GZIP;
1111 	if (fd->use_bz2)
1112 	    method |= 1<<BZIP2;
1113 	if (fd->use_lzma)
1114 	    method |= 1<<LZMA;
1115     }
1116 
1117     if (level == -1)
1118 	level = fd->level;
1119 
1120     //fprintf(stderr, "IN: block %d, sz %d\n", b->content_id, b->uncomp_size);
1121 
1122     if (method == RAW || level == 0 || b->uncomp_size == 0) {
1123 	b->method = RAW;
1124 	b->comp_size = b->uncomp_size;
1125 	//fprintf(stderr, "Skip block id %d\n", b->content_id);
1126 	return 0;
1127     }
1128 
1129     if (metrics) {
1130 	pthread_mutex_lock(&fd->metrics_lock);
1131 	if (metrics->trial > 0 || --metrics->next_trial <= 0) {
1132 	    size_t sz_best = INT_MAX;
1133 	    size_t sz_gz_rle = 0;
1134 	    size_t sz_gz_def = 0;
1135 	    size_t sz_rans0 = 0;
1136 	    size_t sz_rans1 = 0;
1137 	    size_t sz_bzip2 = 0;
1138 	    size_t sz_lzma = 0;
1139 	    int method_best = 0;
1140 	    char *c_best = NULL, *c = NULL;
1141 
1142 	    if (metrics->revised_method)
1143 		method = metrics->revised_method;
1144 	    else
1145 		metrics->revised_method = method;
1146 
1147 	    if (metrics->next_trial <= 0) {
1148 		metrics->next_trial = TRIAL_SPAN;
1149 		metrics->trial = NTRIALS;
1150 		metrics->sz_gz_rle /= 2;
1151 		metrics->sz_gz_def /= 2;
1152 		metrics->sz_rans0  /= 2;
1153 		metrics->sz_rans1  /= 2;
1154 		metrics->sz_bzip2  /= 2;
1155 		metrics->sz_lzma   /= 2;
1156 	    }
1157 
1158 	    pthread_mutex_unlock(&fd->metrics_lock);
1159 
1160 	    if (method & (1<<GZIP_RLE)) {
1161 		c = cram_compress_by_method((char *)b->data, b->uncomp_size,
1162 					    &sz_gz_rle, GZIP, 1, Z_RLE);
1163 		if (c && sz_best > sz_gz_rle) {
1164 		    sz_best = sz_gz_rle;
1165 		    method_best = GZIP_RLE;
1166 		    if (c_best)
1167 			free(c_best);
1168 		    c_best = c;
1169 		} else if (c) {
1170 		    free(c);
1171 		} else {
1172 		    sz_gz_rle = b->uncomp_size*2+1000;
1173 		}
1174 
1175 		//fprintf(stderr, "Block %d; %d->%d\n", b->content_id, b->uncomp_size, sz_gz_rle);
1176 	    }
1177 
1178 	    if (method & (1<<GZIP)) {
1179 		c = cram_compress_by_method((char *)b->data, b->uncomp_size,
1180 					    &sz_gz_def, GZIP, level,
1181 					    Z_FILTERED);
1182 		if (c && sz_best > sz_gz_def) {
1183 		    sz_best = sz_gz_def;
1184 		    method_best = GZIP;
1185 		    if (c_best)
1186 			free(c_best);
1187 		    c_best = c;
1188 		} else if (c) {
1189 		    free(c);
1190 		} else {
1191 		    sz_gz_def = b->uncomp_size*2+1000;
1192 		}
1193 
1194 		//fprintf(stderr, "Block %d; %d->%d\n", b->content_id, b->uncomp_size, sz_gz_def);
1195 	    }
1196 
1197 	    if (method & (1<<RANS0)) {
1198 		c = cram_compress_by_method((char *)b->data, b->uncomp_size,
1199 					    &sz_rans0, RANS0, 0, 0);
1200 		if (c && sz_best > sz_rans0) {
1201 		    sz_best = sz_rans0;
1202 		    method_best = RANS0;
1203 		    if (c_best)
1204 			free(c_best);
1205 		    c_best = c;
1206 		} else if (c) {
1207 		    free(c);
1208 		} else {
1209 		    sz_rans0 = b->uncomp_size*2+1000;
1210 		}
1211 	    }
1212 
1213 	    if (method & (1<<RANS1)) {
1214 		c = cram_compress_by_method((char *)b->data, b->uncomp_size,
1215 					    &sz_rans1, RANS1, 0, 0);
1216 		if (c && sz_best > sz_rans1) {
1217 		    sz_best = sz_rans1;
1218 		    method_best = RANS1;
1219 		    if (c_best)
1220 			free(c_best);
1221 		    c_best = c;
1222 		} else if (c) {
1223 		    free(c);
1224 		} else {
1225 		    sz_rans1 = b->uncomp_size*2+1000;
1226 		}
1227 	    }
1228 
1229 	    if (method & (1<<BZIP2)) {
1230 		c = cram_compress_by_method((char *)b->data, b->uncomp_size,
1231 					    &sz_bzip2, BZIP2, level, 0);
1232 		if (c && sz_best > sz_bzip2) {
1233 		    sz_best = sz_bzip2;
1234 		    method_best = BZIP2;
1235 		    if (c_best)
1236 			free(c_best);
1237 		    c_best = c;
1238 		} else if (c) {
1239 		    free(c);
1240 		} else {
1241 		    sz_bzip2 = b->uncomp_size*2+1000;
1242 		}
1243 	    }
1244 
1245 	    if (method & (1<<LZMA)) {
1246 		c = cram_compress_by_method((char *)b->data, b->uncomp_size,
1247 					    &sz_lzma, LZMA, level, 0);
1248 		if (c && sz_best > sz_lzma) {
1249 		    sz_best = sz_lzma;
1250 		    method_best = LZMA;
1251 		    if (c_best)
1252 			free(c_best);
1253 		    c_best = c;
1254 		} else if (c) {
1255 		    free(c);
1256 		} else {
1257 		    sz_lzma = b->uncomp_size*2+1000;
1258 		}
1259 	    }
1260 
1261 	    //fprintf(stderr, "sz_best = %d\n", sz_best);
1262 
1263 	    free(b->data);
1264 	    b->data = (unsigned char *)c_best;
1265 	    //printf("method_best = %s\n", cram_block_method2str(method_best));
1266 	    b->method = method_best == GZIP_RLE ? GZIP : method_best;
1267 	    b->comp_size = sz_best;
1268 
1269 	    pthread_mutex_lock(&fd->metrics_lock);
1270 	    metrics->sz_gz_rle += sz_gz_rle;
1271 	    metrics->sz_gz_def += sz_gz_def;
1272 	    metrics->sz_rans0  += sz_rans0;
1273 	    metrics->sz_rans1  += sz_rans1;
1274 	    metrics->sz_bzip2  += sz_bzip2;
1275 	    metrics->sz_lzma   += sz_lzma;
1276 	    if (--metrics->trial == 0) {
1277 		int best_method = RAW;
1278 		int best_sz = INT_MAX;
1279 
1280 		// Scale methods by cost
1281 		if (fd->level <= 3) {
1282 		    metrics->sz_rans1  *= 1.02;
1283 		    metrics->sz_gz_def *= 1.04;
1284 		    metrics->sz_bzip2  *= 1.08;
1285 		    metrics->sz_lzma   *= 1.10;
1286 		} else if (fd->level <= 6) {
1287 		    metrics->sz_rans1  *= 1.01;
1288 		    metrics->sz_gz_def *= 1.02;
1289 		    metrics->sz_bzip2  *= 1.03;
1290 		    metrics->sz_lzma   *= 1.05;
1291 		}
1292 
1293 		if (method & (1<<GZIP_RLE) && best_sz > metrics->sz_gz_rle)
1294 		    best_sz = metrics->sz_gz_rle, best_method = GZIP_RLE;
1295 
1296 		if (method & (1<<GZIP) && best_sz > metrics->sz_gz_def)
1297 		    best_sz = metrics->sz_gz_def, best_method = GZIP;
1298 
1299 		if (method & (1<<RANS0) && best_sz > metrics->sz_rans0)
1300 		    best_sz = metrics->sz_rans0, best_method = RANS0;
1301 
1302 		if (method & (1<<RANS1) && best_sz > metrics->sz_rans1)
1303 		    best_sz = metrics->sz_rans1, best_method = RANS1;
1304 
1305 		if (method & (1<<BZIP2) && best_sz > metrics->sz_bzip2)
1306 		    best_sz = metrics->sz_bzip2, best_method = BZIP2;
1307 
1308 		if (method & (1<<LZMA) && best_sz > metrics->sz_lzma)
1309 		    best_sz = metrics->sz_lzma, best_method = LZMA;
1310 
1311 		if (best_method == GZIP_RLE) {
1312 		    metrics->method = GZIP;
1313 		    metrics->strat  = Z_RLE;
1314 		} else {
1315 		    metrics->method = best_method;
1316 		    metrics->strat  = Z_FILTERED;
1317 		}
1318 
1319 		// If we see at least MAXFAIL trials in a row for a specific
1320 		// compression method with more than MAXDELTA aggregate
1321 		// size then we drop this from the list of methods used
1322 		// for this block type.
1323 #define MAXDELTA 0.20
1324 #define MAXFAILS 4
1325 		if (best_method == GZIP_RLE) {
1326 		    metrics->gz_rle_cnt = 0;
1327 		    metrics->gz_rle_extra = 0;
1328 		} else if (best_sz < metrics->sz_gz_rle) {
1329 		    double r = (double)metrics->sz_gz_rle / best_sz - 1;
1330 		    if (++metrics->gz_rle_cnt >= MAXFAILS &&
1331 			(metrics->gz_rle_extra += r) >= MAXDELTA)
1332 			method &= ~(1<<GZIP_RLE);
1333 		}
1334 
1335 		if (best_method == GZIP) {
1336 		    metrics->gz_def_cnt = 0;
1337 		    metrics->gz_def_extra = 0;
1338 		} else if (best_sz < metrics->sz_gz_def) {
1339 		    double r = (double)metrics->sz_gz_def / best_sz - 1;
1340 		    if (++metrics->gz_def_cnt >= MAXFAILS &&
1341 			(metrics->gz_def_extra += r) >= MAXDELTA)
1342 			method &= ~(1<<GZIP);
1343 		}
1344 
1345 		if (best_method == RANS0) {
1346 		    metrics->rans0_cnt = 0;
1347 		    metrics->rans0_extra = 0;
1348 		} else if (best_sz < metrics->sz_rans0) {
1349 		    double r = (double)metrics->sz_rans0 / best_sz - 1;
1350 		    if (++metrics->rans0_cnt >= MAXFAILS &&
1351 			(metrics->rans0_extra += r) >= MAXDELTA)
1352 			method &= ~(1<<RANS0);
1353 		}
1354 
1355 		if (best_method == RANS1) {
1356 		    metrics->rans1_cnt = 0;
1357 		    metrics->rans1_extra = 0;
1358 		} else if (best_sz < metrics->sz_rans1) {
1359 		    double r = (double)metrics->sz_rans1 / best_sz - 1;
1360 		    if (++metrics->rans1_cnt >= MAXFAILS &&
1361 			(metrics->rans1_extra += r) >= MAXDELTA)
1362 			method &= ~(1<<RANS1);
1363 		}
1364 
1365 		if (best_method == BZIP2) {
1366 		    metrics->bzip2_cnt = 0;
1367 		    metrics->bzip2_extra = 0;
1368 		} else if (best_sz < metrics->sz_bzip2) {
1369 		    double r = (double)metrics->sz_bzip2 / best_sz - 1;
1370 		    if (++metrics->bzip2_cnt >= MAXFAILS &&
1371 			(metrics->bzip2_extra += r) >= MAXDELTA)
1372 			method &= ~(1<<BZIP2);
1373 		}
1374 
1375 		if (best_method == LZMA) {
1376 		    metrics->lzma_cnt = 0;
1377 		    metrics->lzma_extra = 0;
1378 		} else if (best_sz < metrics->sz_lzma) {
1379 		    double r = (double)metrics->sz_lzma / best_sz - 1;
1380 		    if (++metrics->lzma_cnt >= MAXFAILS &&
1381 			(metrics->lzma_extra += r) >= MAXDELTA)
1382 			method &= ~(1<<LZMA);
1383 		}
1384 
1385 		//if (method != metrics->revised_method)
1386 		//    fprintf(stderr, "%d: method from %x to %x\n",
1387 		//	    b->content_id, metrics->revised_method, method);
1388 		metrics->revised_method = method;
1389 	    }
1390 	    pthread_mutex_unlock(&fd->metrics_lock);
1391 	} else {
1392 	    strat = metrics->strat;
1393 	    method = metrics->method;
1394 
1395 	    pthread_mutex_unlock(&fd->metrics_lock);
1396 	    comp = cram_compress_by_method((char *)b->data, b->uncomp_size,
1397 					   &comp_size, method,
1398 					   level, strat);
1399 	    if (!comp)
1400 		return -1;
1401 	    free(b->data);
1402 	    b->data = (unsigned char *)comp;
1403 	    b->comp_size = comp_size;
1404 	    b->method = method;
1405 	}
1406 
1407     } else {
1408 	// no cached metrics, so just do zlib?
1409 	comp = cram_compress_by_method((char *)b->data, b->uncomp_size,
1410 				       &comp_size, GZIP, level, Z_FILTERED);
1411 	if (!comp) {
1412 	    hts_log_error("Compression failed");
1413 	    return -1;
1414 	}
1415 	free(b->data);
1416 	b->data = (unsigned char *)comp;
1417 	b->comp_size = comp_size;
1418 	b->method = GZIP;
1419     }
1420 
1421     hts_log_info("Compressed block ID %d from %d to %d by method %s",
1422 	    b->content_id, b->uncomp_size, b->comp_size,
1423 	    cram_block_method2str(b->method));
1424 
1425     if (b->method == RANS1)
1426 	b->method = RANS0; // Spec just has RANS (not 0/1) with auto-sensing
1427 
1428     return 0;
1429 }
1430 
cram_new_metrics(void)1431 cram_metrics *cram_new_metrics(void) {
1432     cram_metrics *m = calloc(1, sizeof(*m));
1433     if (!m)
1434 	return NULL;
1435     m->trial = NTRIALS-1;
1436     m->next_trial = TRIAL_SPAN;
1437     m->method = RAW;
1438     m->strat = 0;
1439     m->revised_method = 0;
1440 
1441     return m;
1442 }
1443 
cram_block_method2str(enum cram_block_method m)1444 char *cram_block_method2str(enum cram_block_method m) {
1445     switch(m) {
1446     case RAW:	   return "RAW";
1447     case GZIP:	   return "GZIP";
1448     case BZIP2:	   return "BZIP2";
1449     case LZMA:     return "LZMA";
1450     case RANS0:    return "RANS0";
1451     case RANS1:    return "RANS1";
1452     case GZIP_RLE: return "GZIP_RLE";
1453     case BM_ERROR: break;
1454     }
1455     return "?";
1456 }
1457 
cram_content_type2str(enum cram_content_type t)1458 char *cram_content_type2str(enum cram_content_type t) {
1459     switch (t) {
1460     case FILE_HEADER:         return "FILE_HEADER";
1461     case COMPRESSION_HEADER:  return "COMPRESSION_HEADER";
1462     case MAPPED_SLICE:        return "MAPPED_SLICE";
1463     case UNMAPPED_SLICE:      return "UNMAPPED_SLICE";
1464     case EXTERNAL:            return "EXTERNAL";
1465     case CORE:                return "CORE";
1466     case CT_ERROR:            break;
1467     }
1468     return "?";
1469 }
1470 
1471 /* ----------------------------------------------------------------------
1472  * Reference sequence handling
1473  *
1474  * These revolve around the refs_t structure, which may potentially be
1475  * shared between multiple cram_fd.
1476  *
1477  * We start with refs_create() to allocate an empty refs_t and then
1478  * populate it with @SQ line data using refs_from_header(). This is done on
1479  * cram_open().  Also at start up we can call cram_load_reference() which
1480  * is used with "scramble -r foo.fa". This replaces the fd->refs with the
1481  * new one specified. In either case refs2id() is then called which
1482  * maps ref_entry names to @SQ ids (refs_t->ref_id[]).
1483  *
1484  * Later, possibly within a thread, we will want to know the actual ref
1485  * seq itself, obtained by calling cram_get_ref().  This may use the
1486  * UR: or M5: fields or the filename specified in the original
1487  * cram_load_reference() call.
1488  *
1489  * Given the potential for multi-threaded reference usage, we have
1490  * reference counting (sorry for the confusing double use of "ref") to
1491  * track the number of callers interested in any specific reference.
1492  */
1493 
1494 /*
1495  * Frees/unmaps a reference sequence and associated file handles.
1496  */
ref_entry_free_seq(ref_entry * e)1497 static void ref_entry_free_seq(ref_entry *e) {
1498     if (e->mf)
1499 	mfclose(e->mf);
1500     if (e->seq && !e->mf)
1501 	free(e->seq);
1502 
1503     e->seq = NULL;
1504     e->mf = NULL;
1505 }
1506 
refs_free(refs_t * r)1507 void refs_free(refs_t *r) {
1508     RP("refs_free()\n");
1509 
1510     if (--r->count > 0)
1511 	return;
1512 
1513     if (!r)
1514 	return;
1515 
1516     if (r->pool)
1517 	string_pool_destroy(r->pool);
1518 
1519     if (r->h_meta) {
1520 	khint_t k;
1521 
1522 	for (k = kh_begin(r->h_meta); k != kh_end(r->h_meta); k++) {
1523 	    ref_entry *e;
1524 
1525 	    if (!kh_exist(r->h_meta, k))
1526 		continue;
1527 	    if (!(e = kh_val(r->h_meta, k)))
1528 		continue;
1529 	    ref_entry_free_seq(e);
1530 	    free(e);
1531 	}
1532 
1533 	kh_destroy(refs, r->h_meta);
1534     }
1535 
1536     if (r->ref_id)
1537 	free(r->ref_id);
1538 
1539     if (r->fp)
1540 	bgzf_close(r->fp);
1541 
1542     pthread_mutex_destroy(&r->lock);
1543 
1544     free(r);
1545 }
1546 
refs_create(void)1547 static refs_t *refs_create(void) {
1548     refs_t *r = calloc(1, sizeof(*r));
1549 
1550     RP("refs_create()\n");
1551 
1552     if (!r)
1553 	return NULL;
1554 
1555     if (!(r->pool = string_pool_create(8192)))
1556 	goto err;
1557 
1558     r->ref_id = NULL; // see refs2id() to populate.
1559     r->count = 1;
1560     r->last = NULL;
1561     r->last_id = -1;
1562 
1563     if (!(r->h_meta = kh_init(refs)))
1564 	goto err;
1565 
1566     pthread_mutex_init(&r->lock, NULL);
1567 
1568     return r;
1569 
1570  err:
1571     refs_free(r);
1572     return NULL;
1573 }
1574 
1575 /*
1576  * Opens a reference fasta file as a BGZF stream, allowing for
1577  * compressed files.  It automatically builds a .fai file if
1578  * required and if compressed a .gzi bgzf index too.
1579  *
1580  * Returns a BGZF handle on success;
1581  *         NULL on failure.
1582  */
bgzf_open_ref(char * fn,char * mode,int is_md5)1583 static BGZF *bgzf_open_ref(char *fn, char *mode, int is_md5) {
1584     BGZF *fp;
1585 
1586     if (!is_md5) {
1587 	char fai_file[PATH_MAX];
1588 
1589 	snprintf(fai_file, PATH_MAX, "%s.fai", fn);
1590 	if (access(fai_file, R_OK) != 0)
1591 	    if (fai_build(fn) != 0)
1592 		return NULL;
1593     }
1594 
1595     if (!(fp = bgzf_open(fn, mode))) {
1596 	perror(fn);
1597 	return NULL;
1598     }
1599 
1600     if (fp->is_compressed == 1 && bgzf_index_load(fp, fn, ".gzi") < 0) {
1601 	hts_log_error("Unable to load .gzi index '%s.gzi'", fn);
1602 	bgzf_close(fp);
1603 	return NULL;
1604     }
1605 
1606     return fp;
1607 }
1608 
1609 /*
1610  * Loads a FAI file for a reference.fasta.
1611  * "is_err" indicates whether failure to load is worthy of emitting an
1612  * error message. In some cases (eg with embedded references) we
1613  * speculatively load, just incase, and silently ignore errors.
1614  *
1615  * Returns the refs_t struct on success (maybe newly allocated);
1616  *         NULL on failure
1617  */
refs_load_fai(refs_t * r_orig,char * fn,int is_err)1618 static refs_t *refs_load_fai(refs_t *r_orig, char *fn, int is_err) {
1619     struct stat sb;
1620     FILE *fp = NULL;
1621     char fai_fn[PATH_MAX];
1622     char line[8192];
1623     refs_t *r = r_orig;
1624     size_t fn_l = strlen(fn);
1625     int id = 0, id_alloc = 0;
1626 
1627     RP("refs_load_fai %s\n", fn);
1628 
1629     if (!r)
1630 	if (!(r = refs_create()))
1631 	    goto err;
1632 
1633     /* Open reference, for later use */
1634     if (stat(fn, &sb) != 0) {
1635 	if (is_err)
1636 	    perror(fn);
1637 	goto err;
1638     }
1639 
1640     if (r->fp)
1641 	if (bgzf_close(r->fp) != 0)
1642 	    goto err;
1643     r->fp = NULL;
1644 
1645     if (!(r->fn = string_dup(r->pool, fn)))
1646 	goto err;
1647 
1648     if (fn_l > 4 && strcmp(&fn[fn_l-4], ".fai") == 0)
1649 	r->fn[fn_l-4] = 0;
1650 
1651     if (!(r->fp = bgzf_open_ref(r->fn, "r", 0)))
1652 	goto err;
1653 
1654     /* Parse .fai file and load meta-data */
1655     sprintf(fai_fn, "%.*s.fai", PATH_MAX-5, r->fn);
1656 
1657     if (stat(fai_fn, &sb) != 0) {
1658 	if (is_err)
1659 	    perror(fai_fn);
1660 	goto err;
1661     }
1662     if (!(fp = fopen(fai_fn, "r"))) {
1663 	if (is_err)
1664 	    perror(fai_fn);
1665 	goto err;
1666     }
1667     while (fgets(line, 8192, fp) != NULL) {
1668 	ref_entry *e = malloc(sizeof(*e));
1669 	char *cp;
1670 	int n;
1671 	khint_t k;
1672 
1673 	if (!e)
1674 	    return NULL;
1675 
1676 	// id
1677 	for (cp = line; *cp && !isspace_c(*cp); cp++)
1678 	    ;
1679 	*cp++ = 0;
1680 	e->name = string_dup(r->pool, line);
1681 
1682 	// length
1683 	while (*cp && isspace_c(*cp))
1684 	    cp++;
1685 	e->length = strtoll(cp, &cp, 10);
1686 
1687 	// offset
1688 	while (*cp && isspace_c(*cp))
1689 	    cp++;
1690 	e->offset = strtoll(cp, &cp, 10);
1691 
1692 	// bases per line
1693 	while (*cp && isspace_c(*cp))
1694 	    cp++;
1695 	e->bases_per_line = strtol(cp, &cp, 10);
1696 
1697 	// line length
1698 	while (*cp && isspace_c(*cp))
1699 	    cp++;
1700 	e->line_length = strtol(cp, &cp, 10);
1701 
1702 	// filename
1703 	e->fn = r->fn;
1704 
1705 	e->count = 0;
1706 	e->seq = NULL;
1707 	e->mf = NULL;
1708 	e->is_md5 = 0;
1709 
1710 	k = kh_put(refs, r->h_meta, e->name, &n);
1711 	if (-1 == n)  {
1712 	    free(e);
1713 	    return NULL;
1714 	}
1715 
1716 	if (n) {
1717 	    kh_val(r->h_meta, k) = e;
1718 	} else {
1719 	    ref_entry *re = kh_val(r->h_meta, k);
1720 	    if (re && (re->count != 0 || re->length != 0)) {
1721 		/* Keep old */
1722 		free(e);
1723 	    } else {
1724 		/* Replace old */
1725 		if (re)
1726 		    free(re);
1727 		kh_val(r->h_meta, k) = e;
1728 	    }
1729 	}
1730 
1731 	if (id >= id_alloc) {
1732 	    int x;
1733 
1734 	    id_alloc = id_alloc ?id_alloc*2 : 16;
1735 	    r->ref_id = realloc(r->ref_id, id_alloc * sizeof(*r->ref_id));
1736 
1737 	    for (x = id; x < id_alloc; x++)
1738 		r->ref_id[x] = NULL;
1739 	}
1740 	r->ref_id[id] = e;
1741 	r->nref = ++id;
1742     }
1743 
1744     fclose(fp);
1745     return r;
1746 
1747  err:
1748     if (fp)
1749 	fclose(fp);
1750 
1751     if (!r_orig)
1752 	refs_free(r);
1753 
1754     return NULL;
1755 }
1756 
1757 /*
1758  * Verifies that the CRAM @SQ lines and .fai files match.
1759  */
sanitise_SQ_lines(cram_fd * fd)1760 static void sanitise_SQ_lines(cram_fd *fd) {
1761     int i;
1762 
1763     if (!fd->header)
1764 	return;
1765 
1766     if (!fd->refs || !fd->refs->h_meta)
1767 	return;
1768 
1769     for (i = 0; i < fd->header->nref; i++) {
1770 	char *name = fd->header->ref[i].name;
1771 	khint_t k = kh_get(refs, fd->refs->h_meta, name);
1772 	ref_entry *r;
1773 
1774 	// We may have @SQ lines which have no known .fai, but do not
1775 	// in themselves pose a problem because they are unused in the file.
1776 	if (k == kh_end(fd->refs->h_meta))
1777 	    continue;
1778 
1779 	if (!(r = (ref_entry *)kh_val(fd->refs->h_meta, k)))
1780 	    continue;
1781 
1782 	if (r->length && r->length != fd->header->ref[i].len) {
1783 	    assert(strcmp(r->name, fd->header->ref[i].name) == 0);
1784 
1785 	    // Should we also check MD5sums here to ensure the correct
1786 	    // reference was given?
1787 	    hts_log_warning("Header @SQ length mismatch for ref %s, %d vs %d",
1788 		    r->name, fd->header->ref[i].len, (int)r->length);
1789 
1790 	    // Fixing the parsed @SQ header will make MD:Z: strings work
1791 	    // and also stop it producing N for the sequence.
1792 	    fd->header->ref[i].len = r->length;
1793 	}
1794     }
1795 }
1796 
1797 /*
1798  * Indexes references by the order they appear in a BAM file. This may not
1799  * necessarily be the same order they appear in the fasta reference file.
1800  *
1801  * Returns 0 on success
1802  *        -1 on failure
1803  */
refs2id(refs_t * r,SAM_hdr * h)1804 int refs2id(refs_t *r, SAM_hdr *h) {
1805     int i;
1806 
1807     if (r->ref_id)
1808 	free(r->ref_id);
1809     if (r->last)
1810 	r->last = NULL;
1811 
1812     r->ref_id = calloc(h->nref, sizeof(*r->ref_id));
1813     if (!r->ref_id)
1814 	return -1;
1815 
1816     r->nref = h->nref;
1817     for (i = 0; i < h->nref; i++) {
1818 	khint_t k = kh_get(refs, r->h_meta, h->ref[i].name);
1819 	if (k != kh_end(r->h_meta)) {
1820 	    r->ref_id[i] = kh_val(r->h_meta, k);
1821 	} else {
1822 	    hts_log_warning("Unable to find ref name '%s'", h->ref[i].name);
1823 	}
1824     }
1825 
1826     return 0;
1827 }
1828 
1829 /*
1830  * Generates refs_t entries based on @SQ lines in the header.
1831  * Returns 0 on success
1832  *         -1 on failure
1833  */
refs_from_header(refs_t * r,cram_fd * fd,SAM_hdr * h)1834 static int refs_from_header(refs_t *r, cram_fd *fd, SAM_hdr *h) {
1835     int i, j;
1836 
1837     if (!r)
1838 	return -1;
1839 
1840     if (!h || h->nref == 0)
1841 	return 0;
1842 
1843     //fprintf(stderr, "refs_from_header for %p mode %c\n", fd, fd->mode);
1844 
1845     /* Existing refs are fine, as long as they're compatible with the hdr. */
1846     if (!(r->ref_id = realloc(r->ref_id, (r->nref + h->nref) * sizeof(*r->ref_id))))
1847 	return -1;
1848 
1849     /* Copy info from h->ref[i] over to r */
1850     for (i = 0, j = r->nref; i < h->nref; i++) {
1851 	SAM_hdr_type *ty;
1852 	SAM_hdr_tag *tag;
1853 	khint_t k;
1854 	int n;
1855 
1856 	k = kh_get(refs, r->h_meta, h->ref[i].name);
1857 	if (k != kh_end(r->h_meta))
1858 	    // Ref already known about
1859 	    continue;
1860 
1861 	if (!(r->ref_id[j] = calloc(1, sizeof(ref_entry))))
1862 	    return -1;
1863 
1864 	if (!h->ref[i].name)
1865 	    return -1;
1866 
1867 	r->ref_id[j]->name = string_dup(r->pool, h->ref[i].name);
1868 	r->ref_id[j]->length = 0; // marker for not yet loaded
1869 
1870 	/* Initialise likely filename if known */
1871 	if ((ty = sam_hdr_find(h, "SQ", "SN", h->ref[i].name))) {
1872 	    if ((tag = sam_hdr_find_key(h, ty, "M5", NULL))) {
1873 		r->ref_id[j]->fn = string_dup(r->pool, tag->str+3);
1874 		//fprintf(stderr, "Tagging @SQ %s / %s\n", r->ref_id[h]->name, r->ref_id[h]->fn);
1875 	    }
1876 	}
1877 
1878 	k = kh_put(refs, r->h_meta, r->ref_id[j]->name, &n);
1879 	if (n <= 0) // already exists or error
1880 	    return -1;
1881 	kh_val(r->h_meta, k) = r->ref_id[j];
1882 
1883 	j++;
1884     }
1885     r->nref = j;
1886 
1887     return 0;
1888 }
1889 
1890 /*
1891  * Attaches a header to a cram_fd.
1892  *
1893  * This should be used when creating a new cram_fd for writing where
1894  * we have an SAM_hdr already constructed (eg from a file we've read
1895  * in).
1896  */
cram_set_header(cram_fd * fd,SAM_hdr * hdr)1897 int cram_set_header(cram_fd *fd, SAM_hdr *hdr) {
1898     if (fd->header)
1899 	sam_hdr_free(fd->header);
1900     fd->header = hdr;
1901     return refs_from_header(fd->refs, fd, hdr);
1902 }
1903 
1904 /*
1905  * Converts a directory and a filename into an expanded path, replacing %s
1906  * in directory with the filename and %[0-9]+s with portions of the filename
1907  * Any remaining parts of filename are added to the end with /%s.
1908  */
expand_cache_path(char * path,char * dir,char * fn)1909 void expand_cache_path(char *path, char *dir, char *fn) {
1910     char *cp;
1911 
1912     while ((cp = strchr(dir, '%'))) {
1913 	strncpy(path, dir, cp-dir);
1914 	path += cp-dir;
1915 
1916 	if (*++cp == 's') {
1917 	    strcpy(path, fn);
1918 	    path += strlen(fn);
1919 	    fn += strlen(fn);
1920 	    cp++;
1921 	} else if (*cp >= '0' && *cp <= '9') {
1922 	    char *endp;
1923 	    long l;
1924 
1925 	    l = strtol(cp, &endp, 10);
1926 	    l = MIN(l, strlen(fn));
1927 	    if (*endp == 's') {
1928 		strncpy(path, fn, l);
1929 		path += l;
1930 		fn += l;
1931 		*path = 0;
1932 		cp = endp+1;
1933 	    } else {
1934 		*path++ = '%';
1935 		*path++ = *cp++;
1936 	    }
1937 	} else {
1938 	    *path++ = '%';
1939 	    *path++ = *cp++;
1940 	}
1941 	dir = cp;
1942     }
1943     strcpy(path, dir);
1944     path += strlen(dir);
1945     if (*fn && path[-1] != '/')
1946 	*path++ = '/';
1947     strcpy(path, fn);
1948 }
1949 
1950 /*
1951  * Make the directory containing path and any prefix directories.
1952  */
mkdir_prefix(char * path,int mode)1953 void mkdir_prefix(char *path, int mode) {
1954     char *cp = strrchr(path, '/');
1955     if (!cp)
1956 	return;
1957 
1958     *cp = 0;
1959     if (is_directory(path)) {
1960 	*cp = '/';
1961 	return;
1962     }
1963 
1964     if (mkdir(path, mode) == 0) {
1965 	chmod(path, mode);
1966 	*cp = '/';
1967 	return;
1968     }
1969 
1970     mkdir_prefix(path, mode);
1971     mkdir(path, mode);
1972     chmod(path, mode);
1973     *cp = '/';
1974 }
1975 
1976 /*
1977  * Return the cache directory to use, based on the first of these
1978  * environment variables to be set to a non-empty value.
1979  */
get_cache_basedir(const char ** extra)1980 static const char *get_cache_basedir(const char **extra) {
1981     char *base;
1982 
1983     *extra = "";
1984 
1985     base = getenv("XDG_CACHE_HOME");
1986     if (base && *base) return base;
1987 
1988     base = getenv("HOME");
1989     if (base && *base) { *extra = "/.cache"; return base; }
1990 
1991     base = getenv("TMPDIR");
1992     if (base && *base) return base;
1993 
1994     base = getenv("TEMP");
1995     if (base && *base) return base;
1996 
1997     return "/tmp";
1998 }
1999 
2000 /*
2001  * Return an integer representation of pthread_self().
2002  */
get_int_threadid()2003 static unsigned get_int_threadid() {
2004     pthread_t pt = pthread_self();
2005     unsigned char *s = (unsigned char *) &pt;
2006     size_t i;
2007     unsigned h = 0;
2008     for (i = 0; i < sizeof(pthread_t); i++)
2009 	h = (h << 5) - h + s[i];
2010     return h;
2011 }
2012 
2013 /*
2014  * Queries the M5 string from the header and attempts to populate the
2015  * reference from this using the REF_PATH environment.
2016  *
2017  * Returns 0 on sucess
2018  *        -1 on failure
2019  */
cram_populate_ref(cram_fd * fd,int id,ref_entry * r)2020 static int cram_populate_ref(cram_fd *fd, int id, ref_entry *r) {
2021     char *ref_path = getenv("REF_PATH");
2022     SAM_hdr_type *ty;
2023     SAM_hdr_tag *tag;
2024     char path[PATH_MAX], path_tmp[PATH_MAX];
2025     char cache[PATH_MAX], cache_root[PATH_MAX];
2026     char *local_cache = getenv("REF_CACHE");
2027     mFILE *mf;
2028     int local_path = 0;
2029 
2030     hts_log_info("Running cram_populate_ref on fd %p, id %d", (void *)fd, id);
2031 
2032     cache_root[0] = '\0';
2033 
2034     if (!ref_path || *ref_path == '\0') {
2035 	/*
2036 	 * If we have no ref path, we use the EBI server.
2037 	 * However to avoid spamming it we require a local ref cache too.
2038 	 */
2039 	ref_path = "http://www.ebi.ac.uk:80/ena/cram/md5/%s";
2040 	if (!local_cache || *local_cache == '\0') {
2041 	    const char *extra;
2042 	    const char *base = get_cache_basedir(&extra);
2043 	    snprintf(cache_root, PATH_MAX, "%s%s/hts-ref", base, extra);
2044 	    snprintf(cache,PATH_MAX, "%s%s/hts-ref/%%2s/%%2s/%%s", base, extra);
2045 	    local_cache = cache;
2046 	    hts_log_info("Populating local cache: %s", local_cache);
2047 	}
2048     }
2049 
2050     if (!r->name)
2051 	return -1;
2052 
2053     if (!(ty = sam_hdr_find(fd->header, "SQ", "SN", r->name)))
2054 	return -1;
2055 
2056     if (!(tag = sam_hdr_find_key(fd->header, ty, "M5", NULL)))
2057 	goto no_M5;
2058 
2059     hts_log_info("Querying ref %s", tag->str+3);
2060 
2061     /* Use cache if available */
2062     if (local_cache && *local_cache) {
2063 	expand_cache_path(path, local_cache, tag->str+3);
2064 	local_path = 1;
2065     }
2066 
2067 #ifndef HAVE_MMAP
2068     char *path2;
2069     /* Search local files in REF_PATH; we can open them and return as above */
2070     if (!local_path && (path2 = find_path(tag->str+3, ref_path))) {
2071 	strncpy(path, path2, PATH_MAX);
2072 	free(path2);
2073 	if (is_file(path)) // incase it's too long
2074 	    local_path = 1;
2075     }
2076 #endif
2077 
2078     /* Found via REF_CACHE or local REF_PATH file */
2079     if (local_path) {
2080 	struct stat sb;
2081 	BGZF *fp;
2082 
2083 	if (0 == stat(path, &sb) && (fp = bgzf_open(path, "r"))) {
2084 	    r->length = sb.st_size;
2085 	    r->offset = r->line_length = r->bases_per_line = 0;
2086 
2087 	    r->fn = string_dup(fd->refs->pool, path);
2088 
2089 	    if (fd->refs->fp)
2090 		if (bgzf_close(fd->refs->fp) != 0)
2091 		    return -1;
2092 	    fd->refs->fp = fp;
2093 	    fd->refs->fn = r->fn;
2094 	    r->is_md5 = 1;
2095 
2096 	    // Fall back to cram_get_ref() where it'll do the actual
2097 	    // reading of the file.
2098 	    return 0;
2099 	}
2100     }
2101 
2102 
2103     /* Otherwise search full REF_PATH; slower as loads entire file */
2104     if ((mf = open_path_mfile(tag->str+3, ref_path, NULL))) {
2105 	size_t sz;
2106 	r->seq = mfsteal(mf, &sz);
2107 	if (r->seq) {
2108 	    r->mf = NULL;
2109 	} else {
2110 	    // keep mf around as we couldn't detach
2111 	    r->seq = mf->data;
2112 	    r->mf = mf;
2113 	}
2114 	r->length = sz;
2115 	r->is_md5 = 1;
2116     } else {
2117 	refs_t *refs;
2118 	char *fn;
2119 
2120     no_M5:
2121 	/* Failed to find in search path or M5 cache, see if @SQ UR: tag? */
2122 	if (!(tag = sam_hdr_find_key(fd->header, ty, "UR", NULL)))
2123 	    return -1;
2124 
2125 	fn = (strncmp(tag->str+3, "file:", 5) == 0)
2126 	    ? tag->str+8
2127 	    : tag->str+3;
2128 
2129 	if (fd->refs->fp) {
2130 	    if (bgzf_close(fd->refs->fp) != 0)
2131 		return -1;
2132 	    fd->refs->fp = NULL;
2133 	}
2134 	if (!(refs = refs_load_fai(fd->refs, fn, 0)))
2135 	    return -1;
2136 	sanitise_SQ_lines(fd);
2137 
2138 	fd->refs = refs;
2139 	if (fd->refs->fp) {
2140 	    if (bgzf_close(fd->refs->fp) != 0)
2141 		return -1;
2142 	    fd->refs->fp = NULL;
2143 	}
2144 
2145 	if (!fd->refs->fn)
2146 	    return -1;
2147 
2148 	if (-1 == refs2id(fd->refs, fd->header))
2149 	    return -1;
2150 	if (!fd->refs->ref_id || !fd->refs->ref_id[id])
2151 	    return -1;
2152 
2153 	// Local copy already, so fall back to cram_get_ref().
2154 	return 0;
2155     }
2156 
2157     /* Populate the local disk cache if required */
2158     if (local_cache && *local_cache) {
2159         int pid = (int) getpid();
2160         unsigned thrid = get_int_threadid();
2161         hFILE *fp;
2162 
2163         if (*cache_root && !is_directory(cache_root)) {
2164             hts_log_warning("Creating reference cache directory %s\n"
2165                 "This may become large; see the samtools(1) manual page REF_CACHE discussion",
2166                 cache_root);
2167         }
2168 
2169 	expand_cache_path(path, local_cache, tag->str+3);
2170 	hts_log_info("Writing cache file '%s'", path);
2171 	mkdir_prefix(path, 01777);
2172 
2173 	do {
2174 	    // Attempt to further uniquify the temporary filename
2175 	    unsigned t = ((unsigned) time(NULL)) ^ ((unsigned) clock());
2176 	    thrid++; // Ensure filename changes even if time/clock haven't
2177 
2178 	    sprintf(path_tmp, "%s.tmp_%d_%u_%u", path, pid, thrid, t);
2179 	    fp = hopen(path_tmp, "wx");
2180 	} while (fp == NULL && errno == EEXIST);
2181 	if (!fp) {
2182 	    perror(path_tmp);
2183 
2184 	    // Not fatal - we have the data already so keep going.
2185 	    return 0;
2186 	}
2187 
2188 	// Check md5sum
2189 	hts_md5_context *md5;
2190 	char unsigned md5_buf1[16];
2191 	char md5_buf2[33];
2192 
2193 	if (!(md5 = hts_md5_init())) {
2194 	    hclose_abruptly(fp);
2195 	    unlink(path_tmp);
2196 	    return -1;
2197 	}
2198 	hts_md5_update(md5, r->seq, r->length);
2199 	hts_md5_final(md5_buf1, md5);
2200 	hts_md5_destroy(md5);
2201 	hts_md5_hex(md5_buf2, md5_buf1);
2202 
2203 	if (strncmp(tag->str+3, md5_buf2, 32) != 0) {
2204 	    hts_log_error("Mismatching md5sum for downloaded reference");
2205 	    hclose_abruptly(fp);
2206 	    unlink(path_tmp);
2207 	    return -1;
2208 	}
2209 
2210 	if (hwrite(fp, r->seq, r->length) != r->length) {
2211 	    perror(path);
2212 	}
2213 	if (hclose(fp) < 0) {
2214 	    unlink(path_tmp);
2215 	} else {
2216 	    if (0 == chmod(path_tmp, 0444))
2217 		rename(path_tmp, path);
2218 	    else
2219 		unlink(path_tmp);
2220 	}
2221     }
2222 
2223     return 0;
2224 }
2225 
cram_ref_incr_locked(refs_t * r,int id)2226 static void cram_ref_incr_locked(refs_t *r, int id) {
2227     RP("%d INC REF %d, %d %p\n", gettid(), id, (int)(id>=0?r->ref_id[id]->count+1:-999), id>=0?r->ref_id[id]->seq:(char *)1);
2228 
2229     if (id < 0 || !r->ref_id[id] || !r->ref_id[id]->seq)
2230 	return;
2231 
2232     if (r->last_id == id)
2233 	r->last_id = -1;
2234 
2235     ++r->ref_id[id]->count;
2236 }
2237 
cram_ref_incr(refs_t * r,int id)2238 void cram_ref_incr(refs_t *r, int id) {
2239     pthread_mutex_lock(&r->lock);
2240     cram_ref_incr_locked(r, id);
2241     pthread_mutex_unlock(&r->lock);
2242 }
2243 
cram_ref_decr_locked(refs_t * r,int id)2244 static void cram_ref_decr_locked(refs_t *r, int id) {
2245     RP("%d DEC REF %d, %d %p\n", gettid(), id, (int)(id>=0?r->ref_id[id]->count-1:-999), id>=0?r->ref_id[id]->seq:(char *)1);
2246 
2247     if (id < 0 || !r->ref_id[id] || !r->ref_id[id]->seq) {
2248 	assert(r->ref_id[id]->count >= 0);
2249 	return;
2250     }
2251 
2252     if (--r->ref_id[id]->count <= 0) {
2253 	assert(r->ref_id[id]->count == 0);
2254 	if (r->last_id >= 0) {
2255 	    if (r->ref_id[r->last_id]->count <= 0 &&
2256 		r->ref_id[r->last_id]->seq) {
2257 		RP("%d FREE REF %d (%p)\n", gettid(),
2258 		   r->last_id, r->ref_id[r->last_id]->seq);
2259 		ref_entry_free_seq(r->ref_id[r->last_id]);
2260 		r->ref_id[r->last_id]->length = 0;
2261 	    }
2262 	}
2263 	r->last_id = id;
2264     }
2265 }
2266 
cram_ref_decr(refs_t * r,int id)2267 void cram_ref_decr(refs_t *r, int id) {
2268     pthread_mutex_lock(&r->lock);
2269     cram_ref_decr_locked(r, id);
2270     pthread_mutex_unlock(&r->lock);
2271 }
2272 
2273 /*
2274  * Used by cram_ref_load and cram_ref_get. The file handle will have
2275  * already been opened, so we can catch it. The ref_entry *e informs us
2276  * of whether this is a multi-line fasta file or a raw MD5 style file.
2277  * Either way we create a single contiguous sequence.
2278  *
2279  * Returns all or part of a reference sequence on success (malloced);
2280  *         NULL on failure.
2281  */
load_ref_portion(BGZF * fp,ref_entry * e,int start,int end)2282 static char *load_ref_portion(BGZF *fp, ref_entry *e, int start, int end) {
2283     off_t offset, len;
2284     char *seq;
2285 
2286     if (end < start)
2287 	end = start;
2288 
2289     /*
2290      * Compute locations in file. This is trivial for the MD5 files, but
2291      * is still necessary for the fasta variants.
2292      */
2293     offset = e->line_length
2294 	? e->offset + (start-1)/e->bases_per_line * e->line_length +
2295 	  (start-1) % e->bases_per_line
2296 	: start-1;
2297 
2298     len = (e->line_length
2299 	   ? e->offset + (end-1)/e->bases_per_line * e->line_length +
2300 	     (end-1) % e->bases_per_line
2301 	   : end-1) - offset + 1;
2302 
2303     if (bgzf_useek(fp, offset, SEEK_SET) < 0) {
2304 	perror("bgzf_useek() on reference file");
2305 	return NULL;
2306     }
2307 
2308     if (len == 0 || !(seq = malloc(len))) {
2309 	return NULL;
2310     }
2311 
2312     if (len != bgzf_read(fp, seq, len)) {
2313 	perror("bgzf_read() on reference file");
2314 	free(seq);
2315 	return NULL;
2316     }
2317 
2318     /* Strip white-space if required. */
2319     if (len != end-start+1) {
2320 	int i, j;
2321 	char *cp = seq;
2322 	char *cp_to;
2323 
2324 	for (i = j = 0; i < len; i++) {
2325 	    if (cp[i] >= '!' && cp[i] <= '~')
2326 		cp[j++] = toupper_c(cp[i]);
2327 	}
2328 	cp_to = cp+j;
2329 
2330 	if (cp_to - seq != end-start+1) {
2331 	    hts_log_error("Malformed reference file");
2332 	    free(seq);
2333 	    return NULL;
2334 	}
2335     } else {
2336 	int i;
2337 	for (i = 0; i < len; i++) {
2338 	    seq[i] = toupper_c(seq[i]);
2339 	}
2340     }
2341 
2342     return seq;
2343 }
2344 
2345 /*
2346  * Load the entire reference 'id'.
2347  * This also increments the reference count by 1.
2348  *
2349  * Returns ref_entry on success;
2350  *         NULL on failure
2351  */
cram_ref_load(refs_t * r,int id,int is_md5)2352 ref_entry *cram_ref_load(refs_t *r, int id, int is_md5) {
2353     ref_entry *e = r->ref_id[id];
2354     int start = 1, end = e->length;
2355     char *seq;
2356 
2357     if (e->seq) {
2358 	return e;
2359     }
2360 
2361     assert(e->count == 0);
2362 
2363     if (r->last) {
2364 #ifdef REF_DEBUG
2365 	int idx = 0;
2366 	for (idx = 0; idx < r->nref; idx++)
2367 	    if (r->last == r->ref_id[idx])
2368 		break;
2369 	RP("%d cram_ref_load DECR %d\n", gettid(), idx);
2370 #endif
2371 	assert(r->last->count > 0);
2372 	if (--r->last->count <= 0) {
2373 	    RP("%d FREE REF %d (%p)\n", gettid(), id, r->ref_id[id]->seq);
2374 	    if (r->last->seq)
2375 		ref_entry_free_seq(r->last);
2376 	}
2377     }
2378 
2379     /* Open file if it's not already the current open reference */
2380     if (strcmp(r->fn, e->fn) || r->fp == NULL) {
2381 	if (r->fp)
2382 	    if (bgzf_close(r->fp) != 0)
2383 		return NULL;
2384 	r->fn = e->fn;
2385 	if (!(r->fp = bgzf_open_ref(r->fn, "r", is_md5)))
2386 	    return NULL;
2387     }
2388 
2389     RP("%d Loading ref %d (%d..%d)\n", gettid(), id, start, end);
2390 
2391     if (!(seq = load_ref_portion(r->fp, e, start, end))) {
2392 	return NULL;
2393     }
2394 
2395     RP("%d Loaded ref %d (%d..%d) = %p\n", gettid(), id, start, end, seq);
2396 
2397     RP("%d INC REF %d, %d\n", gettid(), id, (int)(e->count+1));
2398     e->seq = seq;
2399     e->mf = NULL;
2400     e->count++;
2401 
2402     /*
2403      * Also keep track of last used ref so incr/decr loops on the same
2404      * sequence don't cause load/free loops.
2405      */
2406     RP("%d cram_ref_load INCR %d => %d\n", gettid(), id, e->count+1);
2407     r->last = e;
2408     e->count++;
2409 
2410     return e;
2411 }
2412 
2413 /*
2414  * Returns a portion of a reference sequence from start to end inclusive.
2415  * The returned pointer is owned by either the cram_file fd or by the
2416  * internal refs_t structure and should not be freed  by the caller.
2417  *
2418  * The difference is whether or not this refs_t is in use by just the one
2419  * cram_fd or by multiples, or whether we have multiple threads accessing
2420  * references. In either case fd->shared will be true and we start using
2421  * reference counting to track the number of users of a specific reference
2422  * sequence.
2423  *
2424  * Otherwise the ref seq returned is allocated as part of cram_fd itself
2425  * and will be freed up on the next call to cram_get_ref or cram_close.
2426  *
2427  * To return the entire reference sequence, specify start as 1 and end
2428  * as 0.
2429  *
2430  * To cease using a reference, call cram_ref_decr().
2431  *
2432  * Returns reference on success,
2433  *         NULL on failure
2434  */
cram_get_ref(cram_fd * fd,int id,int start,int end)2435 char *cram_get_ref(cram_fd *fd, int id, int start, int end) {
2436     ref_entry *r;
2437     char *seq;
2438     int ostart = start;
2439 
2440     if (id == -1)
2441 	return NULL;
2442 
2443     /* FIXME: axiomatic query of r->seq being true?
2444      * Or shortcut for unsorted data where we load once and never free?
2445      */
2446 
2447     //fd->shared_ref = 1; // hard code for now to simplify things
2448 
2449     pthread_mutex_lock(&fd->ref_lock);
2450 
2451     RP("%d cram_get_ref on fd %p, id %d, range %d..%d\n", gettid(), fd, id, start, end);
2452 
2453     /*
2454      * Unsorted data implies we want to fetch an entire reference at a time.
2455      * We just deal with this at the moment by claiming we're sharing
2456      * references instead, which has the same requirement.
2457      */
2458     if (fd->unsorted)
2459 	fd->shared_ref = 1;
2460 
2461 
2462     /* Sanity checking: does this ID exist? */
2463     if (id >= fd->refs->nref) {
2464 	hts_log_error("No reference found for id %d", id);
2465 	pthread_mutex_unlock(&fd->ref_lock);
2466 	return NULL;
2467     }
2468 
2469     if (!fd->refs || !fd->refs->ref_id[id]) {
2470 	hts_log_error("No reference found for id %d", id);
2471 	pthread_mutex_unlock(&fd->ref_lock);
2472 	return NULL;
2473     }
2474 
2475     if (!(r = fd->refs->ref_id[id])) {
2476 	hts_log_error("No reference found for id %d", id);
2477 	pthread_mutex_unlock(&fd->ref_lock);
2478 	return NULL;
2479     }
2480 
2481 
2482     /*
2483      * It has an entry, but may not have been populated yet.
2484      * Any manually loaded .fai files have their lengths known.
2485      * A ref entry computed from @SQ lines (M5 or UR field) will have
2486      * r->length == 0 unless it's been loaded once and verified that we have
2487      * an on-disk filename for it.
2488      *
2489      * 19 Sep 2013: Moved the lock here as the cram_populate_ref code calls
2490      * open_path_mfile and libcurl, which isn't multi-thread safe unless I
2491      * rewrite my code to have one curl handle per thread.
2492      */
2493     pthread_mutex_lock(&fd->refs->lock);
2494     if (r->length == 0) {
2495 	if (cram_populate_ref(fd, id, r) == -1) {
2496 	    hts_log_error("Failed to populate reference for id %d", id);
2497 	    pthread_mutex_unlock(&fd->refs->lock);
2498 	    pthread_mutex_unlock(&fd->ref_lock);
2499 	    return NULL;
2500 	}
2501 	r = fd->refs->ref_id[id];
2502 	if (fd->unsorted)
2503 	    cram_ref_incr_locked(fd->refs, id);
2504     }
2505 
2506 
2507     /*
2508      * We now know that we the filename containing the reference, so check
2509      * for limits. If it's over half the reference we'll load all of it in
2510      * memory as this will speed up subsequent calls.
2511      */
2512     if (end < 1)
2513 	end = r->length;
2514     if (end >= r->length)
2515 	end  = r->length;
2516     if (start < 1)
2517 	return NULL;
2518 
2519     if (end - start >= 0.5*r->length || fd->shared_ref) {
2520 	start = 1;
2521 	end = r->length;
2522     }
2523 
2524     /*
2525      * Maybe we have it cached already? If so use it.
2526      *
2527      * Alternatively if we don't have the sequence but we're sharing
2528      * references and/or are asking for the entire length of it, then
2529      * load the full reference into the refs structure and return
2530      * a pointer to that one instead.
2531      */
2532     if (fd->shared_ref || r->seq || (start == 1 && end == r->length)) {
2533 	char *cp;
2534 
2535 	if (id >= 0) {
2536 	    if (r->seq) {
2537 		cram_ref_incr_locked(fd->refs, id);
2538 	    } else {
2539 		ref_entry *e;
2540 		if (!(e = cram_ref_load(fd->refs, id, r->is_md5))) {
2541 		    pthread_mutex_unlock(&fd->refs->lock);
2542 		    pthread_mutex_unlock(&fd->ref_lock);
2543 		    return NULL;
2544 		}
2545 
2546 		/* unsorted data implies cache ref indefinitely, to avoid
2547 		 * continually loading and unloading.
2548 		 */
2549 		if (fd->unsorted)
2550 		    cram_ref_incr_locked(fd->refs, id);
2551 	    }
2552 
2553 	    fd->ref = NULL; /* We never access it directly */
2554 	    fd->ref_start = 1;
2555 	    fd->ref_end   = r->length;
2556 	    fd->ref_id    = id;
2557 
2558 	    cp = fd->refs->ref_id[id]->seq + ostart-1;
2559 	} else {
2560 	    fd->ref = NULL;
2561 	    cp = NULL;
2562 	}
2563 
2564 	RP("%d cram_get_ref returning for id %d, count %d\n", gettid(), id, (int)r->count);
2565 
2566 	pthread_mutex_unlock(&fd->refs->lock);
2567 	pthread_mutex_unlock(&fd->ref_lock);
2568 	return cp;
2569     }
2570 
2571     /*
2572      * Otherwise we're not sharing, we don't have a copy of it already and
2573      * we're only asking for a small portion of it.
2574      *
2575      * In this case load up just that segment ourselves, freeing any old
2576      * small segments in the process.
2577      */
2578 
2579     /* Unmapped ref ID */
2580     if (id < 0) {
2581 	if (fd->ref_free) {
2582 	    free(fd->ref_free);
2583 	    fd->ref_free = NULL;
2584 	}
2585 	fd->ref = NULL;
2586 	fd->ref_id = id;
2587 	pthread_mutex_unlock(&fd->refs->lock);
2588 	pthread_mutex_unlock(&fd->ref_lock);
2589 	return NULL;
2590     }
2591 
2592     /* Open file if it's not already the current open reference */
2593     if (strcmp(fd->refs->fn, r->fn) || fd->refs->fp == NULL) {
2594 	if (fd->refs->fp)
2595 	    if (bgzf_close(fd->refs->fp) != 0)
2596 		return NULL;
2597 	fd->refs->fn = r->fn;
2598 	if (!(fd->refs->fp = bgzf_open_ref(fd->refs->fn, "r", r->is_md5))) {
2599 	    pthread_mutex_unlock(&fd->refs->lock);
2600 	    pthread_mutex_unlock(&fd->ref_lock);
2601 	    return NULL;
2602 	}
2603     }
2604 
2605     if (!(fd->ref = load_ref_portion(fd->refs->fp, r, start, end))) {
2606 	pthread_mutex_unlock(&fd->refs->lock);
2607 	pthread_mutex_unlock(&fd->ref_lock);
2608 	return NULL;
2609     }
2610 
2611     if (fd->ref_free)
2612 	free(fd->ref_free);
2613 
2614     fd->ref_id    = id;
2615     fd->ref_start = start;
2616     fd->ref_end   = end;
2617     fd->ref_free = fd->ref;
2618     seq = fd->ref;
2619 
2620     pthread_mutex_unlock(&fd->refs->lock);
2621     pthread_mutex_unlock(&fd->ref_lock);
2622 
2623     return seq + ostart - start;
2624 }
2625 
2626 /*
2627  * If fd has been opened for reading, it may be permitted to specify 'fn'
2628  * as NULL and let the code auto-detect the reference by parsing the
2629  * SAM header @SQ lines.
2630  */
cram_load_reference(cram_fd * fd,char * fn)2631 int cram_load_reference(cram_fd *fd, char *fn) {
2632     int ret = 0;
2633 
2634     if (fn) {
2635 	fd->refs = refs_load_fai(fd->refs, fn,
2636 				 !(fd->embed_ref && fd->mode == 'r'));
2637 	fn = fd->refs ? fd->refs->fn : NULL;
2638 	if (!fn)
2639 	    ret = -1;
2640 	sanitise_SQ_lines(fd);
2641     }
2642     fd->ref_fn = fn;
2643 
2644     if ((!fd->refs || (fd->refs->nref == 0 && !fn)) && fd->header) {
2645 	if (fd->refs)
2646 	    refs_free(fd->refs);
2647 	if (!(fd->refs = refs_create()))
2648 	    return -1;
2649 	if (-1 == refs_from_header(fd->refs, fd, fd->header))
2650 	    return -1;
2651     }
2652 
2653     if (fd->header)
2654 	if (-1 == refs2id(fd->refs, fd->header))
2655 	    return -1;
2656 
2657     return ret;
2658 }
2659 
2660 /* ----------------------------------------------------------------------
2661  * Containers
2662  */
2663 
2664 /*
2665  * Creates a new container, specifying the maximum number of slices
2666  * and records permitted.
2667  *
2668  * Returns cram_container ptr on success
2669  *         NULL on failure
2670  */
cram_new_container(int nrec,int nslice)2671 cram_container *cram_new_container(int nrec, int nslice) {
2672     cram_container *c = calloc(1, sizeof(*c));
2673     enum cram_DS_ID id;
2674 
2675     if (!c)
2676 	return NULL;
2677 
2678     c->curr_ref = -2;
2679 
2680     c->max_c_rec = nrec * nslice;
2681     c->curr_c_rec = 0;
2682 
2683     c->max_rec = nrec;
2684     c->record_counter = 0;
2685     c->num_bases = 0;
2686     c->s_num_bases = 0;
2687 
2688     c->max_slice = nslice;
2689     c->curr_slice = 0;
2690 
2691     c->pos_sorted = 1;
2692     c->max_apos   = 0;
2693     c->multi_seq  = 0;
2694 
2695     c->bams = NULL;
2696 
2697     if (!(c->slices = (cram_slice **)calloc(nslice, sizeof(cram_slice *))))
2698 	goto err;
2699     c->slice = NULL;
2700 
2701     if (!(c->comp_hdr = cram_new_compression_header()))
2702 	goto err;
2703     c->comp_hdr_block = NULL;
2704 
2705     for (id = DS_RN; id < DS_TN; id++)
2706 	if (!(c->stats[id] = cram_stats_create())) goto err;
2707 
2708     //c->aux_B_stats = cram_stats_create();
2709 
2710     if (!(c->tags_used = kh_init(m_tagmap)))
2711 	goto err;
2712     c->refs_used = 0;
2713 
2714     return c;
2715 
2716  err:
2717     if (c) {
2718 	if (c->slices)
2719 	    free(c->slices);
2720 	free(c);
2721     }
2722     return NULL;
2723 }
2724 
cram_free_container(cram_container * c)2725 void cram_free_container(cram_container *c) {
2726     enum cram_DS_ID id;
2727     int i;
2728 
2729     if (!c)
2730 	return;
2731 
2732     if (c->refs_used)
2733 	free(c->refs_used);
2734 
2735     if (c->landmark)
2736 	free(c->landmark);
2737 
2738     if (c->comp_hdr)
2739 	cram_free_compression_header(c->comp_hdr);
2740 
2741     if (c->comp_hdr_block)
2742 	cram_free_block(c->comp_hdr_block);
2743 
2744     if (c->slices) {
2745 	for (i = 0; i < c->max_slice; i++)
2746 	    if (c->slices[i])
2747 		cram_free_slice(c->slices[i]);
2748 	free(c->slices);
2749     }
2750 
2751     for (id = DS_RN; id < DS_TN; id++)
2752 	if (c->stats[id]) cram_stats_free(c->stats[id]);
2753 
2754     //if (c->aux_B_stats) cram_stats_free(c->aux_B_stats);
2755 
2756     if (c->tags_used) {
2757 	khint_t k;
2758 
2759 	for (k = kh_begin(c->tags_used); k != kh_end(c->tags_used); k++) {
2760 	    if (!kh_exist(c->tags_used, k))
2761 		continue;
2762 
2763 	    cram_tag_map *tm = (cram_tag_map *)kh_val(c->tags_used, k);
2764 	    cram_codec *c = tm->codec;
2765 
2766 	    if (c) c->free(c);
2767 	    free(tm);
2768 	}
2769 
2770 	kh_destroy(m_tagmap, c->tags_used);
2771     }
2772 
2773     free(c);
2774 }
2775 
2776 /*
2777  * Reads a container header.
2778  *
2779  * Returns cram_container on success
2780  *         NULL on failure or no container left (fd->err == 0).
2781  */
cram_read_container(cram_fd * fd)2782 cram_container *cram_read_container(cram_fd *fd) {
2783     cram_container c2, *c;
2784     int i, s;
2785     size_t rd = 0;
2786     uint32_t crc = 0;
2787 
2788     fd->err = 0;
2789     fd->eof = 0;
2790 
2791     memset(&c2, 0, sizeof(c2));
2792     if (CRAM_MAJOR_VERS(fd->version) == 1) {
2793 	if ((s = itf8_decode_crc(fd, &c2.length, &crc)) == -1) {
2794 	    fd->eof = fd->empty_container ? 1 : 2;
2795 	    return NULL;
2796 	} else {
2797 	    rd+=s;
2798 	}
2799     } else {
2800 	uint32_t len;
2801 	if ((s = int32_decode(fd, &c2.length)) == -1) {
2802 	    if (CRAM_MAJOR_VERS(fd->version) == 2 &&
2803 		CRAM_MINOR_VERS(fd->version) == 0)
2804 		fd->eof = 1; // EOF blocks arrived in v2.1
2805 	    else
2806 		fd->eof = fd->empty_container ? 1 : 2;
2807 	    return NULL;
2808 	} else {
2809 	    rd+=s;
2810 	}
2811 	len = le_int4(c2.length);
2812 	crc = crc32(0L, (unsigned char *)&len, 4);
2813     }
2814     if ((s = itf8_decode_crc(fd, &c2.ref_seq_id, &crc))   == -1) return NULL; else rd+=s;
2815     if ((s = itf8_decode_crc(fd, &c2.ref_seq_start, &crc))== -1) return NULL; else rd+=s;
2816     if ((s = itf8_decode_crc(fd, &c2.ref_seq_span, &crc)) == -1) return NULL; else rd+=s;
2817     if ((s = itf8_decode_crc(fd, &c2.num_records, &crc))  == -1) return NULL; else rd+=s;
2818 
2819     if (CRAM_MAJOR_VERS(fd->version) == 1) {
2820 	c2.record_counter = 0;
2821 	c2.num_bases = 0;
2822     } else {
2823 	if (CRAM_MAJOR_VERS(fd->version) >= 3) {
2824 	    if ((s = ltf8_decode_crc(fd, &c2.record_counter, &crc)) == -1)
2825 		return NULL;
2826 	    else
2827 		rd += s;
2828 	} else {
2829 	    int32_t i32;
2830 	    if ((s = itf8_decode_crc(fd, &i32, &crc)) == -1)
2831 		return NULL;
2832 	    else
2833 		rd += s;
2834 	    c2.record_counter = i32;
2835 	}
2836 
2837 	if ((s = ltf8_decode_crc(fd, &c2.num_bases, &crc))== -1)
2838 	    return NULL;
2839 	else
2840 	    rd += s;
2841     }
2842     if ((s = itf8_decode_crc(fd, &c2.num_blocks, &crc))   == -1) return NULL; else rd+=s;
2843     if ((s = itf8_decode_crc(fd, &c2.num_landmarks, &crc))== -1) return NULL; else rd+=s;
2844 
2845     if (c2.num_landmarks < 0 || c2.num_landmarks >= SIZE_MAX / sizeof(int32_t))
2846         return NULL;
2847 
2848     if (!(c = calloc(1, sizeof(*c))))
2849 	return NULL;
2850 
2851     *c = c2;
2852 
2853     if (!(c->landmark = malloc(c->num_landmarks * sizeof(int32_t))) &&
2854 	c->num_landmarks) {
2855 	fd->err = errno;
2856 	cram_free_container(c);
2857 	return NULL;
2858     }
2859     for (i = 0; i < c->num_landmarks; i++) {
2860 	if ((s = itf8_decode_crc(fd, &c->landmark[i], &crc)) == -1) {
2861 	    cram_free_container(c);
2862 	    return NULL;
2863 	} else {
2864 	    rd += s;
2865 	}
2866     }
2867 
2868     if (CRAM_MAJOR_VERS(fd->version) >= 3) {
2869 	if (-1 == int32_decode(fd, (int32_t *)&c->crc32))
2870 	    return NULL;
2871 	else
2872 	    rd+=4;
2873 
2874 	if (crc != c->crc32) {
2875 	    hts_log_error("Container header CRC32 failure");
2876 	    cram_free_container(c);
2877 	    return NULL;
2878 	}
2879     }
2880 
2881     c->offset = rd;
2882     c->slices = NULL;
2883     c->curr_slice = 0;
2884     c->max_slice = c->num_landmarks;
2885     c->slice_rec = 0;
2886     c->curr_rec = 0;
2887     c->max_rec = 0;
2888 
2889     if (c->ref_seq_id == -2) {
2890 	c->multi_seq = 1;
2891 	fd->multi_seq = 1;
2892     }
2893 
2894     fd->empty_container =
2895 	(c->num_records == 0 &&
2896 	 c->ref_seq_id == -1 &&
2897 	 c->ref_seq_start == 0x454f46 /* EOF */) ? 1 : 0;
2898 
2899     return c;
2900 }
2901 
2902 
2903 /* MAXIMUM storage size needed for the container. */
cram_container_size(cram_container * c)2904 int cram_container_size(cram_container *c) {
2905     return 55 + 5*c->num_landmarks;
2906 }
2907 
2908 
2909 /*
2910  * Stores the container structure in dat and returns *size as the
2911  * number of bytes written to dat[].  The input size of dat is also
2912  * held in *size and should be initialised to cram_container_size(c).
2913  *
2914  * Returns 0 on success;
2915  *        -1 on failure
2916  */
cram_store_container(cram_fd * fd,cram_container * c,char * dat,int * size)2917 int cram_store_container(cram_fd *fd, cram_container *c, char *dat, int *size)
2918 {
2919     unsigned char *cp = (unsigned char *)dat;
2920     int i;
2921 
2922     // Check the input buffer is large enough according to our stated
2923     // requirements. (NOTE: it may actually take less.)
2924     if (cram_container_size(c) > *size)
2925         return -1;
2926 
2927     if (CRAM_MAJOR_VERS(fd->version) == 1) {
2928 	cp += itf8_put((char*)cp, c->length);
2929     } else {
2930 	*(int32_t *)cp = le_int4(c->length);
2931 	cp += 4;
2932     }
2933     if (c->multi_seq) {
2934 	cp += itf8_put((char*)cp, -2);
2935 	cp += itf8_put((char*)cp, 0);
2936 	cp += itf8_put((char*)cp, 0);
2937     } else {
2938 	cp += itf8_put((char*)cp, c->ref_seq_id);
2939 	cp += itf8_put((char*)cp, c->ref_seq_start);
2940 	cp += itf8_put((char*)cp, c->ref_seq_span);
2941     }
2942     cp += itf8_put((char*)cp, c->num_records);
2943     if (CRAM_MAJOR_VERS(fd->version) == 2) {
2944 	cp += itf8_put((char*)cp, c->record_counter);
2945 	cp += ltf8_put((char*)cp, c->num_bases);
2946     } else if (CRAM_MAJOR_VERS(fd->version) >= 3) {
2947 	cp += ltf8_put((char*)cp, c->record_counter);
2948 	cp += ltf8_put((char*)cp, c->num_bases);
2949     }
2950 
2951     cp += itf8_put((char*)cp, c->num_blocks);
2952     cp += itf8_put((char*)cp, c->num_landmarks);
2953     for (i = 0; i < c->num_landmarks; i++)
2954 	cp += itf8_put((char*)cp, c->landmark[i]);
2955 
2956     if (CRAM_MAJOR_VERS(fd->version) >= 3) {
2957 	c->crc32 = crc32(0L, (uc *)dat, (char*)cp-dat);
2958 	cp[0] =  c->crc32        & 0xff;
2959 	cp[1] = (c->crc32 >>  8) & 0xff;
2960 	cp[2] = (c->crc32 >> 16) & 0xff;
2961 	cp[3] = (c->crc32 >> 24) & 0xff;
2962 	cp += 4;
2963     }
2964 
2965     *size = (char *)cp-dat; // actual used size
2966 
2967     return 0;
2968 }
2969 
2970 
2971 /*
2972  * Writes a container structure.
2973  *
2974  * Returns 0 on success
2975  *        -1 on failure
2976  */
cram_write_container(cram_fd * fd,cram_container * c)2977 int cram_write_container(cram_fd *fd, cram_container *c) {
2978     char buf_a[1024], *buf = buf_a;
2979     unsigned char *cp;
2980     int i;
2981 
2982     if (55 + c->num_landmarks * 5 >= 1024)
2983 	buf = malloc(55 + c->num_landmarks * 5);
2984     cp = (unsigned char *)buf;
2985 
2986     if (CRAM_MAJOR_VERS(fd->version) == 1) {
2987 	cp += itf8_put((char*)cp, c->length);
2988     } else {
2989 	*(int32_t *)cp = le_int4(c->length);
2990 	cp += 4;
2991     }
2992     if (c->multi_seq) {
2993 	cp += itf8_put((char*)cp, -2);
2994 	cp += itf8_put((char*)cp, 0);
2995 	cp += itf8_put((char*)cp, 0);
2996     } else {
2997 	cp += itf8_put((char*)cp, c->ref_seq_id);
2998 	cp += itf8_put((char*)cp, c->ref_seq_start);
2999 	cp += itf8_put((char*)cp, c->ref_seq_span);
3000     }
3001     cp += itf8_put((char*)cp, c->num_records);
3002     if (CRAM_MAJOR_VERS(fd->version) == 2) {
3003 	cp += itf8_put((char*)cp, c->record_counter);
3004 	cp += ltf8_put((char*)cp, c->num_bases);
3005     } else if (CRAM_MAJOR_VERS(fd->version) >= 3) {
3006 	cp += ltf8_put((char*)cp, c->record_counter);
3007 	cp += ltf8_put((char*)cp, c->num_bases);
3008     }
3009 
3010     cp += itf8_put((char*)cp, c->num_blocks);
3011     cp += itf8_put((char*)cp, c->num_landmarks);
3012     for (i = 0; i < c->num_landmarks; i++)
3013 	cp += itf8_put((char*)cp, c->landmark[i]);
3014 
3015     if (CRAM_MAJOR_VERS(fd->version) >= 3) {
3016 	c->crc32 = crc32(0L, (uc *)buf, (char*)cp-buf);
3017 	cp[0] =  c->crc32        & 0xff;
3018 	cp[1] = (c->crc32 >>  8) & 0xff;
3019 	cp[2] = (c->crc32 >> 16) & 0xff;
3020 	cp[3] = (c->crc32 >> 24) & 0xff;
3021 	cp += 4;
3022     }
3023 
3024     if ((char*)cp-buf != hwrite(fd->fp, buf, (char*)cp-buf)) {
3025 	if (buf != buf_a)
3026 	    free(buf);
3027 	return -1;
3028     }
3029 
3030     if (buf != buf_a)
3031 	free(buf);
3032 
3033     return 0;
3034 }
3035 
3036 // common component shared by cram_flush_container{,_mt}
cram_flush_container2(cram_fd * fd,cram_container * c)3037 static int cram_flush_container2(cram_fd *fd, cram_container *c) {
3038     int i, j;
3039 
3040     if (c->curr_slice > 0 && !c->slices)
3041 	return -1;
3042 
3043     //fprintf(stderr, "Writing container %d, sum %u\n", c->record_counter, sum);
3044 
3045     /* Write the container struct itself */
3046     if (0 != cram_write_container(fd, c))
3047 	return -1;
3048 
3049     /* And the compression header */
3050     if (0 != cram_write_block(fd, c->comp_hdr_block))
3051 	return -1;
3052 
3053     /* Followed by the slice blocks */
3054     for (i = 0; i < c->curr_slice; i++) {
3055 	cram_slice *s = c->slices[i];
3056 
3057 	if (0 != cram_write_block(fd, s->hdr_block))
3058 	    return -1;
3059 
3060 	for (j = 0; j < s->hdr->num_blocks; j++) {
3061 	    if (0 != cram_write_block(fd, s->block[j]))
3062 		return -1;
3063 	}
3064     }
3065 
3066     return hflush(fd->fp) == 0 ? 0 : -1;
3067 }
3068 
3069 /*
3070  * Flushes a completely or partially full container to disk, writing
3071  * container structure, header and blocks. This also calls the encoder
3072  * functions.
3073  *
3074  * Returns 0 on success
3075  *        -1 on failure
3076  */
cram_flush_container(cram_fd * fd,cram_container * c)3077 int cram_flush_container(cram_fd *fd, cram_container *c) {
3078     /* Encode the container blocks and generate compression header */
3079     if (0 != cram_encode_container(fd, c))
3080 	return -1;
3081 
3082     return cram_flush_container2(fd, c);
3083 }
3084 
3085 typedef struct {
3086     cram_fd *fd;
3087     cram_container *c;
3088 } cram_job;
3089 
cram_flush_thread(void * arg)3090 void *cram_flush_thread(void *arg) {
3091     cram_job *j = (cram_job *)arg;
3092 
3093     /* Encode the container blocks and generate compression header */
3094     if (0 != cram_encode_container(j->fd, j->c)) {
3095 	hts_log_error("Call to cram_encode_container failed");
3096 	return NULL;
3097     }
3098 
3099     return arg;
3100 }
3101 
cram_flush_result(cram_fd * fd)3102 static int cram_flush_result(cram_fd *fd) {
3103     int i, ret = 0;
3104     hts_tpool_result *r;
3105 
3106     while ((r = hts_tpool_next_result(fd->rqueue))) {
3107 	cram_job *j = (cram_job *)hts_tpool_result_data(r);
3108 	cram_container *c;
3109 
3110 	if (!j) {
3111 	    hts_tpool_delete_result(r, 0);
3112 	    return -1;
3113 	}
3114 
3115 	fd = j->fd;
3116 	c = j->c;
3117 
3118 	if (fd->mode == 'w')
3119 	    if (0 != cram_flush_container2(fd, c))
3120 		return -1;
3121 
3122 	/* Free the container */
3123 	for (i = 0; i < c->max_slice; i++) {
3124 	    if (c->slices && c->slices[i]) {
3125 		cram_free_slice(c->slices[i]);
3126 		c->slices[i] = NULL;
3127 	    }
3128 	}
3129 
3130 	c->slice = NULL;
3131 	c->curr_slice = 0;
3132 
3133 	cram_free_container(c);
3134 
3135 	ret |= hflush(fd->fp) == 0 ? 0 : -1;
3136 
3137 	hts_tpool_delete_result(r, 1);
3138     }
3139 
3140     return ret;
3141 }
3142 
cram_flush_container_mt(cram_fd * fd,cram_container * c)3143 int cram_flush_container_mt(cram_fd *fd, cram_container *c) {
3144     cram_job *j;
3145 
3146     if (!fd->pool)
3147 	return cram_flush_container(fd, c);
3148 
3149     if (!(j = malloc(sizeof(*j))))
3150 	return -1;
3151     j->fd = fd;
3152     j->c = c;
3153 
3154     // Flush the job.  Note our encoder queue may be full, so we
3155     // either have to keep trying in non-blocking mode (what we do) or
3156     // use a dedicated separate thread for draining the queue.
3157     for (;;) {
3158 	errno = 0;
3159 	hts_tpool_dispatch2(fd->pool, fd->rqueue, cram_flush_thread, j, 1);
3160 	int pending = (errno == EAGAIN);
3161 	if (cram_flush_result(fd) != 0)
3162 	    return -1;
3163 	if (!pending)
3164 	    break;
3165 
3166 	usleep(1000);
3167     }
3168 
3169     return 0;
3170 }
3171 
3172 /* ----------------------------------------------------------------------
3173  * Compression headers; the first part of the container
3174  */
3175 
3176 /*
3177  * Creates a new blank container compression header
3178  *
3179  * Returns header ptr on success
3180  *         NULL on failure
3181  */
cram_new_compression_header(void)3182 cram_block_compression_hdr *cram_new_compression_header(void) {
3183     cram_block_compression_hdr *hdr = calloc(1, sizeof(*hdr));
3184     if (!hdr)
3185 	return NULL;
3186 
3187     if (!(hdr->TD_blk = cram_new_block(CORE, 0))) {
3188 	free(hdr);
3189 	return NULL;
3190     }
3191 
3192     if (!(hdr->TD_hash = kh_init(m_s2i))) {
3193 	cram_free_block(hdr->TD_blk);
3194 	free(hdr);
3195 	return NULL;
3196     }
3197 
3198     if (!(hdr->TD_keys = string_pool_create(8192))) {
3199 	kh_destroy(m_s2i, hdr->TD_hash);
3200 	cram_free_block(hdr->TD_blk);
3201 	free(hdr);
3202 	return NULL;
3203     }
3204 
3205     return hdr;
3206 }
3207 
cram_free_compression_header(cram_block_compression_hdr * hdr)3208 void cram_free_compression_header(cram_block_compression_hdr *hdr) {
3209     int i;
3210 
3211     if (hdr->landmark)
3212 	free(hdr->landmark);
3213 
3214     if (hdr->preservation_map)
3215 	kh_destroy(map, hdr->preservation_map);
3216 
3217     for (i = 0; i < CRAM_MAP_HASH; i++) {
3218 	cram_map *m, *m2;
3219 	for (m = hdr->rec_encoding_map[i]; m; m = m2) {
3220 	    m2 = m->next;
3221 	    if (m->codec)
3222 		m->codec->free(m->codec);
3223 	    free(m);
3224 	}
3225     }
3226 
3227     for (i = 0; i < CRAM_MAP_HASH; i++) {
3228 	cram_map *m, *m2;
3229 	for (m = hdr->tag_encoding_map[i]; m; m = m2) {
3230 	    m2 = m->next;
3231 	    if (m->codec)
3232 		m->codec->free(m->codec);
3233 	    free(m);
3234 	}
3235     }
3236 
3237     for (i = 0; i < DS_END; i++) {
3238 	if (hdr->codecs[i])
3239 	    hdr->codecs[i]->free(hdr->codecs[i]);
3240     }
3241 
3242     if (hdr->TL)
3243 	free(hdr->TL);
3244     if (hdr->TD_blk)
3245 	cram_free_block(hdr->TD_blk);
3246     if (hdr->TD_hash)
3247 	kh_destroy(m_s2i, hdr->TD_hash);
3248     if (hdr->TD_keys)
3249 	string_pool_destroy(hdr->TD_keys);
3250 
3251     free(hdr);
3252 }
3253 
3254 
3255 /* ----------------------------------------------------------------------
3256  * Slices and slice headers
3257  */
3258 
cram_free_slice_header(cram_block_slice_hdr * hdr)3259 void cram_free_slice_header(cram_block_slice_hdr *hdr) {
3260     if (!hdr)
3261 	return;
3262 
3263     if (hdr->block_content_ids)
3264 	free(hdr->block_content_ids);
3265 
3266     free(hdr);
3267 
3268     return;
3269 }
3270 
cram_free_slice(cram_slice * s)3271 void cram_free_slice(cram_slice *s) {
3272     if (!s)
3273 	return;
3274 
3275     if (s->hdr_block)
3276 	cram_free_block(s->hdr_block);
3277 
3278     if (s->block) {
3279 	int i;
3280 
3281 	if (s->hdr) {
3282 	    for (i = 0; i < s->hdr->num_blocks; i++) {
3283 		cram_free_block(s->block[i]);
3284 	    }
3285 	}
3286 	free(s->block);
3287     }
3288 
3289     if (s->block_by_id)
3290 	free(s->block_by_id);
3291 
3292     if (s->hdr)
3293 	cram_free_slice_header(s->hdr);
3294 
3295     if (s->seqs_blk)
3296 	cram_free_block(s->seqs_blk);
3297 
3298     if (s->qual_blk)
3299 	cram_free_block(s->qual_blk);
3300 
3301     if (s->name_blk)
3302 	cram_free_block(s->name_blk);
3303 
3304     if (s->aux_blk)
3305 	cram_free_block(s->aux_blk);
3306 
3307     if (s->base_blk)
3308 	cram_free_block(s->base_blk);
3309 
3310     if (s->soft_blk)
3311 	cram_free_block(s->soft_blk);
3312 
3313     if (s->cigar)
3314 	free(s->cigar);
3315 
3316     if (s->crecs)
3317 	free(s->crecs);
3318 
3319     if (s->features)
3320 	free(s->features);
3321 
3322     if (s->TN)
3323 	free(s->TN);
3324 
3325     if (s->pair_keys)
3326 	string_pool_destroy(s->pair_keys);
3327 
3328     if (s->pair[0])
3329 	kh_destroy(m_s2i, s->pair[0]);
3330     if (s->pair[1])
3331 	kh_destroy(m_s2i, s->pair[1]);
3332 
3333     if (s->aux_block)
3334 	free(s->aux_block);
3335 
3336     free(s);
3337 }
3338 
3339 /*
3340  * Creates a new empty slice in memory, for subsequent writing to
3341  * disk.
3342  *
3343  * Returns cram_slice ptr on success
3344  *         NULL on failure
3345  */
cram_new_slice(enum cram_content_type type,int nrecs)3346 cram_slice *cram_new_slice(enum cram_content_type type, int nrecs) {
3347     cram_slice *s = calloc(1, sizeof(*s));
3348     if (!s)
3349 	return NULL;
3350 
3351     if (!(s->hdr = (cram_block_slice_hdr *)calloc(1, sizeof(*s->hdr))))
3352 	goto err;
3353     s->hdr->content_type = type;
3354 
3355     s->hdr_block = NULL;
3356     s->block = NULL;
3357     s->block_by_id = NULL;
3358     s->last_apos = 0;
3359     if (!(s->crecs = malloc(nrecs * sizeof(cram_record))))  goto err;
3360     s->cigar = NULL;
3361     s->cigar_alloc = 0;
3362     s->ncigar = 0;
3363 
3364     if (!(s->seqs_blk = cram_new_block(EXTERNAL, 0)))       goto err;
3365     if (!(s->qual_blk = cram_new_block(EXTERNAL, DS_QS)))   goto err;
3366     if (!(s->name_blk = cram_new_block(EXTERNAL, DS_RN)))   goto err;
3367     if (!(s->aux_blk  = cram_new_block(EXTERNAL, DS_aux)))  goto err;
3368     if (!(s->base_blk = cram_new_block(EXTERNAL, DS_IN)))   goto err;
3369     if (!(s->soft_blk = cram_new_block(EXTERNAL, DS_SC)))   goto err;
3370 
3371     s->features = NULL;
3372     s->nfeatures = s->afeatures = 0;
3373 
3374 #ifndef TN_external
3375     s->TN = NULL;
3376     s->nTN = s->aTN = 0;
3377 #endif
3378 
3379     // Volatile keys as we do realloc in dstring
3380     if (!(s->pair_keys = string_pool_create(8192))) goto err;
3381     if (!(s->pair[0] = kh_init(m_s2i)))             goto err;
3382     if (!(s->pair[1] = kh_init(m_s2i)))             goto err;
3383 
3384 #ifdef BA_external
3385     s->BA_len = 0;
3386 #endif
3387 
3388     return s;
3389 
3390  err:
3391     if (s)
3392 	cram_free_slice(s);
3393 
3394     return NULL;
3395 }
3396 
3397 /*
3398  * Loads an entire slice.
3399  * FIXME: In 1.0 the native unit of slices within CRAM is broken
3400  * as slices contain references to objects in other slices.
3401  * To work around this while keeping the slice oriented outer loop
3402  * we read all slices and stitch them together into a fake large
3403  * slice instead.
3404  *
3405  * Returns cram_slice ptr on success
3406  *         NULL on failure
3407  */
cram_read_slice(cram_fd * fd)3408 cram_slice *cram_read_slice(cram_fd *fd) {
3409     cram_block *b = cram_read_block(fd);
3410     cram_slice *s = calloc(1, sizeof(*s));
3411     int i, n, max_id, min_id;
3412 
3413     if (!b || !s)
3414 	goto err;
3415 
3416     s->hdr_block = b;
3417     switch (b->content_type) {
3418     case MAPPED_SLICE:
3419     case UNMAPPED_SLICE:
3420 	if (!(s->hdr = cram_decode_slice_header(fd, b)))
3421 	    goto err;
3422 	break;
3423 
3424     default:
3425 	hts_log_error("Unexpected block of type %s",
3426 		cram_content_type2str(b->content_type));
3427 	goto err;
3428     }
3429 
3430     if (s->hdr->num_blocks < 1) {
3431 	hts_log_error("Slice does not include any data blocks");
3432 	goto err;
3433     }
3434 
3435     s->block = calloc(n = s->hdr->num_blocks, sizeof(*s->block));
3436     if (!s->block)
3437 	goto err;
3438 
3439     for (max_id = i = 0, min_id = INT_MAX; i < n; i++) {
3440 	if (!(s->block[i] = cram_read_block(fd)))
3441 	    goto err;
3442 
3443 	if (s->block[i]->content_type == EXTERNAL) {
3444 	    if (max_id < s->block[i]->content_id)
3445 		max_id = s->block[i]->content_id;
3446 	    if (min_id > s->block[i]->content_id)
3447 		min_id = s->block[i]->content_id;
3448 	}
3449     }
3450     if (min_id >= 0 && max_id < 1024) {
3451 	if (!(s->block_by_id = calloc(1024, sizeof(s->block[0]))))
3452 	    goto err;
3453 
3454 	for (i = 0; i < n; i++) {
3455 	    if (s->block[i]->content_type != EXTERNAL)
3456 		continue;
3457 	    s->block_by_id[s->block[i]->content_id] = s->block[i];
3458 	}
3459     }
3460 
3461     /* Initialise encoding/decoding tables */
3462     s->cigar = NULL;
3463     s->cigar_alloc = 0;
3464     s->ncigar = 0;
3465 
3466     if (!(s->seqs_blk = cram_new_block(EXTERNAL, 0)))      goto err;
3467     if (!(s->qual_blk = cram_new_block(EXTERNAL, DS_QS)))  goto err;
3468     if (!(s->name_blk = cram_new_block(EXTERNAL, DS_RN)))  goto err;
3469     if (!(s->aux_blk  = cram_new_block(EXTERNAL, DS_aux))) goto err;
3470     if (!(s->base_blk = cram_new_block(EXTERNAL, DS_IN)))  goto err;
3471     if (!(s->soft_blk = cram_new_block(EXTERNAL, DS_SC)))  goto err;
3472 
3473     s->crecs = NULL;
3474 
3475     s->last_apos = s->hdr->ref_seq_start;
3476 
3477     return s;
3478 
3479  err:
3480     if (b)
3481 	cram_free_block(b);
3482     if (s) {
3483 	s->hdr_block = NULL;
3484 	cram_free_slice(s);
3485     }
3486     return NULL;
3487 }
3488 
3489 
3490 /* ----------------------------------------------------------------------
3491  * CRAM file definition (header)
3492  */
3493 
3494 /*
3495  * Reads a CRAM file definition structure.
3496  * Returns file_def ptr on success
3497  *         NULL on failure
3498  */
cram_read_file_def(cram_fd * fd)3499 cram_file_def *cram_read_file_def(cram_fd *fd) {
3500     cram_file_def *def = malloc(sizeof(*def));
3501     if (!def)
3502 	return NULL;
3503 
3504     if (26 != hread(fd->fp, &def->magic[0], 26)) {
3505 	free(def);
3506 	return NULL;
3507     }
3508 
3509     if (memcmp(def->magic, "CRAM", 4) != 0) {
3510 	free(def);
3511 	return NULL;
3512     }
3513 
3514     if (def->major_version > 3) {
3515 	hts_log_error("CRAM version number mismatch. Expected 1.x, 2.x or 3.x, got %d.%d",
3516 		def->major_version, def->minor_version);
3517 	free(def);
3518 	return NULL;
3519     }
3520 
3521     fd->first_container += 26;
3522     fd->last_slice = 0;
3523 
3524     return def;
3525 }
3526 
3527 /*
3528  * Writes a cram_file_def structure to cram_fd.
3529  * Returns 0 on success
3530  *        -1 on failure
3531  */
cram_write_file_def(cram_fd * fd,cram_file_def * def)3532 int cram_write_file_def(cram_fd *fd, cram_file_def *def) {
3533     return (hwrite(fd->fp, &def->magic[0], 26) == 26) ? 0 : -1;
3534 }
3535 
cram_free_file_def(cram_file_def * def)3536 void cram_free_file_def(cram_file_def *def) {
3537     if (def) free(def);
3538 }
3539 
3540 /* ----------------------------------------------------------------------
3541  * SAM header I/O
3542  */
3543 
3544 
3545 /*
3546  * Reads the SAM header from the first CRAM data block.
3547  * Also performs minimal parsing to extract read-group
3548  * and sample information.
3549 
3550  * Returns SAM hdr ptr on success
3551  *         NULL on failure
3552  */
cram_read_SAM_hdr(cram_fd * fd)3553 SAM_hdr *cram_read_SAM_hdr(cram_fd *fd) {
3554     int32_t header_len;
3555     char *header;
3556     SAM_hdr *hdr;
3557 
3558     /* 1.1 onwards stores the header in the first block of a container */
3559     if (CRAM_MAJOR_VERS(fd->version) == 1) {
3560 	/* Length */
3561 	if (-1 == int32_decode(fd, &header_len))
3562 	    return NULL;
3563 
3564 	/* Alloc and read */
3565 	if (header_len < 0 || NULL == (header = malloc((size_t) header_len+1)))
3566 	    return NULL;
3567 
3568 	if (header_len != hread(fd->fp, header, header_len))
3569 	    return NULL;
3570 	header[header_len] = '\0';
3571 
3572 	fd->first_container += 4 + header_len;
3573     } else {
3574 	cram_container *c = cram_read_container(fd);
3575 	cram_block *b;
3576 	int i;
3577         int64_t len;
3578 
3579 	if (!c)
3580 	    return NULL;
3581 
3582 	fd->first_container += c->length + c->offset;
3583 
3584 	if (c->num_blocks < 1) {
3585 	    cram_free_container(c);
3586 	    return NULL;
3587 	}
3588 
3589 	if (!(b = cram_read_block(fd))) {
3590 	    cram_free_container(c);
3591 	    return NULL;
3592 	}
3593 	if (cram_uncompress_block(b) != 0) {
3594 	    cram_free_container(c);
3595 	    cram_free_block(b);
3596 	    return NULL;
3597 	}
3598 
3599 	len = b->comp_size + 2 + 4*(CRAM_MAJOR_VERS(fd->version) >= 3) +
3600 	    itf8_size(b->content_id) +
3601 	    itf8_size(b->uncomp_size) +
3602 	    itf8_size(b->comp_size);
3603 
3604 	/* Extract header from 1st block */
3605 	if (-1 == int32_get_blk(b, &header_len) ||
3606             header_len < 0 || /* Spec. says signed...  why? */
3607 	    b->uncomp_size - 4 < header_len) {
3608 	    cram_free_container(c);
3609 	    cram_free_block(b);
3610 	    return NULL;
3611 	}
3612 	if (NULL == (header = malloc((size_t) header_len+1))) {
3613 	    cram_free_container(c);
3614 	    cram_free_block(b);
3615 	    return NULL;
3616 	}
3617 	memcpy(header, BLOCK_END(b), header_len);
3618 	header[header_len] = '\0';
3619 	cram_free_block(b);
3620 
3621 	/* Consume any remaining blocks */
3622 	for (i = 1; i < c->num_blocks; i++) {
3623 	    if (!(b = cram_read_block(fd))) {
3624 		cram_free_container(c);
3625 		return NULL;
3626 	    }
3627 	    len += b->comp_size + 2 + 4*(CRAM_MAJOR_VERS(fd->version) >= 3) +
3628 		itf8_size(b->content_id) +
3629 		itf8_size(b->uncomp_size) +
3630 		itf8_size(b->comp_size);
3631 	    cram_free_block(b);
3632 	}
3633 
3634 	if (c->length > 0 && len > 0 && c->length > len) {
3635 	    // Consume padding
3636 	    char *pads = malloc(c->length - len);
3637 	    if (!pads) {
3638 		cram_free_container(c);
3639 		return NULL;
3640 	    }
3641 
3642 	    if (c->length - len != hread(fd->fp, pads, c->length - len)) {
3643 		cram_free_container(c);
3644 		return NULL;
3645 	    }
3646 	    free(pads);
3647 	}
3648 
3649 	cram_free_container(c);
3650     }
3651 
3652     /* Parse */
3653     hdr = sam_hdr_parse_(header, header_len);
3654     free(header);
3655 
3656     return hdr;
3657 }
3658 
3659 /*
3660  * Converts 'in' to a full pathname to store in out.
3661  * Out must be at least PATH_MAX bytes long.
3662  */
full_path(char * out,char * in)3663 static void full_path(char *out, char *in) {
3664     size_t in_l = strlen(in);
3665     if (*in == '/' ||
3666 	// Windows paths
3667 	(in_l > 3 && toupper(*in) >= 'A'  && toupper(*in) <= 'Z' &&
3668 	 in[1] == ':' && (in[2] == '/' || in[2] == '\\'))) {
3669 	strncpy(out, in, PATH_MAX);
3670 	out[PATH_MAX-1] = 0;
3671     } else {
3672 	int len;
3673 
3674 	// unable to get dir or out+in is too long
3675 	if (!getcwd(out, PATH_MAX) ||
3676 	    (len = strlen(out))+1+strlen(in) >= PATH_MAX) {
3677 	    strncpy(out, in, PATH_MAX);
3678 	    out[PATH_MAX-1] = 0;
3679 	    return;
3680 	}
3681 
3682 	sprintf(out+len, "/%.*s", PATH_MAX - len, in);
3683 
3684 	// FIXME: cope with `pwd`/../../../foo.fa ?
3685     }
3686 }
3687 
3688 /*
3689  * Writes a CRAM SAM header.
3690  * Returns 0 on success
3691  *        -1 on failure
3692  */
cram_write_SAM_hdr(cram_fd * fd,SAM_hdr * hdr)3693 int cram_write_SAM_hdr(cram_fd *fd, SAM_hdr *hdr) {
3694     int header_len;
3695     int blank_block = (CRAM_MAJOR_VERS(fd->version) >= 3);
3696 
3697     /* Write CRAM MAGIC if not yet written. */
3698     if (fd->file_def->major_version == 0) {
3699 	fd->file_def->major_version = CRAM_MAJOR_VERS(fd->version);
3700 	fd->file_def->minor_version = CRAM_MINOR_VERS(fd->version);
3701 	if (0 != cram_write_file_def(fd, fd->file_def))
3702 	    return -1;
3703     }
3704 
3705     /* 1.0 requires an UNKNOWN read-group */
3706     if (CRAM_MAJOR_VERS(fd->version) == 1) {
3707 	if (!sam_hdr_find_rg(hdr, "UNKNOWN"))
3708 	    if (sam_hdr_add(hdr, "RG",
3709 			    "ID", "UNKNOWN", "SM", "UNKNOWN", NULL))
3710 		return -1;
3711     }
3712 
3713     /* Fix M5 strings */
3714     if (fd->refs && !fd->no_ref) {
3715 	int i;
3716 	for (i = 0; i < hdr->nref; i++) {
3717 	    SAM_hdr_type *ty;
3718 	    char *ref;
3719 
3720 	    if (!(ty = sam_hdr_find(hdr, "SQ", "SN", hdr->ref[i].name)))
3721 		return -1;
3722 
3723 	    if (!sam_hdr_find_key(hdr, ty, "M5", NULL)) {
3724 		char unsigned buf[16];
3725 		char buf2[33];
3726 		int rlen;
3727 		hts_md5_context *md5;
3728 
3729 		if (!fd->refs ||
3730 		    !fd->refs->ref_id ||
3731 		    !fd->refs->ref_id[i]) {
3732 		    return -1;
3733 		}
3734 		rlen = fd->refs->ref_id[i]->length;
3735 		if (!(md5 = hts_md5_init()))
3736 		    return -1;
3737 		ref = cram_get_ref(fd, i, 1, rlen);
3738 		if (NULL == ref) return -1;
3739 		rlen = fd->refs->ref_id[i]->length; /* In case it just loaded */
3740 		hts_md5_update(md5, ref, rlen);
3741 		hts_md5_final(buf, md5);
3742 		hts_md5_destroy(md5);
3743 		cram_ref_decr(fd->refs, i);
3744 
3745 		hts_md5_hex(buf2, buf);
3746 		if (sam_hdr_update(hdr, ty, "M5", buf2, NULL))
3747 		    return -1;
3748 	    }
3749 
3750 	    if (fd->ref_fn) {
3751 		char ref_fn[PATH_MAX];
3752 		full_path(ref_fn, fd->ref_fn);
3753 		if (sam_hdr_update(hdr, ty, "UR", ref_fn, NULL))
3754 		    return -1;
3755 	    }
3756 	}
3757     }
3758 
3759     if (sam_hdr_rebuild(hdr))
3760 	return -1;
3761 
3762     /* Length */
3763     header_len = sam_hdr_length(hdr);
3764     if (CRAM_MAJOR_VERS(fd->version) == 1) {
3765 	if (-1 == int32_encode(fd, header_len))
3766 	    return -1;
3767 
3768 	/* Text data */
3769 	if (header_len != hwrite(fd->fp, sam_hdr_str(hdr), header_len))
3770 	    return -1;
3771     } else {
3772 	/* Create block(s) inside a container */
3773 	cram_block *b = cram_new_block(FILE_HEADER, 0);
3774 	cram_container *c = cram_new_container(0, 0);
3775 	int padded_length;
3776 	char *pads;
3777 	int is_cram_3 = (CRAM_MAJOR_VERS(fd->version) >= 3);
3778 
3779 	if (!b || !c) {
3780 	    if (b) cram_free_block(b);
3781 	    if (c) cram_free_container(c);
3782 	    return -1;
3783 	}
3784 
3785 	int32_put_blk(b, header_len);
3786 	if (header_len)
3787             BLOCK_APPEND(b, sam_hdr_str(hdr), header_len);
3788 	BLOCK_UPLEN(b);
3789 
3790 	// Compress header block if V3.0 and above
3791 	if (CRAM_MAJOR_VERS(fd->version) >= 3)
3792 	    cram_compress_block(fd, b, NULL, -1, -1);
3793 
3794 	if (blank_block) {
3795 	    c->length = b->comp_size + 2 + 4*is_cram_3 +
3796 		itf8_size(b->content_id)   +
3797 		itf8_size(b->uncomp_size)  +
3798 		itf8_size(b->comp_size);
3799 
3800 	    c->num_blocks = 2;
3801 	    c->num_landmarks = 2;
3802 	    if (!(c->landmark = malloc(2*sizeof(*c->landmark)))) {
3803 		cram_free_block(b);
3804 		cram_free_container(c);
3805 		return -1;
3806 	    }
3807 	    c->landmark[0] = 0;
3808 	    c->landmark[1] = c->length;
3809 
3810 	    // Plus extra storage for uncompressed secondary blank block
3811 	    padded_length = MIN(c->length*.5, 10000);
3812 	    c->length += padded_length + 2 + 4*is_cram_3 +
3813 		itf8_size(b->content_id) +
3814 		itf8_size(padded_length)*2;
3815 	} else {
3816 	    // Pad the block instead.
3817 	    c->num_blocks = 1;
3818 	    c->num_landmarks = 1;
3819 	    if (!(c->landmark = malloc(sizeof(*c->landmark))))
3820 		return -1;
3821 	    c->landmark[0] = 0;
3822 
3823 	    padded_length = MAX(c->length*1.5, 10000) - c->length;
3824 
3825 	    c->length = b->comp_size + padded_length +
3826 		2 + 4*is_cram_3 +
3827 		itf8_size(b->content_id)   +
3828 		itf8_size(b->uncomp_size)  +
3829 		itf8_size(b->comp_size);
3830 
3831 	    if (NULL == (pads = calloc(1, padded_length))) {
3832 		cram_free_block(b);
3833 		cram_free_container(c);
3834 		return -1;
3835 	    }
3836 	    BLOCK_APPEND(b, pads, padded_length);
3837 	    BLOCK_UPLEN(b);
3838 	    free(pads);
3839 	}
3840 
3841 	if (-1 == cram_write_container(fd, c)) {
3842 	    cram_free_block(b);
3843 	    cram_free_container(c);
3844 	    return -1;
3845 	}
3846 
3847 	if (-1 == cram_write_block(fd, b)) {
3848 	    cram_free_block(b);
3849 	    cram_free_container(c);
3850 	    return -1;
3851 	}
3852 
3853 	if (blank_block) {
3854 	    BLOCK_RESIZE(b, padded_length);
3855 	    memset(BLOCK_DATA(b), 0, padded_length);
3856 	    BLOCK_SIZE(b) = padded_length;
3857 	    BLOCK_UPLEN(b);
3858 	    b->method = RAW;
3859 	    if (-1 == cram_write_block(fd, b)) {
3860 		cram_free_block(b);
3861 		cram_free_container(c);
3862 		return -1;
3863 	    }
3864 	}
3865 
3866 	cram_free_block(b);
3867 	cram_free_container(c);
3868     }
3869 
3870     if (-1 == refs_from_header(fd->refs, fd, fd->header))
3871 	return -1;
3872     if (-1 == refs2id(fd->refs, fd->header))
3873 	return -1;
3874 
3875     if (0 != hflush(fd->fp))
3876 	return -1;
3877 
3878     RP("=== Finishing saving header ===\n");
3879 
3880     return 0;
3881 }
3882 
3883 /* ----------------------------------------------------------------------
3884  * The top-level cram opening, closing and option handling
3885  */
3886 
3887 /*
3888  * Initialises the lookup tables. These could be global statics, but they're
3889  * clumsy to setup in a multi-threaded environment unless we generate
3890  * verbatim code and include that.
3891  */
cram_init_tables(cram_fd * fd)3892 static void cram_init_tables(cram_fd *fd) {
3893     int i;
3894 
3895     memset(fd->L1, 4, 256);
3896     fd->L1['A'] = 0; fd->L1['a'] = 0;
3897     fd->L1['C'] = 1; fd->L1['c'] = 1;
3898     fd->L1['G'] = 2; fd->L1['g'] = 2;
3899     fd->L1['T'] = 3; fd->L1['t'] = 3;
3900 
3901     memset(fd->L2, 5, 256);
3902     fd->L2['A'] = 0; fd->L2['a'] = 0;
3903     fd->L2['C'] = 1; fd->L2['c'] = 1;
3904     fd->L2['G'] = 2; fd->L2['g'] = 2;
3905     fd->L2['T'] = 3; fd->L2['t'] = 3;
3906     fd->L2['N'] = 4; fd->L2['n'] = 4;
3907 
3908     if (CRAM_MAJOR_VERS(fd->version) == 1) {
3909 	for (i = 0; i < 0x200; i++) {
3910 	    int f = 0;
3911 
3912 	    if (i & CRAM_FPAIRED)      f |= BAM_FPAIRED;
3913 	    if (i & CRAM_FPROPER_PAIR) f |= BAM_FPROPER_PAIR;
3914 	    if (i & CRAM_FUNMAP)       f |= BAM_FUNMAP;
3915 	    if (i & CRAM_FREVERSE)     f |= BAM_FREVERSE;
3916 	    if (i & CRAM_FREAD1)       f |= BAM_FREAD1;
3917 	    if (i & CRAM_FREAD2)       f |= BAM_FREAD2;
3918 	    if (i & CRAM_FSECONDARY)   f |= BAM_FSECONDARY;
3919 	    if (i & CRAM_FQCFAIL)      f |= BAM_FQCFAIL;
3920 	    if (i & CRAM_FDUP)         f |= BAM_FDUP;
3921 
3922 	    fd->bam_flag_swap[i]  = f;
3923 	}
3924 
3925 	for (i = 0; i < 0x1000; i++) {
3926 	    int g = 0;
3927 
3928 	    if (i & BAM_FPAIRED)	   g |= CRAM_FPAIRED;
3929 	    if (i & BAM_FPROPER_PAIR)  g |= CRAM_FPROPER_PAIR;
3930 	    if (i & BAM_FUNMAP)        g |= CRAM_FUNMAP;
3931 	    if (i & BAM_FREVERSE)      g |= CRAM_FREVERSE;
3932 	    if (i & BAM_FREAD1)        g |= CRAM_FREAD1;
3933 	    if (i & BAM_FREAD2)        g |= CRAM_FREAD2;
3934 	    if (i & BAM_FSECONDARY)    g |= CRAM_FSECONDARY;
3935 	    if (i & BAM_FQCFAIL)       g |= CRAM_FQCFAIL;
3936 	    if (i & BAM_FDUP)          g |= CRAM_FDUP;
3937 
3938 	    fd->cram_flag_swap[i] = g;
3939 	}
3940     } else {
3941 	/* NOP */
3942 	for (i = 0; i < 0x1000; i++)
3943 	    fd->bam_flag_swap[i] = i;
3944 	for (i = 0; i < 0x1000; i++)
3945 	    fd->cram_flag_swap[i] = i;
3946     }
3947 
3948     memset(fd->cram_sub_matrix, 4, 32*32);
3949     for (i = 0; i < 32; i++) {
3950 	fd->cram_sub_matrix[i]['A'&0x1f]=0;
3951 	fd->cram_sub_matrix[i]['C'&0x1f]=1;
3952 	fd->cram_sub_matrix[i]['G'&0x1f]=2;
3953 	fd->cram_sub_matrix[i]['T'&0x1f]=3;
3954 	fd->cram_sub_matrix[i]['N'&0x1f]=4;
3955     }
3956     for (i = 0; i < 20; i+=4) {
3957 	int j;
3958 	for (j = 0; j < 20; j++) {
3959 	    fd->cram_sub_matrix["ACGTN"[i>>2]&0x1f][j]=3;
3960 	    fd->cram_sub_matrix["ACGTN"[i>>2]&0x1f][j]=3;
3961 	    fd->cram_sub_matrix["ACGTN"[i>>2]&0x1f][j]=3;
3962 	    fd->cram_sub_matrix["ACGTN"[i>>2]&0x1f][j]=3;
3963 	}
3964 	fd->cram_sub_matrix["ACGTN"[i>>2]&0x1f][CRAM_SUBST_MATRIX[i+0]&0x1f]=0;
3965 	fd->cram_sub_matrix["ACGTN"[i>>2]&0x1f][CRAM_SUBST_MATRIX[i+1]&0x1f]=1;
3966 	fd->cram_sub_matrix["ACGTN"[i>>2]&0x1f][CRAM_SUBST_MATRIX[i+2]&0x1f]=2;
3967 	fd->cram_sub_matrix["ACGTN"[i>>2]&0x1f][CRAM_SUBST_MATRIX[i+3]&0x1f]=3;
3968     }
3969 }
3970 
3971 // Default version numbers for CRAM
3972 static int major_version = 3;
3973 static int minor_version = 0;
3974 
3975 /*
3976  * Opens a CRAM file for read (mode "rb") or write ("wb").
3977  * The filename may be "-" to indicate stdin or stdout.
3978  *
3979  * Returns file handle on success
3980  *         NULL on failure.
3981  */
cram_open(const char * filename,const char * mode)3982 cram_fd *cram_open(const char *filename, const char *mode) {
3983     hFILE *fp;
3984     cram_fd *fd;
3985     char fmode[3]= { mode[0], '\0', '\0' };
3986 
3987     if (strlen(mode) > 1 && (mode[1] == 'b' || mode[1] == 'c')) {
3988 	fmode[1] = 'b';
3989     }
3990 
3991     fp = hopen(filename, fmode);
3992     if (!fp)
3993 	return NULL;
3994 
3995     fd = cram_dopen(fp, filename, mode);
3996     if (!fd)
3997 	hclose_abruptly(fp);
3998 
3999     return fd;
4000 }
4001 
4002 /* Opens an existing stream for reading or writing.
4003  *
4004  * Returns file handle on success;
4005  *         NULL on failure.
4006  */
cram_dopen(hFILE * fp,const char * filename,const char * mode)4007 cram_fd *cram_dopen(hFILE *fp, const char *filename, const char *mode) {
4008     int i;
4009     char *cp;
4010     cram_fd *fd = calloc(1, sizeof(*fd));
4011     if (!fd)
4012 	return NULL;
4013 
4014     fd->level = 5;
4015     for (i = 0; mode[i]; i++) {
4016 	if (mode[i] >= '0' && mode[i] <= '9') {
4017 	    fd->level = mode[i] - '0';
4018 	    break;
4019 	}
4020     }
4021 
4022     fd->fp = fp;
4023     fd->mode = *mode;
4024     fd->first_container = 0;
4025 
4026     if (fd->mode == 'r') {
4027 	/* Reader */
4028 
4029 	if (!(fd->file_def = cram_read_file_def(fd)))
4030 	    goto err;
4031 
4032 	fd->version = fd->file_def->major_version * 256 +
4033 	    fd->file_def->minor_version;
4034 
4035 	if (!(fd->header = cram_read_SAM_hdr(fd))) {
4036 	    cram_free_file_def(fd->file_def);
4037 	    goto err;
4038 	}
4039 
4040     } else {
4041 	/* Writer */
4042 	cram_file_def *def = calloc(1, sizeof(*def));
4043 	if (!def)
4044 	    return NULL;
4045 
4046 	fd->file_def = def;
4047 
4048 	def->magic[0] = 'C';
4049 	def->magic[1] = 'R';
4050 	def->magic[2] = 'A';
4051 	def->magic[3] = 'M';
4052 	def->major_version = 0; // Indicator to write file def later.
4053 	def->minor_version = 0;
4054 	memset(def->file_id, 0, 20);
4055 	strncpy(def->file_id, filename, 20);
4056 
4057 	fd->version = major_version * 256 + minor_version;
4058 
4059 	/* SAM header written later along with this file_def */
4060     }
4061 
4062     cram_init_tables(fd);
4063 
4064     fd->prefix = strdup((cp = strrchr(filename, '/')) ? cp+1 : filename);
4065     if (!fd->prefix)
4066 	goto err;
4067     fd->first_base = fd->last_base = -1;
4068     fd->record_counter = 0;
4069 
4070     fd->ctr = NULL;
4071     fd->refs  = refs_create();
4072     if (!fd->refs)
4073 	goto err;
4074     fd->ref_id = -2;
4075     fd->ref = NULL;
4076 
4077     fd->decode_md = 0;
4078     fd->seqs_per_slice = SEQS_PER_SLICE;
4079     fd->bases_per_slice = BASES_PER_SLICE;
4080     fd->slices_per_container = SLICE_PER_CNT;
4081     fd->embed_ref = 0;
4082     fd->no_ref = 0;
4083     fd->ignore_md5 = 0;
4084     fd->lossy_read_names = 0;
4085     fd->use_bz2 = 0;
4086     fd->use_rans = (CRAM_MAJOR_VERS(fd->version) >= 3);
4087     fd->use_lzma = 0;
4088     fd->multi_seq = -1;
4089     fd->unsorted   = 0;
4090     fd->shared_ref = 0;
4091 
4092     fd->index       = NULL;
4093     fd->own_pool    = 0;
4094     fd->pool        = NULL;
4095     fd->rqueue      = NULL;
4096     fd->job_pending = NULL;
4097     fd->ooc         = 0;
4098     fd->required_fields = INT_MAX;
4099 
4100     for (i = 0; i < DS_END; i++)
4101 	fd->m[i] = cram_new_metrics();
4102 
4103     if (!(fd->tags_used = kh_init(m_metrics)))
4104 	goto err;
4105 
4106     fd->range.refid = -2; // no ref.
4107     fd->eof = 1;          // See samtools issue #150
4108     fd->ref_fn = NULL;
4109 
4110     fd->bl = NULL;
4111 
4112     /* Initialise dummy refs from the @SQ headers */
4113     if (-1 == refs_from_header(fd->refs, fd, fd->header))
4114 	goto err;
4115 
4116     return fd;
4117 
4118  err:
4119     if (fd)
4120 	free(fd);
4121 
4122     return NULL;
4123 }
4124 
4125 /*
4126  * Seek within a CRAM file.
4127  *
4128  * Returns 0 on success
4129  *        -1 on failure
4130  */
cram_seek(cram_fd * fd,off_t offset,int whence)4131 int cram_seek(cram_fd *fd, off_t offset, int whence) {
4132     char buf[65536];
4133 
4134     fd->ooc = 0;
4135 
4136     if (hseek(fd->fp, offset, whence) >= 0) {
4137         return 0;
4138     }
4139 
4140     if (!(whence == SEEK_CUR && offset >= 0))
4141 	return -1;
4142 
4143     /* Couldn't fseek, but we're in SEEK_CUR mode so read instead */
4144     while (offset > 0) {
4145 	int len = MIN(65536, offset);
4146 	if (len != hread(fd->fp, buf, len))
4147 	    return -1;
4148 	offset -= len;
4149     }
4150 
4151     return 0;
4152 }
4153 
4154 /*
4155  * Flushes a CRAM file.
4156  * Useful for when writing to stdout without wishing to close the stream.
4157  *
4158  * Returns 0 on success
4159  *        -1 on failure
4160  */
cram_flush(cram_fd * fd)4161 int cram_flush(cram_fd *fd) {
4162     if (!fd)
4163 	return -1;
4164 
4165     if (fd->mode == 'w' && fd->ctr) {
4166 	if(fd->ctr->slice)
4167 	    cram_update_curr_slice(fd->ctr);
4168 
4169 	if (-1 == cram_flush_container_mt(fd, fd->ctr))
4170 	    return -1;
4171     }
4172 
4173     return 0;
4174 }
4175 
4176 /*
4177  * Closes a CRAM file.
4178  * Returns 0 on success
4179  *        -1 on failure
4180  */
cram_close(cram_fd * fd)4181 int cram_close(cram_fd *fd) {
4182     spare_bams *bl, *next;
4183     int i;
4184 
4185     if (!fd)
4186 	return -1;
4187 
4188     if (fd->mode == 'w' && fd->ctr) {
4189 	if(fd->ctr->slice)
4190 	    cram_update_curr_slice(fd->ctr);
4191 
4192 	if (-1 == cram_flush_container_mt(fd, fd->ctr))
4193 	    return -1;
4194     }
4195 
4196     if (fd->pool && fd->eof >= 0) {
4197 	hts_tpool_process_flush(fd->rqueue);
4198 
4199 	if (0 != cram_flush_result(fd))
4200 	    return -1;
4201 
4202 	pthread_mutex_destroy(&fd->metrics_lock);
4203 	pthread_mutex_destroy(&fd->ref_lock);
4204 	pthread_mutex_destroy(&fd->bam_list_lock);
4205 
4206 	fd->ctr = NULL; // prevent double freeing
4207 
4208 	//fprintf(stderr, "CRAM: destroy queue %p\n", fd->rqueue);
4209 
4210 	hts_tpool_process_destroy(fd->rqueue);
4211     }
4212 
4213     if (fd->mode == 'w') {
4214 	/* Write EOF block */
4215 	if (CRAM_MAJOR_VERS(fd->version) == 3) {
4216 	    if (38 != hwrite(fd->fp,
4217 			     "\x0f\x00\x00\x00\xff\xff\xff\xff" // Cont HDR
4218 			     "\x0f\xe0\x45\x4f\x46\x00\x00\x00" // Cont HDR
4219 			     "\x00\x01\x00"                     // Cont HDR
4220 			     "\x05\xbd\xd9\x4f"                 // CRC32
4221 			     "\x00\x01\x00\x06\x06"             // Comp.HDR blk
4222 			     "\x01\x00\x01\x00\x01\x00"         // Comp.HDR blk
4223 			     "\xee\x63\x01\x4b",                // CRC32
4224 			     38))
4225 		return -1;
4226 	} else {
4227 	    if (30 != hwrite(fd->fp,
4228 			     "\x0b\x00\x00\x00\xff\xff\xff\xff"
4229 			     "\x0f\xe0\x45\x4f\x46\x00\x00\x00"
4230 			     "\x00\x01\x00\x00\x01\x00\x06\x06"
4231 			     "\x01\x00\x01\x00\x01\x00", 30))
4232 		return -1;
4233 	}
4234     }
4235 
4236     for (bl = fd->bl; bl; bl = next) {
4237 	int i, max_rec = fd->seqs_per_slice * fd->slices_per_container;
4238 
4239 	next = bl->next;
4240 	for (i = 0; i < max_rec; i++) {
4241 	    if (bl->bams[i])
4242 		bam_free(bl->bams[i]);
4243 	}
4244 	free(bl->bams);
4245 	free(bl);
4246     }
4247 
4248     if (hclose(fd->fp) != 0)
4249 	return -1;
4250 
4251     if (fd->file_def)
4252 	cram_free_file_def(fd->file_def);
4253 
4254     if (fd->header)
4255 	sam_hdr_free(fd->header);
4256 
4257     free(fd->prefix);
4258 
4259     if (fd->ctr)
4260 	cram_free_container(fd->ctr);
4261 
4262     if (fd->refs)
4263 	refs_free(fd->refs);
4264     if (fd->ref_free)
4265         free(fd->ref_free);
4266 
4267     for (i = 0; i < DS_END; i++)
4268 	if (fd->m[i])
4269 	    free(fd->m[i]);
4270 
4271     if (fd->tags_used) {
4272 	khint_t k;
4273 
4274 	for (k = kh_begin(fd->tags_used); k != kh_end(fd->tags_used); k++) {
4275 	    if (kh_exist(fd->tags_used, k))
4276 		free(kh_val(fd->tags_used, k));
4277 	}
4278 
4279 	kh_destroy(m_metrics, fd->tags_used);
4280     }
4281 
4282     if (fd->index)
4283 	cram_index_free(fd);
4284 
4285     if (fd->own_pool && fd->pool)
4286 	hts_tpool_destroy(fd->pool);
4287 
4288     free(fd);
4289     return 0;
4290 }
4291 
4292 /*
4293  * Returns 1 if we hit an EOF while reading.
4294  */
cram_eof(cram_fd * fd)4295 int cram_eof(cram_fd *fd) {
4296     return fd->eof;
4297 }
4298 
4299 
4300 /*
4301  * Sets options on the cram_fd. See CRAM_OPT_* definitions in cram_structs.h.
4302  * Use this immediately after opening.
4303  *
4304  * Returns 0 on success
4305  *        -1 on failure
4306  */
cram_set_option(cram_fd * fd,enum hts_fmt_option opt,...)4307 int cram_set_option(cram_fd *fd, enum hts_fmt_option opt, ...) {
4308     int r;
4309     va_list args;
4310 
4311     va_start(args, opt);
4312     r = cram_set_voption(fd, opt, args);
4313     va_end(args);
4314 
4315     return r;
4316 }
4317 
4318 /*
4319  * Sets options on the cram_fd. See CRAM_OPT_* definitions in cram_structs.h.
4320  * Use this immediately after opening.
4321  *
4322  * Returns 0 on success
4323  *        -1 on failure
4324  */
cram_set_voption(cram_fd * fd,enum hts_fmt_option opt,va_list args)4325 int cram_set_voption(cram_fd *fd, enum hts_fmt_option opt, va_list args) {
4326     refs_t *refs;
4327 
4328     if (!fd) {
4329 	errno = EBADF;
4330 	return -1;
4331     }
4332 
4333     switch (opt) {
4334     case CRAM_OPT_DECODE_MD:
4335 	fd->decode_md = va_arg(args, int);
4336 	break;
4337 
4338     case CRAM_OPT_PREFIX:
4339 	if (fd->prefix)
4340 	    free(fd->prefix);
4341 	if (!(fd->prefix = strdup(va_arg(args, char *))))
4342 	    return -1;
4343 	break;
4344 
4345     case CRAM_OPT_VERBOSITY:
4346 	break;
4347 
4348     case CRAM_OPT_SEQS_PER_SLICE:
4349 	fd->seqs_per_slice = va_arg(args, int);
4350 	break;
4351 
4352     case CRAM_OPT_BASES_PER_SLICE:
4353 	fd->bases_per_slice = va_arg(args, int);
4354 	break;
4355 
4356     case CRAM_OPT_SLICES_PER_CONTAINER:
4357 	fd->slices_per_container = va_arg(args, int);
4358 	break;
4359 
4360     case CRAM_OPT_EMBED_REF:
4361 	fd->embed_ref = va_arg(args, int);
4362 	break;
4363 
4364     case CRAM_OPT_NO_REF:
4365 	fd->no_ref = va_arg(args, int);
4366 	break;
4367 
4368     case CRAM_OPT_IGNORE_MD5:
4369 	fd->ignore_md5 = va_arg(args, int);
4370 	break;
4371 
4372     case CRAM_OPT_LOSSY_NAMES:
4373 	fd->lossy_read_names = va_arg(args, int);
4374 	// Currently lossy read names required paired (attached) reads.
4375 	// TLEN 0 or being 1 out causes read pairs to be detached, breaking
4376 	// the lossy read name compression, so we have extra options to
4377 	// slacken the exact TLEN round-trip checks.
4378 	fd->tlen_approx = fd->lossy_read_names;
4379 	fd->tlen_zero = fd->lossy_read_names;
4380 	break;
4381 
4382     case CRAM_OPT_USE_BZIP2:
4383 	fd->use_bz2 = va_arg(args, int);
4384 	break;
4385 
4386     case CRAM_OPT_USE_RANS:
4387 	fd->use_rans = va_arg(args, int);
4388 	break;
4389 
4390     case CRAM_OPT_USE_LZMA:
4391 	fd->use_lzma = va_arg(args, int);
4392 	break;
4393 
4394     case CRAM_OPT_SHARED_REF:
4395 	fd->shared_ref = 1;
4396 	refs = va_arg(args, refs_t *);
4397 	if (refs != fd->refs) {
4398 	    if (fd->refs)
4399 		refs_free(fd->refs);
4400 	    fd->refs = refs;
4401 	    fd->refs->count++;
4402 	}
4403 	break;
4404 
4405     case CRAM_OPT_RANGE:
4406 	fd->range = *va_arg(args, cram_range *);
4407 	return cram_seek_to_refpos(fd, &fd->range);
4408 
4409     case CRAM_OPT_REFERENCE:
4410 	return cram_load_reference(fd, va_arg(args, char *));
4411 
4412     case CRAM_OPT_VERSION: {
4413 	int major, minor;
4414 	char *s = va_arg(args, char *);
4415 	if (2 != sscanf(s, "%d.%d", &major, &minor)) {
4416 	    hts_log_error("Malformed version string %s", s);
4417 	    return -1;
4418 	}
4419 	if (!((major == 1 &&  minor == 0) ||
4420 	      (major == 2 && (minor == 0 || minor == 1)) ||
4421 	      (major == 3 &&  minor == 0))) {
4422 	    hts_log_error("Unknown version string; use 1.0, 2.0, 2.1 or 3.0");
4423 	    errno = EINVAL;
4424 	    return -1;
4425 	}
4426 	fd->version = major*256 + minor;
4427 
4428 	if (CRAM_MAJOR_VERS(fd->version) >= 3)
4429 	    fd->use_rans = 1;
4430 	break;
4431     }
4432 
4433     case CRAM_OPT_MULTI_SEQ_PER_SLICE:
4434 	fd->multi_seq = va_arg(args, int);
4435 	break;
4436 
4437     case CRAM_OPT_NTHREADS: {
4438 	int nthreads =  va_arg(args, int);
4439         if (nthreads >= 1) {
4440             if (!(fd->pool = hts_tpool_init(nthreads)))
4441                 return -1;
4442 
4443 	    fd->rqueue = hts_tpool_process_init(fd->pool, nthreads*2, 0);
4444 	    pthread_mutex_init(&fd->metrics_lock, NULL);
4445 	    pthread_mutex_init(&fd->ref_lock, NULL);
4446 	    pthread_mutex_init(&fd->bam_list_lock, NULL);
4447 	    fd->shared_ref = 1;
4448 	    fd->own_pool = 1;
4449         }
4450 	break;
4451     }
4452 
4453     case CRAM_OPT_THREAD_POOL: {
4454 	htsThreadPool *p = va_arg(args, htsThreadPool *);
4455 	fd->pool = p ? p->pool : NULL;
4456 	if (fd->pool) {
4457 	    fd->rqueue = hts_tpool_process_init(fd->pool,
4458 						p->qsize ? p->qsize : hts_tpool_size(fd->pool)*2,
4459 						0);
4460 	    pthread_mutex_init(&fd->metrics_lock, NULL);
4461 	    pthread_mutex_init(&fd->ref_lock, NULL);
4462 	    pthread_mutex_init(&fd->bam_list_lock, NULL);
4463 	}
4464 	fd->shared_ref = 1; // Needed to avoid clobbering ref between threads
4465 	fd->own_pool = 0;
4466 
4467 	//fd->qsize = 1;
4468 	//fd->decoded = calloc(fd->qsize, sizeof(cram_container *));
4469 	//hts_tpool_dispatch(fd->pool, cram_decoder_thread, fd);
4470 	break;
4471     }
4472 
4473     case CRAM_OPT_REQUIRED_FIELDS:
4474 	fd->required_fields = va_arg(args, int);
4475 	break;
4476 
4477     case HTS_OPT_COMPRESSION_LEVEL:
4478 	fd->level = va_arg(args, int);
4479 	break;
4480 
4481     default:
4482 	hts_log_error("Unknown CRAM option code %d", opt);
4483 	errno = EINVAL;
4484 	return -1;
4485     }
4486 
4487     return 0;
4488 }
4489 
cram_check_EOF(cram_fd * fd)4490 int cram_check_EOF(cram_fd *fd)
4491 {
4492     // Byte 9 in these templates is & with 0x0f to resolve differences
4493     // between ITF-8 interpretations between early Java and C
4494     // implementations of CRAM
4495     static const unsigned char TEMPLATE_2_1[30] = {
4496         0x0b, 0x00, 0x00, 0x00, 0xff, 0xff, 0xff, 0xff, 0x0f, 0xe0,
4497         0x45, 0x4f, 0x46, 0x00, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00,
4498         0x01, 0x00, 0x06, 0x06, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00
4499     };
4500     static const unsigned char TEMPLATE_3[38] = {
4501         0x0f, 0x00, 0x00, 0x00, 0xff, 0xff, 0xff, 0xff, 0x0f, 0xe0,
4502         0x45, 0x4f, 0x46, 0x00, 0x00, 0x00, 0x00, 0x01, 0x00, 0x05,
4503         0xbd, 0xd9, 0x4f, 0x00, 0x01, 0x00, 0x06, 0x06, 0x01, 0x00,
4504         0x01, 0x00, 0x01, 0x00, 0xee, 0x63, 0x01, 0x4b
4505     };
4506 
4507     unsigned char buf[38]; // max(sizeof TEMPLATE_*)
4508 
4509     uint8_t major = CRAM_MAJOR_VERS(fd->version);
4510     uint8_t minor = CRAM_MINOR_VERS(fd->version);
4511 
4512     const unsigned char *template;
4513     ssize_t template_len;
4514     if ((major < 2) ||
4515         (major == 2 && minor == 0)) {
4516         return 3; // No EOF support in cram versions less than 2.1
4517     } else if (major == 2 && minor == 1) {
4518         template = TEMPLATE_2_1;
4519         template_len = sizeof TEMPLATE_2_1;
4520     } else {
4521         template = TEMPLATE_3;
4522         template_len = sizeof TEMPLATE_3;
4523     }
4524 
4525     off_t offset = htell(fd->fp);
4526     if (hseek(fd->fp, -template_len, SEEK_END) < 0) {
4527         if (errno == ESPIPE) {
4528             hclearerr(fd->fp);
4529             return 2;
4530         }
4531         else {
4532             return -1;
4533         }
4534     }
4535     if (hread(fd->fp, buf, template_len) != template_len) return -1;
4536     if (hseek(fd->fp, offset, SEEK_SET) < 0) return -1;
4537     buf[8] &= 0x0f;
4538     return (memcmp(template, buf, template_len) == 0)? 1 : 0;
4539 }
4540