1 /*
2 Copyright (c) 2012-2021 Genome Research Ltd.
3 Author: James Bonfield <jkb@sanger.ac.uk>
4
5 Redistribution and use in source and binary forms, with or without
6 modification, are permitted provided that the following conditions are met:
7
8 1. Redistributions of source code must retain the above copyright notice,
9 this list of conditions and the following disclaimer.
10
11 2. Redistributions in binary form must reproduce the above copyright notice,
12 this list of conditions and the following disclaimer in the documentation
13 and/or other materials provided with the distribution.
14
15 3. Neither the names Genome Research Ltd and Wellcome Trust Sanger
16 Institute nor the names of its contributors may be used to endorse or promote
17 products derived from this software without specific prior written permission.
18
19 THIS SOFTWARE IS PROVIDED BY GENOME RESEARCH LTD AND CONTRIBUTORS "AS IS" AND
20 ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
21 WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
22 DISCLAIMED. IN NO EVENT SHALL GENOME RESEARCH LTD OR CONTRIBUTORS BE LIABLE
23 FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
24 DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
25 SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
26 CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
27 OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
28 OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
29 */
30
31 /*
32 * CRAM I/O primitives.
33 *
34 * - ITF8 encoding and decoding.
35 * - Block based I/O
36 * - Zlib inflating and deflating (memory)
37 * - CRAM basic data structure reading and writing
38 * - File opening / closing
39 * - Reference sequence handling
40 */
41
42 #define HTS_BUILDING_LIBRARY // Enables HTSLIB_EXPORT, see htslib/hts_defs.h
43 #include <config.h>
44
45 #include <stdio.h>
46 #include <errno.h>
47 #include <assert.h>
48 #include <stdlib.h>
49 #include <string.h>
50 #include <unistd.h>
51 #include <zlib.h>
52 #ifdef HAVE_LIBBZ2
53 #include <bzlib.h>
54 #endif
55 #ifdef HAVE_LIBLZMA
56 #ifdef HAVE_LZMA_H
57 #include <lzma.h>
58 #else
59 #include "../os/lzma_stub.h"
60 #endif
61 #endif
62 #include <sys/types.h>
63 #include <sys/stat.h>
64 #include <math.h>
65 #include <stdint.h>
66
67 #ifdef HAVE_LIBDEFLATE
68 #include <libdeflate.h>
69 #define crc32(a,b,c) libdeflate_crc32((a),(b),(c))
70 #endif
71
72 #include "cram.h"
73 #include "os.h"
74 #include "../htslib/hts.h"
75 #include "open_trace_file.h"
76
77 #if defined(HAVE_EXTERNAL_LIBHTSCODECS)
78 #include <htscodecs/rANS_static.h>
79 #include <htscodecs/rANS_static4x16.h>
80 #include <htscodecs/arith_dynamic.h>
81 #include <htscodecs/tokenise_name3.h>
82 #include <htscodecs/fqzcomp_qual.h>
83 #include <htscodecs/varint.h> // CRAM v4.0 variable-size integers
84 #else
85 #include "../htscodecs/htscodecs/rANS_static.h"
86 #include "../htscodecs/htscodecs/rANS_static4x16.h"
87 #include "../htscodecs/htscodecs/arith_dynamic.h"
88 #include "../htscodecs/htscodecs/tokenise_name3.h"
89 #include "../htscodecs/htscodecs/fqzcomp_qual.h"
90 #include "../htscodecs/htscodecs/varint.h"
91 #endif
92
93 //#define REF_DEBUG
94
95 #ifdef REF_DEBUG
96 #include <sys/syscall.h>
97 #define gettid() (int)syscall(SYS_gettid)
98
99 #define RP(...) fprintf (stderr, __VA_ARGS__)
100 #else
101 #define RP(...)
102 #endif
103
104 #include "../htslib/hfile.h"
105 #include "../htslib/bgzf.h"
106 #include "../htslib/faidx.h"
107 #include "../hts_internal.h"
108
109 #ifndef PATH_MAX
110 #define PATH_MAX FILENAME_MAX
111 #endif
112
113 #define TRIAL_SPAN 70
114 #define NTRIALS 3
115
116 #define CRAM_DEFAULT_LEVEL 5
117
118 /* ----------------------------------------------------------------------
119 * ITF8 encoding and decoding.
120 *
121 * Also see the itf8_get and itf8_put macros in cram_io.h
122 */
123
124 /*
125 * LEGACY: consider using itf8_decode_crc.
126 *
127 * Reads an integer in ITF-8 encoding from 'cp' and stores it in
128 * *val.
129 *
130 * Returns the number of bytes read on success
131 * -1 on failure
132 */
itf8_decode(cram_fd * fd,int32_t * val_p)133 int itf8_decode(cram_fd *fd, int32_t *val_p) {
134 static int nbytes[16] = {
135 0,0,0,0, 0,0,0,0, // 0000xxxx - 0111xxxx
136 1,1,1,1, // 1000xxxx - 1011xxxx
137 2,2, // 1100xxxx - 1101xxxx
138 3, // 1110xxxx
139 4, // 1111xxxx
140 };
141
142 static int nbits[16] = {
143 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, // 0000xxxx - 0111xxxx
144 0x3f, 0x3f, 0x3f, 0x3f, // 1000xxxx - 1011xxxx
145 0x1f, 0x1f, // 1100xxxx - 1101xxxx
146 0x0f, // 1110xxxx
147 0x0f, // 1111xxxx
148 };
149
150 int32_t val = hgetc(fd->fp);
151 if (val == -1)
152 return -1;
153
154 int i = nbytes[val>>4];
155 val &= nbits[val>>4];
156
157 switch(i) {
158 case 0:
159 *val_p = val;
160 return 1;
161
162 case 1:
163 val = (val<<8) | (unsigned char)hgetc(fd->fp);
164 *val_p = val;
165 return 2;
166
167 case 2:
168 val = (val<<8) | (unsigned char)hgetc(fd->fp);
169 val = (val<<8) | (unsigned char)hgetc(fd->fp);
170 *val_p = val;
171 return 3;
172
173 case 3:
174 val = (val<<8) | (unsigned char)hgetc(fd->fp);
175 val = (val<<8) | (unsigned char)hgetc(fd->fp);
176 val = (val<<8) | (unsigned char)hgetc(fd->fp);
177 *val_p = val;
178 return 4;
179
180 case 4: // really 3.5 more, why make it different?
181 val = (val<<8) | (unsigned char)hgetc(fd->fp);
182 val = (val<<8) | (unsigned char)hgetc(fd->fp);
183 val = (val<<8) | (unsigned char)hgetc(fd->fp);
184 val = (val<<4) | (((unsigned char)hgetc(fd->fp)) & 0x0f);
185 *val_p = val;
186 }
187
188 return 5;
189 }
190
itf8_decode_crc(cram_fd * fd,int32_t * val_p,uint32_t * crc)191 int itf8_decode_crc(cram_fd *fd, int32_t *val_p, uint32_t *crc) {
192 static int nbytes[16] = {
193 0,0,0,0, 0,0,0,0, // 0000xxxx - 0111xxxx
194 1,1,1,1, // 1000xxxx - 1011xxxx
195 2,2, // 1100xxxx - 1101xxxx
196 3, // 1110xxxx
197 4, // 1111xxxx
198 };
199
200 static int nbits[16] = {
201 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, // 0000xxxx - 0111xxxx
202 0x3f, 0x3f, 0x3f, 0x3f, // 1000xxxx - 1011xxxx
203 0x1f, 0x1f, // 1100xxxx - 1101xxxx
204 0x0f, // 1110xxxx
205 0x0f, // 1111xxxx
206 };
207 unsigned char c[5];
208
209 int32_t val = hgetc(fd->fp);
210 if (val == -1)
211 return -1;
212
213 c[0]=val;
214
215 int i = nbytes[val>>4];
216 val &= nbits[val>>4];
217
218 if (i > 0) {
219 if (hread(fd->fp, &c[1], i) < i)
220 return -1;
221 }
222
223 switch(i) {
224 case 0:
225 *val_p = val;
226 *crc = crc32(*crc, c, 1);
227 return 1;
228
229 case 1:
230 val = (val<<8) | c[1];
231 *val_p = val;
232 *crc = crc32(*crc, c, 2);
233 return 2;
234
235 case 2:
236 val = (val<<8) | c[1];
237 val = (val<<8) | c[2];
238 *val_p = val;
239 *crc = crc32(*crc, c, 3);
240 return 3;
241
242 case 3:
243 val = (val<<8) | c[1];
244 val = (val<<8) | c[2];
245 val = (val<<8) | c[3];
246 *val_p = val;
247 *crc = crc32(*crc, c, 4);
248 return 4;
249
250 case 4: // really 3.5 more, why make it different?
251 {
252 uint32_t uv = val;
253 uv = (uv<<8) | c[1];
254 uv = (uv<<8) | c[2];
255 uv = (uv<<8) | c[3];
256 uv = (uv<<4) | (c[4] & 0x0f);
257 // Avoid implementation-defined behaviour on negative values
258 *val_p = uv < 0x80000000UL ? (int32_t) uv : -((int32_t) (0xffffffffUL - uv)) - 1;
259 *crc = crc32(*crc, c, 5);
260 }
261 }
262
263 return 5;
264 }
265
266 /*
267 * Stores a value to memory in ITF-8 format.
268 *
269 * Returns the number of bytes required to store the number.
270 * This is a maximum of 5 bytes.
271 */
itf8_put(char * cp,int32_t val)272 static inline int itf8_put(char *cp, int32_t val) {
273 unsigned char *up = (unsigned char *)cp;
274 if (!(val & ~0x00000007f)) { // 1 byte
275 *up = val;
276 return 1;
277 } else if (!(val & ~0x00003fff)) { // 2 byte
278 *up++ = (val >> 8 ) | 0x80;
279 *up = val & 0xff;
280 return 2;
281 } else if (!(val & ~0x01fffff)) { // 3 byte
282 *up++ = (val >> 16) | 0xc0;
283 *up++ = (val >> 8 ) & 0xff;
284 *up = val & 0xff;
285 return 3;
286 } else if (!(val & ~0x0fffffff)) { // 4 byte
287 *up++ = (val >> 24) | 0xe0;
288 *up++ = (val >> 16) & 0xff;
289 *up++ = (val >> 8 ) & 0xff;
290 *up = val & 0xff;
291 return 4;
292 } else { // 5 byte
293 *up++ = 0xf0 | ((val>>28) & 0xff);
294 *up++ = (val >> 20) & 0xff;
295 *up++ = (val >> 12) & 0xff;
296 *up++ = (val >> 4 ) & 0xff;
297 *up = val & 0x0f;
298 return 5;
299 }
300 }
301
302
303 /* 64-bit itf8 variant */
ltf8_put(char * cp,int64_t val)304 static inline int ltf8_put(char *cp, int64_t val) {
305 unsigned char *up = (unsigned char *)cp;
306 if (!(val & ~((1LL<<7)-1))) {
307 *up = val;
308 return 1;
309 } else if (!(val & ~((1LL<<(6+8))-1))) {
310 *up++ = (val >> 8 ) | 0x80;
311 *up = val & 0xff;
312 return 2;
313 } else if (!(val & ~((1LL<<(5+2*8))-1))) {
314 *up++ = (val >> 16) | 0xc0;
315 *up++ = (val >> 8 ) & 0xff;
316 *up = val & 0xff;
317 return 3;
318 } else if (!(val & ~((1LL<<(4+3*8))-1))) {
319 *up++ = (val >> 24) | 0xe0;
320 *up++ = (val >> 16) & 0xff;
321 *up++ = (val >> 8 ) & 0xff;
322 *up = val & 0xff;
323 return 4;
324 } else if (!(val & ~((1LL<<(3+4*8))-1))) {
325 *up++ = (val >> 32) | 0xf0;
326 *up++ = (val >> 24) & 0xff;
327 *up++ = (val >> 16) & 0xff;
328 *up++ = (val >> 8 ) & 0xff;
329 *up = val & 0xff;
330 return 5;
331 } else if (!(val & ~((1LL<<(2+5*8))-1))) {
332 *up++ = (val >> 40) | 0xf8;
333 *up++ = (val >> 32) & 0xff;
334 *up++ = (val >> 24) & 0xff;
335 *up++ = (val >> 16) & 0xff;
336 *up++ = (val >> 8 ) & 0xff;
337 *up = val & 0xff;
338 return 6;
339 } else if (!(val & ~((1LL<<(1+6*8))-1))) {
340 *up++ = (val >> 48) | 0xfc;
341 *up++ = (val >> 40) & 0xff;
342 *up++ = (val >> 32) & 0xff;
343 *up++ = (val >> 24) & 0xff;
344 *up++ = (val >> 16) & 0xff;
345 *up++ = (val >> 8 ) & 0xff;
346 *up = val & 0xff;
347 return 7;
348 } else if (!(val & ~((1LL<<(7*8))-1))) {
349 *up++ = (val >> 56) | 0xfe;
350 *up++ = (val >> 48) & 0xff;
351 *up++ = (val >> 40) & 0xff;
352 *up++ = (val >> 32) & 0xff;
353 *up++ = (val >> 24) & 0xff;
354 *up++ = (val >> 16) & 0xff;
355 *up++ = (val >> 8 ) & 0xff;
356 *up = val & 0xff;
357 return 8;
358 } else {
359 *up++ = 0xff;
360 *up++ = (val >> 56) & 0xff;
361 *up++ = (val >> 48) & 0xff;
362 *up++ = (val >> 40) & 0xff;
363 *up++ = (val >> 32) & 0xff;
364 *up++ = (val >> 24) & 0xff;
365 *up++ = (val >> 16) & 0xff;
366 *up++ = (val >> 8 ) & 0xff;
367 *up = val & 0xff;
368 return 9;
369 }
370 }
371
372 /*
373 * Encodes and writes a single integer in ITF-8 format.
374 * Returns 0 on success
375 * -1 on failure
376 */
itf8_encode(cram_fd * fd,int32_t val)377 int itf8_encode(cram_fd *fd, int32_t val) {
378 char buf[5];
379 int len = itf8_put(buf, val);
380 return hwrite(fd->fp, buf, len) == len ? 0 : -1;
381 }
382
383 const int itf8_bytes[16] = {
384 1, 1, 1, 1, 1, 1, 1, 1,
385 2, 2, 2, 2, 3, 3, 4, 5
386 };
387
388 const int ltf8_bytes[256] = {
389 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
390 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
391 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
392 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
393
394 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
395 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
396 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
397 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
398
399 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
400 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
401 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
402 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
403
404 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
405 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
406
407 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
408
409 5, 5, 5, 5, 5, 5, 5, 5, 6, 6, 6, 6, 7, 7, 8, 9
410 };
411
412 /*
413 * LEGACY: consider using ltf8_decode_crc.
414 */
ltf8_decode(cram_fd * fd,int64_t * val_p)415 int ltf8_decode(cram_fd *fd, int64_t *val_p) {
416 int c = hgetc(fd->fp);
417 int64_t val = (unsigned char)c;
418 if (c == -1)
419 return -1;
420
421 if (val < 0x80) {
422 *val_p = val;
423 return 1;
424
425 } else if (val < 0xc0) {
426 val = (val<<8) | (unsigned char)hgetc(fd->fp);
427 *val_p = val & (((1LL<<(6+8)))-1);
428 return 2;
429
430 } else if (val < 0xe0) {
431 val = (val<<8) | (unsigned char)hgetc(fd->fp);
432 val = (val<<8) | (unsigned char)hgetc(fd->fp);
433 *val_p = val & ((1LL<<(5+2*8))-1);
434 return 3;
435
436 } else if (val < 0xf0) {
437 val = (val<<8) | (unsigned char)hgetc(fd->fp);
438 val = (val<<8) | (unsigned char)hgetc(fd->fp);
439 val = (val<<8) | (unsigned char)hgetc(fd->fp);
440 *val_p = val & ((1LL<<(4+3*8))-1);
441 return 4;
442
443 } else if (val < 0xf8) {
444 val = (val<<8) | (unsigned char)hgetc(fd->fp);
445 val = (val<<8) | (unsigned char)hgetc(fd->fp);
446 val = (val<<8) | (unsigned char)hgetc(fd->fp);
447 val = (val<<8) | (unsigned char)hgetc(fd->fp);
448 *val_p = val & ((1LL<<(3+4*8))-1);
449 return 5;
450
451 } else if (val < 0xfc) {
452 val = (val<<8) | (unsigned char)hgetc(fd->fp);
453 val = (val<<8) | (unsigned char)hgetc(fd->fp);
454 val = (val<<8) | (unsigned char)hgetc(fd->fp);
455 val = (val<<8) | (unsigned char)hgetc(fd->fp);
456 val = (val<<8) | (unsigned char)hgetc(fd->fp);
457 *val_p = val & ((1LL<<(2+5*8))-1);
458 return 6;
459
460 } else if (val < 0xfe) {
461 val = (val<<8) | (unsigned char)hgetc(fd->fp);
462 val = (val<<8) | (unsigned char)hgetc(fd->fp);
463 val = (val<<8) | (unsigned char)hgetc(fd->fp);
464 val = (val<<8) | (unsigned char)hgetc(fd->fp);
465 val = (val<<8) | (unsigned char)hgetc(fd->fp);
466 val = (val<<8) | (unsigned char)hgetc(fd->fp);
467 *val_p = val & ((1LL<<(1+6*8))-1);
468 return 7;
469
470 } else if (val < 0xff) {
471 val = (val<<8) | (unsigned char)hgetc(fd->fp);
472 val = (val<<8) | (unsigned char)hgetc(fd->fp);
473 val = (val<<8) | (unsigned char)hgetc(fd->fp);
474 val = (val<<8) | (unsigned char)hgetc(fd->fp);
475 val = (val<<8) | (unsigned char)hgetc(fd->fp);
476 val = (val<<8) | (unsigned char)hgetc(fd->fp);
477 val = (val<<8) | (unsigned char)hgetc(fd->fp);
478 *val_p = val & ((1LL<<(7*8))-1);
479 return 8;
480
481 } else {
482 val = (val<<8) | (unsigned char)hgetc(fd->fp);
483 val = (val<<8) | (unsigned char)hgetc(fd->fp);
484 val = (val<<8) | (unsigned char)hgetc(fd->fp);
485 val = (val<<8) | (unsigned char)hgetc(fd->fp);
486 val = (val<<8) | (unsigned char)hgetc(fd->fp);
487 val = (val<<8) | (unsigned char)hgetc(fd->fp);
488 val = (val<<8) | (unsigned char)hgetc(fd->fp);
489 val = (val<<8) | (unsigned char)hgetc(fd->fp);
490 *val_p = val;
491 }
492
493 return 9;
494 }
495
ltf8_decode_crc(cram_fd * fd,int64_t * val_p,uint32_t * crc)496 int ltf8_decode_crc(cram_fd *fd, int64_t *val_p, uint32_t *crc) {
497 unsigned char c[9];
498 int64_t val = hgetc(fd->fp);
499 if (val < 0)
500 return -1;
501
502 c[0] = val;
503
504 if (val < 0x80) {
505 *val_p = val;
506 *crc = crc32(*crc, c, 1);
507 return 1;
508
509 } else if (val < 0xc0) {
510 int v = hgetc(fd->fp);
511 if (v < 0)
512 return -1;
513 val = (val<<8) | (c[1]=v);
514 *val_p = val & (((1LL<<(6+8)))-1);
515 *crc = crc32(*crc, c, 2);
516 return 2;
517
518 } else if (val < 0xe0) {
519 if (hread(fd->fp, &c[1], 2) < 2)
520 return -1;
521 val = (val<<8) | c[1];
522 val = (val<<8) | c[2];
523 *val_p = val & ((1LL<<(5+2*8))-1);
524 *crc = crc32(*crc, c, 3);
525 return 3;
526
527 } else if (val < 0xf0) {
528 if (hread(fd->fp, &c[1], 3) < 3)
529 return -1;
530 val = (val<<8) | c[1];
531 val = (val<<8) | c[2];
532 val = (val<<8) | c[3];
533 *val_p = val & ((1LL<<(4+3*8))-1);
534 *crc = crc32(*crc, c, 4);
535 return 4;
536
537 } else if (val < 0xf8) {
538 if (hread(fd->fp, &c[1], 4) < 4)
539 return -1;
540 val = (val<<8) | c[1];
541 val = (val<<8) | c[2];
542 val = (val<<8) | c[3];
543 val = (val<<8) | c[4];
544 *val_p = val & ((1LL<<(3+4*8))-1);
545 *crc = crc32(*crc, c, 5);
546 return 5;
547
548 } else if (val < 0xfc) {
549 if (hread(fd->fp, &c[1], 5) < 5)
550 return -1;
551 val = (val<<8) | c[1];
552 val = (val<<8) | c[2];
553 val = (val<<8) | c[3];
554 val = (val<<8) | c[4];
555 val = (val<<8) | c[5];
556 *val_p = val & ((1LL<<(2+5*8))-1);
557 *crc = crc32(*crc, c, 6);
558 return 6;
559
560 } else if (val < 0xfe) {
561 if (hread(fd->fp, &c[1], 6) < 6)
562 return -1;
563 val = (val<<8) | c[1];
564 val = (val<<8) | c[2];
565 val = (val<<8) | c[3];
566 val = (val<<8) | c[4];
567 val = (val<<8) | c[5];
568 val = (val<<8) | c[6];
569 *val_p = val & ((1LL<<(1+6*8))-1);
570 *crc = crc32(*crc, c, 7);
571 return 7;
572
573 } else if (val < 0xff) {
574 uint64_t uval = val;
575 if (hread(fd->fp, &c[1], 7) < 7)
576 return -1;
577 uval = (uval<<8) | c[1];
578 uval = (uval<<8) | c[2];
579 uval = (uval<<8) | c[3];
580 uval = (uval<<8) | c[4];
581 uval = (uval<<8) | c[5];
582 uval = (uval<<8) | c[6];
583 uval = (uval<<8) | c[7];
584 *val_p = uval & ((1ULL<<(7*8))-1);
585 *crc = crc32(*crc, c, 8);
586 return 8;
587
588 } else {
589 uint64_t uval;
590 if (hread(fd->fp, &c[1], 8) < 8)
591 return -1;
592 uval = c[1];
593 uval = (uval<<8) | c[2];
594 uval = (uval<<8) | c[3];
595 uval = (uval<<8) | c[4];
596 uval = (uval<<8) | c[5];
597 uval = (uval<<8) | c[6];
598 uval = (uval<<8) | c[7];
599 uval = (uval<<8) | c[8];
600 *crc = crc32(*crc, c, 9);
601 // Avoid implementation-defined behaviour on negative values
602 *val_p = c[1] < 0x80 ? (int64_t) uval : -((int64_t) (0xffffffffffffffffULL - uval)) - 1;
603 }
604
605 return 9;
606 }
607
608 /*
609 * Pushes a value in ITF8 format onto the end of a block.
610 * This shouldn't be used for high-volume data as it is not the fastest
611 * method.
612 *
613 * Returns the number of bytes written
614 */
itf8_put_blk(cram_block * blk,int32_t val)615 int itf8_put_blk(cram_block *blk, int32_t val) {
616 char buf[5];
617 int sz;
618
619 sz = itf8_put(buf, val);
620 BLOCK_APPEND(blk, buf, sz);
621 return sz;
622
623 block_err:
624 return -1;
625 }
626
ltf8_put_blk(cram_block * blk,int64_t val)627 int ltf8_put_blk(cram_block *blk, int64_t val) {
628 char buf[9];
629 int sz;
630
631 sz = ltf8_put(buf, val);
632 BLOCK_APPEND(blk, buf, sz);
633 return sz;
634
635 block_err:
636 return -1;
637 }
638
safe_itf8_get(char ** cp,const char * endp,int * err)639 static int64_t safe_itf8_get(char **cp, const char *endp, int *err) {
640 const unsigned char *up = (unsigned char *)*cp;
641
642 if (endp && endp - *cp < 5 &&
643 (*cp >= endp || endp - *cp < itf8_bytes[up[0]>>4])) {
644 if (err) *err = 1;
645 return 0;
646 }
647
648 if (up[0] < 0x80) {
649 (*cp)++;
650 return up[0];
651 } else if (up[0] < 0xc0) {
652 (*cp)+=2;
653 return ((up[0] <<8) | up[1]) & 0x3fff;
654 } else if (up[0] < 0xe0) {
655 (*cp)+=3;
656 return ((up[0]<<16) | (up[1]<< 8) | up[2]) & 0x1fffff;
657 } else if (up[0] < 0xf0) {
658 (*cp)+=4;
659 uint32_t uv = (((uint32_t)up[0]<<24) | (up[1]<<16) | (up[2]<<8) | up[3]) & 0x0fffffff;
660 return (int32_t)uv;
661 } else {
662 (*cp)+=5;
663 uint32_t uv = (((uint32_t)up[0] & 0x0f)<<28) | (up[1]<<20) | (up[2]<<12) | (up[3]<<4) | (up[4] & 0x0f);
664 return (int32_t)uv;
665 }
666 }
667
safe_ltf8_get(char ** cp,const char * endp,int * err)668 static int64_t safe_ltf8_get(char **cp, const char *endp, int *err) {
669 unsigned char *up = (unsigned char *)*cp;
670
671 if (endp && endp - *cp < 9 &&
672 (*cp >= endp || endp - *cp < ltf8_bytes[up[0]])) {
673 if (err) *err = 1;
674 return 0;
675 }
676
677 if (up[0] < 0x80) {
678 (*cp)++;
679 return up[0];
680 } else if (up[0] < 0xc0) {
681 (*cp)+=2;
682 return (((uint64_t)up[0]<< 8) |
683 (uint64_t)up[1]) & (((1LL<<(6+8)))-1);
684 } else if (up[0] < 0xe0) {
685 (*cp)+=3;
686 return (((uint64_t)up[0]<<16) |
687 ((uint64_t)up[1]<< 8) |
688 (uint64_t)up[2]) & ((1LL<<(5+2*8))-1);
689 } else if (up[0] < 0xf0) {
690 (*cp)+=4;
691 return (((uint64_t)up[0]<<24) |
692 ((uint64_t)up[1]<<16) |
693 ((uint64_t)up[2]<< 8) |
694 (uint64_t)up[3]) & ((1LL<<(4+3*8))-1);
695 } else if (up[0] < 0xf8) {
696 (*cp)+=5;
697 return (((uint64_t)up[0]<<32) |
698 ((uint64_t)up[1]<<24) |
699 ((uint64_t)up[2]<<16) |
700 ((uint64_t)up[3]<< 8) |
701 (uint64_t)up[4]) & ((1LL<<(3+4*8))-1);
702 } else if (up[0] < 0xfc) {
703 (*cp)+=6;
704 return (((uint64_t)up[0]<<40) |
705 ((uint64_t)up[1]<<32) |
706 ((uint64_t)up[2]<<24) |
707 ((uint64_t)up[3]<<16) |
708 ((uint64_t)up[4]<< 8) |
709 (uint64_t)up[5]) & ((1LL<<(2+5*8))-1);
710 } else if (up[0] < 0xfe) {
711 (*cp)+=7;
712 return (((uint64_t)up[0]<<48) |
713 ((uint64_t)up[1]<<40) |
714 ((uint64_t)up[2]<<32) |
715 ((uint64_t)up[3]<<24) |
716 ((uint64_t)up[4]<<16) |
717 ((uint64_t)up[5]<< 8) |
718 (uint64_t)up[6]) & ((1LL<<(1+6*8))-1);
719 } else if (up[0] < 0xff) {
720 (*cp)+=8;
721 return (((uint64_t)up[1]<<48) |
722 ((uint64_t)up[2]<<40) |
723 ((uint64_t)up[3]<<32) |
724 ((uint64_t)up[4]<<24) |
725 ((uint64_t)up[5]<<16) |
726 ((uint64_t)up[6]<< 8) |
727 (uint64_t)up[7]) & ((1LL<<(7*8))-1);
728 } else {
729 (*cp)+=9;
730 return (((uint64_t)up[1]<<56) |
731 ((uint64_t)up[2]<<48) |
732 ((uint64_t)up[3]<<40) |
733 ((uint64_t)up[4]<<32) |
734 ((uint64_t)up[5]<<24) |
735 ((uint64_t)up[6]<<16) |
736 ((uint64_t)up[7]<< 8) |
737 (uint64_t)up[8]);
738 }
739 }
740
741 // Wrapper for now
safe_itf8_put(char * cp,char * cp_end,int32_t val)742 static int safe_itf8_put(char *cp, char *cp_end, int32_t val) {
743 return itf8_put(cp, val);
744 }
745
safe_ltf8_put(char * cp,char * cp_end,int64_t val)746 static int safe_ltf8_put(char *cp, char *cp_end, int64_t val) {
747 return ltf8_put(cp, val);
748 }
749
itf8_size(int64_t v)750 static int itf8_size(int64_t v) {
751 return ((!((v)&~0x7f))?1:(!((v)&~0x3fff))?2:(!((v)&~0x1fffff))?3:(!((v)&~0xfffffff))?4:5);
752 }
753
754 //-----------------------------------------------------------------------------
755
756 // CRAM v4.0 onwards uses a different variable sized integer encoding
757 // that is size agnostic.
758
759 // Local interface to varint.h inline version, so we can use in func ptr.
760 // Note a lot of these use the unsigned interface but take signed int64_t.
761 // This is because the old CRAM ITF8 inteface had signed -1 as unsigned
762 // 0xffffffff.
uint7_size(int64_t v)763 static int uint7_size(int64_t v) {
764 return var_size_u64(v);
765 }
766
uint7_get_32(char ** cp,const char * endp,int * err)767 static int64_t uint7_get_32(char **cp, const char *endp, int *err) {
768 uint32_t val;
769 int nb = var_get_u32((uint8_t *)(*cp), (const uint8_t *)endp, &val);
770 (*cp) += nb;
771 if (!nb && err) *err = 1;
772 return val;
773 }
774
sint7_get_32(char ** cp,const char * endp,int * err)775 static int64_t sint7_get_32(char **cp, const char *endp, int *err) {
776 int32_t val;
777 int nb = var_get_s32((uint8_t *)(*cp), (const uint8_t *)endp, &val);
778 (*cp) += nb;
779 if (!nb && err) *err = 1;
780 return val;
781 }
782
uint7_get_64(char ** cp,const char * endp,int * err)783 static int64_t uint7_get_64(char **cp, const char *endp, int *err) {
784 uint64_t val;
785 int nb = var_get_u64((uint8_t *)(*cp), (const uint8_t *)endp, &val);
786 (*cp) += nb;
787 if (!nb && err) *err = 1;
788 return val;
789 }
790
sint7_get_64(char ** cp,const char * endp,int * err)791 static int64_t sint7_get_64(char **cp, const char *endp, int *err) {
792 int64_t val;
793 int nb = var_get_s64((uint8_t *)(*cp), (const uint8_t *)endp, &val);
794 (*cp) += nb;
795 if (!nb && err) *err = 1;
796 return val;
797 }
798
uint7_put_32(char * cp,char * endp,int32_t val)799 static int uint7_put_32(char *cp, char *endp, int32_t val) {
800 return var_put_u32((uint8_t *)cp, (uint8_t *)endp, val);
801 }
802
sint7_put_32(char * cp,char * endp,int32_t val)803 static int sint7_put_32(char *cp, char *endp, int32_t val) {
804 return var_put_s32((uint8_t *)cp, (uint8_t *)endp, val);
805 }
806
uint7_put_64(char * cp,char * endp,int64_t val)807 static int uint7_put_64(char *cp, char *endp, int64_t val) {
808 return var_put_u64((uint8_t *)cp, (uint8_t *)endp, val);
809 }
810
sint7_put_64(char * cp,char * endp,int64_t val)811 static int sint7_put_64(char *cp, char *endp, int64_t val) {
812 return var_put_s64((uint8_t *)cp, (uint8_t *)endp, val);
813 }
814
815 // Put direct to to cram_block
uint7_put_blk_32(cram_block * blk,int32_t v)816 static int uint7_put_blk_32(cram_block *blk, int32_t v) {
817 uint8_t buf[10];
818 int sz = var_put_u32(buf, buf+10, v);
819 BLOCK_APPEND(blk, buf, sz);
820 return sz;
821
822 block_err:
823 return -1;
824 }
825
sint7_put_blk_32(cram_block * blk,int32_t v)826 static int sint7_put_blk_32(cram_block *blk, int32_t v) {
827 uint8_t buf[10];
828 int sz = var_put_s32(buf, buf+10, v);
829 BLOCK_APPEND(blk, buf, sz);
830 return sz;
831
832 block_err:
833 return -1;
834 }
835
uint7_put_blk_64(cram_block * blk,int64_t v)836 static int uint7_put_blk_64(cram_block *blk, int64_t v) {
837 uint8_t buf[10];
838 int sz = var_put_u64(buf, buf+10, v);
839 BLOCK_APPEND(blk, buf, sz);
840 return sz;
841
842 block_err:
843 return -1;
844 }
845
sint7_put_blk_64(cram_block * blk,int64_t v)846 static int sint7_put_blk_64(cram_block *blk, int64_t v) {
847 uint8_t buf[10];
848 int sz = var_put_s64(buf, buf+10, v);
849 BLOCK_APPEND(blk, buf, sz);
850 return sz;
851
852 block_err:
853 return -1;
854 }
855
856 // Decode 32-bits with CRC update from cram_fd
uint7_decode_crc32(cram_fd * fd,int32_t * val_p,uint32_t * crc)857 static int uint7_decode_crc32(cram_fd *fd, int32_t *val_p, uint32_t *crc) {
858 uint8_t b[5], i = 0;
859 int c;
860 uint32_t v = 0;
861
862 #ifdef VARINT2
863 b[0] = hgetc(fd->fp);
864 if (b[0] < 177) {
865 } else if (b[0] < 241) {
866 b[1] = hgetc(fd->fp);
867 } else if (b[0] < 249) {
868 b[1] = hgetc(fd->fp);
869 b[2] = hgetc(fd->fp);
870 } else {
871 int n = b[0]+2, z = 1;
872 while (n-- >= 249)
873 b[z++] = hgetc(fd->fp);
874 }
875 i = var_get_u32(b, NULL, &v);
876 #else
877 // // Little endian
878 // int s = 0;
879 // do {
880 // b[i++] = c = hgetc(fd->fp);
881 // if (c < 0)
882 // return -1;
883 // v |= (c & 0x7f) << s;
884 // s += 7;
885 // } while (i < 5 && (c & 0x80));
886
887 // Big endian, see also htscodecs/varint.h
888 do {
889 b[i++] = c = hgetc(fd->fp);
890 if (c < 0)
891 return -1;
892 v = (v<<7) | (c & 0x7f);
893 } while (i < 5 && (c & 0x80));
894 #endif
895 *crc = crc32(*crc, b, i);
896
897 *val_p = v;
898 return i;
899 }
900
901 // Decode 32-bits with CRC update from cram_fd
sint7_decode_crc32(cram_fd * fd,int32_t * val_p,uint32_t * crc)902 static int sint7_decode_crc32(cram_fd *fd, int32_t *val_p, uint32_t *crc) {
903 uint8_t b[5], i = 0;
904 int c;
905 uint32_t v = 0;
906
907 #ifdef VARINT2
908 b[0] = hgetc(fd->fp);
909 if (b[0] < 177) {
910 } else if (b[0] < 241) {
911 b[1] = hgetc(fd->fp);
912 } else if (b[0] < 249) {
913 b[1] = hgetc(fd->fp);
914 b[2] = hgetc(fd->fp);
915 } else {
916 int n = b[0]+2, z = 1;
917 while (n-- >= 249)
918 b[z++] = hgetc(fd->fp);
919 }
920 i = var_get_u32(b, NULL, &v);
921 #else
922 // // Little endian
923 // int s = 0;
924 // do {
925 // b[i++] = c = hgetc(fd->fp);
926 // if (c < 0)
927 // return -1;
928 // v |= (c & 0x7f) << s;
929 // s += 7;
930 // } while (i < 5 && (c & 0x80));
931
932 // Big endian, see also htscodecs/varint.h
933 do {
934 b[i++] = c = hgetc(fd->fp);
935 if (c < 0)
936 return -1;
937 v = (v<<7) | (c & 0x7f);
938 } while (i < 5 && (c & 0x80));
939 #endif
940 *crc = crc32(*crc, b, i);
941
942 *val_p = (v>>1) ^ -(v&1);
943 return i;
944 }
945
946
947 // Decode 64-bits with CRC update from cram_fd
uint7_decode_crc64(cram_fd * fd,int64_t * val_p,uint32_t * crc)948 static int uint7_decode_crc64(cram_fd *fd, int64_t *val_p, uint32_t *crc) {
949 uint8_t b[10], i = 0;
950 int c;
951 uint64_t v = 0;
952
953 #ifdef VARINT2
954 b[0] = hgetc(fd->fp);
955 if (b[0] < 177) {
956 } else if (b[0] < 241) {
957 b[1] = hgetc(fd->fp);
958 } else if (b[0] < 249) {
959 b[1] = hgetc(fd->fp);
960 b[2] = hgetc(fd->fp);
961 } else {
962 int n = b[0]+2, z = 1;
963 while (n-- >= 249)
964 b[z++] = hgetc(fd->fp);
965 }
966 i = var_get_u64(b, NULL, &v);
967 #else
968 // // Little endian
969 // int s = 0;
970 // do {
971 // b[i++] = c = hgetc(fd->fp);
972 // if (c < 0)
973 // return -1;
974 // v |= (c & 0x7f) << s;
975 // s += 7;
976 // } while (i < 10 && (c & 0x80));
977
978 // Big endian, see also htscodecs/varint.h
979 do {
980 b[i++] = c = hgetc(fd->fp);
981 if (c < 0)
982 return -1;
983 v = (v<<7) | (c & 0x7f);
984 } while (i < 5 && (c & 0x80));
985 #endif
986 *crc = crc32(*crc, b, i);
987
988 *val_p = v;
989 return i;
990 }
991
992 //-----------------------------------------------------------------------------
993
994 /*
995 * Decodes a 32-bit little endian value from fd and stores in val.
996 *
997 * Returns the number of bytes read on success
998 * -1 on failure
999 */
int32_decode(cram_fd * fd,int32_t * val)1000 static int int32_decode(cram_fd *fd, int32_t *val) {
1001 int32_t i;
1002 if (4 != hread(fd->fp, &i, 4))
1003 return -1;
1004
1005 *val = le_int4(i);
1006 return 4;
1007 }
1008
1009 /*
1010 * Encodes a 32-bit little endian value 'val' and writes to fd.
1011 *
1012 * Returns the number of bytes written on success
1013 * -1 on failure
1014 */
int32_encode(cram_fd * fd,int32_t val)1015 static int int32_encode(cram_fd *fd, int32_t val) {
1016 uint32_t v = le_int4(val);
1017 if (4 != hwrite(fd->fp, &v, 4))
1018 return -1;
1019
1020 return 4;
1021 }
1022
1023 /* As int32_decoded/encode, but from/to blocks instead of cram_fd */
int32_get_blk(cram_block * b,int32_t * val)1024 int int32_get_blk(cram_block *b, int32_t *val) {
1025 if (b->uncomp_size - BLOCK_SIZE(b) < 4)
1026 return -1;
1027
1028 uint32_t v =
1029 ((uint32_t) b->data[b->byte ]) |
1030 (((uint32_t) b->data[b->byte+1]) << 8) |
1031 (((uint32_t) b->data[b->byte+2]) << 16) |
1032 (((uint32_t) b->data[b->byte+3]) << 24);
1033 // Avoid implementation-defined behaviour on negative values
1034 *val = v < 0x80000000U ? (int32_t) v : -((int32_t) (0xffffffffU - v)) - 1;
1035 BLOCK_SIZE(b) += 4;
1036 return 4;
1037 }
1038
1039 /* As int32_decoded/encode, but from/to blocks instead of cram_fd */
int32_put_blk(cram_block * b,int32_t val)1040 int int32_put_blk(cram_block *b, int32_t val) {
1041 unsigned char cp[4];
1042 uint32_t v = val;
1043 cp[0] = ( v & 0xff);
1044 cp[1] = ((v>>8) & 0xff);
1045 cp[2] = ((v>>16) & 0xff);
1046 cp[3] = ((v>>24) & 0xff);
1047
1048 BLOCK_APPEND(b, cp, 4);
1049 return 0;
1050
1051 block_err:
1052 return -1;
1053 }
1054
1055 #ifdef HAVE_LIBDEFLATE
1056 /* ----------------------------------------------------------------------
1057 * libdeflate compression code, with interface to match
1058 * zlib_mem_{in,de}flate for simplicity elsewhere.
1059 */
1060
1061 // Named the same as the version that uses zlib as we always use libdeflate for
1062 // decompression when available.
zlib_mem_inflate(char * cdata,size_t csize,size_t * size)1063 char *zlib_mem_inflate(char *cdata, size_t csize, size_t *size) {
1064 struct libdeflate_decompressor *z = libdeflate_alloc_decompressor();
1065 if (!z) {
1066 hts_log_error("Call to libdeflate_alloc_decompressor failed");
1067 return NULL;
1068 }
1069
1070 uint8_t *data = NULL, *new_data;
1071 if (!*size)
1072 *size = csize*2;
1073 for(;;) {
1074 new_data = realloc(data, *size);
1075 if (!new_data) {
1076 hts_log_error("Memory allocation failure");
1077 goto fail;
1078 }
1079 data = new_data;
1080
1081 int ret = libdeflate_gzip_decompress(z, cdata, csize, data, *size, size);
1082
1083 // Auto grow output buffer size if needed and try again.
1084 // Fortunately for all bar one call of this we know the size already.
1085 if (ret == LIBDEFLATE_INSUFFICIENT_SPACE) {
1086 (*size) *= 1.5;
1087 continue;
1088 }
1089
1090 if (ret != LIBDEFLATE_SUCCESS) {
1091 hts_log_error("Inflate operation failed: %d", ret);
1092 goto fail;
1093 } else {
1094 break;
1095 }
1096 }
1097
1098 libdeflate_free_decompressor(z);
1099 return (char *)data;
1100
1101 fail:
1102 libdeflate_free_decompressor(z);
1103 free(data);
1104 return NULL;
1105 }
1106
1107 // Named differently as we use both zlib/libdeflate for compression.
libdeflate_deflate(char * data,size_t size,size_t * cdata_size,int level,int strat)1108 static char *libdeflate_deflate(char *data, size_t size, size_t *cdata_size,
1109 int level, int strat) {
1110 level = level > 0 ? level : 6; // libdeflate doesn't honour -1 as default
1111 level *= 1.2; // NB levels go up to 12 here; 5 onwards is +1
1112 if (level >= 8) level += level/8; // 8->10, 9->12
1113 if (level > 12) level = 12;
1114
1115 if (strat == Z_RLE) // not supported by libdeflate
1116 level = 1;
1117
1118 struct libdeflate_compressor *z = libdeflate_alloc_compressor(level);
1119 if (!z) {
1120 hts_log_error("Call to libdeflate_alloc_compressor failed");
1121 return NULL;
1122 }
1123
1124 unsigned char *cdata = NULL; /* Compressed output */
1125 size_t cdata_alloc;
1126 cdata = malloc(cdata_alloc = size*1.05+100);
1127 if (!cdata) {
1128 hts_log_error("Memory allocation failure");
1129 libdeflate_free_compressor(z);
1130 return NULL;
1131 }
1132
1133 *cdata_size = libdeflate_gzip_compress(z, data, size, cdata, cdata_alloc);
1134 libdeflate_free_compressor(z);
1135
1136 if (*cdata_size == 0) {
1137 hts_log_error("Call to libdeflate_gzip_compress failed");
1138 free(cdata);
1139 return NULL;
1140 }
1141
1142 return (char *)cdata;
1143 }
1144
1145 #else
1146
1147 /* ----------------------------------------------------------------------
1148 * zlib compression code - from Gap5's tg_iface_g.c
1149 * They're static here as they're only used within the cram_compress_block
1150 * and cram_uncompress_block functions, which are the external interface.
1151 */
zlib_mem_inflate(char * cdata,size_t csize,size_t * size)1152 char *zlib_mem_inflate(char *cdata, size_t csize, size_t *size) {
1153 z_stream s;
1154 unsigned char *data = NULL; /* Uncompressed output */
1155 int data_alloc = 0;
1156 int err;
1157
1158 /* Starting point at uncompressed size, and scale after that */
1159 data = malloc(data_alloc = csize*1.2+100);
1160 if (!data)
1161 return NULL;
1162
1163 /* Initialise zlib stream */
1164 s.zalloc = Z_NULL; /* use default allocation functions */
1165 s.zfree = Z_NULL;
1166 s.opaque = Z_NULL;
1167 s.next_in = (unsigned char *)cdata;
1168 s.avail_in = csize;
1169 s.total_in = 0;
1170 s.next_out = data;
1171 s.avail_out = data_alloc;
1172 s.total_out = 0;
1173
1174 //err = inflateInit(&s);
1175 err = inflateInit2(&s, 15 + 32);
1176 if (err != Z_OK) {
1177 hts_log_error("Call to zlib inflateInit failed: %s", s.msg);
1178 free(data);
1179 return NULL;
1180 }
1181
1182 /* Decode to 'data' array */
1183 for (;s.avail_in;) {
1184 unsigned char *data_tmp;
1185 int alloc_inc;
1186
1187 s.next_out = &data[s.total_out];
1188 err = inflate(&s, Z_NO_FLUSH);
1189 if (err == Z_STREAM_END)
1190 break;
1191
1192 if (err != Z_OK) {
1193 hts_log_error("Call to zlib inflate failed: %s", s.msg);
1194 free(data);
1195 inflateEnd(&s);
1196 return NULL;
1197 }
1198
1199 /* More to come, so realloc based on growth so far */
1200 alloc_inc = (double)s.avail_in/s.total_in * s.total_out + 100;
1201 data = realloc((data_tmp = data), data_alloc += alloc_inc);
1202 if (!data) {
1203 free(data_tmp);
1204 inflateEnd(&s);
1205 return NULL;
1206 }
1207 s.avail_out += alloc_inc;
1208 }
1209 inflateEnd(&s);
1210
1211 *size = s.total_out;
1212 return (char *)data;
1213 }
1214 #endif
1215
zlib_mem_deflate(char * data,size_t size,size_t * cdata_size,int level,int strat)1216 static char *zlib_mem_deflate(char *data, size_t size, size_t *cdata_size,
1217 int level, int strat) {
1218 z_stream s;
1219 unsigned char *cdata = NULL; /* Compressed output */
1220 int cdata_alloc = 0;
1221 int cdata_pos = 0;
1222 int err;
1223
1224 cdata = malloc(cdata_alloc = size*1.05+100);
1225 if (!cdata)
1226 return NULL;
1227 cdata_pos = 0;
1228
1229 /* Initialise zlib stream */
1230 s.zalloc = Z_NULL; /* use default allocation functions */
1231 s.zfree = Z_NULL;
1232 s.opaque = Z_NULL;
1233 s.next_in = (unsigned char *)data;
1234 s.avail_in = size;
1235 s.total_in = 0;
1236 s.next_out = cdata;
1237 s.avail_out = cdata_alloc;
1238 s.total_out = 0;
1239 s.data_type = Z_BINARY;
1240
1241 err = deflateInit2(&s, level, Z_DEFLATED, 15|16, 9, strat);
1242 if (err != Z_OK) {
1243 hts_log_error("Call to zlib deflateInit2 failed: %s", s.msg);
1244 return NULL;
1245 }
1246
1247 /* Encode to 'cdata' array */
1248 for (;s.avail_in;) {
1249 s.next_out = &cdata[cdata_pos];
1250 s.avail_out = cdata_alloc - cdata_pos;
1251 if (cdata_alloc - cdata_pos <= 0) {
1252 hts_log_error("Deflate produced larger output than expected");
1253 return NULL;
1254 }
1255 err = deflate(&s, Z_NO_FLUSH);
1256 cdata_pos = cdata_alloc - s.avail_out;
1257 if (err != Z_OK) {
1258 hts_log_error("Call to zlib deflate failed: %s", s.msg);
1259 break;
1260 }
1261 }
1262 if (deflate(&s, Z_FINISH) != Z_STREAM_END) {
1263 hts_log_error("Call to zlib deflate failed: %s", s.msg);
1264 }
1265 *cdata_size = s.total_out;
1266
1267 if (deflateEnd(&s) != Z_OK) {
1268 hts_log_error("Call to zlib deflate failed: %s", s.msg);
1269 }
1270 return (char *)cdata;
1271 }
1272
1273 #ifdef HAVE_LIBLZMA
1274 /* ------------------------------------------------------------------------ */
1275 /*
1276 * Data compression routines using liblzma (xz)
1277 *
1278 * On a test set this shrunk the main db from 136157104 bytes to 114796168, but
1279 * caused tg_index to grow from 2m43.707s to 15m3.961s. Exporting as bfastq
1280 * went from 18.3s to 36.3s. So decompression suffers too, but not as bad
1281 * as compression times.
1282 *
1283 * For now we disable this functionality. If it's to be reenabled make sure you
1284 * improve the mem_inflate implementation as it's just a test hack at the
1285 * moment.
1286 */
1287
lzma_mem_deflate(char * data,size_t size,size_t * cdata_size,int level)1288 static char *lzma_mem_deflate(char *data, size_t size, size_t *cdata_size,
1289 int level) {
1290 char *out;
1291 size_t out_size = lzma_stream_buffer_bound(size);
1292 *cdata_size = 0;
1293
1294 out = malloc(out_size);
1295
1296 /* Single call compression */
1297 if (LZMA_OK != lzma_easy_buffer_encode(level, LZMA_CHECK_CRC32, NULL,
1298 (uint8_t *)data, size,
1299 (uint8_t *)out, cdata_size,
1300 out_size))
1301 return NULL;
1302
1303 return out;
1304 }
1305
lzma_mem_inflate(char * cdata,size_t csize,size_t * size)1306 static char *lzma_mem_inflate(char *cdata, size_t csize, size_t *size) {
1307 lzma_stream strm = LZMA_STREAM_INIT;
1308 size_t out_size = 0, out_pos = 0;
1309 char *out = NULL, *new_out;
1310 int r;
1311
1312 /* Initiate the decoder */
1313 if (LZMA_OK != lzma_stream_decoder(&strm, lzma_easy_decoder_memusage(9), 0))
1314 return NULL;
1315
1316 /* Decode loop */
1317 strm.avail_in = csize;
1318 strm.next_in = (uint8_t *)cdata;
1319
1320 for (;strm.avail_in;) {
1321 if (strm.avail_in > out_size - out_pos) {
1322 out_size += strm.avail_in * 4 + 32768;
1323 new_out = realloc(out, out_size);
1324 if (!new_out)
1325 goto fail;
1326 out = new_out;
1327 }
1328 strm.avail_out = out_size - out_pos;
1329 strm.next_out = (uint8_t *)&out[out_pos];
1330
1331 r = lzma_code(&strm, LZMA_RUN);
1332 if (LZMA_OK != r && LZMA_STREAM_END != r) {
1333 hts_log_error("LZMA decode failure (error %d)", r);
1334 goto fail;
1335 }
1336
1337 out_pos = strm.total_out;
1338
1339 if (r == LZMA_STREAM_END)
1340 break;
1341 }
1342
1343 /* finish up any unflushed data; necessary? */
1344 r = lzma_code(&strm, LZMA_FINISH);
1345 if (r != LZMA_OK && r != LZMA_STREAM_END) {
1346 hts_log_error("Call to lzma_code failed with error %d", r);
1347 goto fail;
1348 }
1349
1350 new_out = realloc(out, strm.total_out > 0 ? strm.total_out : 1);
1351 if (new_out)
1352 out = new_out;
1353 *size = strm.total_out;
1354
1355 lzma_end(&strm);
1356
1357 return out;
1358
1359 fail:
1360 lzma_end(&strm);
1361 free(out);
1362 return NULL;
1363 }
1364 #endif
1365
1366 /* ----------------------------------------------------------------------
1367 * CRAM blocks - the dynamically growable data block. We have code to
1368 * create, update, (un)compress and read/write.
1369 *
1370 * These are derived from the deflate_interlaced.c blocks, but with the
1371 * CRAM extension of content types and IDs.
1372 */
1373
1374 /*
1375 * Allocates a new cram_block structure with a specified content_type and
1376 * id.
1377 *
1378 * Returns block pointer on success
1379 * NULL on failure
1380 */
cram_new_block(enum cram_content_type content_type,int content_id)1381 cram_block *cram_new_block(enum cram_content_type content_type,
1382 int content_id) {
1383 cram_block *b = malloc(sizeof(*b));
1384 if (!b)
1385 return NULL;
1386 b->method = b->orig_method = RAW;
1387 b->content_type = content_type;
1388 b->content_id = content_id;
1389 b->comp_size = 0;
1390 b->uncomp_size = 0;
1391 b->data = NULL;
1392 b->alloc = 0;
1393 b->byte = 0;
1394 b->bit = 7; // MSB
1395 b->crc32 = 0;
1396 b->idx = 0;
1397 b->m = NULL;
1398
1399 return b;
1400 }
1401
1402 /*
1403 * Reads a block from a cram file.
1404 * Returns cram_block pointer on success.
1405 * NULL on failure
1406 */
cram_read_block(cram_fd * fd)1407 cram_block *cram_read_block(cram_fd *fd) {
1408 cram_block *b = malloc(sizeof(*b));
1409 unsigned char c;
1410 uint32_t crc = 0;
1411 if (!b)
1412 return NULL;
1413
1414 //fprintf(stderr, "Block at %d\n", (int)ftell(fd->fp));
1415
1416 if (-1 == (b->method = hgetc(fd->fp))) { free(b); return NULL; }
1417 c = b->method; crc = crc32(crc, &c, 1);
1418 if (-1 == (b->content_type= hgetc(fd->fp))) { free(b); return NULL; }
1419 c = b->content_type; crc = crc32(crc, &c, 1);
1420 if (-1 == fd->vv.varint_decode32_crc(fd, &b->content_id, &crc)) { free(b); return NULL; }
1421 if (-1 == fd->vv.varint_decode32_crc(fd, &b->comp_size, &crc)) { free(b); return NULL; }
1422 if (-1 == fd->vv.varint_decode32_crc(fd, &b->uncomp_size, &crc)) { free(b); return NULL; }
1423
1424 //fprintf(stderr, " method %d, ctype %d, cid %d, csize %d, ucsize %d\n",
1425 // b->method, b->content_type, b->content_id, b->comp_size, b->uncomp_size);
1426
1427 if (b->method == RAW) {
1428 if (b->uncomp_size < 0 || b->comp_size != b->uncomp_size) {
1429 free(b);
1430 return NULL;
1431 }
1432 b->alloc = b->uncomp_size;
1433 if (!(b->data = malloc(b->uncomp_size))){ free(b); return NULL; }
1434 if (b->uncomp_size != hread(fd->fp, b->data, b->uncomp_size)) {
1435 free(b->data);
1436 free(b);
1437 return NULL;
1438 }
1439 } else {
1440 if (b->comp_size < 0 || b->uncomp_size < 0) {
1441 free(b);
1442 return NULL;
1443 }
1444 b->alloc = b->comp_size;
1445 if (!(b->data = malloc(b->comp_size))) { free(b); return NULL; }
1446 if (b->comp_size != hread(fd->fp, b->data, b->comp_size)) {
1447 free(b->data);
1448 free(b);
1449 return NULL;
1450 }
1451 }
1452
1453 if (CRAM_MAJOR_VERS(fd->version) >= 3) {
1454 if (-1 == int32_decode(fd, (int32_t *)&b->crc32)) {
1455 free(b->data);
1456 free(b);
1457 return NULL;
1458 }
1459
1460 b->crc32_checked = fd->ignore_md5;
1461 b->crc_part = crc;
1462 } else {
1463 b->crc32_checked = 1; // CRC not present
1464 }
1465
1466 b->orig_method = b->method;
1467 b->idx = 0;
1468 b->byte = 0;
1469 b->bit = 7; // MSB
1470
1471 return b;
1472 }
1473
1474
1475 /*
1476 * Computes the size of a cram block, including the block
1477 * header itself.
1478 */
cram_block_size(cram_block * b)1479 uint32_t cram_block_size(cram_block *b) {
1480 unsigned char dat[100], *cp = dat;;
1481 uint32_t sz;
1482
1483 *cp++ = b->method;
1484 *cp++ = b->content_type;
1485 cp += itf8_put((char*)cp, b->content_id);
1486 cp += itf8_put((char*)cp, b->comp_size);
1487 cp += itf8_put((char*)cp, b->uncomp_size);
1488
1489 sz = cp-dat + 4;
1490 sz += b->method == RAW ? b->uncomp_size : b->comp_size;
1491
1492 return sz;
1493 }
1494
1495 /*
1496 * Writes a CRAM block.
1497 * Returns 0 on success
1498 * -1 on failure
1499 */
cram_write_block(cram_fd * fd,cram_block * b)1500 int cram_write_block(cram_fd *fd, cram_block *b) {
1501 char vardata[100];
1502 int vardata_o = 0;
1503
1504 assert(b->method != RAW || (b->comp_size == b->uncomp_size));
1505
1506 if (hputc(b->method, fd->fp) == EOF) return -1;
1507 if (hputc(b->content_type, fd->fp) == EOF) return -1;
1508 vardata_o += fd->vv.varint_put32(vardata , vardata+100, b->content_id);
1509 vardata_o += fd->vv.varint_put32(vardata+vardata_o, vardata+100, b->comp_size);
1510 vardata_o += fd->vv.varint_put32(vardata+vardata_o, vardata+100, b->uncomp_size);
1511 if (vardata_o != hwrite(fd->fp, vardata, vardata_o))
1512 return -1;
1513
1514 if (b->data) {
1515 if (b->method == RAW) {
1516 if (b->uncomp_size != hwrite(fd->fp, b->data, b->uncomp_size))
1517 return -1;
1518 } else {
1519 if (b->comp_size != hwrite(fd->fp, b->data, b->comp_size))
1520 return -1;
1521 }
1522 } else {
1523 // Absent blocks should be size 0
1524 assert(b->method == RAW && b->uncomp_size == 0);
1525 }
1526
1527 if (CRAM_MAJOR_VERS(fd->version) >= 3) {
1528 char dat[100], *cp = (char *)dat;
1529 uint32_t crc;
1530
1531 *cp++ = b->method;
1532 *cp++ = b->content_type;
1533 cp += fd->vv.varint_put32(cp, dat+100, b->content_id);
1534 cp += fd->vv.varint_put32(cp, dat+100, b->comp_size);
1535 cp += fd->vv.varint_put32(cp, dat+100, b->uncomp_size);
1536 crc = crc32(0L, (uc *)dat, cp-dat);
1537
1538 if (b->method == RAW) {
1539 b->crc32 = crc32(crc, b->data ? b->data : (uc*)"", b->uncomp_size);
1540 } else {
1541 b->crc32 = crc32(crc, b->data ? b->data : (uc*)"", b->comp_size);
1542 }
1543
1544 if (-1 == int32_encode(fd, b->crc32))
1545 return -1;
1546 }
1547
1548 return 0;
1549 }
1550
1551 /*
1552 * Frees a CRAM block, deallocating internal data too.
1553 */
cram_free_block(cram_block * b)1554 void cram_free_block(cram_block *b) {
1555 if (!b)
1556 return;
1557 if (b->data)
1558 free(b->data);
1559 free(b);
1560 }
1561
1562 /*
1563 * Uncompresses a CRAM block, if compressed.
1564 */
cram_uncompress_block(cram_block * b)1565 int cram_uncompress_block(cram_block *b) {
1566 char *uncomp;
1567 size_t uncomp_size = 0;
1568
1569 if (b->crc32_checked == 0) {
1570 uint32_t crc = crc32(b->crc_part, b->data ? b->data : (uc *)"", b->alloc);
1571 b->crc32_checked = 1;
1572 if (crc != b->crc32) {
1573 hts_log_error("Block CRC32 failure");
1574 return -1;
1575 }
1576 }
1577
1578 if (b->uncomp_size == 0) {
1579 // blank block
1580 b->method = RAW;
1581 return 0;
1582 }
1583 assert(b->uncomp_size >= 0); // cram_read_block should ensure this
1584
1585 switch (b->method) {
1586 case RAW:
1587 return 0;
1588
1589 case GZIP:
1590 uncomp_size = b->uncomp_size;
1591 uncomp = zlib_mem_inflate((char *)b->data, b->comp_size, &uncomp_size);
1592
1593 if (!uncomp)
1594 return -1;
1595 if (uncomp_size != b->uncomp_size) {
1596 free(uncomp);
1597 return -1;
1598 }
1599 free(b->data);
1600 b->data = (unsigned char *)uncomp;
1601 b->alloc = uncomp_size;
1602 b->method = RAW;
1603 break;
1604
1605 #ifdef HAVE_LIBBZ2
1606 case BZIP2: {
1607 unsigned int usize = b->uncomp_size;
1608 if (!(uncomp = malloc(usize)))
1609 return -1;
1610 if (BZ_OK != BZ2_bzBuffToBuffDecompress(uncomp, &usize,
1611 (char *)b->data, b->comp_size,
1612 0, 0)) {
1613 free(uncomp);
1614 return -1;
1615 }
1616 free(b->data);
1617 b->data = (unsigned char *)uncomp;
1618 b->alloc = usize;
1619 b->method = RAW;
1620 b->uncomp_size = usize; // Just in case it differs
1621 break;
1622 }
1623 #else
1624 case BZIP2:
1625 hts_log_error("Bzip2 compression is not compiled into this version. Please rebuild and try again");
1626 return -1;
1627 #endif
1628
1629 #ifdef HAVE_LIBLZMA
1630 case LZMA:
1631 uncomp = lzma_mem_inflate((char *)b->data, b->comp_size, &uncomp_size);
1632 if (!uncomp)
1633 return -1;
1634 if (uncomp_size != b->uncomp_size) {
1635 free(uncomp);
1636 return -1;
1637 }
1638 free(b->data);
1639 b->data = (unsigned char *)uncomp;
1640 b->alloc = uncomp_size;
1641 b->method = RAW;
1642 break;
1643 #else
1644 case LZMA:
1645 hts_log_error("Lzma compression is not compiled into this version. Please rebuild and try again");
1646 return -1;
1647 break;
1648 #endif
1649
1650 case RANS: {
1651 unsigned int usize = b->uncomp_size, usize2;
1652 uncomp = (char *)rans_uncompress(b->data, b->comp_size, &usize2);
1653 if (!uncomp)
1654 return -1;
1655 if (usize != usize2) {
1656 free(uncomp);
1657 return -1;
1658 }
1659 free(b->data);
1660 b->data = (unsigned char *)uncomp;
1661 b->alloc = usize2;
1662 b->method = RAW;
1663 b->uncomp_size = usize2; // Just in case it differs
1664 //fprintf(stderr, "Expanded %d to %d\n", b->comp_size, b->uncomp_size);
1665 break;
1666 }
1667
1668 case FQZ: {
1669 uncomp_size = b->uncomp_size;
1670 uncomp = fqz_decompress((char *)b->data, b->comp_size, &uncomp_size, NULL, 0);
1671 if (!uncomp)
1672 return -1;
1673 free(b->data);
1674 b->data = (unsigned char *)uncomp;
1675 b->alloc = uncomp_size;
1676 b->method = RAW;
1677 b->uncomp_size = uncomp_size;
1678 break;
1679 }
1680
1681 case RANS_PR0: {
1682 unsigned int usize = b->uncomp_size, usize2;
1683 uncomp = (char *)rans_uncompress_4x16(b->data, b->comp_size, &usize2);
1684 if (!uncomp)
1685 return -1;
1686 if (usize != usize2) {
1687 free(uncomp);
1688 return -1;
1689 }
1690 b->orig_method = RANS_PR0 + (b->data[0]&1)
1691 + 2*((b->data[0]&0x40)>0) + 4*((b->data[0]&0x80)>0);
1692 free(b->data);
1693 b->data = (unsigned char *)uncomp;
1694 b->alloc = usize2;
1695 b->method = RAW;
1696 b->uncomp_size = usize2; // Just incase it differs
1697 //fprintf(stderr, "Expanded %d to %d\n", b->comp_size, b->uncomp_size);
1698 break;
1699 }
1700
1701 case ARITH_PR0: {
1702 unsigned int usize = b->uncomp_size, usize2;
1703 uncomp = (char *)arith_uncompress_to(b->data, b->comp_size, NULL, &usize2);
1704 if (!uncomp)
1705 return -1;
1706 if (usize != usize2) {
1707 free(uncomp);
1708 return -1;
1709 }
1710 b->orig_method = ARITH_PR0 + (b->data[0]&1)
1711 + 2*((b->data[0]&0x40)>0) + 4*((b->data[0]&0x80)>0);
1712 free(b->data);
1713 b->data = (unsigned char *)uncomp;
1714 b->alloc = usize2;
1715 b->method = RAW;
1716 b->uncomp_size = usize2; // Just incase it differs
1717 //fprintf(stderr, "Expanded %d to %d\n", b->comp_size, b->uncomp_size);
1718 break;
1719 }
1720
1721 case TOK3: {
1722 uint32_t out_len;
1723 uint8_t *cp = decode_names(b->data, b->comp_size, &out_len);
1724 if (!cp)
1725 return -1;
1726 b->orig_method = TOK3;
1727 b->method = RAW;
1728 free(b->data);
1729 b->data = cp;
1730 b->alloc = out_len;
1731 b->uncomp_size = out_len;
1732 break;
1733 }
1734
1735 default:
1736 return -1;
1737 }
1738
1739 return 0;
1740 }
1741
cram_compress_by_method(cram_slice * s,char * in,size_t in_size,int content_id,size_t * out_size,enum cram_block_method_int method,int level,int strat)1742 static char *cram_compress_by_method(cram_slice *s, char *in, size_t in_size,
1743 int content_id, size_t *out_size,
1744 enum cram_block_method_int method,
1745 int level, int strat) {
1746 switch (method) {
1747 case GZIP:
1748 case GZIP_RLE:
1749 case GZIP_1:
1750 // Read names bizarrely benefit from zlib over libdeflate for
1751 // mid-range compression levels. Focusing purely of ratio or
1752 // speed, libdeflate still wins. It also seems to win for
1753 // other data series too.
1754 //
1755 // Eg RN at level 5; libdeflate=55.9MB zlib=51.6MB
1756 #ifdef HAVE_LIBDEFLATE
1757 if (content_id == DS_RN && level >= 4 && level <= 7)
1758 return zlib_mem_deflate(in, in_size, out_size, level, strat);
1759 else
1760 return libdeflate_deflate(in, in_size, out_size, level, strat);
1761 #else
1762 return zlib_mem_deflate(in, in_size, out_size, level, strat);
1763 #endif
1764
1765 case BZIP2: {
1766 #ifdef HAVE_LIBBZ2
1767 unsigned int comp_size = in_size*1.01 + 600;
1768 char *comp = malloc(comp_size);
1769 if (!comp)
1770 return NULL;
1771
1772 if (BZ_OK != BZ2_bzBuffToBuffCompress(comp, &comp_size,
1773 in, in_size,
1774 level, 0, 30)) {
1775 free(comp);
1776 return NULL;
1777 }
1778 *out_size = comp_size;
1779 return comp;
1780 #else
1781 return NULL;
1782 #endif
1783 }
1784
1785 case FQZ:
1786 case FQZ_b:
1787 case FQZ_c:
1788 case FQZ_d: {
1789 // Extract the necessary portion of the slice into an fqz_slice struct.
1790 // These previously were the same thing, but this permits us to detach
1791 // the codec from the rest of this CRAM implementation.
1792 fqz_slice *f = malloc(2*s->hdr->num_records * sizeof(uint32_t) + sizeof(fqz_slice));
1793 if (!f)
1794 return NULL;
1795 f->num_records = s->hdr->num_records;
1796 f->len = (uint32_t *)(((char *)f) + sizeof(fqz_slice));
1797 f->flags = f->len + s->hdr->num_records;
1798 int i;
1799 for (i = 0; i < s->hdr->num_records; i++) {
1800 f->flags[i] = s->crecs[i].flags;
1801 f->len[i] = (i+1 < s->hdr->num_records
1802 ? s->crecs[i+1].qual - s->crecs[i].qual
1803 : s->block[DS_QS]->uncomp_size - s->crecs[i].qual);
1804 }
1805 char *comp = fqz_compress(strat & 0xff /* cram vers */, f,
1806 in, in_size, out_size, strat >> 8, NULL);
1807 free(f);
1808 return comp;
1809 }
1810
1811 case LZMA:
1812 #ifdef HAVE_LIBLZMA
1813 return lzma_mem_deflate(in, in_size, out_size, level);
1814 #else
1815 return NULL;
1816 #endif
1817
1818 case RANS0:
1819 case RANS1: {
1820 unsigned int out_size_i;
1821 unsigned char *cp;
1822 cp = rans_compress((unsigned char *)in, in_size, &out_size_i,
1823 method == RANS0 ? 0 : 1);
1824 *out_size = out_size_i;
1825 return (char *)cp;
1826 }
1827
1828 case RANS_PR0:
1829 case RANS_PR1:
1830 case RANS_PR64:
1831 case RANS_PR9:
1832 case RANS_PR128:
1833 case RANS_PR129:
1834 case RANS_PR192:
1835 case RANS_PR193: {
1836 unsigned int out_size_i;
1837 unsigned char *cp;
1838
1839 // see enum cram_block. We map RANS_* methods to order bit-fields
1840 static int methmap[] = { 1, 64,9, 128,129, 192,193 };
1841
1842 cp = rans_compress_4x16((unsigned char *)in, in_size, &out_size_i,
1843 method == RANS_PR0 ? 0 : methmap[method - RANS_PR1]);
1844 *out_size = out_size_i;
1845 return (char *)cp;
1846 }
1847
1848 case ARITH_PR0:
1849 case ARITH_PR1:
1850 case ARITH_PR64:
1851 case ARITH_PR9:
1852 case ARITH_PR128:
1853 case ARITH_PR129:
1854 case ARITH_PR192:
1855 case ARITH_PR193: {
1856 unsigned int out_size_i;
1857 unsigned char *cp;
1858
1859 // see enum cram_block. We map ARITH_* methods to order bit-fields
1860 static int methmap[] = { 1, 64,9, 128,129, 192,193 };
1861
1862 cp = arith_compress_to((unsigned char *)in, in_size, NULL, &out_size_i,
1863 method == ARITH_PR0 ? 0 : methmap[method - ARITH_PR1]);
1864 *out_size = out_size_i;
1865 return (char *)cp;
1866 }
1867
1868 case TOK3:
1869 case TOKA: {
1870 int out_len;
1871 int lev = level;
1872 if (method == TOK3 && lev > 3)
1873 lev = 3;
1874 uint8_t *cp = encode_names(in, in_size, lev, strat, &out_len, NULL);
1875 *out_size = out_len;
1876 return (char *)cp;
1877 }
1878
1879 case RAW:
1880 break;
1881
1882 default:
1883 return NULL;
1884 }
1885
1886 return NULL;
1887 }
1888
1889
1890 /*
1891 * Compresses a block using one of two different zlib strategies. If we only
1892 * want one choice set strat2 to be -1.
1893 *
1894 * The logic here is that sometimes Z_RLE does a better job than Z_FILTERED
1895 * or Z_DEFAULT_STRATEGY on quality data. If so, we'd rather use it as it is
1896 * significantly faster.
1897 *
1898 * Method and level -1 implies defaults, as specified in cram_fd.
1899 */
cram_compress_block2(cram_fd * fd,cram_slice * s,cram_block * b,cram_metrics * metrics,int method,int level)1900 int cram_compress_block2(cram_fd *fd, cram_slice *s,
1901 cram_block *b, cram_metrics *metrics,
1902 int method, int level) {
1903
1904 if (!b)
1905 return 0;
1906
1907 char *comp = NULL;
1908 size_t comp_size = 0;
1909 int strat;
1910
1911 // Internally we have parameterised methods that externally map
1912 // to the same CRAM method value.
1913 // See enum_cram_block_method_int in cram_structs.h.
1914 int methmap[] = {
1915 // Externally defined values
1916 RAW, GZIP, BZIP2, LZMA, RANS, RANSPR, ARITH, FQZ, TOK3,
1917
1918 // Reserved for possible expansion
1919 0, 0,
1920
1921 // Internally parameterised versions matching back to above
1922 // external values
1923 GZIP, GZIP,
1924 FQZ, FQZ, FQZ,
1925 RANS,
1926 RANSPR, RANSPR, RANSPR, RANSPR, RANSPR, RANSPR, RANSPR,
1927 TOK3,
1928 ARITH, ARITH, ARITH, ARITH, ARITH, ARITH, ARITH,
1929 };
1930
1931 if (b->method != RAW) {
1932 // Maybe already compressed if s->block[0] was compressed and
1933 // we have e.g. s->block[DS_BA] set to s->block[0] due to only
1934 // one base type present and hence using E_HUFFMAN on block 0.
1935 // A second explicit attempt to compress the same block then
1936 // occurs.
1937 return 0;
1938 }
1939
1940 if (method == -1) {
1941 method = 1<<GZIP;
1942 if (fd->use_bz2)
1943 method |= 1<<BZIP2;
1944 if (fd->use_lzma)
1945 method |= 1<<LZMA;
1946 }
1947
1948 if (level == -1)
1949 level = fd->level;
1950
1951 //fprintf(stderr, "IN: block %d, sz %d\n", b->content_id, b->uncomp_size);
1952
1953 if (method == RAW || level == 0 || b->uncomp_size == 0) {
1954 b->method = RAW;
1955 b->comp_size = b->uncomp_size;
1956 //fprintf(stderr, "Skip block id %d\n", b->content_id);
1957 return 0;
1958 }
1959
1960 #ifndef ABS
1961 # define ABS(a) ((a)>=0?(a):-(a))
1962 #endif
1963
1964 if (metrics) {
1965 pthread_mutex_lock(&fd->metrics_lock);
1966 // Sudden changes in size trigger a retrial. These are mainly
1967 // triggered when switching to sorted / unsorted, where the number
1968 // of elements in a slice radically changes.
1969 //
1970 // We also get large fluctuations based on genome coordinate for
1971 // e.g. SA:Z and SC series, but we consider the typical scale of
1972 // delta between blocks and use this to look for abnormality.
1973 if (metrics->input_avg_sz &&
1974 (b->uncomp_size + 1000 > 4*(metrics->input_avg_sz+1000) ||
1975 b->uncomp_size + 1000 < (metrics->input_avg_sz+1000)/4) &&
1976 ABS(b->uncomp_size-metrics->input_avg_sz)
1977 > 10*metrics->input_avg_delta) {
1978 metrics->next_trial = 0;
1979 }
1980
1981 if (metrics->trial > 0 || --metrics->next_trial <= 0) {
1982 int m, unpackable = metrics->unpackable;
1983 size_t sz_best = b->uncomp_size;
1984 size_t sz[CRAM_MAX_METHOD] = {0};
1985 int method_best = 0; // RAW
1986 char *c_best = NULL, *c = NULL;
1987
1988 metrics->input_avg_delta =
1989 0.9 * (metrics->input_avg_delta +
1990 ABS(b->uncomp_size - metrics->input_avg_sz));
1991
1992 metrics->input_avg_sz += b->uncomp_size*.2;
1993 metrics->input_avg_sz *= 0.8;
1994
1995 if (metrics->revised_method)
1996 method = metrics->revised_method;
1997 else
1998 metrics->revised_method = method;
1999
2000 if (metrics->next_trial <= 0) {
2001 metrics->next_trial = TRIAL_SPAN;
2002 metrics->trial = NTRIALS;
2003 for (m = 0; m < CRAM_MAX_METHOD; m++)
2004 metrics->sz[m] /= 2;
2005 metrics->unpackable = 0;
2006 }
2007
2008 // Compress this block using the best method
2009 if (unpackable && CRAM_MAJOR_VERS(fd->version) > 3) {
2010 // No point trying bit-pack if 17+ symbols.
2011 if (method & (1<<RANS_PR128))
2012 method = (method|(1<<RANS_PR0))&~(1<<RANS_PR128);
2013 if (method & (1<<RANS_PR129))
2014 method = (method|(1<<RANS_PR1))&~(1<<RANS_PR129);
2015 if (method & (1<<RANS_PR192))
2016 method = (method|(1<<RANS_PR64))&~(1<<RANS_PR192);
2017 if (method & (1<<RANS_PR193))
2018 method = (method|(1<<RANS_PR64)|(1<<RANS_PR1))&~(1<<RANS_PR193);
2019
2020 if (method & (1<<ARITH_PR128))
2021 method = (method|(1<<ARITH_PR0))&~(1<<ARITH_PR128);
2022 if (method & (1<<ARITH_PR129))
2023 method = (method|(1<<ARITH_PR1))&~(1<<ARITH_PR129);
2024 if (method & (1<<ARITH_PR192))
2025 method = (method|(1<<ARITH_PR64))&~(1<<ARITH_PR192);
2026 if (method & (1u<<ARITH_PR193))
2027 method = (method|(1<<ARITH_PR64)|(1<<ARITH_PR1))&~(1u<<ARITH_PR193);
2028 }
2029
2030 // Libdeflate doesn't have a Z_RLE strategy.
2031 // We treat it as level 1, but iff we haven't also
2032 // explicitly listed that in the method list.
2033 #ifdef HAVE_LIBDEFLATE
2034 if ((method & (1<<GZIP_RLE)) && (method & (1<<GZIP_1)))
2035 method &= ~(1<<GZIP_RLE);
2036 #endif
2037
2038 pthread_mutex_unlock(&fd->metrics_lock);
2039
2040 for (m = 0; m < CRAM_MAX_METHOD; m++) {
2041 if (method & (1u<<m)) {
2042 int lvl = level;
2043 switch (m) {
2044 case GZIP: strat = Z_FILTERED; break;
2045 case GZIP_1: strat = Z_DEFAULT_STRATEGY; lvl = 1; break;
2046 case GZIP_RLE: strat = Z_RLE; break;
2047 case FQZ: strat = CRAM_MAJOR_VERS(fd->version); break;
2048 case FQZ_b: strat = CRAM_MAJOR_VERS(fd->version)+256; break;
2049 case FQZ_c: strat = CRAM_MAJOR_VERS(fd->version)+2*256; break;
2050 case FQZ_d: strat = CRAM_MAJOR_VERS(fd->version)+3*256; break;
2051 case TOK3: strat = 0; break;
2052 case TOKA: strat = 1; break;
2053 default: strat = 0;
2054 }
2055
2056 c = cram_compress_by_method(s, (char *)b->data, b->uncomp_size,
2057 b->content_id, &sz[m], m, lvl, strat);
2058
2059 if (c && sz_best > sz[m]) {
2060 sz_best = sz[m];
2061 method_best = m;
2062 if (c_best)
2063 free(c_best);
2064 c_best = c;
2065 } else if (c) {
2066 free(c);
2067 } else {
2068 sz[m] = b->uncomp_size*2+1000; // arbitrarily worse than raw
2069 }
2070 } else {
2071 sz[m] = b->uncomp_size*2+1000; // arbitrarily worse than raw
2072 }
2073 }
2074
2075 if (c_best) {
2076 free(b->data);
2077 b->data = (unsigned char *)c_best;
2078 b->method = method_best; // adjusted to methmap[method_best] later
2079 b->comp_size = sz_best;
2080 }
2081
2082 // Accumulate stats for all methods tried
2083 pthread_mutex_lock(&fd->metrics_lock);
2084 for (m = 0; m < CRAM_MAX_METHOD; m++)
2085 // don't be overly sure on small blocks.
2086 // +2000 means eg bzip2 vs gzip (1.07 to 1.04) or gz vs rans1
2087 // needs to be at least 60 bytes smaller to overcome the
2088 // fixed size addition.
2089 metrics->sz[m] += sz[m]+2000;
2090
2091 // When enough trials performed, find the best on average
2092 if (--metrics->trial == 0) {
2093 int best_method = RAW;
2094 int best_sz = INT_MAX;
2095
2096 // Relative costs of methods. See enum_cram_block_method_int
2097 // and methmap
2098 double meth_cost[32] = {
2099 // Externally defined methods
2100 1, // 0 raw
2101 1.04, // 1 gzip (Z_FILTERED)
2102 1.07, // 2 bzip2
2103 1.08, // 3 lzma
2104 1.00, // 4 rans (O0)
2105 1.00, // 5 ranspr (O0)
2106 1.04, // 6 arithpr (O0)
2107 1.05, // 7 fqz
2108 1.05, // 8 tok3 (rans)
2109 1.00, 1.00, // 9,10 reserved
2110
2111 // Paramterised versions of above
2112 1.01, // gzip rle
2113 1.01, // gzip -1
2114
2115 1.05, 1.05, 1.05, // FQZ_b,c,d
2116
2117 1.01, // rans O1
2118
2119 1.01, // rans_pr1
2120 1.00, // rans_pr64; if smaller, usually fast
2121 1.03, // rans_pr65/9
2122 1.00, // rans_pr128
2123 1.01, // rans_pr129
2124 1.00, // rans_pr192
2125 1.01, // rans_pr193
2126
2127 1.07, // tok3 arith
2128
2129 1.04, // arith_pr1
2130 1.04, // arith_pr64
2131 1.04, // arith_pr9
2132 1.03, // arith_pr128
2133 1.04, // arith_pr129
2134 1.04, // arith_pr192
2135 1.04, // arith_pr193
2136 };
2137
2138 // Scale methods by cost based on compression level
2139 if (fd->level <= 1) {
2140 for (m = 0; m < CRAM_MAX_METHOD; m++)
2141 metrics->sz[m] *= 1+(meth_cost[m]-1)*4;
2142 } else if (fd->level <= 3) {
2143 for (m = 0; m < CRAM_MAX_METHOD; m++)
2144 metrics->sz[m] *= 1+(meth_cost[m]-1);
2145 } else if (fd->level <= 6) {
2146 for (m = 0; m < CRAM_MAX_METHOD; m++)
2147 metrics->sz[m] *= 1+(meth_cost[m]-1)/2;
2148 } else if (fd->level <= 7) {
2149 for (m = 0; m < CRAM_MAX_METHOD; m++)
2150 metrics->sz[m] *= 1+(meth_cost[m]-1)/3;
2151 } // else cost is ignored
2152
2153 // Ensure these are never used; BSC and ZSTD
2154 metrics->sz[9] = metrics->sz[10] = INT_MAX;
2155
2156 for (m = 0; m < CRAM_MAX_METHOD; m++) {
2157 if ((!metrics->sz[m]) || (!(method & (1u<<m))))
2158 continue;
2159
2160 if (best_sz > metrics->sz[m])
2161 best_sz = metrics->sz[m], best_method = m;
2162 }
2163
2164 if (best_method != metrics->method) {
2165 //metrics->trial = (NTRIALS+1)/2; // be sure
2166 //metrics->next_trial /= 1.5;
2167 metrics->consistency = 0;
2168 } else {
2169 metrics->next_trial *= MIN(2, 1+metrics->consistency/4.0);
2170 metrics->consistency++;
2171 }
2172
2173 metrics->method = best_method;
2174 switch (best_method) {
2175 case GZIP: strat = Z_FILTERED; break;
2176 case GZIP_1: strat = Z_DEFAULT_STRATEGY; break;
2177 case GZIP_RLE: strat = Z_RLE; break;
2178 case FQZ: strat = CRAM_MAJOR_VERS(fd->version); break;
2179 case FQZ_b: strat = CRAM_MAJOR_VERS(fd->version)+256; break;
2180 case FQZ_c: strat = CRAM_MAJOR_VERS(fd->version)+2*256; break;
2181 case FQZ_d: strat = CRAM_MAJOR_VERS(fd->version)+3*256; break;
2182 default: strat = 0;
2183 }
2184 metrics->strat = strat;
2185
2186 // If we see at least MAXFAIL trials in a row for a specific
2187 // compression method with more than MAXDELTA aggregate
2188 // size then we drop this from the list of methods used
2189 // for this block type.
2190 #define MAXDELTA 0.20
2191 #define MAXFAILS 4
2192 for (m = 0; m < CRAM_MAX_METHOD; m++) {
2193 if (best_method == m) {
2194 metrics->cnt[m] = 0;
2195 metrics->extra[m] = 0;
2196 } else if (best_sz < metrics->sz[m]) {
2197 double r = (double)metrics->sz[m] / best_sz - 1;
2198 int mul = 1+(fd->level>=7);
2199 if (++metrics->cnt[m] >= MAXFAILS*mul &&
2200 (metrics->extra[m] += r) >= MAXDELTA*mul)
2201 method &= ~(1u<<m);
2202
2203 // Special case for fqzcomp as it rarely changes
2204 if (m == FQZ || m == FQZ_b || m == FQZ_c || m == FQZ_d) {
2205 if (metrics->sz[m] > best_sz)
2206 method &= ~(1u<<m);
2207 }
2208 }
2209 }
2210
2211 //if (fd->verbose > 1 && method != metrics->revised_method)
2212 // fprintf(stderr, "%d: revising method from %x to %x\n",
2213 // b->content_id, metrics->revised_method, method);
2214 metrics->revised_method = method;
2215 }
2216 pthread_mutex_unlock(&fd->metrics_lock);
2217 } else {
2218 metrics->input_avg_delta =
2219 0.9 * (metrics->input_avg_delta +
2220 ABS(b->uncomp_size - metrics->input_avg_sz));
2221
2222 metrics->input_avg_sz += b->uncomp_size*.2;
2223 metrics->input_avg_sz *= 0.8;
2224
2225 strat = metrics->strat;
2226 method = metrics->method;
2227
2228 pthread_mutex_unlock(&fd->metrics_lock);
2229 comp = cram_compress_by_method(s, (char *)b->data, b->uncomp_size,
2230 b->content_id, &comp_size, method,
2231 method == GZIP_1 ? 1 : level,
2232 strat);
2233 if (!comp)
2234 return -1;
2235
2236 if (comp_size < b->uncomp_size) {
2237 free(b->data);
2238 b->data = (unsigned char *)comp;
2239 b->comp_size = comp_size;
2240 b->method = method;
2241 } else {
2242 free(comp);
2243 }
2244 }
2245
2246 } else {
2247 // no cached metrics, so just do zlib?
2248 comp = cram_compress_by_method(s, (char *)b->data, b->uncomp_size,
2249 b->content_id, &comp_size, GZIP, level, Z_FILTERED);
2250 if (!comp) {
2251 hts_log_error("Compression failed!");
2252 return -1;
2253 }
2254
2255 if (comp_size < b->uncomp_size) {
2256 free(b->data);
2257 b->data = (unsigned char *)comp;
2258 b->comp_size = comp_size;
2259 b->method = GZIP;
2260 } else {
2261 free(comp);
2262 }
2263 strat = Z_FILTERED;
2264 }
2265
2266 hts_log_info("Compressed block ID %d from %d to %d by method %s",
2267 b->content_id, b->uncomp_size, b->comp_size,
2268 cram_block_method2str(b->method));
2269
2270 b->method = methmap[b->method];
2271
2272 return 0;
2273 }
cram_compress_block(cram_fd * fd,cram_block * b,cram_metrics * metrics,int method,int level)2274 int cram_compress_block(cram_fd *fd, cram_block *b, cram_metrics *metrics,
2275 int method, int level) {
2276 return cram_compress_block2(fd, NULL, b, metrics, method, level);
2277 }
2278
cram_new_metrics(void)2279 cram_metrics *cram_new_metrics(void) {
2280 cram_metrics *m = calloc(1, sizeof(*m));
2281 if (!m)
2282 return NULL;
2283 m->trial = NTRIALS-1;
2284 m->next_trial = TRIAL_SPAN/2; // learn quicker at start
2285 m->method = RAW;
2286 m->strat = 0;
2287 m->revised_method = 0;
2288 m->unpackable = 0;
2289
2290 return m;
2291 }
2292
cram_block_method2str(enum cram_block_method_int m)2293 char *cram_block_method2str(enum cram_block_method_int m) {
2294 switch(m) {
2295 case RAW: return "RAW";
2296 case GZIP: return "GZIP";
2297 case BZIP2: return "BZIP2";
2298 case LZMA: return "LZMA";
2299 case RANS0: return "RANS0";
2300 case RANS1: return "RANS1";
2301 case GZIP_RLE: return "GZIP_RLE";
2302 case GZIP_1: return "GZIP_1";
2303 case FQZ: return "FQZ";
2304 case FQZ_b: return "FQZ_b";
2305 case FQZ_c: return "FQZ_c";
2306 case FQZ_d: return "FQZ_d";
2307 case RANS_PR0: return "RANS_PR0";
2308 case RANS_PR1: return "RANS_PR1";
2309 case RANS_PR64: return "RANS_PR64";
2310 case RANS_PR9: return "RANS_PR9";
2311 case RANS_PR128: return "RANS_PR128";
2312 case RANS_PR129: return "RANS_PR129";
2313 case RANS_PR192: return "RANS_PR192";
2314 case RANS_PR193: return "RANS_PR193";
2315 case TOK3: return "TOK3_R";
2316 case TOKA: return "TOK3_A";
2317 case ARITH_PR0: return "ARITH_PR0";
2318 case ARITH_PR1: return "ARITH_PR1";
2319 case ARITH_PR64: return "ARITH_PR64";
2320 case ARITH_PR9: return "ARITH_PR9";
2321 case ARITH_PR128: return "ARITH_PR128";
2322 case ARITH_PR129: return "ARITH_PR129";
2323 case ARITH_PR192: return "ARITH_PR192";
2324 case ARITH_PR193: return "ARITH_PR193";
2325 case BM_ERROR: break;
2326 }
2327 return "?";
2328 }
2329
cram_content_type2str(enum cram_content_type t)2330 char *cram_content_type2str(enum cram_content_type t) {
2331 switch (t) {
2332 case FILE_HEADER: return "FILE_HEADER";
2333 case COMPRESSION_HEADER: return "COMPRESSION_HEADER";
2334 case MAPPED_SLICE: return "MAPPED_SLICE";
2335 case UNMAPPED_SLICE: return "UNMAPPED_SLICE";
2336 case EXTERNAL: return "EXTERNAL";
2337 case CORE: return "CORE";
2338 case CT_ERROR: break;
2339 }
2340 return "?";
2341 }
2342
2343 /* ----------------------------------------------------------------------
2344 * Reference sequence handling
2345 *
2346 * These revolve around the refs_t structure, which may potentially be
2347 * shared between multiple cram_fd.
2348 *
2349 * We start with refs_create() to allocate an empty refs_t and then
2350 * populate it with @SQ line data using refs_from_header(). This is done on
2351 * cram_open(). Also at start up we can call cram_load_reference() which
2352 * is used with "scramble -r foo.fa". This replaces the fd->refs with the
2353 * new one specified. In either case refs2id() is then called which
2354 * maps ref_entry names to @SQ ids (refs_t->ref_id[]).
2355 *
2356 * Later, possibly within a thread, we will want to know the actual ref
2357 * seq itself, obtained by calling cram_get_ref(). This may use the
2358 * UR: or M5: fields or the filename specified in the original
2359 * cram_load_reference() call.
2360 *
2361 * Given the potential for multi-threaded reference usage, we have
2362 * reference counting (sorry for the confusing double use of "ref") to
2363 * track the number of callers interested in any specific reference.
2364 */
2365
2366 /*
2367 * Frees/unmaps a reference sequence and associated file handles.
2368 */
ref_entry_free_seq(ref_entry * e)2369 static void ref_entry_free_seq(ref_entry *e) {
2370 if (e->mf)
2371 mfclose(e->mf);
2372 if (e->seq && !e->mf)
2373 free(e->seq);
2374
2375 e->seq = NULL;
2376 e->mf = NULL;
2377 }
2378
refs_free(refs_t * r)2379 void refs_free(refs_t *r) {
2380 RP("refs_free()\n");
2381
2382 if (--r->count > 0)
2383 return;
2384
2385 if (!r)
2386 return;
2387
2388 if (r->pool)
2389 string_pool_destroy(r->pool);
2390
2391 if (r->h_meta) {
2392 khint_t k;
2393
2394 for (k = kh_begin(r->h_meta); k != kh_end(r->h_meta); k++) {
2395 ref_entry *e;
2396
2397 if (!kh_exist(r->h_meta, k))
2398 continue;
2399 if (!(e = kh_val(r->h_meta, k)))
2400 continue;
2401 ref_entry_free_seq(e);
2402 free(e);
2403 }
2404
2405 kh_destroy(refs, r->h_meta);
2406 }
2407
2408 if (r->ref_id)
2409 free(r->ref_id);
2410
2411 if (r->fp)
2412 bgzf_close(r->fp);
2413
2414 pthread_mutex_destroy(&r->lock);
2415
2416 free(r);
2417 }
2418
refs_create(void)2419 static refs_t *refs_create(void) {
2420 refs_t *r = calloc(1, sizeof(*r));
2421
2422 RP("refs_create()\n");
2423
2424 if (!r)
2425 return NULL;
2426
2427 if (!(r->pool = string_pool_create(8192)))
2428 goto err;
2429
2430 r->ref_id = NULL; // see refs2id() to populate.
2431 r->count = 1;
2432 r->last = NULL;
2433 r->last_id = -1;
2434
2435 if (!(r->h_meta = kh_init(refs)))
2436 goto err;
2437
2438 pthread_mutex_init(&r->lock, NULL);
2439
2440 return r;
2441
2442 err:
2443 refs_free(r);
2444 return NULL;
2445 }
2446
2447 /*
2448 * Opens a reference fasta file as a BGZF stream, allowing for
2449 * compressed files. It automatically builds a .fai file if
2450 * required and if compressed a .gzi bgzf index too.
2451 *
2452 * Returns a BGZF handle on success;
2453 * NULL on failure.
2454 */
bgzf_open_ref(char * fn,char * mode,int is_md5)2455 static BGZF *bgzf_open_ref(char *fn, char *mode, int is_md5) {
2456 BGZF *fp;
2457
2458 if (!is_md5 && !hisremote(fn)) {
2459 char fai_file[PATH_MAX];
2460
2461 snprintf(fai_file, PATH_MAX, "%s.fai", fn);
2462 if (access(fai_file, R_OK) != 0)
2463 if (fai_build(fn) != 0)
2464 return NULL;
2465 }
2466
2467 if (!(fp = bgzf_open(fn, mode))) {
2468 perror(fn);
2469 return NULL;
2470 }
2471
2472 if (fp->is_compressed == 1 && bgzf_index_load(fp, fn, ".gzi") < 0) {
2473 hts_log_error("Unable to load .gzi index '%s.gzi'", fn);
2474 bgzf_close(fp);
2475 return NULL;
2476 }
2477
2478 return fp;
2479 }
2480
2481 /*
2482 * Loads a FAI file for a reference.fasta.
2483 * "is_err" indicates whether failure to load is worthy of emitting an
2484 * error message. In some cases (eg with embedded references) we
2485 * speculatively load, just in case, and silently ignore errors.
2486 *
2487 * Returns the refs_t struct on success (maybe newly allocated);
2488 * NULL on failure
2489 */
refs_load_fai(refs_t * r_orig,const char * fn,int is_err)2490 static refs_t *refs_load_fai(refs_t *r_orig, const char *fn, int is_err) {
2491 hFILE *fp = NULL;
2492 char fai_fn[PATH_MAX];
2493 char line[8192];
2494 refs_t *r = r_orig;
2495 size_t fn_l = strlen(fn);
2496 int id = 0, id_alloc = 0;
2497
2498 RP("refs_load_fai %s\n", fn);
2499
2500 if (!r)
2501 if (!(r = refs_create()))
2502 goto err;
2503
2504 if (r->fp)
2505 if (bgzf_close(r->fp) != 0)
2506 goto err;
2507 r->fp = NULL;
2508
2509 /* Look for a FASTA##idx##FAI format */
2510 char *fn_delim = strstr(fn, HTS_IDX_DELIM);
2511 if (fn_delim) {
2512 if (!(r->fn = string_ndup(r->pool, fn, fn_delim - fn)))
2513 goto err;
2514 fn_delim += strlen(HTS_IDX_DELIM);
2515 snprintf(fai_fn, PATH_MAX, "%s", fn_delim);
2516 } else {
2517 /* An index file was provided, instead of the actual reference file */
2518 if (fn_l > 4 && strcmp(&fn[fn_l-4], ".fai") == 0) {
2519 if (!r->fn) {
2520 if (!(r->fn = string_ndup(r->pool, fn, fn_l-4)))
2521 goto err;
2522 }
2523 snprintf(fai_fn, PATH_MAX, "%s", fn);
2524 } else {
2525 /* Only the reference file provided. Get the index file name from it */
2526 if (!(r->fn = string_dup(r->pool, fn)))
2527 goto err;
2528 sprintf(fai_fn, "%.*s.fai", PATH_MAX-5, fn);
2529 }
2530 }
2531
2532 if (!(r->fp = bgzf_open_ref(r->fn, "r", 0))) {
2533 hts_log_error("Failed to open reference file '%s'", r->fn);
2534 goto err;
2535 }
2536
2537 if (!(fp = hopen(fai_fn, "r"))) {
2538 hts_log_error("Failed to open index file '%s'", fai_fn);
2539 if (is_err)
2540 perror(fai_fn);
2541 goto err;
2542 }
2543 while (hgets(line, 8192, fp) != NULL) {
2544 ref_entry *e = malloc(sizeof(*e));
2545 char *cp;
2546 int n;
2547 khint_t k;
2548
2549 if (!e)
2550 return NULL;
2551
2552 // id
2553 for (cp = line; *cp && !isspace_c(*cp); cp++)
2554 ;
2555 *cp++ = 0;
2556 e->name = string_dup(r->pool, line);
2557
2558 // length
2559 while (*cp && isspace_c(*cp))
2560 cp++;
2561 e->length = strtoll(cp, &cp, 10);
2562
2563 // offset
2564 while (*cp && isspace_c(*cp))
2565 cp++;
2566 e->offset = strtoll(cp, &cp, 10);
2567
2568 // bases per line
2569 while (*cp && isspace_c(*cp))
2570 cp++;
2571 e->bases_per_line = strtol(cp, &cp, 10);
2572
2573 // line length
2574 while (*cp && isspace_c(*cp))
2575 cp++;
2576 e->line_length = strtol(cp, &cp, 10);
2577
2578 // filename
2579 e->fn = r->fn;
2580
2581 e->count = 0;
2582 e->seq = NULL;
2583 e->mf = NULL;
2584 e->is_md5 = 0;
2585
2586 k = kh_put(refs, r->h_meta, e->name, &n);
2587 if (-1 == n) {
2588 free(e);
2589 return NULL;
2590 }
2591
2592 if (n) {
2593 kh_val(r->h_meta, k) = e;
2594 } else {
2595 ref_entry *re = kh_val(r->h_meta, k);
2596 if (re && (re->count != 0 || re->length != 0)) {
2597 /* Keep old */
2598 free(e);
2599 } else {
2600 /* Replace old */
2601 if (re)
2602 free(re);
2603 kh_val(r->h_meta, k) = e;
2604 }
2605 }
2606
2607 if (id >= id_alloc) {
2608 ref_entry **new_refs;
2609 int x;
2610
2611 id_alloc = id_alloc ?id_alloc*2 : 16;
2612 new_refs = realloc(r->ref_id, id_alloc * sizeof(*r->ref_id));
2613 if (!new_refs)
2614 goto err;
2615 r->ref_id = new_refs;
2616
2617 for (x = id; x < id_alloc; x++)
2618 r->ref_id[x] = NULL;
2619 }
2620 r->ref_id[id] = e;
2621 r->nref = ++id;
2622 }
2623
2624 if(hclose(fp) < 0)
2625 goto err;
2626 return r;
2627
2628 err:
2629 if (fp)
2630 hclose_abruptly(fp);
2631
2632 if (!r_orig)
2633 refs_free(r);
2634
2635 return NULL;
2636 }
2637
2638 /*
2639 * Verifies that the CRAM @SQ lines and .fai files match.
2640 */
sanitise_SQ_lines(cram_fd * fd)2641 static void sanitise_SQ_lines(cram_fd *fd) {
2642 int i;
2643
2644 if (!fd->header || !fd->header->hrecs)
2645 return;
2646
2647 if (!fd->refs || !fd->refs->h_meta)
2648 return;
2649
2650 for (i = 0; i < fd->header->hrecs->nref; i++) {
2651 const char *name = fd->header->hrecs->ref[i].name;
2652 khint_t k = kh_get(refs, fd->refs->h_meta, name);
2653 ref_entry *r;
2654
2655 // We may have @SQ lines which have no known .fai, but do not
2656 // in themselves pose a problem because they are unused in the file.
2657 if (k == kh_end(fd->refs->h_meta))
2658 continue;
2659
2660 if (!(r = (ref_entry *)kh_val(fd->refs->h_meta, k)))
2661 continue;
2662
2663 if (r->length && r->length != fd->header->hrecs->ref[i].len) {
2664 assert(strcmp(r->name, fd->header->hrecs->ref[i].name) == 0);
2665
2666 // Should we also check MD5sums here to ensure the correct
2667 // reference was given?
2668 hts_log_warning("Header @SQ length mismatch for ref %s, %"PRIhts_pos" vs %d",
2669 r->name, fd->header->hrecs->ref[i].len, (int)r->length);
2670
2671 // Fixing the parsed @SQ header will make MD:Z: strings work
2672 // and also stop it producing N for the sequence.
2673 fd->header->hrecs->ref[i].len = r->length;
2674 }
2675 }
2676 }
2677
2678 /*
2679 * Indexes references by the order they appear in a BAM file. This may not
2680 * necessarily be the same order they appear in the fasta reference file.
2681 *
2682 * Returns 0 on success
2683 * -1 on failure
2684 */
refs2id(refs_t * r,sam_hdr_t * hdr)2685 int refs2id(refs_t *r, sam_hdr_t *hdr) {
2686 int i;
2687 sam_hrecs_t *h = hdr->hrecs;
2688
2689 if (r->ref_id)
2690 free(r->ref_id);
2691 if (r->last)
2692 r->last = NULL;
2693
2694 r->ref_id = calloc(h->nref, sizeof(*r->ref_id));
2695 if (!r->ref_id)
2696 return -1;
2697
2698 r->nref = h->nref;
2699 for (i = 0; i < h->nref; i++) {
2700 khint_t k = kh_get(refs, r->h_meta, h->ref[i].name);
2701 if (k != kh_end(r->h_meta)) {
2702 r->ref_id[i] = kh_val(r->h_meta, k);
2703 } else {
2704 hts_log_warning("Unable to find ref name '%s'", h->ref[i].name);
2705 }
2706 }
2707
2708 return 0;
2709 }
2710
2711 /*
2712 * Generates refs_t entries based on @SQ lines in the header.
2713 * Returns 0 on success
2714 * -1 on failure
2715 */
refs_from_header(cram_fd * fd)2716 static int refs_from_header(cram_fd *fd) {
2717 if (!fd)
2718 return -1;
2719
2720 refs_t *r = fd->refs;
2721 if (!r)
2722 return -1;
2723
2724 sam_hdr_t *h = fd->header;
2725 if (!h)
2726 return 0;
2727
2728 if (!h->hrecs) {
2729 if (-1 == sam_hdr_fill_hrecs(h))
2730 return -1;
2731 }
2732
2733 if (h->hrecs->nref == 0)
2734 return 0;
2735
2736 //fprintf(stderr, "refs_from_header for %p mode %c\n", fd, fd->mode);
2737
2738 /* Existing refs are fine, as long as they're compatible with the hdr. */
2739 ref_entry **new_ref_id = realloc(r->ref_id, (r->nref + h->hrecs->nref) * sizeof(*r->ref_id));
2740 if (!new_ref_id)
2741 return -1;
2742 r->ref_id = new_ref_id;
2743
2744 int i, j;
2745 /* Copy info from h->ref[i] over to r */
2746 for (i = 0, j = r->nref; i < h->hrecs->nref; i++) {
2747 sam_hrec_type_t *ty;
2748 sam_hrec_tag_t *tag;
2749 khint_t k;
2750 int n;
2751
2752 k = kh_get(refs, r->h_meta, h->hrecs->ref[i].name);
2753 if (k != kh_end(r->h_meta))
2754 // Ref already known about
2755 continue;
2756
2757 if (!(r->ref_id[j] = calloc(1, sizeof(ref_entry))))
2758 return -1;
2759
2760 if (!h->hrecs->ref[i].name)
2761 return -1;
2762
2763 r->ref_id[j]->name = string_dup(r->pool, h->hrecs->ref[i].name);
2764 if (!r->ref_id[j]->name) return -1;
2765 r->ref_id[j]->length = 0; // marker for not yet loaded
2766
2767 /* Initialise likely filename if known */
2768 if ((ty = sam_hrecs_find_type_id(h->hrecs, "SQ", "SN", h->hrecs->ref[i].name))) {
2769 if ((tag = sam_hrecs_find_key(ty, "M5", NULL))) {
2770 r->ref_id[j]->fn = string_dup(r->pool, tag->str+3);
2771 //fprintf(stderr, "Tagging @SQ %s / %s\n", r->ref_id[h]->name, r->ref_id[h]->fn);
2772 }
2773 }
2774
2775 k = kh_put(refs, r->h_meta, r->ref_id[j]->name, &n);
2776 if (n <= 0) // already exists or error
2777 return -1;
2778 kh_val(r->h_meta, k) = r->ref_id[j];
2779
2780 j++;
2781 }
2782 r->nref = j;
2783
2784 return 0;
2785 }
2786
2787 /*
2788 * Attaches a header to a cram_fd.
2789 *
2790 * This should be used when creating a new cram_fd for writing where
2791 * we have a header already constructed (eg from a file we've read
2792 * in).
2793 */
cram_set_header2(cram_fd * fd,const sam_hdr_t * hdr)2794 int cram_set_header2(cram_fd *fd, const sam_hdr_t *hdr) {
2795 if (!fd || !hdr )
2796 return -1;
2797
2798 if (fd->header != hdr) {
2799 if (fd->header)
2800 sam_hdr_destroy(fd->header);
2801 fd->header = sam_hdr_dup(hdr);
2802 if (!fd->header)
2803 return -1;
2804 }
2805 return refs_from_header(fd);
2806 }
2807
cram_set_header(cram_fd * fd,sam_hdr_t * hdr)2808 int cram_set_header(cram_fd *fd, sam_hdr_t *hdr) {
2809 return cram_set_header2(fd, hdr);
2810 }
2811
2812 /*
2813 * Returns whether the path refers to a directory.
2814 */
is_directory(char * fn)2815 static int is_directory(char *fn) {
2816 struct stat buf;
2817 if ( stat(fn,&buf) ) return 0;
2818 return S_ISDIR(buf.st_mode);
2819 }
2820
2821 /*
2822 * Converts a directory and a filename into an expanded path, replacing %s
2823 * in directory with the filename and %[0-9]+s with portions of the filename
2824 * Any remaining parts of filename are added to the end with /%s.
2825 */
expand_cache_path(char * path,char * dir,const char * fn)2826 static int expand_cache_path(char *path, char *dir, const char *fn) {
2827 char *cp, *start = path;
2828 size_t len;
2829 size_t sz = PATH_MAX;
2830
2831 while ((cp = strchr(dir, '%'))) {
2832 if (cp-dir >= sz) return -1;
2833 strncpy(path, dir, cp-dir);
2834 path += cp-dir;
2835 sz -= cp-dir;
2836
2837 if (*++cp == 's') {
2838 len = strlen(fn);
2839 if (len >= sz) return -1;
2840 strcpy(path, fn);
2841 path += len;
2842 sz -= len;
2843 fn += len;
2844 cp++;
2845 } else if (*cp >= '0' && *cp <= '9') {
2846 char *endp;
2847 long l;
2848
2849 l = strtol(cp, &endp, 10);
2850 l = MIN(l, strlen(fn));
2851 if (*endp == 's') {
2852 if (l >= sz) return -1;
2853 strncpy(path, fn, l);
2854 path += l;
2855 fn += l;
2856 sz -= l;
2857 *path = 0;
2858 cp = endp+1;
2859 } else {
2860 if (sz < 3) return -1;
2861 *path++ = '%';
2862 *path++ = *cp++;
2863 }
2864 } else {
2865 if (sz < 3) return -1;
2866 *path++ = '%';
2867 *path++ = *cp++;
2868 }
2869 dir = cp;
2870 }
2871
2872 len = strlen(dir);
2873 if (len >= sz) return -1;
2874 strcpy(path, dir);
2875 path += len;
2876 sz -= len;
2877
2878 len = strlen(fn) + ((*fn && path > start && path[-1] != '/') ? 1 : 0);
2879 if (len >= sz) return -1;
2880 if (*fn && path > start && path[-1] != '/')
2881 *path++ = '/';
2882 strcpy(path, fn);
2883 return 0;
2884 }
2885
2886 /*
2887 * Make the directory containing path and any prefix directories.
2888 */
mkdir_prefix(char * path,int mode)2889 static void mkdir_prefix(char *path, int mode) {
2890 char *cp = strrchr(path, '/');
2891 if (!cp)
2892 return;
2893
2894 *cp = 0;
2895 if (is_directory(path)) {
2896 *cp = '/';
2897 return;
2898 }
2899
2900 if (mkdir(path, mode) == 0) {
2901 chmod(path, mode);
2902 *cp = '/';
2903 return;
2904 }
2905
2906 mkdir_prefix(path, mode);
2907 mkdir(path, mode);
2908 chmod(path, mode);
2909 *cp = '/';
2910 }
2911
2912 /*
2913 * Return the cache directory to use, based on the first of these
2914 * environment variables to be set to a non-empty value.
2915 */
get_cache_basedir(const char ** extra)2916 static const char *get_cache_basedir(const char **extra) {
2917 char *base;
2918
2919 *extra = "";
2920
2921 base = getenv("XDG_CACHE_HOME");
2922 if (base && *base) return base;
2923
2924 base = getenv("HOME");
2925 if (base && *base) { *extra = "/.cache"; return base; }
2926
2927 base = getenv("TMPDIR");
2928 if (base && *base) return base;
2929
2930 base = getenv("TEMP");
2931 if (base && *base) return base;
2932
2933 return "/tmp";
2934 }
2935
2936 /*
2937 * Queries the M5 string from the header and attempts to populate the
2938 * reference from this using the REF_PATH environment.
2939 *
2940 * Returns 0 on success
2941 * -1 on failure
2942 */
cram_populate_ref(cram_fd * fd,int id,ref_entry * r)2943 static int cram_populate_ref(cram_fd *fd, int id, ref_entry *r) {
2944 char *ref_path = getenv("REF_PATH");
2945 sam_hrec_type_t *ty;
2946 sam_hrec_tag_t *tag;
2947 char path[PATH_MAX];
2948 kstring_t path_tmp = KS_INITIALIZE;
2949 char cache[PATH_MAX], cache_root[PATH_MAX];
2950 char *local_cache = getenv("REF_CACHE");
2951 mFILE *mf;
2952 int local_path = 0;
2953
2954 hts_log_info("Running cram_populate_ref on fd %p, id %d", (void *)fd, id);
2955
2956 cache_root[0] = '\0';
2957
2958 if (!ref_path || *ref_path == '\0') {
2959 /*
2960 * If we have no ref path, we use the EBI server.
2961 * However to avoid spamming it we require a local ref cache too.
2962 */
2963 ref_path = "https://www.ebi.ac.uk/ena/cram/md5/%s";
2964 if (!local_cache || *local_cache == '\0') {
2965 const char *extra;
2966 const char *base = get_cache_basedir(&extra);
2967 snprintf(cache_root, PATH_MAX, "%s%s/hts-ref", base, extra);
2968 snprintf(cache,PATH_MAX, "%s%s/hts-ref/%%2s/%%2s/%%s", base, extra);
2969 local_cache = cache;
2970 hts_log_info("Populating local cache: %s", local_cache);
2971 }
2972 }
2973
2974 if (!r->name)
2975 return -1;
2976
2977 if (!(ty = sam_hrecs_find_type_id(fd->header->hrecs, "SQ", "SN", r->name)))
2978 return -1;
2979
2980 if (!(tag = sam_hrecs_find_key(ty, "M5", NULL)))
2981 goto no_M5;
2982
2983 hts_log_info("Querying ref %s", tag->str+3);
2984
2985 /* Use cache if available */
2986 if (local_cache && *local_cache) {
2987 if (expand_cache_path(path, local_cache, tag->str+3) == 0)
2988 local_path = 1;
2989 }
2990
2991 #ifndef HAVE_MMAP
2992 char *path2;
2993 /* Search local files in REF_PATH; we can open them and return as above */
2994 if (!local_path && (path2 = find_path(tag->str+3, ref_path))) {
2995 int len = snprintf(path, PATH_MAX, "%s", path2);
2996 free(path2);
2997 if (len > 0 && len < PATH_MAX) // in case it's too long
2998 local_path = 1;
2999 }
3000 #endif
3001
3002 /* Found via REF_CACHE or local REF_PATH file */
3003 if (local_path) {
3004 struct stat sb;
3005 BGZF *fp;
3006
3007 if (0 == stat(path, &sb)
3008 && S_ISREG(sb.st_mode)
3009 && (fp = bgzf_open(path, "r"))) {
3010 r->length = sb.st_size;
3011 r->offset = r->line_length = r->bases_per_line = 0;
3012
3013 r->fn = string_dup(fd->refs->pool, path);
3014
3015 if (fd->refs->fp)
3016 if (bgzf_close(fd->refs->fp) != 0)
3017 return -1;
3018 fd->refs->fp = fp;
3019 fd->refs->fn = r->fn;
3020 r->is_md5 = 1;
3021
3022 // Fall back to cram_get_ref() where it'll do the actual
3023 // reading of the file.
3024 return 0;
3025 }
3026 }
3027
3028
3029 /* Otherwise search full REF_PATH; slower as loads entire file */
3030 if ((mf = open_path_mfile(tag->str+3, ref_path, NULL))) {
3031 size_t sz;
3032 r->seq = mfsteal(mf, &sz);
3033 if (r->seq) {
3034 r->mf = NULL;
3035 } else {
3036 // keep mf around as we couldn't detach
3037 r->seq = mf->data;
3038 r->mf = mf;
3039 }
3040 r->length = sz;
3041 r->is_md5 = 1;
3042 } else {
3043 refs_t *refs;
3044 const char *fn;
3045
3046 no_M5:
3047 /* Failed to find in search path or M5 cache, see if @SQ UR: tag? */
3048 if (!(tag = sam_hrecs_find_key(ty, "UR", NULL)))
3049 return -1;
3050
3051 fn = (strncmp(tag->str+3, "file:", 5) == 0)
3052 ? tag->str+8
3053 : tag->str+3;
3054
3055 if (fd->refs->fp) {
3056 if (bgzf_close(fd->refs->fp) != 0)
3057 return -1;
3058 fd->refs->fp = NULL;
3059 }
3060 if (!(refs = refs_load_fai(fd->refs, fn, 0)))
3061 return -1;
3062 sanitise_SQ_lines(fd);
3063
3064 fd->refs = refs;
3065 if (fd->refs->fp) {
3066 if (bgzf_close(fd->refs->fp) != 0)
3067 return -1;
3068 fd->refs->fp = NULL;
3069 }
3070
3071 if (!fd->refs->fn)
3072 return -1;
3073
3074 if (-1 == refs2id(fd->refs, fd->header))
3075 return -1;
3076 if (!fd->refs->ref_id || !fd->refs->ref_id[id])
3077 return -1;
3078
3079 // Local copy already, so fall back to cram_get_ref().
3080 return 0;
3081 }
3082
3083 /* Populate the local disk cache if required */
3084 if (local_cache && *local_cache) {
3085 hFILE *fp;
3086
3087 if (*cache_root && !is_directory(cache_root)) {
3088 hts_log_warning("Creating reference cache directory %s\n"
3089 "This may become large; see the samtools(1) manual page REF_CACHE discussion",
3090 cache_root);
3091 }
3092
3093 if (expand_cache_path(path, local_cache, tag->str+3) < 0) {
3094 return 0; // Not fatal - we have the data already so keep going.
3095 }
3096 hts_log_info("Writing cache file '%s'", path);
3097 mkdir_prefix(path, 01777);
3098
3099 fp = hts_open_tmpfile(path, "wx", &path_tmp);
3100 if (!fp) {
3101 perror(path_tmp.s);
3102 free(path_tmp.s);
3103
3104 // Not fatal - we have the data already so keep going.
3105 return 0;
3106 }
3107
3108 // Check md5sum
3109 hts_md5_context *md5;
3110 char unsigned md5_buf1[16];
3111 char md5_buf2[33];
3112
3113 if (!(md5 = hts_md5_init())) {
3114 hclose_abruptly(fp);
3115 unlink(path_tmp.s);
3116 free(path_tmp.s);
3117 return -1;
3118 }
3119 hts_md5_update(md5, r->seq, r->length);
3120 hts_md5_final(md5_buf1, md5);
3121 hts_md5_destroy(md5);
3122 hts_md5_hex(md5_buf2, md5_buf1);
3123
3124 if (strncmp(tag->str+3, md5_buf2, 32) != 0) {
3125 hts_log_error("Mismatching md5sum for downloaded reference");
3126 hclose_abruptly(fp);
3127 unlink(path_tmp.s);
3128 free(path_tmp.s);
3129 return -1;
3130 }
3131
3132 ssize_t length_written = hwrite(fp, r->seq, r->length);
3133 if (hclose(fp) < 0 || length_written != r->length ||
3134 chmod(path_tmp.s, 0444) < 0 ||
3135 rename(path_tmp.s, path) < 0) {
3136 hts_log_error("Creating reference at %s failed: %s",
3137 path, strerror(errno));
3138 unlink(path_tmp.s);
3139 }
3140 }
3141
3142 free(path_tmp.s);
3143 return 0;
3144 }
3145
cram_ref_incr_locked(refs_t * r,int id)3146 static void cram_ref_incr_locked(refs_t *r, int id) {
3147 RP("%d INC REF %d, %d %p\n", gettid(), id,
3148 (int)(id>=0 && r->ref_id[id]?r->ref_id[id]->count+1:-999),
3149 id>=0 && r->ref_id[id]?r->ref_id[id]->seq:(char *)1);
3150
3151 if (id < 0 || !r->ref_id[id] || !r->ref_id[id]->seq)
3152 return;
3153
3154 if (r->last_id == id)
3155 r->last_id = -1;
3156
3157 ++r->ref_id[id]->count;
3158 }
3159
cram_ref_incr(refs_t * r,int id)3160 void cram_ref_incr(refs_t *r, int id) {
3161 pthread_mutex_lock(&r->lock);
3162 cram_ref_incr_locked(r, id);
3163 pthread_mutex_unlock(&r->lock);
3164 }
3165
cram_ref_decr_locked(refs_t * r,int id)3166 static void cram_ref_decr_locked(refs_t *r, int id) {
3167 RP("%d DEC REF %d, %d %p\n", gettid(), id,
3168 (int)(id>=0 && r->ref_id[id]?r->ref_id[id]->count-1:-999),
3169 id>=0 && r->ref_id[id]?r->ref_id[id]->seq:(char *)1);
3170
3171 if (id < 0 || !r->ref_id[id] || !r->ref_id[id]->seq) {
3172 return;
3173 }
3174
3175 if (--r->ref_id[id]->count <= 0) {
3176 assert(r->ref_id[id]->count == 0);
3177 if (r->last_id >= 0) {
3178 if (r->ref_id[r->last_id]->count <= 0 &&
3179 r->ref_id[r->last_id]->seq) {
3180 RP("%d FREE REF %d (%p)\n", gettid(),
3181 r->last_id, r->ref_id[r->last_id]->seq);
3182 ref_entry_free_seq(r->ref_id[r->last_id]);
3183 if (r->ref_id[r->last_id]->is_md5) r->ref_id[r->last_id]->length = 0;
3184 }
3185 }
3186 r->last_id = id;
3187 }
3188 }
3189
cram_ref_decr(refs_t * r,int id)3190 void cram_ref_decr(refs_t *r, int id) {
3191 pthread_mutex_lock(&r->lock);
3192 cram_ref_decr_locked(r, id);
3193 pthread_mutex_unlock(&r->lock);
3194 }
3195
3196 /*
3197 * Used by cram_ref_load and cram_ref_get. The file handle will have
3198 * already been opened, so we can catch it. The ref_entry *e informs us
3199 * of whether this is a multi-line fasta file or a raw MD5 style file.
3200 * Either way we create a single contiguous sequence.
3201 *
3202 * Returns all or part of a reference sequence on success (malloced);
3203 * NULL on failure.
3204 */
load_ref_portion(BGZF * fp,ref_entry * e,int start,int end)3205 static char *load_ref_portion(BGZF *fp, ref_entry *e, int start, int end) {
3206 off_t offset, len;
3207 char *seq;
3208
3209 if (end < start)
3210 end = start;
3211
3212 /*
3213 * Compute locations in file. This is trivial for the MD5 files, but
3214 * is still necessary for the fasta variants.
3215 */
3216 offset = e->line_length
3217 ? e->offset + (start-1)/e->bases_per_line * e->line_length +
3218 (start-1) % e->bases_per_line
3219 : start-1;
3220
3221 len = (e->line_length
3222 ? e->offset + (end-1)/e->bases_per_line * e->line_length +
3223 (end-1) % e->bases_per_line
3224 : end-1) - offset + 1;
3225
3226 if (bgzf_useek(fp, offset, SEEK_SET) < 0) {
3227 perror("bgzf_useek() on reference file");
3228 return NULL;
3229 }
3230
3231 if (len == 0 || !(seq = malloc(len))) {
3232 return NULL;
3233 }
3234
3235 if (len != bgzf_read(fp, seq, len)) {
3236 perror("bgzf_read() on reference file");
3237 free(seq);
3238 return NULL;
3239 }
3240
3241 /* Strip white-space if required. */
3242 if (len != end-start+1) {
3243 int i, j;
3244 char *cp = seq;
3245 char *cp_to;
3246
3247 for (i = j = 0; i < len; i++) {
3248 if (cp[i] >= '!' && cp[i] <= '~')
3249 cp[j++] = toupper_c(cp[i]);
3250 }
3251 cp_to = cp+j;
3252
3253 if (cp_to - seq != end-start+1) {
3254 hts_log_error("Malformed reference file");
3255 free(seq);
3256 return NULL;
3257 }
3258 } else {
3259 int i;
3260 for (i = 0; i < len; i++) {
3261 seq[i] = toupper_c(seq[i]);
3262 }
3263 }
3264
3265 return seq;
3266 }
3267
3268 /*
3269 * Load the entire reference 'id'.
3270 * This also increments the reference count by 1.
3271 *
3272 * Returns ref_entry on success;
3273 * NULL on failure
3274 */
cram_ref_load(refs_t * r,int id,int is_md5)3275 ref_entry *cram_ref_load(refs_t *r, int id, int is_md5) {
3276 ref_entry *e = r->ref_id[id];
3277 int start = 1, end = e->length;
3278 char *seq;
3279
3280 if (e->seq) {
3281 return e;
3282 }
3283
3284 assert(e->count == 0);
3285
3286 if (r->last) {
3287 #ifdef REF_DEBUG
3288 int idx = 0;
3289 for (idx = 0; idx < r->nref; idx++)
3290 if (r->last == r->ref_id[idx])
3291 break;
3292 RP("%d cram_ref_load DECR %d\n", gettid(), idx);
3293 #endif
3294 assert(r->last->count > 0);
3295 if (--r->last->count <= 0) {
3296 RP("%d FREE REF %d (%p)\n", gettid(), id, r->ref_id[id]->seq);
3297 if (r->last->seq)
3298 ref_entry_free_seq(r->last);
3299 }
3300 }
3301
3302 if (!r->fn)
3303 return NULL;
3304
3305 /* Open file if it's not already the current open reference */
3306 if (strcmp(r->fn, e->fn) || r->fp == NULL) {
3307 if (r->fp)
3308 if (bgzf_close(r->fp) != 0)
3309 return NULL;
3310 r->fn = e->fn;
3311 if (!(r->fp = bgzf_open_ref(r->fn, "r", is_md5)))
3312 return NULL;
3313 }
3314
3315 RP("%d Loading ref %d (%d..%d)\n", gettid(), id, start, end);
3316
3317 if (!(seq = load_ref_portion(r->fp, e, start, end))) {
3318 return NULL;
3319 }
3320
3321 RP("%d Loaded ref %d (%d..%d) = %p\n", gettid(), id, start, end, seq);
3322
3323 RP("%d INC REF %d, %"PRId64"\n", gettid(), id, (e->count+1));
3324 e->seq = seq;
3325 e->mf = NULL;
3326 e->count++;
3327
3328 /*
3329 * Also keep track of last used ref so incr/decr loops on the same
3330 * sequence don't cause load/free loops.
3331 */
3332 RP("%d cram_ref_load INCR %d => %"PRId64"\n", gettid(), id, e->count+1);
3333 r->last = e;
3334 e->count++;
3335
3336 return e;
3337 }
3338
3339 /*
3340 * Returns a portion of a reference sequence from start to end inclusive.
3341 * The returned pointer is owned by either the cram_file fd or by the
3342 * internal refs_t structure and should not be freed by the caller.
3343 *
3344 * The difference is whether or not this refs_t is in use by just the one
3345 * cram_fd or by multiples, or whether we have multiple threads accessing
3346 * references. In either case fd->shared will be true and we start using
3347 * reference counting to track the number of users of a specific reference
3348 * sequence.
3349 *
3350 * Otherwise the ref seq returned is allocated as part of cram_fd itself
3351 * and will be freed up on the next call to cram_get_ref or cram_close.
3352 *
3353 * To return the entire reference sequence, specify start as 1 and end
3354 * as 0.
3355 *
3356 * To cease using a reference, call cram_ref_decr().
3357 *
3358 * Returns reference on success,
3359 * NULL on failure
3360 */
cram_get_ref(cram_fd * fd,int id,int start,int end)3361 char *cram_get_ref(cram_fd *fd, int id, int start, int end) {
3362 ref_entry *r;
3363 char *seq;
3364 int ostart = start;
3365
3366 if (id == -1 || start < 1)
3367 return NULL;
3368
3369 /* FIXME: axiomatic query of r->seq being true?
3370 * Or shortcut for unsorted data where we load once and never free?
3371 */
3372
3373 //fd->shared_ref = 1; // hard code for now to simplify things
3374
3375 pthread_mutex_lock(&fd->ref_lock);
3376
3377 RP("%d cram_get_ref on fd %p, id %d, range %d..%d\n", gettid(), fd, id, start, end);
3378
3379 /*
3380 * Unsorted data implies we want to fetch an entire reference at a time.
3381 * We just deal with this at the moment by claiming we're sharing
3382 * references instead, which has the same requirement.
3383 */
3384 if (fd->unsorted)
3385 fd->shared_ref = 1;
3386
3387
3388 /* Sanity checking: does this ID exist? */
3389 if (id >= fd->refs->nref) {
3390 hts_log_error("No reference found for id %d", id);
3391 pthread_mutex_unlock(&fd->ref_lock);
3392 return NULL;
3393 }
3394
3395 if (!fd->refs || !fd->refs->ref_id[id]) {
3396 hts_log_error("No reference found for id %d", id);
3397 pthread_mutex_unlock(&fd->ref_lock);
3398 return NULL;
3399 }
3400
3401 if (!(r = fd->refs->ref_id[id])) {
3402 hts_log_error("No reference found for id %d", id);
3403 pthread_mutex_unlock(&fd->ref_lock);
3404 return NULL;
3405 }
3406
3407
3408 /*
3409 * It has an entry, but may not have been populated yet.
3410 * Any manually loaded .fai files have their lengths known.
3411 * A ref entry computed from @SQ lines (M5 or UR field) will have
3412 * r->length == 0 unless it's been loaded once and verified that we have
3413 * an on-disk filename for it.
3414 *
3415 * 19 Sep 2013: Moved the lock here as the cram_populate_ref code calls
3416 * open_path_mfile and libcurl, which isn't multi-thread safe unless I
3417 * rewrite my code to have one curl handle per thread.
3418 */
3419 pthread_mutex_lock(&fd->refs->lock);
3420 if (r->length == 0) {
3421 if (cram_populate_ref(fd, id, r) == -1) {
3422 hts_log_error("Failed to populate reference for id %d", id);
3423 pthread_mutex_unlock(&fd->refs->lock);
3424 pthread_mutex_unlock(&fd->ref_lock);
3425 return NULL;
3426 }
3427 r = fd->refs->ref_id[id];
3428 if (fd->unsorted)
3429 cram_ref_incr_locked(fd->refs, id);
3430 }
3431
3432
3433 /*
3434 * We now know that we the filename containing the reference, so check
3435 * for limits. If it's over half the reference we'll load all of it in
3436 * memory as this will speed up subsequent calls.
3437 */
3438 if (end < 1)
3439 end = r->length;
3440 if (end >= r->length)
3441 end = r->length;
3442
3443 if (end - start >= 0.5*r->length || fd->shared_ref) {
3444 start = 1;
3445 end = r->length;
3446 }
3447
3448 /*
3449 * Maybe we have it cached already? If so use it.
3450 *
3451 * Alternatively if we don't have the sequence but we're sharing
3452 * references and/or are asking for the entire length of it, then
3453 * load the full reference into the refs structure and return
3454 * a pointer to that one instead.
3455 */
3456 if (fd->shared_ref || r->seq || (start == 1 && end == r->length)) {
3457 char *cp;
3458
3459 if (id >= 0) {
3460 if (r->seq) {
3461 cram_ref_incr_locked(fd->refs, id);
3462 } else {
3463 ref_entry *e;
3464 if (!(e = cram_ref_load(fd->refs, id, r->is_md5))) {
3465 pthread_mutex_unlock(&fd->refs->lock);
3466 pthread_mutex_unlock(&fd->ref_lock);
3467 return NULL;
3468 }
3469
3470 /* unsorted data implies cache ref indefinitely, to avoid
3471 * continually loading and unloading.
3472 */
3473 if (fd->unsorted)
3474 cram_ref_incr_locked(fd->refs, id);
3475 }
3476
3477 fd->ref = NULL; /* We never access it directly */
3478 fd->ref_start = 1;
3479 fd->ref_end = r->length;
3480 fd->ref_id = id;
3481
3482 cp = fd->refs->ref_id[id]->seq + ostart-1;
3483 } else {
3484 fd->ref = NULL;
3485 cp = NULL;
3486 }
3487
3488 RP("%d cram_get_ref returning for id %d, count %d\n", gettid(), id, (int)r->count);
3489
3490 pthread_mutex_unlock(&fd->refs->lock);
3491 pthread_mutex_unlock(&fd->ref_lock);
3492 return cp;
3493 }
3494
3495 /*
3496 * Otherwise we're not sharing, we don't have a copy of it already and
3497 * we're only asking for a small portion of it.
3498 *
3499 * In this case load up just that segment ourselves, freeing any old
3500 * small segments in the process.
3501 */
3502
3503 /* Unmapped ref ID */
3504 if (id < 0 || !fd->refs->fn) {
3505 if (fd->ref_free) {
3506 free(fd->ref_free);
3507 fd->ref_free = NULL;
3508 }
3509 fd->ref = NULL;
3510 fd->ref_id = id;
3511 pthread_mutex_unlock(&fd->refs->lock);
3512 pthread_mutex_unlock(&fd->ref_lock);
3513 return NULL;
3514 }
3515
3516 /* Open file if it's not already the current open reference */
3517 if (strcmp(fd->refs->fn, r->fn) || fd->refs->fp == NULL) {
3518 if (fd->refs->fp)
3519 if (bgzf_close(fd->refs->fp) != 0)
3520 return NULL;
3521 fd->refs->fn = r->fn;
3522 if (!(fd->refs->fp = bgzf_open_ref(fd->refs->fn, "r", r->is_md5))) {
3523 pthread_mutex_unlock(&fd->refs->lock);
3524 pthread_mutex_unlock(&fd->ref_lock);
3525 return NULL;
3526 }
3527 }
3528
3529 if (!(fd->ref = load_ref_portion(fd->refs->fp, r, start, end))) {
3530 pthread_mutex_unlock(&fd->refs->lock);
3531 pthread_mutex_unlock(&fd->ref_lock);
3532 return NULL;
3533 }
3534
3535 if (fd->ref_free)
3536 free(fd->ref_free);
3537
3538 fd->ref_id = id;
3539 fd->ref_start = start;
3540 fd->ref_end = end;
3541 fd->ref_free = fd->ref;
3542 seq = fd->ref;
3543
3544 pthread_mutex_unlock(&fd->refs->lock);
3545 pthread_mutex_unlock(&fd->ref_lock);
3546
3547 return seq ? seq + ostart - start : NULL;
3548 }
3549
3550 /*
3551 * If fd has been opened for reading, it may be permitted to specify 'fn'
3552 * as NULL and let the code auto-detect the reference by parsing the
3553 * SAM header @SQ lines.
3554 */
cram_load_reference(cram_fd * fd,char * fn)3555 int cram_load_reference(cram_fd *fd, char *fn) {
3556 int ret = 0;
3557
3558 if (fn) {
3559 fd->refs = refs_load_fai(fd->refs, fn,
3560 !(fd->embed_ref && fd->mode == 'r'));
3561 fn = fd->refs ? fd->refs->fn : NULL;
3562 if (!fn)
3563 ret = -1;
3564 sanitise_SQ_lines(fd);
3565 }
3566 fd->ref_fn = fn;
3567
3568 if ((!fd->refs || (fd->refs->nref == 0 && !fn)) && fd->header) {
3569 if (fd->refs)
3570 refs_free(fd->refs);
3571 if (!(fd->refs = refs_create()))
3572 return -1;
3573 if (-1 == refs_from_header(fd))
3574 return -1;
3575 }
3576
3577 if (fd->header)
3578 if (-1 == refs2id(fd->refs, fd->header))
3579 return -1;
3580
3581 return ret;
3582 }
3583
3584 /* ----------------------------------------------------------------------
3585 * Containers
3586 */
3587
3588 /*
3589 * Creates a new container, specifying the maximum number of slices
3590 * and records permitted.
3591 *
3592 * Returns cram_container ptr on success
3593 * NULL on failure
3594 */
cram_new_container(int nrec,int nslice)3595 cram_container *cram_new_container(int nrec, int nslice) {
3596 cram_container *c = calloc(1, sizeof(*c));
3597 enum cram_DS_ID id;
3598
3599 if (!c)
3600 return NULL;
3601
3602 c->curr_ref = -2;
3603
3604 c->max_c_rec = nrec * nslice;
3605 c->curr_c_rec = 0;
3606
3607 c->max_rec = nrec;
3608 c->record_counter = 0;
3609 c->num_bases = 0;
3610 c->s_num_bases = 0;
3611
3612 c->max_slice = nslice;
3613 c->curr_slice = 0;
3614
3615 c->pos_sorted = 1;
3616 c->max_apos = 0;
3617 c->multi_seq = 0;
3618 c->qs_seq_orient = 1;
3619
3620 c->bams = NULL;
3621
3622 if (!(c->slices = calloc(nslice != 0 ? nslice : 1, sizeof(cram_slice *))))
3623 goto err;
3624 c->slice = NULL;
3625
3626 if (!(c->comp_hdr = cram_new_compression_header()))
3627 goto err;
3628 c->comp_hdr_block = NULL;
3629
3630 for (id = DS_RN; id < DS_TN; id++)
3631 if (!(c->stats[id] = cram_stats_create())) goto err;
3632
3633 //c->aux_B_stats = cram_stats_create();
3634
3635 if (!(c->tags_used = kh_init(m_tagmap)))
3636 goto err;
3637 c->refs_used = 0;
3638
3639 return c;
3640
3641 err:
3642 if (c) {
3643 if (c->slices)
3644 free(c->slices);
3645 free(c);
3646 }
3647 return NULL;
3648 }
3649
cram_free_container(cram_container * c)3650 void cram_free_container(cram_container *c) {
3651 enum cram_DS_ID id;
3652 int i;
3653
3654 if (!c)
3655 return;
3656
3657 if (c->refs_used)
3658 free(c->refs_used);
3659
3660 if (c->landmark)
3661 free(c->landmark);
3662
3663 if (c->comp_hdr)
3664 cram_free_compression_header(c->comp_hdr);
3665
3666 if (c->comp_hdr_block)
3667 cram_free_block(c->comp_hdr_block);
3668
3669 // Free the slices; filled out by encoder only
3670 if (c->slices) {
3671 for (i = 0; i < c->max_slice; i++) {
3672 if (c->slices[i])
3673 cram_free_slice(c->slices[i]);
3674 if (c->slices[i] == c->slice)
3675 c->slice = NULL;
3676 }
3677 free(c->slices);
3678 }
3679
3680 // Free the current slice; set by both encoder & decoder
3681 if (c->slice) {
3682 cram_free_slice(c->slice);
3683 c->slice = NULL;
3684 }
3685
3686 for (id = DS_RN; id < DS_TN; id++)
3687 if (c->stats[id]) cram_stats_free(c->stats[id]);
3688
3689 //if (c->aux_B_stats) cram_stats_free(c->aux_B_stats);
3690
3691 if (c->tags_used) {
3692 khint_t k;
3693
3694 for (k = kh_begin(c->tags_used); k != kh_end(c->tags_used); k++) {
3695 if (!kh_exist(c->tags_used, k))
3696 continue;
3697
3698 cram_tag_map *tm = (cram_tag_map *)kh_val(c->tags_used, k);
3699 if (tm) {
3700 cram_codec *c = tm->codec;
3701
3702 if (c) c->free(c);
3703 free(tm);
3704 }
3705 }
3706
3707 kh_destroy(m_tagmap, c->tags_used);
3708 }
3709
3710 free(c);
3711 }
3712
3713 /*
3714 * Reads a container header.
3715 *
3716 * Returns cram_container on success
3717 * NULL on failure or no container left (fd->err == 0).
3718 */
cram_read_container(cram_fd * fd)3719 cram_container *cram_read_container(cram_fd *fd) {
3720 cram_container c2, *c;
3721 int i, s;
3722 size_t rd = 0;
3723 uint32_t crc = 0;
3724
3725 fd->err = 0;
3726 fd->eof = 0;
3727
3728 memset(&c2, 0, sizeof(c2));
3729 if (CRAM_MAJOR_VERS(fd->version) == 1) {
3730 if ((s = fd->vv.varint_decode32_crc(fd, &c2.length, &crc)) == -1) {
3731 fd->eof = fd->empty_container ? 1 : 2;
3732 return NULL;
3733 } else {
3734 rd+=s;
3735 }
3736 } else if (CRAM_MAJOR_VERS(fd->version) < 4) {
3737 uint32_t len;
3738 if ((s = int32_decode(fd, &c2.length)) == -1) {
3739 if (CRAM_MAJOR_VERS(fd->version) == 2 &&
3740 CRAM_MINOR_VERS(fd->version) == 0)
3741 fd->eof = 1; // EOF blocks arrived in v2.1
3742 else
3743 fd->eof = fd->empty_container ? 1 : 2;
3744 return NULL;
3745 } else {
3746 rd+=s;
3747 }
3748 len = le_int4(c2.length);
3749 crc = crc32(0L, (unsigned char *)&len, 4);
3750 } else {
3751 if ((s = fd->vv.varint_decode32_crc(fd, &c2.length, &crc)) == -1) {
3752 fd->eof = fd->empty_container ? 1 : 2;
3753 return NULL;
3754 } else {
3755 rd+=s;
3756 }
3757 }
3758 if ((s = fd->vv.varint_decode32s_crc(fd, &c2.ref_seq_id, &crc)) == -1) return NULL; else rd+=s;
3759 if (CRAM_MAJOR_VERS(fd->version) >= 4) {
3760 int64_t i64;
3761 if ((s = fd->vv.varint_decode64_crc(fd, &i64, &crc))== -1) return NULL; else rd+=s;
3762 c2.ref_seq_start = i64;
3763 if ((s = fd->vv.varint_decode64_crc(fd, &i64, &crc)) == -1) return NULL; else rd+=s;
3764 c2.ref_seq_span = i64;
3765 } else {
3766 int32_t i32;
3767 if ((s = fd->vv.varint_decode32_crc(fd, &i32, &crc))== -1) return NULL; else rd+=s;
3768 c2.ref_seq_start = i32;
3769 if ((s = fd->vv.varint_decode32_crc(fd, &i32, &crc)) == -1) return NULL; else rd+=s;
3770 c2.ref_seq_span = i32;
3771 }
3772 if ((s = fd->vv.varint_decode32_crc(fd, &c2.num_records, &crc)) == -1) return NULL; else rd+=s;
3773
3774 if (CRAM_MAJOR_VERS(fd->version) == 1) {
3775 c2.record_counter = 0;
3776 c2.num_bases = 0;
3777 } else {
3778 if (CRAM_MAJOR_VERS(fd->version) >= 3) {
3779 if ((s = fd->vv.varint_decode64_crc(fd, &c2.record_counter, &crc)) == -1)
3780 return NULL;
3781 else
3782 rd += s;
3783 } else {
3784 int32_t i32;
3785 if ((s = fd->vv.varint_decode32_crc(fd, &i32, &crc)) == -1)
3786 return NULL;
3787 else
3788 rd += s;
3789 c2.record_counter = i32;
3790 }
3791
3792 if ((s = fd->vv.varint_decode64_crc(fd, &c2.num_bases, &crc))== -1)
3793 return NULL;
3794 else
3795 rd += s;
3796 }
3797 if ((s = fd->vv.varint_decode32_crc(fd, &c2.num_blocks, &crc)) == -1)
3798 return NULL;
3799 else
3800 rd+=s;
3801 if ((s = fd->vv.varint_decode32_crc(fd, &c2.num_landmarks, &crc))== -1)
3802 return NULL;
3803 else
3804 rd+=s;
3805
3806 if (c2.num_landmarks < 0 || c2.num_landmarks >= SIZE_MAX / sizeof(int32_t))
3807 return NULL;
3808
3809 if (!(c = calloc(1, sizeof(*c))))
3810 return NULL;
3811
3812 *c = c2;
3813
3814 if (c->num_landmarks && !(c->landmark = malloc(c->num_landmarks * sizeof(int32_t)))) {
3815 fd->err = errno;
3816 cram_free_container(c);
3817 return NULL;
3818 }
3819 for (i = 0; i < c->num_landmarks; i++) {
3820 if ((s = fd->vv.varint_decode32_crc(fd, &c->landmark[i], &crc)) == -1) {
3821 cram_free_container(c);
3822 return NULL;
3823 } else {
3824 rd += s;
3825 }
3826 }
3827
3828 if (CRAM_MAJOR_VERS(fd->version) >= 3) {
3829 if (-1 == int32_decode(fd, (int32_t *)&c->crc32)) {
3830 cram_free_container(c);
3831 return NULL;
3832 } else {
3833 rd+=4;
3834 }
3835
3836 if (crc != c->crc32) {
3837 hts_log_error("Container header CRC32 failure");
3838 cram_free_container(c);
3839 return NULL;
3840 }
3841 }
3842
3843 c->offset = rd;
3844 c->slices = NULL;
3845 c->slice = NULL;
3846 c->curr_slice = 0;
3847 c->max_slice = c->num_landmarks;
3848 c->slice_rec = 0;
3849 c->curr_rec = 0;
3850 c->max_rec = 0;
3851
3852 if (c->ref_seq_id == -2) {
3853 c->multi_seq = 1;
3854 fd->multi_seq = 1;
3855 }
3856
3857 fd->empty_container =
3858 (c->num_records == 0 &&
3859 c->ref_seq_id == -1 &&
3860 c->ref_seq_start == 0x454f46 /* EOF */) ? 1 : 0;
3861
3862 return c;
3863 }
3864
3865
3866 /* MAXIMUM storage size needed for the container. */
cram_container_size(cram_container * c)3867 int cram_container_size(cram_container *c) {
3868 return 55 + 5*c->num_landmarks;
3869 }
3870
3871
3872 /*
3873 * Stores the container structure in dat and returns *size as the
3874 * number of bytes written to dat[]. The input size of dat is also
3875 * held in *size and should be initialised to cram_container_size(c).
3876 *
3877 * Returns 0 on success;
3878 * -1 on failure
3879 */
cram_store_container(cram_fd * fd,cram_container * c,char * dat,int * size)3880 int cram_store_container(cram_fd *fd, cram_container *c, char *dat, int *size)
3881 {
3882 char *cp = (char *)dat;
3883 int i;
3884
3885 // Check the input buffer is large enough according to our stated
3886 // requirements. (NOTE: it may actually take less.)
3887 if (cram_container_size(c) > *size)
3888 return -1;
3889
3890 if (CRAM_MAJOR_VERS(fd->version) == 1) {
3891 cp += itf8_put(cp, c->length);
3892 } else {
3893 *(int32_t *)cp = le_int4(c->length);
3894 cp += 4;
3895 }
3896 if (c->multi_seq) {
3897 cp += fd->vv.varint_put32(cp, NULL, -2);
3898 cp += fd->vv.varint_put32(cp, NULL, 0);
3899 cp += fd->vv.varint_put32(cp, NULL, 0);
3900 } else {
3901 cp += fd->vv.varint_put32s(cp, NULL, c->ref_seq_id);
3902 if (CRAM_MAJOR_VERS(fd->version) >= 4) {
3903 cp += fd->vv.varint_put64(cp, NULL, c->ref_seq_start);
3904 cp += fd->vv.varint_put64(cp, NULL, c->ref_seq_span);
3905 } else {
3906 cp += fd->vv.varint_put32(cp, NULL, c->ref_seq_start);
3907 cp += fd->vv.varint_put32(cp, NULL, c->ref_seq_span);
3908 }
3909 }
3910 cp += fd->vv.varint_put32(cp, NULL, c->num_records);
3911 if (CRAM_MAJOR_VERS(fd->version) == 2) {
3912 cp += fd->vv.varint_put64(cp, NULL, c->record_counter);
3913 } else if (CRAM_MAJOR_VERS(fd->version) >= 3) {
3914 cp += fd->vv.varint_put32(cp, NULL, c->record_counter);
3915 }
3916 cp += fd->vv.varint_put64(cp, NULL, c->num_bases);
3917 cp += fd->vv.varint_put32(cp, NULL, c->num_blocks);
3918 cp += fd->vv.varint_put32(cp, NULL, c->num_landmarks);
3919 for (i = 0; i < c->num_landmarks; i++)
3920 cp += fd->vv.varint_put32(cp, NULL, c->landmark[i]);
3921
3922 if (CRAM_MAJOR_VERS(fd->version) >= 3) {
3923 c->crc32 = crc32(0L, (uc *)dat, cp-dat);
3924 cp[0] = c->crc32 & 0xff;
3925 cp[1] = (c->crc32 >> 8) & 0xff;
3926 cp[2] = (c->crc32 >> 16) & 0xff;
3927 cp[3] = (c->crc32 >> 24) & 0xff;
3928 cp += 4;
3929 }
3930
3931 *size = cp-dat; // actual used size
3932
3933 return 0;
3934 }
3935
3936
3937 /*
3938 * Writes a container structure.
3939 *
3940 * Returns 0 on success
3941 * -1 on failure
3942 */
cram_write_container(cram_fd * fd,cram_container * c)3943 int cram_write_container(cram_fd *fd, cram_container *c) {
3944 char buf_a[1024], *buf = buf_a, *cp;
3945 int i;
3946
3947 if (61 + c->num_landmarks * 10 >= 1024) {
3948 buf = malloc(61 + c->num_landmarks * 10);
3949 if (!buf)
3950 return -1;
3951 }
3952 cp = buf;
3953
3954 if (CRAM_MAJOR_VERS(fd->version) == 1) {
3955 cp += itf8_put(cp, c->length);
3956 } else if (CRAM_MAJOR_VERS(fd->version) <= 3) {
3957 *(int32_t *)cp = le_int4(c->length);
3958 cp += 4;
3959 } else {
3960 cp += fd->vv.varint_put32(cp, NULL, c->length);
3961 }
3962 if (c->multi_seq) {
3963 cp += fd->vv.varint_put32(cp, NULL, (uint32_t)-2);
3964 cp += fd->vv.varint_put32(cp, NULL, 0);
3965 cp += fd->vv.varint_put32(cp, NULL, 0);
3966 } else {
3967 cp += fd->vv.varint_put32s(cp, NULL, c->ref_seq_id);
3968 if (CRAM_MAJOR_VERS(fd->version) >= 4) {
3969 cp += fd->vv.varint_put64(cp, NULL, c->ref_seq_start);
3970 cp += fd->vv.varint_put64(cp, NULL, c->ref_seq_span);
3971 } else {
3972 cp += fd->vv.varint_put32(cp, NULL, c->ref_seq_start);
3973 cp += fd->vv.varint_put32(cp, NULL, c->ref_seq_span);
3974 }
3975 }
3976 cp += fd->vv.varint_put32(cp, NULL, c->num_records);
3977 if (CRAM_MAJOR_VERS(fd->version) >= 3)
3978 cp += fd->vv.varint_put64(cp, NULL, c->record_counter);
3979 else
3980 cp += fd->vv.varint_put32(cp, NULL, c->record_counter);
3981 cp += fd->vv.varint_put64(cp, NULL, c->num_bases);
3982 cp += fd->vv.varint_put32(cp, NULL, c->num_blocks);
3983 cp += fd->vv.varint_put32(cp, NULL, c->num_landmarks);
3984 for (i = 0; i < c->num_landmarks; i++)
3985 cp += fd->vv.varint_put32(cp, NULL, c->landmark[i]);
3986
3987 if (CRAM_MAJOR_VERS(fd->version) >= 3) {
3988 c->crc32 = crc32(0L, (uc *)buf, cp-buf);
3989 cp[0] = c->crc32 & 0xff;
3990 cp[1] = (c->crc32 >> 8) & 0xff;
3991 cp[2] = (c->crc32 >> 16) & 0xff;
3992 cp[3] = (c->crc32 >> 24) & 0xff;
3993 cp += 4;
3994 }
3995
3996 if (cp-buf != hwrite(fd->fp, buf, cp-buf)) {
3997 if (buf != buf_a)
3998 free(buf);
3999 return -1;
4000 }
4001
4002 if (buf != buf_a)
4003 free(buf);
4004
4005 return 0;
4006 }
4007
4008 // common component shared by cram_flush_container{,_mt}
cram_flush_container2(cram_fd * fd,cram_container * c)4009 static int cram_flush_container2(cram_fd *fd, cram_container *c) {
4010 int i, j;
4011
4012 if (c->curr_slice > 0 && !c->slices)
4013 return -1;
4014
4015 //fprintf(stderr, "Writing container %d, sum %u\n", c->record_counter, sum);
4016
4017 off_t c_offset = htell(fd->fp); // File offset of container
4018
4019 /* Write the container struct itself */
4020 if (0 != cram_write_container(fd, c))
4021 return -1;
4022
4023 off_t hdr_size = htell(fd->fp) - c_offset;
4024
4025 /* And the compression header */
4026 if (0 != cram_write_block(fd, c->comp_hdr_block))
4027 return -1;
4028
4029 /* Followed by the slice blocks */
4030 off_t file_offset = htell(fd->fp);
4031 for (i = 0; i < c->curr_slice; i++) {
4032 cram_slice *s = c->slices[i];
4033 off_t spos = file_offset - c_offset - hdr_size;
4034
4035 if (0 != cram_write_block(fd, s->hdr_block))
4036 return -1;
4037
4038 for (j = 0; j < s->hdr->num_blocks; j++) {
4039 if (0 != cram_write_block(fd, s->block[j]))
4040 return -1;
4041 }
4042
4043 file_offset = htell(fd->fp);
4044 off_t sz = file_offset - c_offset - hdr_size - spos;
4045
4046 if (fd->idxfp) {
4047 if (cram_index_slice(fd, c, s, fd->idxfp, c_offset, spos, sz) < 0)
4048 return -1;
4049 }
4050 }
4051
4052 return 0;
4053 }
4054
4055 /*
4056 * Flushes a completely or partially full container to disk, writing
4057 * container structure, header and blocks. This also calls the encoder
4058 * functions.
4059 *
4060 * Returns 0 on success
4061 * -1 on failure
4062 */
cram_flush_container(cram_fd * fd,cram_container * c)4063 int cram_flush_container(cram_fd *fd, cram_container *c) {
4064 /* Encode the container blocks and generate compression header */
4065 if (0 != cram_encode_container(fd, c))
4066 return -1;
4067
4068 return cram_flush_container2(fd, c);
4069 }
4070
4071 typedef struct {
4072 cram_fd *fd;
4073 cram_container *c;
4074 } cram_job;
4075
cram_flush_thread(void * arg)4076 void *cram_flush_thread(void *arg) {
4077 cram_job *j = (cram_job *)arg;
4078
4079 /* Encode the container blocks and generate compression header */
4080 if (0 != cram_encode_container(j->fd, j->c)) {
4081 hts_log_error("Call to cram_encode_container failed");
4082 return NULL;
4083 }
4084
4085 return arg;
4086 }
4087
cram_flush_result(cram_fd * fd)4088 static int cram_flush_result(cram_fd *fd) {
4089 int i, ret = 0;
4090 hts_tpool_result *r;
4091 cram_container *lc = NULL;
4092
4093 // NB: we can have one result per slice, not per container,
4094 // so we need to free the container only after all slices
4095 // within it have been freed. (Automatic via reference counting.)
4096 while ((r = hts_tpool_next_result(fd->rqueue))) {
4097 cram_job *j = (cram_job *)hts_tpool_result_data(r);
4098 cram_container *c;
4099
4100 if (!j) {
4101 hts_tpool_delete_result(r, 0);
4102 return -1;
4103 }
4104
4105 fd = j->fd;
4106 c = j->c;
4107
4108 if (fd->mode == 'w')
4109 if (0 != cram_flush_container2(fd, c))
4110 return -1;
4111
4112 // Free the slices; filled out by encoder only
4113 if (c->slices) {
4114 for (i = 0; i < c->max_slice; i++) {
4115 if (c->slices[i])
4116 cram_free_slice(c->slices[i]);
4117 if (c->slices[i] == c->slice)
4118 c->slice = NULL;
4119 c->slices[i] = NULL;
4120 }
4121 }
4122
4123 // Free the current slice; set by both encoder & decoder
4124 if (c->slice) {
4125 cram_free_slice(c->slice);
4126 c->slice = NULL;
4127 }
4128 c->curr_slice = 0;
4129
4130 // Our jobs will be in order, so we free the last
4131 // container when our job has switched to a new one.
4132 if (c != lc) {
4133 if (lc) {
4134 if (fd->ctr == lc)
4135 fd->ctr = NULL;
4136 if (fd->ctr_mt == lc)
4137 fd->ctr_mt = NULL;
4138 cram_free_container(lc);
4139 }
4140 lc = c;
4141 }
4142
4143 hts_tpool_delete_result(r, 1);
4144 }
4145 if (lc) {
4146 if (fd->ctr == lc)
4147 fd->ctr = NULL;
4148 if (fd->ctr_mt == lc)
4149 fd->ctr_mt = NULL;
4150 cram_free_container(lc);
4151 }
4152
4153 return ret;
4154 }
4155
4156 // Note: called while metrics_lock is held.
4157 // Will be left in this state too, but may temporarily unlock.
reset_metrics(cram_fd * fd)4158 void reset_metrics(cram_fd *fd) {
4159 int i;
4160
4161 if (fd->pool) {
4162 // If multi-threaded we have multiple blocks being
4163 // compressed already and several on the to-do list
4164 // (fd->rqueue->pending). It's tricky to reset the
4165 // metrics exactly the correct point, so instead we
4166 // just flush the pool, reset, and then continue again.
4167
4168 // Don't bother starting a new trial before then though.
4169 for (i = 0; i < DS_END; i++) {
4170 cram_metrics *m = fd->m[i];
4171 if (!m)
4172 continue;
4173 m->next_trial = 999;
4174 }
4175
4176 pthread_mutex_unlock(&fd->metrics_lock);
4177 hts_tpool_process_flush(fd->rqueue);
4178 pthread_mutex_lock(&fd->metrics_lock);
4179 }
4180
4181 for (i = 0; i < DS_END; i++) {
4182 cram_metrics *m = fd->m[i];
4183 if (!m)
4184 continue;
4185
4186 m->trial = NTRIALS;
4187 m->next_trial = TRIAL_SPAN;
4188 m->revised_method = 0;
4189 m->unpackable = 0;
4190
4191 memset(m->sz, 0, sizeof(m->sz));
4192 }
4193 }
4194
cram_flush_container_mt(cram_fd * fd,cram_container * c)4195 int cram_flush_container_mt(cram_fd *fd, cram_container *c) {
4196 cram_job *j;
4197
4198 // At the junction of mapped to unmapped data the compression
4199 // methods may need to change due to very different statistical
4200 // properties; particularly BA if minhash sorted.
4201 //
4202 // However with threading we'll have several in-flight blocks
4203 // arriving out of order.
4204 //
4205 // So we do one trial reset of NThreads to last for NThreads
4206 // duration to get us over this transition period, followed
4207 // by another retrial of the usual ntrials & trial span.
4208 pthread_mutex_lock(&fd->metrics_lock);
4209 if (c->n_mapped < 0.3*c->curr_rec &&
4210 fd->last_mapped > 0.7*c->max_rec) {
4211 reset_metrics(fd);
4212 }
4213 fd->last_mapped = c->n_mapped * (c->max_rec+1)/(c->curr_rec+1) ;
4214 pthread_mutex_unlock(&fd->metrics_lock);
4215
4216 if (!fd->pool)
4217 return cram_flush_container(fd, c);
4218
4219 if (!(j = malloc(sizeof(*j))))
4220 return -1;
4221 j->fd = fd;
4222 j->c = c;
4223
4224 // Flush the job. Note our encoder queue may be full, so we
4225 // either have to keep trying in non-blocking mode (what we do) or
4226 // use a dedicated separate thread for draining the queue.
4227 for (;;) {
4228 errno = 0;
4229 hts_tpool_dispatch2(fd->pool, fd->rqueue, cram_flush_thread, j, 1);
4230 int pending = (errno == EAGAIN);
4231 if (cram_flush_result(fd) != 0)
4232 return -1;
4233 if (!pending)
4234 break;
4235
4236 usleep(1000);
4237 }
4238
4239 return 0;
4240 }
4241
4242 /* ----------------------------------------------------------------------
4243 * Compression headers; the first part of the container
4244 */
4245
4246 /*
4247 * Creates a new blank container compression header
4248 *
4249 * Returns header ptr on success
4250 * NULL on failure
4251 */
cram_new_compression_header(void)4252 cram_block_compression_hdr *cram_new_compression_header(void) {
4253 cram_block_compression_hdr *hdr = calloc(1, sizeof(*hdr));
4254 if (!hdr)
4255 return NULL;
4256
4257 if (!(hdr->TD_blk = cram_new_block(CORE, 0))) {
4258 free(hdr);
4259 return NULL;
4260 }
4261
4262 if (!(hdr->TD_hash = kh_init(m_s2i))) {
4263 cram_free_block(hdr->TD_blk);
4264 free(hdr);
4265 return NULL;
4266 }
4267
4268 if (!(hdr->TD_keys = string_pool_create(8192))) {
4269 kh_destroy(m_s2i, hdr->TD_hash);
4270 cram_free_block(hdr->TD_blk);
4271 free(hdr);
4272 return NULL;
4273 }
4274
4275 return hdr;
4276 }
4277
cram_free_compression_header(cram_block_compression_hdr * hdr)4278 void cram_free_compression_header(cram_block_compression_hdr *hdr) {
4279 int i;
4280
4281 if (hdr->landmark)
4282 free(hdr->landmark);
4283
4284 if (hdr->preservation_map)
4285 kh_destroy(map, hdr->preservation_map);
4286
4287 for (i = 0; i < CRAM_MAP_HASH; i++) {
4288 cram_map *m, *m2;
4289 for (m = hdr->rec_encoding_map[i]; m; m = m2) {
4290 m2 = m->next;
4291 if (m->codec)
4292 m->codec->free(m->codec);
4293 free(m);
4294 }
4295 }
4296
4297 for (i = 0; i < CRAM_MAP_HASH; i++) {
4298 cram_map *m, *m2;
4299 for (m = hdr->tag_encoding_map[i]; m; m = m2) {
4300 m2 = m->next;
4301 if (m->codec)
4302 m->codec->free(m->codec);
4303 free(m);
4304 }
4305 }
4306
4307 for (i = 0; i < DS_END; i++) {
4308 if (hdr->codecs[i])
4309 hdr->codecs[i]->free(hdr->codecs[i]);
4310 }
4311
4312 if (hdr->TL)
4313 free(hdr->TL);
4314 if (hdr->TD_blk)
4315 cram_free_block(hdr->TD_blk);
4316 if (hdr->TD_hash)
4317 kh_destroy(m_s2i, hdr->TD_hash);
4318 if (hdr->TD_keys)
4319 string_pool_destroy(hdr->TD_keys);
4320
4321 free(hdr);
4322 }
4323
4324
4325 /* ----------------------------------------------------------------------
4326 * Slices and slice headers
4327 */
4328
cram_free_slice_header(cram_block_slice_hdr * hdr)4329 void cram_free_slice_header(cram_block_slice_hdr *hdr) {
4330 if (!hdr)
4331 return;
4332
4333 if (hdr->block_content_ids)
4334 free(hdr->block_content_ids);
4335
4336 free(hdr);
4337
4338 return;
4339 }
4340
cram_free_slice(cram_slice * s)4341 void cram_free_slice(cram_slice *s) {
4342 if (!s)
4343 return;
4344
4345 if (s->hdr_block)
4346 cram_free_block(s->hdr_block);
4347
4348 if (s->block) {
4349 int i;
4350
4351 if (s->hdr) {
4352 for (i = 0; i < s->hdr->num_blocks; i++) {
4353 if (i > 0 && s->block[i] == s->block[0])
4354 continue;
4355 cram_free_block(s->block[i]);
4356 }
4357 }
4358 free(s->block);
4359 }
4360
4361 if (s->block_by_id)
4362 free(s->block_by_id);
4363
4364 if (s->hdr)
4365 cram_free_slice_header(s->hdr);
4366
4367 if (s->seqs_blk)
4368 cram_free_block(s->seqs_blk);
4369
4370 if (s->qual_blk)
4371 cram_free_block(s->qual_blk);
4372
4373 if (s->name_blk)
4374 cram_free_block(s->name_blk);
4375
4376 if (s->aux_blk)
4377 cram_free_block(s->aux_blk);
4378
4379 if (s->base_blk)
4380 cram_free_block(s->base_blk);
4381
4382 if (s->soft_blk)
4383 cram_free_block(s->soft_blk);
4384
4385 if (s->cigar)
4386 free(s->cigar);
4387
4388 if (s->crecs)
4389 free(s->crecs);
4390
4391 if (s->features)
4392 free(s->features);
4393
4394 if (s->TN)
4395 free(s->TN);
4396
4397 if (s->pair_keys)
4398 string_pool_destroy(s->pair_keys);
4399
4400 if (s->pair[0])
4401 kh_destroy(m_s2i, s->pair[0]);
4402 if (s->pair[1])
4403 kh_destroy(m_s2i, s->pair[1]);
4404
4405 if (s->aux_block)
4406 free(s->aux_block);
4407
4408 free(s);
4409 }
4410
4411 /*
4412 * Creates a new empty slice in memory, for subsequent writing to
4413 * disk.
4414 *
4415 * Returns cram_slice ptr on success
4416 * NULL on failure
4417 */
cram_new_slice(enum cram_content_type type,int nrecs)4418 cram_slice *cram_new_slice(enum cram_content_type type, int nrecs) {
4419 cram_slice *s = calloc(1, sizeof(*s));
4420 if (!s)
4421 return NULL;
4422
4423 if (!(s->hdr = (cram_block_slice_hdr *)calloc(1, sizeof(*s->hdr))))
4424 goto err;
4425 s->hdr->content_type = type;
4426
4427 s->hdr_block = NULL;
4428 s->block = NULL;
4429 s->block_by_id = NULL;
4430 s->last_apos = 0;
4431 if (!(s->crecs = malloc(nrecs * sizeof(cram_record)))) goto err;
4432 s->cigar_alloc = 1024;
4433 if (!(s->cigar = malloc(s->cigar_alloc * sizeof(*s->cigar)))) goto err;
4434 s->ncigar = 0;
4435
4436 if (!(s->seqs_blk = cram_new_block(EXTERNAL, 0))) goto err;
4437 if (!(s->qual_blk = cram_new_block(EXTERNAL, DS_QS))) goto err;
4438 if (!(s->name_blk = cram_new_block(EXTERNAL, DS_RN))) goto err;
4439 if (!(s->aux_blk = cram_new_block(EXTERNAL, DS_aux))) goto err;
4440 if (!(s->base_blk = cram_new_block(EXTERNAL, DS_IN))) goto err;
4441 if (!(s->soft_blk = cram_new_block(EXTERNAL, DS_SC))) goto err;
4442
4443 s->features = NULL;
4444 s->nfeatures = s->afeatures = 0;
4445
4446 #ifndef TN_external
4447 s->TN = NULL;
4448 s->nTN = s->aTN = 0;
4449 #endif
4450
4451 // Volatile keys as we do realloc in dstring
4452 if (!(s->pair_keys = string_pool_create(8192))) goto err;
4453 if (!(s->pair[0] = kh_init(m_s2i))) goto err;
4454 if (!(s->pair[1] = kh_init(m_s2i))) goto err;
4455
4456 #ifdef BA_external
4457 s->BA_len = 0;
4458 #endif
4459
4460 return s;
4461
4462 err:
4463 if (s)
4464 cram_free_slice(s);
4465
4466 return NULL;
4467 }
4468
4469 /*
4470 * Loads an entire slice.
4471 * FIXME: In 1.0 the native unit of slices within CRAM is broken
4472 * as slices contain references to objects in other slices.
4473 * To work around this while keeping the slice oriented outer loop
4474 * we read all slices and stitch them together into a fake large
4475 * slice instead.
4476 *
4477 * Returns cram_slice ptr on success
4478 * NULL on failure
4479 */
cram_read_slice(cram_fd * fd)4480 cram_slice *cram_read_slice(cram_fd *fd) {
4481 cram_block *b = cram_read_block(fd);
4482 cram_slice *s = calloc(1, sizeof(*s));
4483 int i, n, max_id, min_id;
4484
4485 if (!b || !s)
4486 goto err;
4487
4488 s->hdr_block = b;
4489 switch (b->content_type) {
4490 case MAPPED_SLICE:
4491 case UNMAPPED_SLICE:
4492 if (!(s->hdr = cram_decode_slice_header(fd, b)))
4493 goto err;
4494 break;
4495
4496 default:
4497 hts_log_error("Unexpected block of type %s",
4498 cram_content_type2str(b->content_type));
4499 goto err;
4500 }
4501
4502 if (s->hdr->num_blocks < 1) {
4503 hts_log_error("Slice does not include any data blocks");
4504 goto err;
4505 }
4506
4507 s->block = calloc(n = s->hdr->num_blocks, sizeof(*s->block));
4508 if (!s->block)
4509 goto err;
4510
4511 for (max_id = i = 0, min_id = INT_MAX; i < n; i++) {
4512 if (!(s->block[i] = cram_read_block(fd)))
4513 goto err;
4514
4515 if (s->block[i]->content_type == EXTERNAL) {
4516 if (max_id < s->block[i]->content_id)
4517 max_id = s->block[i]->content_id;
4518 if (min_id > s->block[i]->content_id)
4519 min_id = s->block[i]->content_id;
4520 }
4521 }
4522
4523 if (!(s->block_by_id = calloc(512, sizeof(s->block[0]))))
4524 goto err;
4525
4526 for (i = 0; i < n; i++) {
4527 if (s->block[i]->content_type != EXTERNAL)
4528 continue;
4529 uint32_t v = s->block[i]->content_id;
4530 if (v >= 256)
4531 v = 256 + v % 251;
4532 s->block_by_id[v] = s->block[i];
4533 }
4534
4535 /* Initialise encoding/decoding tables */
4536 s->cigar_alloc = 1024;
4537 if (!(s->cigar = malloc(s->cigar_alloc * sizeof(*s->cigar)))) goto err;
4538 s->ncigar = 0;
4539
4540 if (!(s->seqs_blk = cram_new_block(EXTERNAL, 0))) goto err;
4541 if (!(s->qual_blk = cram_new_block(EXTERNAL, DS_QS))) goto err;
4542 if (!(s->name_blk = cram_new_block(EXTERNAL, DS_RN))) goto err;
4543 if (!(s->aux_blk = cram_new_block(EXTERNAL, DS_aux))) goto err;
4544 if (!(s->base_blk = cram_new_block(EXTERNAL, DS_IN))) goto err;
4545 if (!(s->soft_blk = cram_new_block(EXTERNAL, DS_SC))) goto err;
4546
4547 s->crecs = NULL;
4548
4549 s->last_apos = s->hdr->ref_seq_start;
4550 s->decode_md = fd->decode_md;
4551
4552 return s;
4553
4554 err:
4555 if (b)
4556 cram_free_block(b);
4557 if (s) {
4558 s->hdr_block = NULL;
4559 cram_free_slice(s);
4560 }
4561 return NULL;
4562 }
4563
4564
4565 /* ----------------------------------------------------------------------
4566 * CRAM file definition (header)
4567 */
4568
4569 /*
4570 * Reads a CRAM file definition structure.
4571 * Returns file_def ptr on success
4572 * NULL on failure
4573 */
cram_read_file_def(cram_fd * fd)4574 cram_file_def *cram_read_file_def(cram_fd *fd) {
4575 cram_file_def *def = malloc(sizeof(*def));
4576 if (!def)
4577 return NULL;
4578
4579 if (26 != hread(fd->fp, &def->magic[0], 26)) {
4580 free(def);
4581 return NULL;
4582 }
4583
4584 if (memcmp(def->magic, "CRAM", 4) != 0) {
4585 free(def);
4586 return NULL;
4587 }
4588
4589 if (def->major_version > 4) {
4590 hts_log_error("CRAM version number mismatch. Expected 1.x, 2.x, 3.x or 4.x, got %d.%d",
4591 def->major_version, def->minor_version);
4592 free(def);
4593 return NULL;
4594 }
4595
4596 fd->first_container += 26;
4597 fd->curr_position = fd->first_container;
4598 fd->last_slice = 0;
4599
4600 return def;
4601 }
4602
4603 /*
4604 * Writes a cram_file_def structure to cram_fd.
4605 * Returns 0 on success
4606 * -1 on failure
4607 */
cram_write_file_def(cram_fd * fd,cram_file_def * def)4608 int cram_write_file_def(cram_fd *fd, cram_file_def *def) {
4609 return (hwrite(fd->fp, &def->magic[0], 26) == 26) ? 0 : -1;
4610 }
4611
cram_free_file_def(cram_file_def * def)4612 void cram_free_file_def(cram_file_def *def) {
4613 if (def) free(def);
4614 }
4615
4616 /* ----------------------------------------------------------------------
4617 * SAM header I/O
4618 */
4619
4620
4621 /*
4622 * Reads the SAM header from the first CRAM data block.
4623 * Also performs minimal parsing to extract read-group
4624 * and sample information.
4625
4626 * Returns SAM hdr ptr on success
4627 * NULL on failure
4628 */
cram_read_SAM_hdr(cram_fd * fd)4629 sam_hdr_t *cram_read_SAM_hdr(cram_fd *fd) {
4630 int32_t header_len;
4631 char *header;
4632 sam_hdr_t *hdr;
4633
4634 /* 1.1 onwards stores the header in the first block of a container */
4635 if (CRAM_MAJOR_VERS(fd->version) == 1) {
4636 /* Length */
4637 if (-1 == int32_decode(fd, &header_len))
4638 return NULL;
4639
4640 /* Alloc and read */
4641 if (header_len < 0 || NULL == (header = malloc((size_t) header_len+1)))
4642 return NULL;
4643
4644 if (header_len != hread(fd->fp, header, header_len)) {
4645 free(header);
4646 return NULL;
4647 }
4648 header[header_len] = '\0';
4649
4650 fd->first_container += 4 + header_len;
4651 } else {
4652 cram_container *c = cram_read_container(fd);
4653 cram_block *b;
4654 int i;
4655 int64_t len;
4656
4657 if (!c)
4658 return NULL;
4659
4660 fd->first_container += c->length + c->offset;
4661 fd->curr_position = fd->first_container;
4662
4663 if (c->num_blocks < 1) {
4664 cram_free_container(c);
4665 return NULL;
4666 }
4667
4668 if (!(b = cram_read_block(fd))) {
4669 cram_free_container(c);
4670 return NULL;
4671 }
4672 if (cram_uncompress_block(b) != 0) {
4673 cram_free_container(c);
4674 cram_free_block(b);
4675 return NULL;
4676 }
4677
4678 len = b->comp_size + 2 + 4*(CRAM_MAJOR_VERS(fd->version) >= 3) +
4679 fd->vv.varint_size(b->content_id) +
4680 fd->vv.varint_size(b->uncomp_size) +
4681 fd->vv.varint_size(b->comp_size);
4682
4683 /* Extract header from 1st block */
4684 if (-1 == int32_get_blk(b, &header_len) ||
4685 header_len < 0 || /* Spec. says signed... why? */
4686 b->uncomp_size - 4 < header_len) {
4687 cram_free_container(c);
4688 cram_free_block(b);
4689 return NULL;
4690 }
4691 if (NULL == (header = malloc((size_t) header_len+1))) {
4692 cram_free_container(c);
4693 cram_free_block(b);
4694 return NULL;
4695 }
4696 memcpy(header, BLOCK_END(b), header_len);
4697 header[header_len] = '\0';
4698 cram_free_block(b);
4699
4700 /* Consume any remaining blocks */
4701 for (i = 1; i < c->num_blocks; i++) {
4702 if (!(b = cram_read_block(fd))) {
4703 cram_free_container(c);
4704 free(header);
4705 return NULL;
4706 }
4707 len += b->comp_size + 2 + 4*(CRAM_MAJOR_VERS(fd->version) >= 3) +
4708 fd->vv.varint_size(b->content_id) +
4709 fd->vv.varint_size(b->uncomp_size) +
4710 fd->vv.varint_size(b->comp_size);
4711 cram_free_block(b);
4712 }
4713
4714 if (c->length > 0 && len > 0 && c->length > len) {
4715 // Consume padding
4716 char *pads = malloc(c->length - len);
4717 if (!pads) {
4718 cram_free_container(c);
4719 free(header);
4720 return NULL;
4721 }
4722
4723 if (c->length - len != hread(fd->fp, pads, c->length - len)) {
4724 cram_free_container(c);
4725 free(header);
4726 free(pads);
4727 return NULL;
4728 }
4729 free(pads);
4730 }
4731
4732 cram_free_container(c);
4733 }
4734
4735 /* Parse */
4736 hdr = sam_hdr_init();
4737 if (!hdr) {
4738 free(header);
4739 return NULL;
4740 }
4741
4742 if (-1 == sam_hdr_add_lines(hdr, header, header_len)) {
4743 free(header);
4744 sam_hdr_destroy(hdr);
4745 return NULL;
4746 }
4747
4748 hdr->l_text = header_len;
4749 hdr->text = header;
4750
4751 return hdr;
4752
4753 }
4754
4755 /*
4756 * Converts 'in' to a full pathname to store in out.
4757 * Out must be at least PATH_MAX bytes long.
4758 */
full_path(char * out,char * in)4759 static void full_path(char *out, char *in) {
4760 size_t in_l = strlen(in);
4761 if (hisremote(in)) {
4762 if (in_l > PATH_MAX) {
4763 hts_log_error("Reference path is longer than %d", PATH_MAX);
4764 return;
4765 }
4766 strncpy(out, in, PATH_MAX-1);
4767 out[PATH_MAX-1] = 0;
4768 return;
4769 }
4770 if (*in == '/' ||
4771 // Windows paths
4772 (in_l > 3 && toupper_c(*in) >= 'A' && toupper_c(*in) <= 'Z' &&
4773 in[1] == ':' && (in[2] == '/' || in[2] == '\\'))) {
4774 strncpy(out, in, PATH_MAX-1);
4775 out[PATH_MAX-1] = 0;
4776 } else {
4777 int len;
4778
4779 // unable to get dir or out+in is too long
4780 if (!getcwd(out, PATH_MAX) ||
4781 (len = strlen(out))+1+strlen(in) >= PATH_MAX) {
4782 strncpy(out, in, PATH_MAX-1);
4783 out[PATH_MAX-1] = 0;
4784 return;
4785 }
4786
4787 sprintf(out+len, "/%.*s", PATH_MAX - 2 - len, in);
4788
4789 // FIXME: cope with `pwd`/../../../foo.fa ?
4790 }
4791 }
4792
4793 /*
4794 * Writes a CRAM SAM header.
4795 * Returns 0 on success
4796 * -1 on failure
4797 */
cram_write_SAM_hdr(cram_fd * fd,sam_hdr_t * hdr)4798 int cram_write_SAM_hdr(cram_fd *fd, sam_hdr_t *hdr) {
4799 size_t header_len;
4800 int blank_block = (CRAM_MAJOR_VERS(fd->version) >= 3);
4801
4802 /* Write CRAM MAGIC if not yet written. */
4803 if (fd->file_def->major_version == 0) {
4804 fd->file_def->major_version = CRAM_MAJOR_VERS(fd->version);
4805 fd->file_def->minor_version = CRAM_MINOR_VERS(fd->version);
4806 if (0 != cram_write_file_def(fd, fd->file_def))
4807 return -1;
4808 }
4809
4810 /* 1.0 requires an UNKNOWN read-group */
4811 if (CRAM_MAJOR_VERS(fd->version) == 1) {
4812 if (!sam_hrecs_find_rg(hdr->hrecs, "UNKNOWN"))
4813 if (sam_hdr_add_line(hdr, "RG",
4814 "ID", "UNKNOWN", "SM", "UNKNOWN", NULL))
4815 return -1;
4816 }
4817
4818 /* Fix M5 strings */
4819 if (fd->refs && !fd->no_ref) {
4820 int i;
4821 for (i = 0; i < hdr->hrecs->nref; i++) {
4822 sam_hrec_type_t *ty;
4823 char *ref;
4824
4825 if (!(ty = sam_hrecs_find_type_id(hdr->hrecs, "SQ", "SN", hdr->hrecs->ref[i].name)))
4826 return -1;
4827
4828 if (!sam_hrecs_find_key(ty, "M5", NULL)) {
4829 char unsigned buf[16];
4830 char buf2[33];
4831 int rlen;
4832 hts_md5_context *md5;
4833
4834 if (!fd->refs ||
4835 !fd->refs->ref_id ||
4836 !fd->refs->ref_id[i]) {
4837 return -1;
4838 }
4839 rlen = fd->refs->ref_id[i]->length;
4840 if (!(md5 = hts_md5_init()))
4841 return -1;
4842 ref = cram_get_ref(fd, i, 1, rlen);
4843 if (NULL == ref) return -1;
4844 rlen = fd->refs->ref_id[i]->length; /* In case it just loaded */
4845 hts_md5_update(md5, ref, rlen);
4846 hts_md5_final(buf, md5);
4847 hts_md5_destroy(md5);
4848 cram_ref_decr(fd->refs, i);
4849
4850 hts_md5_hex(buf2, buf);
4851 if (sam_hdr_update_line(hdr, "SQ", "SN", hdr->hrecs->ref[i].name, "M5", buf2, NULL))
4852 return -1;
4853 }
4854
4855 if (fd->ref_fn) {
4856 char ref_fn[PATH_MAX];
4857 full_path(ref_fn, fd->ref_fn);
4858 if (sam_hdr_update_line(hdr, "SQ", "SN", hdr->hrecs->ref[i].name, "UR", ref_fn, NULL))
4859 return -1;
4860 }
4861 }
4862 }
4863
4864 /* Length */
4865 header_len = sam_hdr_length(hdr);
4866 if (header_len > INT32_MAX) {
4867 hts_log_error("Header is too long for CRAM format");
4868 return -1;
4869 }
4870 if (CRAM_MAJOR_VERS(fd->version) == 1) {
4871 if (-1 == int32_encode(fd, header_len))
4872 return -1;
4873
4874 /* Text data */
4875 if (header_len != hwrite(fd->fp, sam_hdr_str(hdr), header_len))
4876 return -1;
4877 } else {
4878 /* Create block(s) inside a container */
4879 cram_block *b = cram_new_block(FILE_HEADER, 0);
4880 cram_container *c = cram_new_container(0, 0);
4881 int padded_length;
4882 char *pads;
4883 int is_cram_3 = (CRAM_MAJOR_VERS(fd->version) >= 3);
4884
4885 if (!b || !c) {
4886 if (b) cram_free_block(b);
4887 if (c) cram_free_container(c);
4888 return -1;
4889 }
4890
4891 if (int32_put_blk(b, header_len) < 0)
4892 return -1;
4893 if (header_len)
4894 BLOCK_APPEND(b, sam_hdr_str(hdr), header_len);
4895 BLOCK_UPLEN(b);
4896
4897 // Compress header block if V3.0 and above
4898 if (CRAM_MAJOR_VERS(fd->version) >= 3)
4899 if (cram_compress_block(fd, b, NULL, -1, -1) < 0)
4900 return -1;
4901
4902 if (blank_block) {
4903 c->length = b->comp_size + 2 + 4*is_cram_3 +
4904 fd->vv.varint_size(b->content_id) +
4905 fd->vv.varint_size(b->uncomp_size) +
4906 fd->vv.varint_size(b->comp_size);
4907
4908 c->num_blocks = 2;
4909 c->num_landmarks = 2;
4910 if (!(c->landmark = malloc(2*sizeof(*c->landmark)))) {
4911 cram_free_block(b);
4912 cram_free_container(c);
4913 return -1;
4914 }
4915 c->landmark[0] = 0;
4916 c->landmark[1] = c->length;
4917
4918 // Plus extra storage for uncompressed secondary blank block
4919 padded_length = MIN(c->length*.5, 10000);
4920 c->length += padded_length + 2 + 4*is_cram_3 +
4921 fd->vv.varint_size(b->content_id) +
4922 fd->vv.varint_size(padded_length)*2;
4923 } else {
4924 // Pad the block instead.
4925 c->num_blocks = 1;
4926 c->num_landmarks = 1;
4927 if (!(c->landmark = malloc(sizeof(*c->landmark))))
4928 return -1;
4929 c->landmark[0] = 0;
4930
4931 padded_length = MAX(c->length*1.5, 10000) - c->length;
4932
4933 c->length = b->comp_size + padded_length +
4934 2 + 4*is_cram_3 +
4935 fd->vv.varint_size(b->content_id) +
4936 fd->vv.varint_size(b->uncomp_size) +
4937 fd->vv.varint_size(b->comp_size);
4938
4939 if (NULL == (pads = calloc(1, padded_length))) {
4940 cram_free_block(b);
4941 cram_free_container(c);
4942 return -1;
4943 }
4944 BLOCK_APPEND(b, pads, padded_length);
4945 BLOCK_UPLEN(b);
4946 free(pads);
4947 }
4948
4949 if (-1 == cram_write_container(fd, c)) {
4950 cram_free_block(b);
4951 cram_free_container(c);
4952 return -1;
4953 }
4954
4955 if (-1 == cram_write_block(fd, b)) {
4956 cram_free_block(b);
4957 cram_free_container(c);
4958 return -1;
4959 }
4960
4961 if (blank_block) {
4962 BLOCK_RESIZE(b, padded_length);
4963 memset(BLOCK_DATA(b), 0, padded_length);
4964 BLOCK_SIZE(b) = padded_length;
4965 BLOCK_UPLEN(b);
4966 b->method = RAW;
4967 if (-1 == cram_write_block(fd, b)) {
4968 cram_free_block(b);
4969 cram_free_container(c);
4970 return -1;
4971 }
4972 }
4973
4974 cram_free_block(b);
4975 cram_free_container(c);
4976 }
4977
4978 if (-1 == refs_from_header(fd))
4979 return -1;
4980 if (-1 == refs2id(fd->refs, fd->header))
4981 return -1;
4982
4983 if (0 != hflush(fd->fp))
4984 return -1;
4985
4986 RP("=== Finishing saving header ===\n");
4987
4988 return 0;
4989
4990 block_err:
4991 return -1;
4992 }
4993
4994 /* ----------------------------------------------------------------------
4995 * The top-level cram opening, closing and option handling
4996 */
4997
4998 /*
4999 * Sets CRAM variable sized integer decode function tables.
5000 * CRAM 1, 2, and 3.x all used ITF8 for uint32 and UTF8 for uint64.
5001 * CRAM 4.x uses the same encoding mechanism for 32-bit and 64-bit
5002 * (or anything inbetween), but also now supports signed values.
5003 *
5004 * Version is the CRAM major version number.
5005 * vv is the vector table (probably &cram_fd->vv)
5006 */
cram_init_varint(varint_vec * vv,int version)5007 static void cram_init_varint(varint_vec *vv, int version) {
5008 if (version >= 4) {
5009 vv->varint_get32 = uint7_get_32; // FIXME: varint.h API should be size agnostic
5010 vv->varint_get32s = sint7_get_32;
5011 vv->varint_get64 = uint7_get_64;
5012 vv->varint_get64s = sint7_get_64;
5013 vv->varint_put32 = uint7_put_32;
5014 vv->varint_put32s = sint7_put_32;
5015 vv->varint_put64 = uint7_put_64;
5016 vv->varint_put64s = sint7_put_64;
5017 vv->varint_put32_blk = uint7_put_blk_32;
5018 vv->varint_put32s_blk = sint7_put_blk_32;
5019 vv->varint_put64_blk = uint7_put_blk_64;
5020 vv->varint_put64s_blk = sint7_put_blk_64;
5021 vv->varint_size = uint7_size;
5022 vv->varint_decode32_crc = uint7_decode_crc32;
5023 vv->varint_decode32s_crc = sint7_decode_crc32;
5024 vv->varint_decode64_crc = uint7_decode_crc64;
5025 } else {
5026 vv->varint_get32 = safe_itf8_get;
5027 vv->varint_get32s = safe_itf8_get;
5028 vv->varint_get64 = safe_ltf8_get;
5029 vv->varint_get64s = safe_ltf8_get;
5030 vv->varint_put32 = safe_itf8_put;
5031 vv->varint_put32s = safe_itf8_put;
5032 vv->varint_put64 = safe_ltf8_put;
5033 vv->varint_put64s = safe_ltf8_put;
5034 vv->varint_put32_blk = itf8_put_blk;
5035 vv->varint_put32s_blk = itf8_put_blk;
5036 vv->varint_put64_blk = ltf8_put_blk;
5037 vv->varint_put64s_blk = ltf8_put_blk;
5038 vv->varint_size = itf8_size;
5039 vv->varint_decode32_crc = itf8_decode_crc;
5040 vv->varint_decode32s_crc = itf8_decode_crc;
5041 vv->varint_decode64_crc = ltf8_decode_crc;
5042 }
5043 }
5044
5045 /*
5046 * Initialises the lookup tables. These could be global statics, but they're
5047 * clumsy to setup in a multi-threaded environment unless we generate
5048 * verbatim code and include that.
5049 */
cram_init_tables(cram_fd * fd)5050 static void cram_init_tables(cram_fd *fd) {
5051 int i;
5052
5053 memset(fd->L1, 4, 256);
5054 fd->L1['A'] = 0; fd->L1['a'] = 0;
5055 fd->L1['C'] = 1; fd->L1['c'] = 1;
5056 fd->L1['G'] = 2; fd->L1['g'] = 2;
5057 fd->L1['T'] = 3; fd->L1['t'] = 3;
5058
5059 memset(fd->L2, 5, 256);
5060 fd->L2['A'] = 0; fd->L2['a'] = 0;
5061 fd->L2['C'] = 1; fd->L2['c'] = 1;
5062 fd->L2['G'] = 2; fd->L2['g'] = 2;
5063 fd->L2['T'] = 3; fd->L2['t'] = 3;
5064 fd->L2['N'] = 4; fd->L2['n'] = 4;
5065
5066 if (CRAM_MAJOR_VERS(fd->version) == 1) {
5067 for (i = 0; i < 0x200; i++) {
5068 int f = 0;
5069
5070 if (i & CRAM_FPAIRED) f |= BAM_FPAIRED;
5071 if (i & CRAM_FPROPER_PAIR) f |= BAM_FPROPER_PAIR;
5072 if (i & CRAM_FUNMAP) f |= BAM_FUNMAP;
5073 if (i & CRAM_FREVERSE) f |= BAM_FREVERSE;
5074 if (i & CRAM_FREAD1) f |= BAM_FREAD1;
5075 if (i & CRAM_FREAD2) f |= BAM_FREAD2;
5076 if (i & CRAM_FSECONDARY) f |= BAM_FSECONDARY;
5077 if (i & CRAM_FQCFAIL) f |= BAM_FQCFAIL;
5078 if (i & CRAM_FDUP) f |= BAM_FDUP;
5079
5080 fd->bam_flag_swap[i] = f;
5081 }
5082
5083 for (i = 0; i < 0x1000; i++) {
5084 int g = 0;
5085
5086 if (i & BAM_FPAIRED) g |= CRAM_FPAIRED;
5087 if (i & BAM_FPROPER_PAIR) g |= CRAM_FPROPER_PAIR;
5088 if (i & BAM_FUNMAP) g |= CRAM_FUNMAP;
5089 if (i & BAM_FREVERSE) g |= CRAM_FREVERSE;
5090 if (i & BAM_FREAD1) g |= CRAM_FREAD1;
5091 if (i & BAM_FREAD2) g |= CRAM_FREAD2;
5092 if (i & BAM_FSECONDARY) g |= CRAM_FSECONDARY;
5093 if (i & BAM_FQCFAIL) g |= CRAM_FQCFAIL;
5094 if (i & BAM_FDUP) g |= CRAM_FDUP;
5095
5096 fd->cram_flag_swap[i] = g;
5097 }
5098 } else {
5099 /* NOP */
5100 for (i = 0; i < 0x1000; i++)
5101 fd->bam_flag_swap[i] = i;
5102 for (i = 0; i < 0x1000; i++)
5103 fd->cram_flag_swap[i] = i;
5104 }
5105
5106 memset(fd->cram_sub_matrix, 4, 32*32);
5107 for (i = 0; i < 32; i++) {
5108 fd->cram_sub_matrix[i]['A'&0x1f]=0;
5109 fd->cram_sub_matrix[i]['C'&0x1f]=1;
5110 fd->cram_sub_matrix[i]['G'&0x1f]=2;
5111 fd->cram_sub_matrix[i]['T'&0x1f]=3;
5112 fd->cram_sub_matrix[i]['N'&0x1f]=4;
5113 }
5114 for (i = 0; i < 20; i+=4) {
5115 int j;
5116 for (j = 0; j < 20; j++) {
5117 fd->cram_sub_matrix["ACGTN"[i>>2]&0x1f][j]=3;
5118 fd->cram_sub_matrix["ACGTN"[i>>2]&0x1f][j]=3;
5119 fd->cram_sub_matrix["ACGTN"[i>>2]&0x1f][j]=3;
5120 fd->cram_sub_matrix["ACGTN"[i>>2]&0x1f][j]=3;
5121 }
5122 fd->cram_sub_matrix["ACGTN"[i>>2]&0x1f][CRAM_SUBST_MATRIX[i+0]&0x1f]=0;
5123 fd->cram_sub_matrix["ACGTN"[i>>2]&0x1f][CRAM_SUBST_MATRIX[i+1]&0x1f]=1;
5124 fd->cram_sub_matrix["ACGTN"[i>>2]&0x1f][CRAM_SUBST_MATRIX[i+2]&0x1f]=2;
5125 fd->cram_sub_matrix["ACGTN"[i>>2]&0x1f][CRAM_SUBST_MATRIX[i+3]&0x1f]=3;
5126 }
5127
5128 cram_init_varint(&fd->vv, CRAM_MAJOR_VERS(fd->version));
5129 }
5130
5131 // Default version numbers for CRAM
5132 static int major_version = 3;
5133 static int minor_version = 0;
5134
5135 /*
5136 * Opens a CRAM file for read (mode "rb") or write ("wb").
5137 * The filename may be "-" to indicate stdin or stdout.
5138 *
5139 * Returns file handle on success
5140 * NULL on failure.
5141 */
cram_open(const char * filename,const char * mode)5142 cram_fd *cram_open(const char *filename, const char *mode) {
5143 hFILE *fp;
5144 cram_fd *fd;
5145 char fmode[3]= { mode[0], '\0', '\0' };
5146
5147 if (strlen(mode) > 1 && (mode[1] == 'b' || mode[1] == 'c')) {
5148 fmode[1] = 'b';
5149 }
5150
5151 fp = hopen(filename, fmode);
5152 if (!fp)
5153 return NULL;
5154
5155 fd = cram_dopen(fp, filename, mode);
5156 if (!fd)
5157 hclose_abruptly(fp);
5158
5159 return fd;
5160 }
5161
5162 /* Opens an existing stream for reading or writing.
5163 *
5164 * Returns file handle on success;
5165 * NULL on failure.
5166 */
cram_dopen(hFILE * fp,const char * filename,const char * mode)5167 cram_fd *cram_dopen(hFILE *fp, const char *filename, const char *mode) {
5168 int i;
5169 char *cp;
5170 cram_fd *fd = calloc(1, sizeof(*fd));
5171 if (!fd)
5172 return NULL;
5173
5174 fd->level = CRAM_DEFAULT_LEVEL;
5175 for (i = 0; mode[i]; i++) {
5176 if (mode[i] >= '0' && mode[i] <= '9') {
5177 fd->level = mode[i] - '0';
5178 break;
5179 }
5180 }
5181
5182 fd->fp = fp;
5183 fd->mode = *mode;
5184 fd->first_container = 0;
5185 fd->curr_position = 0;
5186
5187 if (fd->mode == 'r') {
5188 /* Reader */
5189
5190 if (!(fd->file_def = cram_read_file_def(fd)))
5191 goto err;
5192
5193 fd->version = fd->file_def->major_version * 256 +
5194 fd->file_def->minor_version;
5195
5196 cram_init_tables(fd);
5197
5198 if (!(fd->header = cram_read_SAM_hdr(fd))) {
5199 cram_free_file_def(fd->file_def);
5200 goto err;
5201 }
5202
5203 } else {
5204 /* Writer */
5205 cram_file_def *def = calloc(1, sizeof(*def));
5206 if (!def)
5207 return NULL;
5208
5209 fd->file_def = def;
5210
5211 def->magic[0] = 'C';
5212 def->magic[1] = 'R';
5213 def->magic[2] = 'A';
5214 def->magic[3] = 'M';
5215 def->major_version = 0; // Indicator to write file def later.
5216 def->minor_version = 0;
5217 memset(def->file_id, 0, 20);
5218 strncpy(def->file_id, filename, 20);
5219
5220 fd->version = major_version * 256 + minor_version;
5221 cram_init_tables(fd);
5222
5223 /* SAM header written later along with this file_def */
5224 }
5225
5226 fd->prefix = strdup((cp = strrchr(filename, '/')) ? cp+1 : filename);
5227 if (!fd->prefix)
5228 goto err;
5229 fd->first_base = fd->last_base = -1;
5230 fd->record_counter = 0;
5231
5232 fd->ctr = NULL;
5233 fd->ctr_mt = NULL;
5234 fd->refs = refs_create();
5235 if (!fd->refs)
5236 goto err;
5237 fd->ref_id = -2;
5238 fd->ref = NULL;
5239
5240 fd->decode_md = 0;
5241 fd->seqs_per_slice = SEQS_PER_SLICE;
5242 fd->bases_per_slice = BASES_PER_SLICE;
5243 fd->slices_per_container = SLICE_PER_CNT;
5244 fd->embed_ref = 0;
5245 fd->no_ref = 0;
5246 fd->ap_delta = 0;
5247 fd->ignore_md5 = 0;
5248 fd->lossy_read_names = 0;
5249 fd->use_bz2 = 0;
5250 fd->use_rans = (CRAM_MAJOR_VERS(fd->version) >= 3);
5251 fd->use_tok = (CRAM_MAJOR_VERS(fd->version) >= 3) && (CRAM_MINOR_VERS(fd->version) >= 1);
5252 fd->use_lzma = 0;
5253 fd->multi_seq = -1;
5254 fd->multi_seq_user = -1;
5255 fd->unsorted = 0;
5256 fd->shared_ref = 0;
5257 fd->store_md = 0;
5258 fd->store_nm = 0;
5259 fd->last_RI_count = 0;
5260
5261 fd->index = NULL;
5262 fd->own_pool = 0;
5263 fd->pool = NULL;
5264 fd->rqueue = NULL;
5265 fd->job_pending = NULL;
5266 fd->ooc = 0;
5267 fd->required_fields = INT_MAX;
5268
5269 for (i = 0; i < DS_END; i++) {
5270 fd->m[i] = cram_new_metrics();
5271 if (!fd->m[i])
5272 goto err;
5273 }
5274
5275 if (!(fd->tags_used = kh_init(m_metrics)))
5276 goto err;
5277
5278 fd->range.refid = -2; // no ref.
5279 fd->eof = 1; // See samtools issue #150
5280 fd->ref_fn = NULL;
5281
5282 fd->bl = NULL;
5283
5284 /* Initialise dummy refs from the @SQ headers */
5285 if (-1 == refs_from_header(fd))
5286 goto err;
5287
5288 return fd;
5289
5290 err:
5291 if (fd)
5292 free(fd);
5293
5294 return NULL;
5295 }
5296
5297 /*
5298 * Seek within a CRAM file.
5299 *
5300 * Returns 0 on success
5301 * -1 on failure
5302 */
cram_seek(cram_fd * fd,off_t offset,int whence)5303 int cram_seek(cram_fd *fd, off_t offset, int whence) {
5304 char buf[65536];
5305
5306 fd->ooc = 0;
5307
5308 cram_drain_rqueue(fd);
5309
5310 if (hseek(fd->fp, offset, whence) >= 0) {
5311 return 0;
5312 }
5313
5314 if (!(whence == SEEK_CUR && offset >= 0))
5315 return -1;
5316
5317 /* Couldn't fseek, but we're in SEEK_CUR mode so read instead */
5318 while (offset > 0) {
5319 int len = MIN(65536, offset);
5320 if (len != hread(fd->fp, buf, len))
5321 return -1;
5322 offset -= len;
5323 }
5324
5325 return 0;
5326 }
5327
5328 /*
5329 * Flushes a CRAM file.
5330 * Useful for when writing to stdout without wishing to close the stream.
5331 *
5332 * Returns 0 on success
5333 * -1 on failure
5334 */
cram_flush(cram_fd * fd)5335 int cram_flush(cram_fd *fd) {
5336 if (!fd)
5337 return -1;
5338
5339 if (fd->mode == 'w' && fd->ctr) {
5340 if(fd->ctr->slice)
5341 cram_update_curr_slice(fd->ctr);
5342
5343 if (-1 == cram_flush_container_mt(fd, fd->ctr))
5344 return -1;
5345 }
5346
5347 return 0;
5348 }
5349
5350 /*
5351 * Writes an EOF block to a CRAM file.
5352 *
5353 * Returns 0 on success
5354 * -1 on failure
5355 */
cram_write_eof_block(cram_fd * fd)5356 int cram_write_eof_block(cram_fd *fd) {
5357 // EOF block is a container with special values to aid detection
5358 if (CRAM_MAJOR_VERS(fd->version) >= 2) {
5359 // Empty container with
5360 // ref_seq_id -1
5361 // start pos 0x454f46 ("EOF")
5362 // span 0
5363 // nrec 0
5364 // counter 0
5365 // nbases 0
5366 // 1 block (landmark 0)
5367 // (CRC32)
5368 cram_container c;
5369 memset(&c, 0, sizeof(c));
5370 c.ref_seq_id = -1;
5371 c.ref_seq_start = 0x454f46; // "EOF"
5372 c.ref_seq_span = 0;
5373 c.record_counter = 0;
5374 c.num_bases = 0;
5375 c.num_blocks = 1;
5376 int32_t land[1] = {0};
5377 c.landmark = land;
5378
5379 // An empty compression header block with
5380 // method raw (0)
5381 // type comp header (1)
5382 // content id 0
5383 // block contents size 6
5384 // raw size 6
5385 // empty preservation map (01 00)
5386 // empty data series map (01 00)
5387 // empty tag map (01 00)
5388 // block CRC
5389 cram_block_compression_hdr ch;
5390 memset(&ch, 0, sizeof(ch));
5391 c.comp_hdr_block = cram_encode_compression_header(fd, &c, &ch);
5392
5393 c.length = c.comp_hdr_block->byte // Landmark[0]
5394 + 5 // block struct
5395 + 4*(CRAM_MAJOR_VERS(fd->version) >= 3); // CRC
5396 if (cram_write_container(fd, &c) < 0 ||
5397 cram_write_block(fd, c.comp_hdr_block) < 0) {
5398 cram_close(fd);
5399 cram_free_block(c.comp_hdr_block);
5400 return -1;
5401 }
5402 if (ch.preservation_map)
5403 kh_destroy(map, ch.preservation_map);
5404 cram_free_block(c.comp_hdr_block);
5405
5406 // V2.1 bytes
5407 // 0b 00 00 00 ff ff ff ff 0f // Cont HDR: size, ref seq id
5408 // e0 45 4f 46 00 00 00 // Cont HDR: pos, span, nrec, counter
5409 // 00 01 00 // Cont HDR: nbase, nblk, landmark
5410 // 00 01 00 06 06 // Comp.HDR blk
5411 // 01 00 01 00 01 00 // Comp.HDR blk
5412
5413 // V3.0 bytes:
5414 // 0f 00 00 00 ff ff ff ff 0f // Cont HDR: size, ref seq id
5415 // e0 45 4f 46 00 00 00 // Cont HDR: pos, span, nrec, counter
5416 // 00 01 00 // Cont HDR: nbase, nblk, landmark
5417 // 05 bd d9 4f // CRC32
5418 // 00 01 00 06 06 // Comp.HDR blk
5419 // 01 00 01 00 01 00 // Comp.HDR blk
5420 // ee 63 01 4b // CRC32
5421
5422 // V4.0 bytes:
5423 // 0f 00 00 00 8f ff ff ff // Cont HDR: size, ref seq id
5424 // 82 95 9e 46 00 00 00 // Cont HDR: pos, span, nrec, counter
5425 // 00 01 00 // Cont HDR: nbase, nblk, landmark
5426 // ac d6 05 bc // CRC32
5427 // 00 01 00 06 06 // Comp.HDR blk
5428 // 01 00 01 00 01 00 // Comp.HDR blk
5429 // ee 63 01 4b // CRC32
5430 }
5431
5432 return 0;
5433 }
5434 /*
5435 * Closes a CRAM file.
5436 * Returns 0 on success
5437 * -1 on failure
5438 */
cram_close(cram_fd * fd)5439 int cram_close(cram_fd *fd) {
5440 spare_bams *bl, *next;
5441 int i;
5442
5443 if (!fd)
5444 return -1;
5445
5446 if (fd->mode == 'w' && fd->ctr) {
5447 if(fd->ctr->slice)
5448 cram_update_curr_slice(fd->ctr);
5449
5450 if (-1 == cram_flush_container_mt(fd, fd->ctr))
5451 return -1;
5452 }
5453
5454 if (fd->mode != 'w')
5455 cram_drain_rqueue(fd);
5456
5457 if (fd->pool && fd->eof >= 0 && fd->rqueue) {
5458 hts_tpool_process_flush(fd->rqueue);
5459
5460 if (0 != cram_flush_result(fd))
5461 return -1;
5462
5463 if (fd->mode == 'w')
5464 fd->ctr = NULL; // prevent double freeing
5465
5466 pthread_mutex_destroy(&fd->metrics_lock);
5467 pthread_mutex_destroy(&fd->ref_lock);
5468 pthread_mutex_destroy(&fd->bam_list_lock);
5469
5470 //fprintf(stderr, "CRAM: destroy queue %p\n", fd->rqueue);
5471
5472 hts_tpool_process_destroy(fd->rqueue);
5473 }
5474
5475 if (fd->mode == 'w') {
5476 /* Write EOF block */
5477 if (0 != cram_write_eof_block(fd))
5478 return -1;
5479 }
5480
5481 for (bl = fd->bl; bl; bl = next) {
5482 int i, max_rec = fd->seqs_per_slice * fd->slices_per_container;
5483
5484 next = bl->next;
5485 for (i = 0; i < max_rec; i++) {
5486 if (bl->bams[i])
5487 bam_free(bl->bams[i]);
5488 }
5489 free(bl->bams);
5490 free(bl);
5491 }
5492
5493 if (hclose(fd->fp) != 0)
5494 return -1;
5495
5496 if (fd->file_def)
5497 cram_free_file_def(fd->file_def);
5498
5499 if (fd->header)
5500 sam_hdr_destroy(fd->header);
5501
5502 free(fd->prefix);
5503
5504 if (fd->ctr)
5505 cram_free_container(fd->ctr);
5506
5507 if (fd->ctr_mt && fd->ctr_mt != fd->ctr)
5508 cram_free_container(fd->ctr_mt);
5509
5510 if (fd->refs)
5511 refs_free(fd->refs);
5512 if (fd->ref_free)
5513 free(fd->ref_free);
5514
5515 for (i = 0; i < DS_END; i++)
5516 if (fd->m[i])
5517 free(fd->m[i]);
5518
5519 if (fd->tags_used) {
5520 khint_t k;
5521
5522 for (k = kh_begin(fd->tags_used); k != kh_end(fd->tags_used); k++) {
5523 if (kh_exist(fd->tags_used, k))
5524 free(kh_val(fd->tags_used, k));
5525 }
5526
5527 kh_destroy(m_metrics, fd->tags_used);
5528 }
5529
5530 if (fd->index)
5531 cram_index_free(fd);
5532
5533 if (fd->own_pool && fd->pool)
5534 hts_tpool_destroy(fd->pool);
5535
5536 if (fd->idxfp)
5537 if (bgzf_close(fd->idxfp) < 0)
5538 return -1;
5539
5540 free(fd);
5541 return 0;
5542 }
5543
5544 /*
5545 * Returns 1 if we hit an EOF while reading.
5546 */
cram_eof(cram_fd * fd)5547 int cram_eof(cram_fd *fd) {
5548 return fd->eof;
5549 }
5550
5551
5552 /*
5553 * Sets options on the cram_fd. See CRAM_OPT_* definitions in cram_structs.h.
5554 * Use this immediately after opening.
5555 *
5556 * Returns 0 on success
5557 * -1 on failure
5558 */
cram_set_option(cram_fd * fd,enum hts_fmt_option opt,...)5559 int cram_set_option(cram_fd *fd, enum hts_fmt_option opt, ...) {
5560 int r;
5561 va_list args;
5562
5563 va_start(args, opt);
5564 r = cram_set_voption(fd, opt, args);
5565 va_end(args);
5566
5567 return r;
5568 }
5569
5570 /*
5571 * Sets options on the cram_fd. See CRAM_OPT_* definitions in cram_structs.h.
5572 * Use this immediately after opening.
5573 *
5574 * Returns 0 on success
5575 * -1 on failure
5576 */
cram_set_voption(cram_fd * fd,enum hts_fmt_option opt,va_list args)5577 int cram_set_voption(cram_fd *fd, enum hts_fmt_option opt, va_list args) {
5578 refs_t *refs;
5579
5580 if (!fd) {
5581 errno = EBADF;
5582 return -1;
5583 }
5584
5585 switch (opt) {
5586 case CRAM_OPT_DECODE_MD:
5587 fd->decode_md = va_arg(args, int);
5588 break;
5589
5590 case CRAM_OPT_PREFIX:
5591 if (fd->prefix)
5592 free(fd->prefix);
5593 if (!(fd->prefix = strdup(va_arg(args, char *))))
5594 return -1;
5595 break;
5596
5597 case CRAM_OPT_VERBOSITY:
5598 break;
5599
5600 case CRAM_OPT_SEQS_PER_SLICE:
5601 fd->seqs_per_slice = va_arg(args, int);
5602 if (fd->bases_per_slice == BASES_PER_SLICE)
5603 fd->bases_per_slice = fd->seqs_per_slice * 500;
5604 break;
5605
5606 case CRAM_OPT_BASES_PER_SLICE:
5607 fd->bases_per_slice = va_arg(args, int);
5608 break;
5609
5610 case CRAM_OPT_SLICES_PER_CONTAINER:
5611 fd->slices_per_container = va_arg(args, int);
5612 break;
5613
5614 case CRAM_OPT_EMBED_REF:
5615 fd->embed_ref = va_arg(args, int);
5616 break;
5617
5618 case CRAM_OPT_NO_REF:
5619 fd->no_ref = va_arg(args, int);
5620 break;
5621
5622 case CRAM_OPT_POS_DELTA:
5623 fd->ap_delta = va_arg(args, int);
5624 break;
5625
5626 case CRAM_OPT_IGNORE_MD5:
5627 fd->ignore_md5 = va_arg(args, int);
5628 break;
5629
5630 case CRAM_OPT_LOSSY_NAMES:
5631 fd->lossy_read_names = va_arg(args, int);
5632 // Currently lossy read names required paired (attached) reads.
5633 // TLEN 0 or being 1 out causes read pairs to be detached, breaking
5634 // the lossy read name compression, so we have extra options to
5635 // slacken the exact TLEN round-trip checks.
5636 fd->tlen_approx = fd->lossy_read_names;
5637 fd->tlen_zero = fd->lossy_read_names;
5638 break;
5639
5640 case CRAM_OPT_USE_BZIP2:
5641 fd->use_bz2 = va_arg(args, int);
5642 break;
5643
5644 case CRAM_OPT_USE_RANS:
5645 fd->use_rans = va_arg(args, int);
5646 break;
5647
5648 case CRAM_OPT_USE_TOK:
5649 fd->use_tok = va_arg(args, int);
5650 break;
5651
5652 case CRAM_OPT_USE_FQZ:
5653 fd->use_fqz = va_arg(args, int);
5654 break;
5655
5656 case CRAM_OPT_USE_ARITH:
5657 fd->use_arith = va_arg(args, int);
5658 break;
5659
5660 case CRAM_OPT_USE_LZMA:
5661 fd->use_lzma = va_arg(args, int);
5662 break;
5663
5664 case CRAM_OPT_SHARED_REF:
5665 fd->shared_ref = 1;
5666 refs = va_arg(args, refs_t *);
5667 if (refs != fd->refs) {
5668 if (fd->refs)
5669 refs_free(fd->refs);
5670 fd->refs = refs;
5671 fd->refs->count++;
5672 }
5673 break;
5674
5675 case CRAM_OPT_RANGE: {
5676 int r = cram_seek_to_refpos(fd, va_arg(args, cram_range *));
5677 pthread_mutex_lock(&fd->range_lock);
5678 if (fd->range.refid != -2)
5679 fd->required_fields |= SAM_POS;
5680 pthread_mutex_unlock(&fd->range_lock);
5681 return r;
5682 }
5683
5684 case CRAM_OPT_RANGE_NOSEEK: {
5685 // As per CRAM_OPT_RANGE, but no seeking
5686 pthread_mutex_lock(&fd->range_lock);
5687 cram_range *r = va_arg(args, cram_range *);
5688 fd->range = *r;
5689 if (r->refid == HTS_IDX_NOCOOR) {
5690 fd->range.refid = -1;
5691 fd->range.start = 0;
5692 } else if (r->refid == HTS_IDX_START || r->refid == HTS_IDX_REST) {
5693 fd->range.refid = -2; // special case in cram_next_slice
5694 }
5695 if (fd->range.refid != -2)
5696 fd->required_fields |= SAM_POS;
5697 fd->ooc = 0;
5698 fd->eof = 0;
5699 pthread_mutex_unlock(&fd->range_lock);
5700 return 0;
5701 }
5702
5703 case CRAM_OPT_REFERENCE:
5704 return cram_load_reference(fd, va_arg(args, char *));
5705
5706 case CRAM_OPT_VERSION: {
5707 int major, minor;
5708 char *s = va_arg(args, char *);
5709 if (2 != sscanf(s, "%d.%d", &major, &minor)) {
5710 hts_log_error("Malformed version string %s", s);
5711 return -1;
5712 }
5713 if (!((major == 1 && minor == 0) ||
5714 (major == 2 && (minor == 0 || minor == 1)) ||
5715 (major == 3 && (minor == 0 || minor == 1)) ||
5716 (major == 4 && minor == 0))) {
5717 hts_log_error("Unknown version string; use 1.0, 2.0, 2.1, 3.0, 3.1 or 4.0");
5718 errno = EINVAL;
5719 return -1;
5720 }
5721
5722 if (major > 3 || (major == 3 && minor > 0)) {
5723 hts_log_warning(
5724 "CRAM version %s is still a draft and subject to change.\n"
5725 "This is a technology demonstration that should not be "
5726 "used for archival data.", s);
5727 }
5728
5729 fd->version = major*256 + minor;
5730
5731 fd->use_rans = (CRAM_MAJOR_VERS(fd->version) >= 3) ? 1 : 0;
5732
5733 fd->use_tok = ((CRAM_MAJOR_VERS(fd->version) == 3 &&
5734 CRAM_MINOR_VERS(fd->version) >= 1) ||
5735 CRAM_MAJOR_VERS(fd->version) >= 4) ? 1 : 0;
5736 cram_init_tables(fd);
5737
5738 break;
5739 }
5740
5741 case CRAM_OPT_MULTI_SEQ_PER_SLICE:
5742 fd->multi_seq_user = fd->multi_seq = va_arg(args, int);
5743 break;
5744
5745 case CRAM_OPT_NTHREADS: {
5746 int nthreads = va_arg(args, int);
5747 if (nthreads >= 1) {
5748 if (!(fd->pool = hts_tpool_init(nthreads)))
5749 return -1;
5750
5751 fd->rqueue = hts_tpool_process_init(fd->pool, nthreads*2, 0);
5752 pthread_mutex_init(&fd->metrics_lock, NULL);
5753 pthread_mutex_init(&fd->ref_lock, NULL);
5754 pthread_mutex_init(&fd->range_lock, NULL);
5755 pthread_mutex_init(&fd->bam_list_lock, NULL);
5756 fd->shared_ref = 1;
5757 fd->own_pool = 1;
5758 }
5759 break;
5760 }
5761
5762 case CRAM_OPT_THREAD_POOL: {
5763 htsThreadPool *p = va_arg(args, htsThreadPool *);
5764 fd->pool = p ? p->pool : NULL;
5765 if (fd->pool) {
5766 fd->rqueue = hts_tpool_process_init(fd->pool,
5767 p->qsize ? p->qsize : hts_tpool_size(fd->pool)*2,
5768 0);
5769 pthread_mutex_init(&fd->metrics_lock, NULL);
5770 pthread_mutex_init(&fd->ref_lock, NULL);
5771 pthread_mutex_init(&fd->range_lock, NULL);
5772 pthread_mutex_init(&fd->bam_list_lock, NULL);
5773 }
5774 fd->shared_ref = 1; // Needed to avoid clobbering ref between threads
5775 fd->own_pool = 0;
5776
5777 //fd->qsize = 1;
5778 //fd->decoded = calloc(fd->qsize, sizeof(cram_container *));
5779 //hts_tpool_dispatch(fd->pool, cram_decoder_thread, fd);
5780 break;
5781 }
5782
5783 case CRAM_OPT_REQUIRED_FIELDS:
5784 fd->required_fields = va_arg(args, int);
5785 if (fd->range.refid != -2)
5786 fd->required_fields |= SAM_POS;
5787 break;
5788
5789 case CRAM_OPT_STORE_MD:
5790 fd->store_md = va_arg(args, int);
5791 break;
5792
5793 case CRAM_OPT_STORE_NM:
5794 fd->store_nm = va_arg(args, int);
5795 break;
5796
5797 case HTS_OPT_COMPRESSION_LEVEL:
5798 fd->level = va_arg(args, int);
5799 break;
5800
5801 case HTS_OPT_PROFILE: {
5802 enum hts_profile_option prof = va_arg(args, int);
5803 switch (prof) {
5804 case HTS_PROFILE_FAST:
5805 if (fd->level == CRAM_DEFAULT_LEVEL) fd->level = 1;
5806 fd->use_tok = 0;
5807 fd->seqs_per_slice = 10000;
5808 break;
5809
5810 case HTS_PROFILE_NORMAL:
5811 break;
5812
5813 case HTS_PROFILE_SMALL:
5814 if (fd->level == CRAM_DEFAULT_LEVEL) fd->level = 6;
5815 fd->use_bz2 = 1;
5816 fd->use_fqz = 1;
5817 fd->seqs_per_slice = 25000;
5818 break;
5819
5820 case HTS_PROFILE_ARCHIVE:
5821 if (fd->level == CRAM_DEFAULT_LEVEL) fd->level = 7;
5822 fd->use_bz2 = 1;
5823 fd->use_fqz = 1;
5824 fd->use_arith = 1;
5825 if (fd->level > 7)
5826 fd->use_lzma = 1;
5827 fd->seqs_per_slice = 100000;
5828 break;
5829 }
5830
5831 if (fd->bases_per_slice == BASES_PER_SLICE)
5832 fd->bases_per_slice = fd->seqs_per_slice * 500;
5833 break;
5834 }
5835
5836 default:
5837 hts_log_error("Unknown CRAM option code %d", opt);
5838 errno = EINVAL;
5839 return -1;
5840 }
5841
5842 return 0;
5843 }
5844
cram_check_EOF(cram_fd * fd)5845 int cram_check_EOF(cram_fd *fd)
5846 {
5847 // Byte 9 in these templates is & with 0x0f to resolve differences
5848 // between ITF-8 interpretations between early Java and C
5849 // implementations of CRAM
5850 static const unsigned char TEMPLATE_2_1[30] = {
5851 0x0b, 0x00, 0x00, 0x00, 0xff, 0xff, 0xff, 0xff, 0x0f, 0xe0,
5852 0x45, 0x4f, 0x46, 0x00, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00,
5853 0x01, 0x00, 0x06, 0x06, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00
5854 };
5855 static const unsigned char TEMPLATE_3[38] = {
5856 0x0f, 0x00, 0x00, 0x00, 0xff, 0xff, 0xff, 0xff, 0x0f, 0xe0,
5857 0x45, 0x4f, 0x46, 0x00, 0x00, 0x00, 0x00, 0x01, 0x00, 0x05,
5858 0xbd, 0xd9, 0x4f, 0x00, 0x01, 0x00, 0x06, 0x06, 0x01, 0x00,
5859 0x01, 0x00, 0x01, 0x00, 0xee, 0x63, 0x01, 0x4b
5860 };
5861
5862 unsigned char buf[38]; // max(sizeof TEMPLATE_*)
5863
5864 uint8_t major = CRAM_MAJOR_VERS(fd->version);
5865 uint8_t minor = CRAM_MINOR_VERS(fd->version);
5866
5867 const unsigned char *template;
5868 ssize_t template_len;
5869 if ((major < 2) ||
5870 (major == 2 && minor == 0)) {
5871 return 3; // No EOF support in cram versions less than 2.1
5872 } else if (major == 2 && minor == 1) {
5873 template = TEMPLATE_2_1;
5874 template_len = sizeof TEMPLATE_2_1;
5875 } else {
5876 template = TEMPLATE_3;
5877 template_len = sizeof TEMPLATE_3;
5878 }
5879
5880 off_t offset = htell(fd->fp);
5881 if (hseek(fd->fp, -template_len, SEEK_END) < 0) {
5882 if (errno == ESPIPE) {
5883 hclearerr(fd->fp);
5884 return 2;
5885 }
5886 else {
5887 return -1;
5888 }
5889 }
5890 if (hread(fd->fp, buf, template_len) != template_len) return -1;
5891 if (hseek(fd->fp, offset, SEEK_SET) < 0) return -1;
5892 buf[8] &= 0x0f;
5893 return (memcmp(template, buf, template_len) == 0)? 1 : 0;
5894 }
5895