1 /* Tests for fqz codec */
2 /*
3 * Copyright (c) 2019,2020 Genome Research Ltd.
4 * Author(s): James Bonfield
5 *
6 * Redistribution and use in source and binary forms, with or without
7 * modification, are permitted provided that the following conditions are met:
8 *
9 * 1. Redistributions of source code must retain the above copyright notice,
10 * this list of conditions and the following disclaimer.
11 *
12 * 2. Redistributions in binary form must reproduce the above
13 * copyright notice, this list of conditions and the following
14 * disclaimer in the documentation and/or other materials provided
15 * with the distribution.
16 *
17 * 3. Neither the names Genome Research Ltd and Wellcome Trust Sanger
18 * Institute nor the names of its contributors may be used to endorse
19 * or promote products derived from this software without specific
20 * prior written permission.
21 *
22 * THIS SOFTWARE IS PROVIDED BY GENOME RESEARCH LTD AND CONTRIBUTORS "AS
23 * IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
24 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
25 * PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL GENOME RESEARCH
26 * LTD OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
27 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
28 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
29 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
30 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
31 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
32 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
33 */
34
35 #include <stdio.h>
36 #include <stdint.h>
37 #include <stdlib.h>
38 #include <string.h>
39 #include <unistd.h>
40 #include <assert.h>
41 #include <fcntl.h>
42 #include <ctype.h>
43 #include <limits.h>
44
45 #include "htscodecs/fqzcomp_qual.h"
46 #include "htscodecs/varint.h"
47
48 #ifndef MAX_REC
49 #define MAX_REC 1000000
50 #endif
51
52 #ifndef MAX_SEQ
53 # define MAX_SEQ 100000
54 #endif
55
56 #ifndef MIN
57 # define MIN(a,b) ((a)<(b)?(a):(b))
58 # define MAX(a,b) ((a)>(b)?(a):(b))
59 #endif
60
61 static fqz_slice fixed_slice = {0};
62
fake_slice(size_t buf_len,int * len,int * r2,int * sel,int nlen)63 fqz_slice *fake_slice(size_t buf_len, int *len, int *r2, int *sel, int nlen) {
64 fixed_slice.num_records = (nlen == 1) ? (buf_len+len[0]-1) / len[0] : nlen;
65 assert(fixed_slice.num_records <= MAX_REC);
66 int i;
67 if (!fixed_slice.len)
68 fixed_slice.len = malloc(MAX_REC * sizeof(*fixed_slice.len));
69 if (!fixed_slice.flags)
70 fixed_slice.flags = malloc(MAX_REC * sizeof(*fixed_slice.flags));
71 for (i = 0; i < fixed_slice.num_records; i++) {
72 int idx = i < nlen ? i : nlen-1;
73 fixed_slice.len[i] = len[idx];
74 fixed_slice.flags[i] = r2 ? r2[idx]*FQZ_FREAD2 : 0;
75 fixed_slice.flags[i] |= sel ? (sel[idx]<<16) : 0;
76 }
77
78 return &fixed_slice;
79 }
80
81 static uint64_t manual_strats[10] = {0};
82 static int manual_nstrat = 0;
83
84 /*
85 * Manually specified strategies held in global manual_strats[].
86 */
87 static inline
fqz_manual_parameters(fqz_gparams * gp,fqz_slice * s,unsigned char * in,size_t in_size)88 int fqz_manual_parameters(fqz_gparams *gp,
89 fqz_slice *s,
90 unsigned char *in,
91 size_t in_size) {
92 int i, p;
93 int dsqr[] = {
94 0, 1, 1, 1, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3,
95 4, 4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 5, 5, 5,
96 5, 5, 5, 5, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6,
97 6, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7
98 };
99
100 gp->vers = FQZ_VERS;
101 gp->nparam = manual_nstrat;
102 gp->gflags = GFLAG_MULTI_PARAM | GFLAG_HAVE_STAB;
103 for (i = 0; i < 256; i++)
104 gp->stab[i] = 0;
105
106 // Fill these out later
107 gp->max_sel = 0;
108 gp->max_sym = 0;
109 gp->p = malloc(gp->nparam * sizeof(*gp->p));
110
111 for (p = 0; p < gp->nparam; p++) {
112 fqz_param *pm = &gp->p[p];
113 uint64_t st = manual_strats[p];
114
115 pm->do_qa = st & 15; st >>= 4;
116 pm->do_r2 = st & 15; st >>= 4;
117 pm->dloc = st & 15; st >>= 4;
118 pm->ploc = st & 15; st >>= 4;
119 pm->sloc = st & 15; st >>= 4;
120 pm->qloc = st & 15; st >>= 4;
121 pm->dshift = st & 15; st >>= 4;
122 pm->dbits = st & 15; st >>= 4;
123 pm->pshift = st & 15; st >>= 4;
124 pm->pbits = st & 15; st >>= 4;
125 pm->qshift = st & 15; st >>= 4;
126 pm->qbits = st & 15; st >>= 4;
127
128 // Gather some stats, as per qual_stats func.
129 // r in rec count.
130 // i = index to in[]
131 // j = index within this rec
132 uint32_t qhist[256] = {0};
133
134 // qual stats for seqs using this parameter only
135 fqz_qual_stats(s, in, in_size, pm, qhist, p);
136 int max_sel = pm->max_sel;
137
138 // Update max_sel running total. Eg with 4 sub-params:
139 //
140 // sel param no. => new
141 // 0 0 0
142 // 0/1 1 1,2
143 // 0/1 2 3,4
144 // 0 3 5
145 for (i = gp->max_sel; i < gp->max_sel + max_sel+1; i++)
146 gp->stab[i] = p;
147 gp->max_sel += max_sel+1;
148
149 pm->fixed_len = pm->fixed_len > 0;
150 pm->use_qtab = 0; // unused by current encoder
151 pm->store_qmap = pm->nsym <= 8;
152
153 // Adjust parameters based on quality stats.
154 // FIXME: dup from fqz_pick_parameters.
155 for (i = 0; i < sizeof(dsqr)/sizeof(*dsqr); i++)
156 if (dsqr[i] > (1<<pm->dbits)-1)
157 dsqr[i] = (1<<pm->dbits)-1;
158
159 if (pm->store_qmap) {
160 int j;
161 for (i = j = 0; i < 256; i++)
162 if (qhist[i])
163 pm->qmap[i] = j++;
164 else
165 pm->qmap[i] = INT_MAX;
166 pm->max_sym = pm->nsym;
167 } else {
168 pm->nsym = 255;
169 for (i = 0; i < 256; i++)
170 pm->qmap[i] = i;
171 }
172 if (gp->max_sym < pm->max_sym)
173 gp->max_sym = pm->max_sym;
174
175 // Produce ptab from pshift.
176 if (pm->qbits) {
177 for (i = 0; i < 256; i++) {
178 pm->qtab[i] = i; // 1:1
179
180 // Alternative mappings:
181 //qtab[i] = i > 30 ? MIN(max_sym,i)-15 : i/2; // eg for 9827 BAM
182 }
183
184 }
185 pm->qmask = (1<<pm->qbits)-1;
186
187 if (pm->pbits) {
188 for (i = 0; i < 1024; i++)
189 pm->ptab[i] = MIN((1<<pm->pbits)-1, i>>pm->pshift);
190
191 // Alternatively via analysis of quality distributions we
192 // may select a bunch of positions that are special and
193 // have a non-uniform ptab[].
194 // Manual experimentation on a NovaSeq run saved 2.8% here.
195 }
196
197 if (pm->dbits) {
198 for (i = 0; i < 256; i++)
199 pm->dtab[i] = dsqr[MIN(sizeof(dsqr)/sizeof(*dsqr)-1, i>>pm->dshift)];
200 }
201
202 pm->use_ptab = (pm->pbits > 0);
203 pm->use_dtab = (pm->dbits > 0);
204
205 pm->pflags =
206 (pm->use_qtab ?PFLAG_HAVE_QTAB :0)|
207 (pm->use_dtab ?PFLAG_HAVE_DTAB :0)|
208 (pm->use_ptab ?PFLAG_HAVE_PTAB :0)|
209 (pm->do_sel ?PFLAG_DO_SEL :0)|
210 (pm->fixed_len ?PFLAG_DO_LEN :0)|
211 (pm->do_dedup ?PFLAG_DO_DEDUP :0)|
212 (pm->store_qmap ?PFLAG_HAVE_QMAP :0);
213 }
214
215 for (i = gp->max_sel; i < 256; i++)
216 gp->stab[i] = gp->stab[gp->max_sel-1];
217
218 return 0;
219 }
220
221 #define BS 1024*1024
load(char * fn,size_t * lenp)222 static unsigned char *load(char *fn, size_t *lenp) {
223 unsigned char *data = NULL;
224 uint64_t dsize = 0;
225 uint64_t dcurr = 0;
226 signed int len;
227
228 //build_rcp_freq();
229
230 #ifndef _O_BINARY
231 #define _O_BINARY 0
232 #endif
233
234 int fd = open(fn, O_RDONLY | _O_BINARY);
235 if (!fd) {
236 perror(fn);
237 return NULL;
238 }
239
240 do {
241 if (dsize - dcurr < BS) {
242 dsize = dsize ? dsize * 2 : BS;
243 data = realloc(data, dsize);
244 }
245
246 len = read(fd, data + dcurr, BS);
247 if (len > 0)
248 dcurr += len;
249 } while (len > 0);
250
251 if (len == -1) {
252 perror("read");
253 }
254 close(fd);
255
256 *lenp = dcurr;
257 return data;
258 }
259
260 #define BLK_SIZE 300*1000000
261 //#define BLK_SIZE 100*100000
262
count_lines(unsigned char * in,size_t len)263 int count_lines(unsigned char *in, size_t len) {
264 size_t i;
265 int lines = 0;
266
267 for (i = 0; i < len; i++)
268 if (in[i] == '\n')
269 lines++;
270
271 return lines;
272 }
273
274 // QUAL [is_read2 [selector]]
parse_lines(unsigned char * in,size_t len,int * rec_len,int * rec_r2,int * rec_sel,size_t * new_len)275 void parse_lines(unsigned char *in, size_t len,
276 int *rec_len, int *rec_r2, int *rec_sel, size_t *new_len) {
277 size_t i, j, start;
278 int rec = 0;
279
280 for (start = i = j = 0; i < len; i++) {
281 if (in[i] == '\n' || in[i] == ' ' || in[i] == '\t') {
282 rec_len[rec] = i-start;
283
284 // Read2 marker
285 while (i < len && in[i] != '\n' && isspace(in[i]))
286 i++;
287
288 if (in[i] != '\n')
289 rec_r2[rec] = atoi((char *)&in[i]);
290 else
291 rec_r2[rec] = 0;
292
293 while (i < len && !isspace(in[i]))
294 i++;
295
296 // selector
297 while (i < len && in[i] != '\n' && isspace(in[i]))
298 i++;
299
300 if (in[i] != '\n')
301 rec_sel[rec] = atoi((char *)&in[i]);
302 else
303 rec_sel[rec] = 0;
304
305 while (i < len && in[i] != '\n')
306 i++;
307
308 start = i+1;
309 rec++;
310 } else {
311 in[j++] = in[i]-33; // ASCII phred to qual
312 }
313 }
314 *new_len = j;
315 }
316
main(int argc,char ** argv)317 int main(int argc, char **argv) {
318 unsigned char *in, *out;
319 size_t in_len, out_len;
320 int decomp = 0, vers = 4; // CRAM version 4.0 (4) or 3.1 (3)
321 int strat = 0, raw = 0;
322 fqz_gparams *gp = NULL, gp_local;
323 int blk_size = BLK_SIZE; // MAX
324
325 #ifdef _WIN32
326 _setmode(_fileno(stdin), _O_BINARY);
327 _setmode(_fileno(stdout), _O_BINARY);
328 #endif
329
330 extern char *optarg;
331 extern int optind;
332 int opt;
333
334 while ((opt = getopt(argc, argv, "ds:s:b:rx:")) != -1) {
335 switch (opt) {
336 case 'd':
337 decomp = 1;
338 break;
339
340 case 'b':
341 blk_size = atoi(optarg);
342 if (blk_size > BLK_SIZE)
343 blk_size = BLK_SIZE;
344 break;
345
346 case 's':
347 strat = atoi(optarg);
348 break;
349
350 case 'x': {
351 // Hex digits are:
352 // qbits qshift
353 // pbits pshift
354 // dbits dshift
355 // qloc sloc
356 // ploc dloc
357 // do_r2 do_qavg
358 //
359 // Examples: -x 0x5570000d6e14 q40+dir = 3473340
360 // -x 0x8252120e8d04 q4 = 724989
361 uint64_t x = strtol(optarg, NULL, 0);
362 manual_strats[manual_nstrat++] = x;
363
364 gp = &gp_local;
365 break;
366 }
367 case 'r':
368 raw = 1;
369 break;
370 }
371 }
372
373 in = load(optind < argc ? argv[optind] : "/dev/stdin", &in_len);
374 if (!in)
375 exit(1);
376
377 if (raw)
378 blk_size = in_len;
379
380 // Block based, for arbitrary sizes of input
381 if (decomp) {
382 unsigned char *in2 = in;
383 while (in_len > 0) {
384 // Read sizes as 32-bit
385 size_t in2_len, out_len;
386 if (raw) {
387 uint32_t u32;
388 var_get_u32(in2, in2+in_len, &u32);
389 out_len = u32;
390 in2_len = in_len;
391 } else {
392 out_len = *(uint32_t *)in2; in2 += 4;
393 in2_len = *(uint32_t *)in2; in2 += 4;
394 }
395
396 fprintf(stderr, "out_len %ld, in_len %ld\n", (long)out_len, (long)in2_len);
397
398 int *lengths = malloc(MAX_REC * sizeof(int));
399 out = (unsigned char *)fqz_decompress((char *)in2, in_len-(raw?0:8), &out_len, lengths, MAX_REC);
400 if (!out) {
401 fprintf(stderr, "Failed to decompress\n");
402 return 1;
403 }
404
405 // Convert from binary back to ASCII with newlines
406 int i = 0, j = 0;
407 while (j < out_len) {
408 int k;
409 char seq[MAX_SEQ];
410 for (k = 0; k < lengths[i]; k++)
411 seq[k] = out[j+k]+33;
412 seq[k] = 0;
413 puts(seq);
414 j += lengths[i++];
415 }
416 free(out);
417 in2 += in2_len;
418 in_len -= in2_len+(raw?0:8);
419
420 free(lengths);
421
422 break; // One cycle only until we fix blocking to be \n based
423 }
424 } else {
425 // Convert from ASCII newline separated file to binary block.
426 // We return an array of line lengths and optionally param selectors.
427 int nlines = count_lines(in, in_len);
428 fprintf(stderr, "nlines=%d\n", nlines);
429 int *rec_len = calloc(nlines, sizeof(*rec_len));
430 int *rec_r2 = calloc(nlines, sizeof(*rec_r2));
431 int *rec_sel = calloc(nlines, sizeof(*rec_sel));
432 parse_lines(in, in_len, rec_len, rec_r2, rec_sel, &in_len);
433
434 unsigned char *in2 = in;
435 long t_out = 0;
436 out = NULL;
437 while (in_len > 0) {
438 // FIXME: blk_size no longer working in test. One cycle only!
439 size_t in2_len = in_len <= blk_size ? in_len : blk_size;
440 fqz_slice *s = fake_slice(in2_len, rec_len, rec_r2, rec_sel, nlines);
441 if (gp == &gp_local)
442 if (fqz_manual_parameters(gp, s, in2, in2_len) < 0)
443 return 1;
444 out = (unsigned char *)fqz_compress(vers, s, (char *)in2, in2_len, &out_len, strat, gp);
445
446 // Write out 32-bit sizes.
447 if (!raw) {
448 uint32_t u32;
449 u32 = in2_len; if (write(1, &u32, 4) != 4) return 1;
450 u32 = out_len; if (write(1, &u32, 4) != 4) return 1;
451 }
452 if (write(1, out, out_len) < 0) return 1;
453 in_len -= in2_len;
454 in2 += in2_len;
455 t_out += out_len + (raw?0:8);
456
457 break; // One cycle only until we fix blocking to be \n based
458 }
459 free(out);
460 free(rec_len);
461 free(rec_r2);
462 free(rec_sel);
463 fprintf(stderr, "Total output = %ld\n", t_out);
464 }
465
466 free(in);
467
468 return 0;
469 }
470