1 /* Tests for fqz codec */
2 /*
3  * Copyright (c) 2019,2020 Genome Research Ltd.
4  * Author(s): James Bonfield
5  *
6  * Redistribution and use in source and binary forms, with or without
7  * modification, are permitted provided that the following conditions are met:
8  *
9  *    1. Redistributions of source code must retain the above copyright notice,
10  *       this list of conditions and the following disclaimer.
11  *
12  *    2. Redistributions in binary form must reproduce the above
13  *       copyright notice, this list of conditions and the following
14  *       disclaimer in the documentation and/or other materials provided
15  *       with the distribution.
16  *
17  *    3. Neither the names Genome Research Ltd and Wellcome Trust Sanger
18  *       Institute nor the names of its contributors may be used to endorse
19  *       or promote products derived from this software without specific
20  *       prior written permission.
21  *
22  * THIS SOFTWARE IS PROVIDED BY GENOME RESEARCH LTD AND CONTRIBUTORS "AS
23  * IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
24  * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
25  * PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL GENOME RESEARCH
26  * LTD OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
27  * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
28  * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
29  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
30  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
31  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
32  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
33  */
34 
35 #include <stdio.h>
36 #include <stdint.h>
37 #include <stdlib.h>
38 #include <string.h>
39 #include <unistd.h>
40 #include <assert.h>
41 #include <fcntl.h>
42 #include <ctype.h>
43 #include <limits.h>
44 
45 #include "htscodecs/fqzcomp_qual.h"
46 #include "htscodecs/varint.h"
47 
48 #ifndef MAX_REC
49 #define MAX_REC 1000000
50 #endif
51 
52 #ifndef MAX_SEQ
53 #  define MAX_SEQ 100000
54 #endif
55 
56 #ifndef MIN
57 #  define MIN(a,b) ((a)<(b)?(a):(b))
58 #  define MAX(a,b) ((a)>(b)?(a):(b))
59 #endif
60 
61 static fqz_slice fixed_slice = {0};
62 
fake_slice(size_t buf_len,int * len,int * r2,int * sel,int nlen)63 fqz_slice *fake_slice(size_t buf_len, int *len, int *r2, int *sel, int nlen) {
64     fixed_slice.num_records = (nlen == 1) ? (buf_len+len[0]-1) / len[0] : nlen;
65     assert(fixed_slice.num_records <= MAX_REC);
66     int i;
67     if (!fixed_slice.len)
68 	fixed_slice.len = malloc(MAX_REC * sizeof(*fixed_slice.len));
69     if (!fixed_slice.flags)
70 	fixed_slice.flags = malloc(MAX_REC * sizeof(*fixed_slice.flags));
71     for (i = 0; i < fixed_slice.num_records; i++) {
72 	int idx = i < nlen ? i : nlen-1;
73 	fixed_slice.len[i] = len[idx];
74 	fixed_slice.flags[i] = r2 ? r2[idx]*FQZ_FREAD2 : 0;
75 	fixed_slice.flags[i] |= sel ? (sel[idx]<<16) : 0;
76     }
77 
78     return &fixed_slice;
79 }
80 
81 static uint64_t manual_strats[10] = {0};
82 static int manual_nstrat = 0;
83 
84 /*
85  * Manually specified strategies held in global manual_strats[].
86  */
87 static inline
fqz_manual_parameters(fqz_gparams * gp,fqz_slice * s,unsigned char * in,size_t in_size)88 int fqz_manual_parameters(fqz_gparams *gp,
89 			  fqz_slice *s,
90 			  unsigned char *in,
91 			  size_t in_size) {
92     int i, p;
93     int dsqr[] = {
94 	0, 1, 1, 1, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3,
95 	4, 4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 5, 5, 5,
96 	5, 5, 5, 5, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6,
97 	6, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7
98     };
99 
100     gp->vers = FQZ_VERS;
101     gp->nparam = manual_nstrat;
102     gp->gflags = GFLAG_MULTI_PARAM | GFLAG_HAVE_STAB;
103     for (i = 0; i < 256; i++)
104 	gp->stab[i] = 0;
105 
106     // Fill these out later
107     gp->max_sel = 0;
108     gp->max_sym = 0;
109     gp->p = malloc(gp->nparam * sizeof(*gp->p));
110 
111     for (p = 0; p < gp->nparam; p++) {
112 	fqz_param *pm = &gp->p[p];
113 	uint64_t st = manual_strats[p];
114 
115 	pm->do_qa  = st & 15; st >>= 4;
116 	pm->do_r2  = st & 15; st >>= 4;
117 	pm->dloc   = st & 15; st >>= 4;
118 	pm->ploc   = st & 15; st >>= 4;
119 	pm->sloc   = st & 15; st >>= 4;
120 	pm->qloc   = st & 15; st >>= 4;
121 	pm->dshift = st & 15; st >>= 4;
122 	pm->dbits  = st & 15; st >>= 4;
123 	pm->pshift = st & 15; st >>= 4;
124 	pm->pbits  = st & 15; st >>= 4;
125 	pm->qshift = st & 15; st >>= 4;
126 	pm->qbits  = st & 15; st >>= 4;
127 
128 	// Gather some stats, as per qual_stats func.
129 	// r in rec count.
130 	// i = index to in[]
131 	// j = index within this rec
132 	uint32_t qhist[256] = {0};
133 
134 	// qual stats for seqs using this parameter only
135 	fqz_qual_stats(s, in, in_size, pm, qhist, p);
136 	int max_sel = pm->max_sel;
137 
138 	// Update max_sel running total. Eg with 4 sub-params:
139 	//
140 	// sel    param no.   => new
141 	// 0      0              0
142 	// 0/1    1              1,2
143 	// 0/1    2              3,4
144 	// 0      3              5
145 	for (i = gp->max_sel; i < gp->max_sel + max_sel+1; i++)
146 	    gp->stab[i] = p;
147 	gp->max_sel += max_sel+1;
148 
149 	pm->fixed_len = pm->fixed_len > 0;
150 	pm->use_qtab = 0;  // unused by current encoder
151 	pm->store_qmap = pm->nsym <= 8;
152 
153 	// Adjust parameters based on quality stats.
154 	// FIXME:  dup from fqz_pick_parameters.
155 	for (i = 0; i < sizeof(dsqr)/sizeof(*dsqr); i++)
156 	    if (dsqr[i] > (1<<pm->dbits)-1)
157 		dsqr[i] = (1<<pm->dbits)-1;
158 
159 	if (pm->store_qmap) {
160 	    int j;
161 	    for (i = j = 0; i < 256; i++)
162 		if (qhist[i])
163 		    pm->qmap[i] = j++;
164 		else
165 		    pm->qmap[i] = INT_MAX;
166 	    pm->max_sym = pm->nsym;
167 	} else {
168 	    pm->nsym = 255;
169 	    for (i = 0; i < 256; i++)
170 		pm->qmap[i] = i;
171 	}
172 	if (gp->max_sym < pm->max_sym)
173 	    gp->max_sym = pm->max_sym;
174 
175 	// Produce ptab from pshift.
176 	if (pm->qbits) {
177 	    for (i = 0; i < 256; i++) {
178 		pm->qtab[i] = i; // 1:1
179 
180 		// Alternative mappings:
181 		//qtab[i] = i > 30 ? MIN(max_sym,i)-15 : i/2;  // eg for 9827 BAM
182 	    }
183 
184 	}
185 	pm->qmask = (1<<pm->qbits)-1;
186 
187 	if (pm->pbits) {
188 	    for (i = 0; i < 1024; i++)
189 		pm->ptab[i] = MIN((1<<pm->pbits)-1, i>>pm->pshift);
190 
191 	    // Alternatively via analysis of quality distributions we
192 	    // may select a bunch of positions that are special and
193 	    // have a non-uniform ptab[].
194 	    // Manual experimentation on a NovaSeq run saved 2.8% here.
195 	}
196 
197 	if (pm->dbits) {
198 	    for (i = 0; i < 256; i++)
199 		pm->dtab[i] = dsqr[MIN(sizeof(dsqr)/sizeof(*dsqr)-1, i>>pm->dshift)];
200 	}
201 
202 	pm->use_ptab = (pm->pbits > 0);
203 	pm->use_dtab = (pm->dbits > 0);
204 
205 	pm->pflags =
206 	    (pm->use_qtab   ?PFLAG_HAVE_QTAB :0)|
207 	    (pm->use_dtab   ?PFLAG_HAVE_DTAB :0)|
208 	    (pm->use_ptab   ?PFLAG_HAVE_PTAB :0)|
209 	    (pm->do_sel     ?PFLAG_DO_SEL    :0)|
210 	    (pm->fixed_len  ?PFLAG_DO_LEN    :0)|
211 	    (pm->do_dedup   ?PFLAG_DO_DEDUP  :0)|
212 	    (pm->store_qmap ?PFLAG_HAVE_QMAP :0);
213     }
214 
215     for (i = gp->max_sel; i < 256; i++)
216 	gp->stab[i] = gp->stab[gp->max_sel-1];
217 
218     return 0;
219 }
220 
221 #define BS 1024*1024
load(char * fn,size_t * lenp)222 static unsigned char *load(char *fn, size_t *lenp) {
223     unsigned char *data = NULL;
224     uint64_t dsize = 0;
225     uint64_t dcurr = 0;
226     signed int len;
227 
228     //build_rcp_freq();
229 
230 #ifndef _O_BINARY
231 #define _O_BINARY 0
232 #endif
233 
234     int fd = open(fn, O_RDONLY | _O_BINARY);
235     if (!fd) {
236 	perror(fn);
237 	return NULL;
238     }
239 
240     do {
241 	if (dsize - dcurr < BS) {
242 	    dsize = dsize ? dsize * 2 : BS;
243 	    data = realloc(data, dsize);
244 	}
245 
246 	len = read(fd, data + dcurr, BS);
247 	if (len > 0)
248 	    dcurr += len;
249     } while (len > 0);
250 
251     if (len == -1) {
252 	perror("read");
253     }
254     close(fd);
255 
256     *lenp = dcurr;
257     return data;
258 }
259 
260 #define BLK_SIZE 300*1000000
261 //#define BLK_SIZE 100*100000
262 
count_lines(unsigned char * in,size_t len)263 int count_lines(unsigned char *in, size_t len) {
264     size_t i;
265     int lines = 0;
266 
267     for (i = 0; i < len; i++)
268 	if (in[i] == '\n')
269 	    lines++;
270 
271     return lines;
272 }
273 
274 // QUAL [is_read2 [selector]]
parse_lines(unsigned char * in,size_t len,int * rec_len,int * rec_r2,int * rec_sel,size_t * new_len)275 void parse_lines(unsigned char *in, size_t len,
276 		 int *rec_len, int *rec_r2, int *rec_sel, size_t *new_len) {
277     size_t i, j, start;
278     int rec = 0;
279 
280     for (start = i = j = 0; i < len; i++) {
281 	if (in[i] == '\n' || in[i] == ' ' || in[i] == '\t') {
282 	    rec_len[rec] = i-start;
283 
284 	    // Read2 marker
285 	    while (i < len && in[i] != '\n' && isspace(in[i]))
286 		i++;
287 
288 	    if (in[i] != '\n')
289 		rec_r2[rec] = atoi((char *)&in[i]);
290 	    else
291 		rec_r2[rec] = 0;
292 
293 	    while (i < len && !isspace(in[i]))
294 		i++;
295 
296 	    // selector
297 	    while (i < len && in[i] != '\n' && isspace(in[i]))
298 		i++;
299 
300 	    if (in[i] != '\n')
301 		rec_sel[rec] = atoi((char *)&in[i]);
302 	    else
303 		rec_sel[rec] = 0;
304 
305 	    while (i < len && in[i] != '\n')
306 		i++;
307 
308 	    start = i+1;
309 	    rec++;
310 	} else {
311 	    in[j++] = in[i]-33; // ASCII phred to qual
312 	}
313     }
314     *new_len = j;
315 }
316 
main(int argc,char ** argv)317 int main(int argc, char **argv) {
318     unsigned char *in, *out;
319     size_t in_len, out_len;
320     int decomp = 0, vers = 4;  // CRAM version 4.0 (4) or 3.1 (3)
321     int strat = 0, raw = 0;
322     fqz_gparams *gp = NULL, gp_local;
323     int blk_size = BLK_SIZE; // MAX
324 
325 #ifdef _WIN32
326         _setmode(_fileno(stdin),  _O_BINARY);
327         _setmode(_fileno(stdout), _O_BINARY);
328 #endif
329 
330     extern char *optarg;
331     extern int optind;
332     int opt;
333 
334     while ((opt = getopt(argc, argv, "ds:s:b:rx:")) != -1) {
335 	switch (opt) {
336 	case 'd':
337 	    decomp = 1;
338 	    break;
339 
340 	case 'b':
341 	    blk_size = atoi(optarg);
342 	    if (blk_size > BLK_SIZE)
343 		blk_size = BLK_SIZE;
344 	    break;
345 
346 	case 's':
347 	    strat = atoi(optarg);
348 	    break;
349 
350 	case 'x': {
351 	    // Hex digits are:
352 	    // qbits  qshift
353 	    // pbits  pshift
354 	    // dbits  dshift
355 	    // qloc   sloc
356 	    // ploc   dloc
357 	    // do_r2  do_qavg
358 	    //
359 	    // Examples: -x 0x5570000d6e14 q40+dir =  3473340
360 	    //           -x 0x8252120e8d04 q4      =  724989
361 	    uint64_t x = strtol(optarg, NULL, 0);
362 	    manual_strats[manual_nstrat++] = x;
363 
364 	    gp = &gp_local;
365 	    break;
366 	}
367 	case 'r':
368 	    raw = 1;
369 	    break;
370 	}
371     }
372 
373     in = load(optind < argc ? argv[optind] : "/dev/stdin", &in_len);
374     if (!in)
375 	exit(1);
376 
377     if (raw)
378 	blk_size = in_len;
379 
380     // Block based, for arbitrary sizes of input
381     if (decomp) {
382 	unsigned char *in2 = in;
383 	while (in_len > 0) {
384 	    // Read sizes as 32-bit
385 	    size_t in2_len, out_len;
386 	    if (raw) {
387 		uint32_t u32;
388 		var_get_u32(in2, in2+in_len, &u32);
389 		out_len = u32;
390 		in2_len = in_len;
391 	    } else {
392 		out_len = *(uint32_t *)in2;  in2 += 4;
393 		in2_len = *(uint32_t *)in2;  in2 += 4;
394 	    }
395 
396 	    fprintf(stderr, "out_len %ld, in_len %ld\n", (long)out_len, (long)in2_len);
397 
398 	    int *lengths = malloc(MAX_REC * sizeof(int));
399 	    out = (unsigned char *)fqz_decompress((char *)in2, in_len-(raw?0:8), &out_len, lengths, MAX_REC);
400 	    if (!out) {
401 		fprintf(stderr, "Failed to decompress\n");
402 		return 1;
403 	    }
404 
405 	    // Convert from binary back to ASCII with newlines
406 	    int i = 0, j = 0;
407 	    while (j < out_len) {
408 		int k;
409 		char seq[MAX_SEQ];
410 		for (k = 0; k < lengths[i]; k++)
411 		    seq[k] = out[j+k]+33;
412 		seq[k] = 0;
413 		puts(seq);
414 		j += lengths[i++];
415 	    }
416 	    free(out);
417 	    in2 += in2_len;
418 	    in_len -= in2_len+(raw?0:8);
419 
420 	    free(lengths);
421 
422 	    break; // One cycle only until we fix blocking to be \n based
423 	}
424     } else {
425 	// Convert from ASCII newline separated file to binary block.
426 	// We return an array of line lengths and optionally param selectors.
427 	int nlines = count_lines(in, in_len);
428 	fprintf(stderr, "nlines=%d\n", nlines);
429 	int *rec_len = calloc(nlines, sizeof(*rec_len));
430 	int *rec_r2  = calloc(nlines, sizeof(*rec_r2));
431 	int *rec_sel = calloc(nlines, sizeof(*rec_sel));
432 	parse_lines(in, in_len, rec_len, rec_r2, rec_sel, &in_len);
433 
434 	unsigned char *in2 = in;
435 	long t_out = 0;
436 	out = NULL;
437 	while (in_len > 0) {
438 	    // FIXME: blk_size no longer working in test.  One cycle only!
439 	    size_t in2_len = in_len <= blk_size ? in_len : blk_size;
440 	    fqz_slice *s = fake_slice(in2_len, rec_len, rec_r2, rec_sel, nlines);
441 	    if (gp == &gp_local)
442 		if (fqz_manual_parameters(gp, s, in2, in2_len) < 0)
443 		    return 1;
444 	    out = (unsigned char *)fqz_compress(vers, s, (char *)in2, in2_len, &out_len, strat, gp);
445 
446 	    // Write out 32-bit sizes.
447 	    if (!raw) {
448 		uint32_t u32;
449 		u32 = in2_len; if (write(1, &u32, 4) != 4) return 1;
450 		u32 = out_len; if (write(1, &u32, 4) != 4) return 1;
451 	    }
452 	    if (write(1, out, out_len) < 0) return 1;
453 	    in_len -= in2_len;
454 	    in2 += in2_len;
455 	    t_out += out_len + (raw?0:8);
456 
457 	    break; // One cycle only until we fix blocking to be \n based
458 	}
459 	free(out);
460 	free(rec_len);
461 	free(rec_r2);
462 	free(rec_sel);
463 	fprintf(stderr, "Total output = %ld\n", t_out);
464     }
465 
466     free(in);
467 
468     return 0;
469 }
470