1 /*
2  * Copyright (c) 2011-2013, 2018-2019 Genome Research Ltd.
3  * Author(s): James Bonfield
4  *
5  * Redistribution and use in source and binary forms, with or without
6  * modification, are permitted provided that the following conditions are met:
7  *
8  *    1. Redistributions of source code must retain the above copyright notice,
9  *       this list of conditions and the following disclaimer.
10  *
11  *    2. Redistributions in binary form must reproduce the above
12  *       copyright notice, this list of conditions and the following
13  *       disclaimer in the documentation and/or other materials provided
14  *       with the distribution.
15  *
16  *    3. Neither the names Genome Research Ltd and Wellcome Trust Sanger
17  *       Institute nor the names of its contributors may be used to endorse
18  *       or promote products derived from this software without specific
19  *       prior written permission.
20  *
21  * THIS SOFTWARE IS PROVIDED BY GENOME RESEARCH LTD AND CONTRIBUTORS "AS
22  * IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
23  * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
24  * PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL GENOME RESEARCH
25  * LTD OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
26  * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
27  * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
28  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
29  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
30  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
31  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
32  */
33 
34 #ifndef FQZ_COMP_QUAL_H
35 #define FQZ_COMP_QUAL_H
36 
37 #ifdef __cplusplus
38 extern "C" {
39 #endif
40 
41 #include <stdint.h>
42 
43 /* Bit flags, deliberately mirroring BAM ones */
44 #define FQZ_FREVERSE 16
45 #define FQZ_FREAD2 128
46 
47 /* Current FQZ format version */
48 #define FQZ_VERS 5
49 
50 #define FQZ_MAX_STRAT 3
51 
52 /*
53  * Minimal per-record information taken from a cram slice.
54  *
55  * To compress we need to know the junction from one quality string to
56  * the next (len), whether it is first/second read and whether it is
57  * reverse complemented (flags).
58  */
59 typedef struct {
60     int num_records;
61     uint32_t *len;    // of size num_records
62     uint32_t *flags;  // of size num_records
63 } fqz_slice;
64 
65 
66 // Global flags
67 static const int GFLAG_MULTI_PARAM = 1;
68 static const int GFLAG_HAVE_STAB   = 2;
69 static const int GFLAG_DO_REV      = 4;
70 
71 // Param flags
72 // Add PFLAG_HAVE_DMAP and a dmap[] for delta incr?
73 static const int PFLAG_DO_DEDUP    = 2;
74 static const int PFLAG_DO_LEN      = 4;
75 static const int PFLAG_DO_SEL      = 8;
76 static const int PFLAG_HAVE_QMAP   = 16;
77 static const int PFLAG_HAVE_PTAB   = 32;
78 static const int PFLAG_HAVE_DTAB   = 64;
79 static const int PFLAG_HAVE_QTAB   = 128;
80 
81 /*
82  * FQZ parameters.  These may be simply passed in as NULL to fqz_compress
83  * and it'll automatically choose, but if we wish to have complete control
84  * then this (long) struct contains all the details.
85  *
86  * TODO: document all this!
87  */
88 
89 // A single parameter block
90 typedef struct {
91     // Starting context value
92     uint16_t context;
93 
94     // flags
95     unsigned int pflags;
96     unsigned int do_sel, do_dedup, store_qmap, fixed_len;
97     unsigned char use_qtab, use_dtab, use_ptab;
98 
99     // context bits and locations
100     unsigned int qbits, qloc;
101     unsigned int pbits, ploc;
102     unsigned int dbits, dloc;
103     unsigned int sbits, sloc;
104 
105     // models
106     int max_sym, nsym, max_sel;
107 
108     // tables / maps
109     unsigned int qmap[256];
110     unsigned int qtab[256];
111     unsigned int ptab[1024];
112     unsigned int dtab[256];
113 
114     // Not stored paramters, but computed as part of encoder
115     // parameterisation.
116     int qshift;
117     int pshift;
118     int dshift;
119     int sshift;
120     unsigned int qmask; // (1<<qbits)-1
121     int do_r2, do_qa;
122 } fqz_param;
123 
124 // The global params, which is a collection of parameter blocks plus
125 // a few pieces of meta-data.
126 typedef struct {
127     int vers;               // Format version; Set to FQZ_VERS
128     unsigned int gflags;    // global param flags
129     int nparam;             // Number of fqz_param blocks
130     int max_sel;            // Number of selector values
131     unsigned int stab[256]; // Selector to parameter no. table
132 
133     int max_sym;            // max symbol value across all sub-params
134 
135     fqz_param *p;           // 1 or more parameter blocks
136 } fqz_gparams;
137 
138 
139 /** Compress a block of quality values.
140  *
141  * @param vers          The CRAM version number (<<8) plus fqz strategy (0-3)
142  * @param s             Length and flag data CRAM per-record
143  * @param in            Buffer of concatenated quality values (no separator)
144  * @param in_size       Size of in buffer
145  * @param out_size      Size of returned output
146  * @param strat         FQZ compression strategy (0 to FQZ_MAX_STRAT)
147  * @param gp            Optional fqzcomp paramters (may be NULL).
148  *
149  * @return              The compressed quality buffer on success,
150  *                      NULL on failure.
151  */
152 char *fqz_compress(int vers, fqz_slice *s, char *in, size_t in_size,
153                    size_t *out_size, int strat, fqz_gparams *gp);
154 
155 /** Decompress a block of quality values.
156  *
157  * @param in            Buffer of compressed quality values
158  * @param in_size       Size of in buffer
159  * @param out_size      Size of returned output
160  * @param lengths       Optional array filled out with record lengths.
161  *                      May be NULL.  If not, preallocate it to correct size.
162  *
163  * @return              The uncompressed concatenated qualities on success,
164  *                      NULL on failure.
165  */
166 char *fqz_decompress(char *in, size_t in_size, size_t *out_size,
167                      int *lengths, int nlengths);
168 
169 /** A utlity function to analyse a quality buffer to gather statistical
170  *  information.  This is written into qhist and pm.  This function is only
171  *  useful if you intend on passing your own fqz_gparams block to
172  *  fqz_compress.
173  */
174 void fqz_qual_stats(fqz_slice *s,
175 		    unsigned char *in, size_t in_size,
176 		    fqz_param *pm,
177 		    uint32_t qhist[256],
178 		    int one_param);
179 
180 #ifdef __cplusplus
181 }
182 #endif
183 
184 #endif
185