1 /*===========================================================================
2  *
3  *                            PUBLIC DOMAIN NOTICE
4  *               National Center for Biotechnology Information
5  *
6  *  This software/database is a "United States Government Work" under the
7  *  terms of the United States Copyright Act.  It was written as part of
8  *  the author's official duties as a United States Government employee and
9  *  thus cannot be copyrighted.  This software/database is freely available
10  *  to the public for use. The National Library of Medicine and the U.S.
11  *  Government have not placed any restriction on its use or reproduction.
12  *
13  *  Although all reasonable efforts have been taken to ensure the accuracy
14  *  and reliability of the software and data, the NLM and the U.S.
15  *  Government do not and cannot warrant the performance or results that
16  *  may be obtained by using this software or data. The NLM and the U.S.
17  *  Government disclaim all warranties, express or implied, including
18  *  warranties of performance, merchantability or fitness for any particular
19  *  purpose.
20  *
21  *  Please cite the author in any work or product based on this material.
22  *
23  * ===========================================================================
24  *
25  */
26 #include <vdb/extern.h>
27 
28 #include <klib/rc.h>
29 #include <insdc/insdc.h>
30 #include <sra/sradb.h>
31 #include <klib/data-buffer.h>
32 #include <sysalloc.h>
33 #include <assert.h>
34 
35 #ifndef UNIT_TEST_FUNCTION
36 #include <vdb/xform.h>
37 #include <vdb/schema.h>
38 #endif
39 
40 /*
41     This is a schema-function to synthesize quality values.
42     Its input are the read-len and the spot-filter column.
43     It sums up the values in read-len to determin the length of the produced column
44     It reads the spot-filter values and puts 'good' or 'bad' values into the output
45     The literals for 'good' and 'bad' are not hardcoded but passed into the function
46     from the schema.
47 */
48 
49 typedef struct syn_qual_params
50 {
51     INSDC_quality_phred good;
52     INSDC_quality_phred bad;
53 } syn_qual_params;
54 
sum_read_len(size_t const count,INSDC_coord_len const * const lengths)55 static INSDC_coord_len sum_read_len(size_t const count, INSDC_coord_len const *const lengths)
56 {
57     INSDC_coord_len result = 0;
58     size_t i;
59 
60     for (i = 0; i < count; ++i) {
61         result += lengths[i];
62     }
63     return result;
64 }
65 
is_good(size_t const count,INSDC_SRA_spot_filter const * const filters)66 static bool is_good(size_t const count, INSDC_SRA_spot_filter const *const filters)
67 {
68     return (count == 0 || filters[0] == SRA_SPOT_FILTER_PASS) ? true : false;
69 }
70 
syn_quality_impl(syn_qual_params const * const params,size_t numreads,INSDC_coord_len const * const lengths,size_t numfilts,INSDC_SRA_spot_filter const * const filters,KDataBuffer * rslt)71 static rc_t syn_quality_impl(syn_qual_params const *const params,
72                              size_t numreads, INSDC_coord_len const *const lengths,
73                              size_t numfilts, INSDC_SRA_spot_filter const *const filters,
74                              KDataBuffer *rslt)
75 {
76     rc_t rc = 0;
77     INSDC_coord_len const total_read_len = sum_read_len(numreads, lengths);
78     INSDC_quality_phred const q = is_good(numfilts, filters) ? params->good : params->bad;
79 
80     rslt->elem_bits = 8;
81     rc = KDataBufferResize(rslt, total_read_len);
82     if ( rc == 0 && total_read_len > 0 )
83         memset(rslt->base, q, total_read_len);
84     return rc;
85 }
86 
gen_syn_quality(syn_qual_params const * const params,uint8_t * const dst,size_t const total_length,size_t const numreads,INSDC_coord_zero const * const start,INSDC_coord_len const * const length,INSDC_SRA_xread_type const * const type,INSDC_SRA_read_filter const * const filter)87 static void gen_syn_quality( syn_qual_params const *const params
88                            , uint8_t *const dst
89                            , size_t const total_length
90                            , size_t const numreads
91                            , INSDC_coord_zero const *const start
92                            , INSDC_coord_len const *const length
93                            , INSDC_SRA_xread_type const *const type
94                            , INSDC_SRA_read_filter const *const filter
95                            )
96 {
97     unsigned i;
98 
99     memset(dst, params->good, total_length);
100     for (i = 0; i < numreads; ++i) {
101         if ((type[i] & SRA_READ_TYPE_BIOLOGICAL) != SRA_READ_TYPE_BIOLOGICAL)
102             continue;
103         if (filter[i] == SRA_READ_FILTER_PASS)
104             continue;
105         assert(start[i] + length[i] <= total_length);
106         memset(dst + start[i], params->bad, length[i]);
107     }
108 }
109 
syn_quality_read_impl(syn_qual_params const * const params,size_t const numreads,INSDC_coord_zero const * const start,INSDC_coord_len const * const length,INSDC_SRA_xread_type const * const type,INSDC_SRA_read_filter const * const filter,KDataBuffer * rslt)110 static rc_t syn_quality_read_impl( syn_qual_params const *const params
111                                  , size_t const numreads
112                                  , INSDC_coord_zero const *const start
113                                  , INSDC_coord_len const *const length
114                                  , INSDC_SRA_xread_type const *const type
115                                  , INSDC_SRA_read_filter const *const filter
116                                  , KDataBuffer *rslt)
117 {
118     rc_t rc = 0;
119     INSDC_coord_len const total_read_len = sum_read_len(numreads, length);
120 
121     rslt->elem_bits = 8;
122     rc = KDataBufferResize(rslt, total_read_len);
123     if ( rc == 0 && total_read_len > 0 )
124         gen_syn_quality(params, (uint8_t *)rslt->base, total_read_len, numreads, start, length, type, filter);
125     return rc;
126 }
127 
128 #ifndef UNIT_TEST_FUNCTION
129 
130 #define SAFE_BASE(ELEM, DTYPE) ((ELEM < argc && sizeof(DTYPE) * 8 == (size_t)argv[ELEM].u.data.elem_bits) ? (((DTYPE const *)argv[ELEM].u.data.base) + argv[ELEM].u.data.first_elem) : ((DTYPE const *)NULL))
131 #define BIND_COLUMN(ELEM, DTYPE, POINTER) DTYPE const *const POINTER = SAFE_BASE(ELEM, DTYPE)
132 #define SAFE_COUNT(ELEM) (ELEM < argc ? argv[ELEM].u.data.elem_count : 0)
133 
syn_quality_spot_drvr(void * self,const VXformInfo * info,int64_t row_id,VRowResult * rslt,uint32_t argc,const VRowData argv[])134 static rc_t CC syn_quality_spot_drvr ( void * self,
135                                   const VXformInfo * info,
136                                   int64_t row_id,
137                                   VRowResult * rslt,
138                                   uint32_t argc,
139                                   const VRowData argv [] )
140 {
141     enum {
142         COL_READ_LEN,
143         COL_SPOT_FILTER,
144     };
145     rc_t rc;
146     assert(argc == 2);
147     rc = syn_quality_impl
148         (self,
149          SAFE_COUNT(COL_READ_LEN), SAFE_BASE(COL_READ_LEN, INSDC_coord_len),
150          SAFE_COUNT(COL_SPOT_FILTER), SAFE_BASE(COL_SPOT_FILTER, INSDC_SRA_spot_filter),
151          rslt->data);
152     rslt->elem_count = rslt->data->elem_count;
153     return rc;
154 }
155 
syn_quality_read_drvr(void * self,const VXformInfo * info,int64_t row_id,VRowResult * rslt,uint32_t argc,const VRowData argv[])156 static rc_t CC syn_quality_read_drvr ( void * self,
157                                   const VXformInfo * info,
158                                   int64_t row_id,
159                                   VRowResult * rslt,
160                                   uint32_t argc,
161                                   const VRowData argv [] )
162 {
163     enum {
164         COL_START,
165         COL_LEN,
166         COL_TYPE,
167         COL_FILTER,
168     };
169     rc_t rc;
170     assert(argc == 4);
171     rc = syn_quality_read_impl(self
172                               , SAFE_COUNT(COL_START)
173                               , SAFE_BASE(COL_START, INSDC_coord_zero)
174                               , SAFE_BASE(COL_LEN, INSDC_coord_len)
175                               , SAFE_BASE(COL_TYPE, INSDC_SRA_xread_type)
176                               , SAFE_BASE(COL_FILTER, INSDC_SRA_read_filter)
177                               , rslt->data
178                               );
179     rslt->elem_count = rslt->data->elem_count;
180     return rc;
181 }
182 
make_params(syn_qual_params * const params,VFactoryParams const * const fp)183 static void make_params(syn_qual_params *const params, VFactoryParams const *const fp)
184 {
185     params->good = 30;
186     params->bad = 3;
187     if (fp->argc > 0) {
188         assert(fp->argv[0].desc.domain == vtdUint && fp->argv[0].count == 1);
189         params->good = fp->argv[0].data.u8[0];
190 
191         if (fp->argc > 1) {
192             assert(fp->argv[1].desc.domain == vtdUint && fp->argv[1].count == 1);
193             params->bad = fp->argv[1].data.u8[0];
194         }
195     }
196 }
197 
198 enum syn_quality_flavor {
199     sqf_spot,
200     sqf_read,
201 };
202 
NCBI_SRA_syn_quality_factory(VFuncDesc * rslt,const VFactoryParams * cp,const VFunctionParams * dp,enum syn_quality_flavor flavor)203 static rc_t NCBI_SRA_syn_quality_factory(
204     VFuncDesc * rslt,
205     const VFactoryParams * cp,
206     const VFunctionParams * dp,
207     enum syn_quality_flavor flavor )
208 {
209     /* expecting 2 or 4 data arguments and 0, 1, or 2 factory arguments */
210     assert((dp->argc == 2 || dp->argc == 4) && 0 <= cp->argc && cp->argc <= 2);
211 
212     rslt->whack = free;
213     rslt->u.rf = (flavor == sqf_read) ? syn_quality_read_drvr : syn_quality_spot_drvr;
214     rslt->variant = vftRow;
215     rslt->self = malloc(sizeof(syn_qual_params));
216     if (rslt->self) {
217         make_params(rslt->self, cp);
218         return 0;
219     }
220     return RC(rcXF, rcFunction, rcConstructing, rcMemory, rcExhausted);
221 }
222 
223 /*
224  * function INSDC:quality:phred NCBI:syn_quality #1
225  *      < * INSDC:quality:phred good_quality, INSDC:quality:phred bad_quality >
226  *      ( INSDC:coord:len read_len, INSDC:SRA:spot_filter spot_filter );
227  */
228 VTRANSFACT_IMPL ( NCBI_SRA_syn_quality, 1, 0, 0 ) ( const void * Self,
229                                            const VXfactInfo * info,
230                                            VFuncDesc * rslt,
231                                            const VFactoryParams * cp,
232                                            const VFunctionParams * dp )
233 {
234     return NCBI_SRA_syn_quality_factory(rslt, cp, dp, sqf_spot);
235 }
236 
237 /*
238  * function INSDC:quality:phred NCBI:SRA:syn_quality #2
239  *     < INSDC:quality:phred good_quality, INSDC:quality:phred bad_quality >
240  *     ( INSDC:coord:zero read_start, INSDC:coord:len read_len,
241  *       INSDC:SRA:xread_type read_type, INSDC:SRA:read_filter read_filter )
242  *     = NCBI:SRA:syn_quality_read;
243  */
244 VTRANSFACT_IMPL ( NCBI_SRA_syn_quality_read, 1, 0, 0 ) ( const void * Self,
245                                            const VXfactInfo * info,
246                                            VFuncDesc * rslt,
247                                            const VFactoryParams * cp,
248                                            const VFunctionParams * dp )
249 {
250     return NCBI_SRA_syn_quality_factory(rslt, cp, dp, sqf_read);
251 }
252 
253 
254 #else /* ifndef UNIT_TEST_FUNCTION */
255 
256 #define ASSERT(X) do { if (!(X)) return -1; } while(0)
257 
UnitTest_0Read_NoFilter(void)258 static int UnitTest_0Read_NoFilter(void)
259 {
260     syn_qual_params p; p.good = 30; p.bad = 3;
261     KDataBuffer buffer; memset(&buffer, 0, sizeof(buffer));
262     INSDC_coord_len length[] = { 20, 40 };
263     INSDC_SRA_spot_filter filter[] = { SRA_SPOT_FILTER_PASS, SRA_SPOT_FILTER_REJECT };
264     {
265         rc_t rc = syn_quality_impl(&p, 0, NULL, 0, NULL, &buffer);
266         ASSERT(rc == 0);
267         ASSERT(buffer.elem_bits == 8);
268         ASSERT(buffer.elem_count == 0);
269         (void)(length[0]);
270         (void)(filter[0]);
271     }
272     KDataBufferWhack(&buffer);
273     return 0;
274 }
275 
UnitTest_1Read_NoFilter(void)276 static int UnitTest_1Read_NoFilter(void)
277 {
278     syn_qual_params p; p.good = 30; p.bad = 3;
279     KDataBuffer buffer; memset(&buffer, 0, sizeof(buffer));
280     INSDC_coord_len length[] = { 20, 40 };
281     INSDC_SRA_spot_filter filter[] = { SRA_SPOT_FILTER_PASS, SRA_SPOT_FILTER_REJECT };
282     {
283         unsigned i;
284         rc_t rc = syn_quality_impl(&p, 1, length, 0, NULL, &buffer);
285         ASSERT(rc == 0);
286         ASSERT(buffer.elem_bits == 8);
287         ASSERT(buffer.elem_count == length[0]);
288         for (i = 0; i < buffer.elem_count; ++i) {
289             INSDC_quality_phred const qv = ((INSDC_quality_phred *)buffer.base)[i];
290             ASSERT(qv == p.good);
291         }
292         (void)(length[0]);
293         (void)(filter[0]);
294     }
295     KDataBufferWhack(&buffer);
296     return 0;
297 }
298 
UnitTest_2Read_NoFilter(void)299 static int UnitTest_2Read_NoFilter(void)
300 {
301     syn_qual_params p; p.good = 30; p.bad = 3;
302     KDataBuffer buffer; memset(&buffer, 0, sizeof(buffer));
303     INSDC_coord_len length[] = { 20, 40 };
304     INSDC_SRA_spot_filter filter[] = { SRA_SPOT_FILTER_PASS, SRA_SPOT_FILTER_REJECT };
305     {
306         unsigned i;
307         rc_t rc = syn_quality_impl(&p, 2, length, 0, NULL, &buffer);
308         ASSERT(rc == 0);
309         ASSERT(buffer.elem_bits == 8);
310         ASSERT(buffer.elem_count == length[0] + length[1]);
311         for (i = 0; i < buffer.elem_count; ++i) {
312             INSDC_quality_phred const qv = ((INSDC_quality_phred *)buffer.base)[i];
313             ASSERT(qv == p.good);
314         }
315         (void)(length[0]);
316         (void)(filter[0]);
317     }
318     KDataBufferWhack(&buffer);
319     return 0;
320 }
321 
UnitTest_1Read_Pass(void)322 static int UnitTest_1Read_Pass(void)
323 {
324     syn_qual_params p; p.good = 30; p.bad = 3;
325     KDataBuffer buffer; memset(&buffer, 0, sizeof(buffer));
326     INSDC_coord_len length[] = { 20, 40 };
327     INSDC_SRA_spot_filter filter[] = { SRA_SPOT_FILTER_PASS, SRA_SPOT_FILTER_REJECT };
328     {
329         unsigned i;
330         rc_t rc = syn_quality_impl(&p, 1, length, 1, filter, &buffer);
331         ASSERT(rc == 0);
332         ASSERT(buffer.elem_bits == 8);
333         ASSERT(buffer.elem_count == length[0]);
334         for (i = 0; i < buffer.elem_count; ++i) {
335             INSDC_quality_phred const qv = ((INSDC_quality_phred *)buffer.base)[i];
336             ASSERT(qv == p.good);
337         }
338         (void)(length[0]);
339         (void)(filter[0]);
340     }
341     KDataBufferWhack(&buffer);
342     return 0;
343 }
344 
UnitTest_1Read_Fail(void)345 static int UnitTest_1Read_Fail(void)
346 {
347     syn_qual_params p; p.good = 30; p.bad = 3;
348     KDataBuffer buffer; memset(&buffer, 0, sizeof(buffer));
349     INSDC_coord_len length[] = { 20, 40 };
350     INSDC_SRA_spot_filter filter[] = { SRA_SPOT_FILTER_PASS, SRA_SPOT_FILTER_REJECT };
351     {
352         unsigned i;
353         rc_t rc = syn_quality_impl(&p, 1, length, 1, filter + 1, &buffer);
354         ASSERT(rc == 0);
355         ASSERT(buffer.elem_bits == 8);
356         ASSERT(buffer.elem_count == length[0]);
357         for (i = 0; i < buffer.elem_count; ++i) {
358             INSDC_quality_phred const qv = ((INSDC_quality_phred *)buffer.base)[i];
359             ASSERT(qv == p.bad);
360         }
361         (void)(length[0]);
362         (void)(filter[0]);
363     }
364     KDataBufferWhack(&buffer);
365     return 0;
366 }
367 
UnitTest_2Read_Pass(void)368 static int UnitTest_2Read_Pass(void)
369 {
370     syn_qual_params p; p.good = 30; p.bad = 3;
371     KDataBuffer buffer; memset(&buffer, 0, sizeof(buffer));
372     INSDC_coord_len length[] = { 20, 40 };
373     INSDC_SRA_spot_filter filter[] = { SRA_SPOT_FILTER_PASS, SRA_SPOT_FILTER_REJECT };
374     {
375         unsigned i;
376         rc_t rc = syn_quality_impl(&p, 2, length, 1, filter, &buffer);
377         ASSERT(rc == 0);
378         ASSERT(buffer.elem_bits == 8);
379         ASSERT(buffer.elem_count == length[0] + length[1]);
380         for (i = 0; i < buffer.elem_count; ++i) {
381             INSDC_quality_phred const qv = ((INSDC_quality_phred *)buffer.base)[i];
382             ASSERT(qv == p.good);
383         }
384         (void)(length[0]);
385         (void)(filter[0]);
386     }
387     KDataBufferWhack(&buffer);
388     return 0;
389 }
390 
UnitTest_2Read_Fail(void)391 static int UnitTest_2Read_Fail(void)
392 {
393     syn_qual_params p; p.good = 30; p.bad = 3;
394     KDataBuffer buffer; memset(&buffer, 0, sizeof(buffer));
395     INSDC_coord_len length[] = { 20, 40 };
396     INSDC_SRA_spot_filter filter[] = { SRA_SPOT_FILTER_PASS, SRA_SPOT_FILTER_REJECT };
397     {
398         unsigned i;
399         rc_t rc = syn_quality_impl(&p, 2, length, 1, filter + 1, &buffer);
400         ASSERT(rc == 0);
401         ASSERT(buffer.elem_bits == 8);
402         ASSERT(buffer.elem_count == length[0] + length[1]);
403         for (i = 0; i < buffer.elem_count; ++i) {
404             INSDC_quality_phred const qv = ((INSDC_quality_phred *)buffer.base)[i];
405             ASSERT(qv == p.bad);
406         }
407         (void)(length[0]);
408         (void)(filter[0]);
409     }
410     KDataBufferWhack(&buffer);
411     return 0;
412 }
413 #endif
414