1 /*===========================================================================
2 *
3 *                            PUBLIC DOMAIN NOTICE
4 *               National Center for Biotechnology Information
5 *
6 *  This software/database is a "United States Government Work" under the
7 *  terms of the United States Copyright Act.  It was written as part of
8 *  the author's official duties as a United States Government employee and
9 *  thus cannot be copyrighted.  This software/database is freely available
10 *  to the public for use. The National Library of Medicine and the U.S.
11 *  Government have not placed any restriction on its use or reproduction.
12 *
13 *  Although all reasonable efforts have been taken to ensure the accuracy
14 *  and reliability of the software and data, the NLM and the U.S.
15 *  Government do not and cannot warrant the performance or results that
16 *  may be obtained by using this software or data. The NLM and the U.S.
17 *  Government disclaim all warranties, express or implied, including
18 *  warranties of performance, merchantability or fitness for any particular
19 *  purpose.
20 *
21 *  Please cite the author in any work or product based on this material.
22 *
23 * ===========================================================================
24 *
25 */
26 
27 #include "CSRA1_Read.h"
28 
29 #include "NGS_String.h"
30 #include "NGS_Cursor.h"
31 #include "NGS_Id.h"
32 
33 #include <kfc/ctx.h>
34 #include <kfc/rsrc.h>
35 #include <kfc/except.h>
36 #include <kfc/xc.h>
37 #include <klib/text.h>
38 #include <klib/printf.h>
39 #include <klib/refcount.h>
40 #include <vdb/cursor.h>
41 #include <vdb/schema.h>
42 #include <vdb/vdb-priv.h>
43 #include <insdc/insdc.h>
44 
45 #include <stddef.h>
46 #include <assert.h>
47 
48 #include <sysalloc.h>
49 
50 #ifndef min
51 #   define min(a,b) ( (a) < (b) ? (a) : (b) )
52 #endif
53 
54 static bool CSRA1_ReadIteratorNext ( CSRA1_Read * self, ctx_t ctx );
55 
56 /*--------------------------------------------------------------------------
57  * CSRA1_Read
58  */
59 struct CSRA1_Read
60 {
61     SRA_Read dad;
62 };
63 
64 static bool                CSRA1_FragmentIsAligned ( CSRA1_Read * self, ctx_t ctx );
65 static bool                CSRA1_ReadFragIsAligned ( CSRA1_Read * self, ctx_t ctx, uint32_t frag_idx );
66 
67 static NGS_Read_vt CSRA1_Read_vt_inst =
68 {
69     {
70         {
71             /* NGS_Refcount */
72             SRA_ReadWhack
73         },
74 
75         /* NGS_Fragment */
76         SRA_FragmentGetId,
77         SRA_FragmentGetSequence,
78         SRA_FragmentGetQualities,
79         SRA_FragmentIsPaired,
80         CSRA1_FragmentIsAligned,
81         SRA_FragmentNext
82     },
83 
84     /* NGS_Read */
85     SRA_ReadGetId,
86     SRA_ReadGetName,
87     SRA_ReadGetReadGroup,
88     SRA_ReadGetCategory,
89     SRA_ReadGetSequence,
90     SRA_ReadGetQualities,
91     SRA_ReadNumFragments,
92     CSRA1_ReadFragIsAligned,
93     CSRA1_ReadIteratorNext,
94     SRA_ReadIteratorGetCount,
95 };
96 
97 /* Init
98  */
99 static
CSRA1_ReadInit(ctx_t ctx,SRA_Read * self,const char * clsname,const char * instname,const NGS_String * run_name)100 void CSRA1_ReadInit ( ctx_t ctx, SRA_Read * self, const char *clsname, const char *instname, const NGS_String * run_name )
101 {
102     FUNC_ENTRY ( ctx, rcSRA, rcCursor, rcConstructing );
103 
104     if ( self == NULL )
105         INTERNAL_ERROR ( xcParamNull, "bad object reference" );
106     else
107     {
108         TRY ( NGS_ReadInit ( ctx, & self -> dad, & CSRA1_Read_vt_inst, clsname, instname ) )
109         {
110             TRY ( self -> run_name = NGS_StringDuplicate ( run_name, ctx ) )
111             {
112                 self -> wants_full      = true;
113                 self -> wants_partial   = true;
114                 self -> wants_unaligned = true;
115             }
116         }
117     }
118 }
119 
120 /* Release
121  *  release reference
122  */
CSRA1_ReadRelease(const CSRA1_Read * self,ctx_t ctx)123 void CSRA1_ReadRelease ( const CSRA1_Read * self, ctx_t ctx )
124 {
125     if ( self != NULL )
126     {
127         NGS_ReadRelease ( & self -> dad . dad, ctx );
128     }
129 }
130 
131 /*--------------------------------------------------------------------------
132  * NGS_ReadIterator
133  */
134 
135 static
CSRA1_ReadIteratorInit(ctx_t ctx,CSRA1_Read * cself,const char * clsname,const char * instname,const NGS_String * run_name,bool wants_full,bool wants_partial,bool wants_unaligned)136 void CSRA1_ReadIteratorInit ( ctx_t ctx,
137                              CSRA1_Read * cself,
138                              const char *clsname,
139                              const char *instname,
140                              const NGS_String * run_name,
141                              bool wants_full,
142                              bool wants_partial,
143                              bool wants_unaligned )
144 {
145     FUNC_ENTRY ( ctx, rcSRA, rcCursor, rcConstructing );
146 
147     if ( cself == NULL )
148         INTERNAL_ERROR ( xcParamNull, "bad object reference" );
149     else
150     {
151         SRA_Read * self = & cself -> dad;
152         TRY ( NGS_ReadIteratorInit ( ctx, & self -> dad, & CSRA1_Read_vt_inst, clsname, instname ) )
153         {
154             TRY ( self -> run_name = NGS_StringDuplicate ( run_name, ctx ) )
155             {
156                 self -> wants_full      = wants_full;
157                 self -> wants_partial   = wants_partial;
158                 self -> wants_unaligned = wants_unaligned;
159             }
160         }
161     }
162 }
163 
164 /* Make
165  */
CSRA1_ReadIteratorMake(ctx_t ctx,const NGS_Cursor * curs,const NGS_String * run_name,bool wants_full,bool wants_partial,bool wants_unaligned)166 NGS_Read * CSRA1_ReadIteratorMake ( ctx_t ctx,
167                                  const NGS_Cursor * curs,
168                                  const NGS_String * run_name,
169                                  bool wants_full,
170                                  bool wants_partial,
171                                  bool wants_unaligned )
172 {
173     FUNC_ENTRY ( ctx, rcSRA, rcCursor, rcConstructing );
174 
175     CSRA1_Read * cref;
176     SRA_Read * ref;
177 
178     assert ( curs != NULL );
179 
180     cref = calloc ( 1, sizeof * cref );
181     if ( cref == NULL )
182         SYSTEM_ERROR ( xcNoMemory, "allocating CSRA1_ReadIterator on '%.*s'", NGS_StringSize ( run_name, ctx ), NGS_StringData ( run_name, ctx ) );
183     else
184     {
185 #if _DEBUGGING
186         char instname [ 256 ];
187         string_printf ( instname, sizeof instname, NULL, "%.*s", NGS_StringSize ( run_name, ctx ), NGS_StringData ( run_name, ctx ) );
188         instname [ sizeof instname - 1 ] = 0;
189 #else
190         const char *instname = "";
191 #endif
192         TRY ( CSRA1_ReadIteratorInit ( ctx, cref, "CSRA1_ReadIterator", instname, run_name, wants_full, wants_partial, wants_unaligned ) )
193         {
194             ref = & cref -> dad;
195 
196             ref -> curs = NGS_CursorDuplicate ( curs, ctx );
197             TRY ( NGS_CursorGetRowRange ( ref -> curs, ctx, & ref -> cur_row, & ref -> row_count ) )
198             {
199                 ref -> row_max = ref -> cur_row + ref -> row_count;
200                 return & ref -> dad;
201             }
202             CSRA1_ReadRelease ( cref, ctx );
203             return NULL;
204         }
205 
206         free ( cref );
207     }
208 
209     return NULL;
210 }
211 
212 /* MakeRange
213  */
CSRA1_ReadIteratorMakeRange(ctx_t ctx,const NGS_Cursor * curs,const NGS_String * run_name,uint64_t first,uint64_t count,bool wants_full,bool wants_partial,bool wants_unaligned)214 NGS_Read * CSRA1_ReadIteratorMakeRange ( ctx_t ctx,
215                                       const NGS_Cursor * curs,
216                                       const NGS_String * run_name,
217                                       uint64_t first,
218                                       uint64_t count,
219                                       bool wants_full,
220                                       bool wants_partial,
221                                       bool wants_unaligned )
222 {
223     FUNC_ENTRY ( ctx, rcSRA, rcCursor, rcConstructing );
224 
225     CSRA1_Read * cref;
226     SRA_Read * ref;
227 
228     assert ( curs != NULL );
229 
230     cref = calloc ( 1, sizeof * ref );
231     if ( cref == NULL )
232         SYSTEM_ERROR ( xcNoMemory, "allocating CSRA1_ReadIterator on '%.*s'", NGS_StringSize ( run_name, ctx ), NGS_StringData ( run_name, ctx ) );
233     else
234     {
235 #if _DEBUGGING
236         char instname [ 256 ];
237         string_printf ( instname, sizeof instname, NULL, "%.*s", NGS_StringSize ( run_name, ctx ), NGS_StringData ( run_name, ctx ) );
238         instname [ sizeof instname - 1 ] = 0;
239 #else
240         const char *instname = "";
241 #endif
242         TRY ( CSRA1_ReadIteratorInit ( ctx, cref, "CSRA1_ReadIterator", instname, run_name, wants_full, wants_partial, wants_unaligned ) )
243         {
244             ref = & cref -> dad;
245 
246             ref -> curs = NGS_CursorDuplicate ( curs, ctx );
247             TRY ( NGS_CursorGetRowRange ( ref -> curs, ctx, & ref -> cur_row, & ref -> row_count ) )
248             {
249                 ref -> row_max = min ( first + count, ref -> cur_row + ref -> row_count );
250                 ref -> cur_row = first;
251                 return & ref -> dad;
252             }
253             CSRA1_ReadRelease ( cref, ctx );
254             return NULL;
255         }
256 
257         free ( cref );
258     }
259 
260     return NULL;
261 }
262 
263 /* MakeReadGroup
264  */
CSRA1_ReadIteratorMakeReadGroup(ctx_t ctx,const NGS_Cursor * curs,const NGS_String * run_name,const NGS_String * group_name,uint64_t first,uint64_t count,bool wants_full,bool wants_partial,bool wants_unaligned)265 NGS_Read * CSRA1_ReadIteratorMakeReadGroup ( ctx_t ctx,
266                                           const NGS_Cursor * curs,
267                                           const NGS_String * run_name,
268                                           const NGS_String * group_name,
269                                           uint64_t first,
270                                           uint64_t count,
271                                           bool wants_full,
272                                           bool wants_partial,
273                                           bool wants_unaligned )
274 {
275     FUNC_ENTRY ( ctx, rcSRA, rcCursor, rcConstructing );
276 
277     TRY ( CSRA1_Read * cref = (CSRA1_Read*) CSRA1_ReadIteratorMakeRange ( ctx,
278                                                                   curs,
279                                                                   run_name,
280                                                                   first,
281                                                                   count,
282                                                                   wants_full,
283                                                                   wants_partial,
284                                                                   wants_unaligned ) )
285     {
286         SRA_Read * ref = & cref -> dad;
287         TRY ( ref -> group_name = NGS_StringDuplicate ( group_name, ctx ) )
288         {
289             return & ref -> dad;
290         }
291 
292         CSRA1_ReadRelease ( cref, ctx );
293     }
294     return NULL;
295 }
296 
CSRA1_ReadMake(ctx_t ctx,const NGS_Cursor * curs,int64_t readId,const struct NGS_String * run_name)297 NGS_Read * CSRA1_ReadMake ( ctx_t ctx, const NGS_Cursor * curs, int64_t readId, const struct NGS_String * run_name )
298 {
299     FUNC_ENTRY ( ctx, rcSRA, rcCursor, rcConstructing );
300 
301     CSRA1_Read * cref;
302     SRA_Read * ref;
303 
304     assert ( curs != NULL );
305 
306     cref = calloc ( 1, sizeof * cref );
307     if ( cref == NULL )
308         SYSTEM_ERROR ( xcNoMemory, "allocating SRA_Read(%lu) on '%.*s'", readId, NGS_StringSize ( run_name, ctx ), NGS_StringData ( run_name, ctx ) );
309     else
310     {
311 #if _DEBUGGING
312         char instname [ 256 ];
313         string_printf ( instname, sizeof instname, NULL, "%.*s(%lu)", NGS_StringSize ( run_name, ctx ), NGS_StringData ( run_name, ctx ), readId );
314         instname [ sizeof instname - 1 ] = 0;
315 #else
316         const char *instname = "";
317 #endif
318         ref = & cref -> dad;
319 
320         TRY ( CSRA1_ReadInit ( ctx, ref, "CSRA1_Read", instname, run_name ) )
321         {
322             uint64_t row_count = NGS_CursorGetRowCount ( curs, ctx );
323 
324             /* validate the requested rowId and seek to it */
325             if ( readId <= 0 || (uint64_t)readId > row_count )
326             {
327                 INTERNAL_ERROR ( xcCursorAccessFailed, "rowId ( %li ) out of range for %.*s", readId, NGS_StringSize ( run_name, ctx ), NGS_StringData ( run_name, ctx ) );
328             }
329             else
330             {
331                 ref -> curs = NGS_CursorDuplicate ( curs, ctx );
332                 ref -> cur_row = readId;
333                 TRY ( SRA_ReadIteratorInitFragment ( cref , ctx ) )
334                 {
335                     ref -> row_max = readId + 1;
336                     ref -> row_count = 1;
337                     ref -> seen_first = true;
338                     return & ref -> dad;
339                 }
340             }
341 
342             CSRA1_ReadRelease ( cref, ctx );
343             return NULL;
344         }
345         free ( cref );
346     }
347 
348     return NULL;
349 }
350 
CSRA1_FragmentIsAligned(CSRA1_Read * cself,ctx_t ctx)351 bool CSRA1_FragmentIsAligned ( CSRA1_Read * cself, ctx_t ctx )
352 {
353     FUNC_ENTRY ( ctx, rcSRA, rcCursor, rcAccessing );
354     const SRA_Read * self;
355 
356     assert ( cself != NULL );
357 
358     self = & cself -> dad;
359 
360     if ( ! self -> seen_first )
361     {
362         USER_ERROR ( xcIteratorUninitialized, "Read accessed before a call to nextRead()" );
363         return false;
364     }
365 
366     if ( self -> cur_row >= self -> row_max )
367     {
368         USER_ERROR ( xcCursorExhausted, "No more rows available" );
369         return false;
370     }
371 
372     if ( ! self -> seen_first_frag )
373     {
374         USER_ERROR ( xcIteratorUninitialized, "Fragment accessed before a call to nextFragment()" );
375         return false;
376     }
377 
378     if ( self -> frag_idx >= self -> frag_max )
379     {
380         USER_ERROR ( xcCursorExhausted, "No more fragments available" );
381         return false;
382     }
383 
384     {
385         const void * base;
386         uint32_t elem_bits, boff, row_len;
387         ON_FAIL ( NGS_CursorCellDataDirect ( self -> curs, ctx, self -> cur_row, seq_PRIMARY_ALIGNMENT_ID, & elem_bits, & base, & boff, & row_len ) )
388         {
389             CLEAR();
390             return false;
391         }
392 
393         {
394             const int64_t * orig = base;
395             assert(elem_bits == 64);
396             assert(boff == 0);
397 
398             return orig[self -> frag_idx] != 0;
399         }
400     }
401 
402 }
403 
CSRA1_ReadFragIsAligned(CSRA1_Read * cself,ctx_t ctx,uint32_t frag_idx)404 bool CSRA1_ReadFragIsAligned ( CSRA1_Read * cself, ctx_t ctx, uint32_t frag_idx )
405 {
406     FUNC_ENTRY ( ctx, rcSRA, rcCursor, rcAccessing );
407     const SRA_Read * self;
408 
409     assert ( cself != NULL );
410 
411     self = & cself -> dad;
412 
413     if ( ! self -> seen_first )
414     {
415         USER_ERROR ( xcIteratorUninitialized, "Read accessed before a call to nextRead()" );
416         return false;
417     }
418 
419     if ( self -> cur_row >= self -> row_max )
420     {
421         USER_ERROR ( xcCursorExhausted, "No more rows available" );
422         return false;
423     }
424 
425     if ( frag_idx >= self -> bio_frags )
426     {
427         USER_ERROR ( xcIntegerOutOfBounds, "bad fragment index" );
428         return false;
429     }
430 
431     {
432         const void * base;
433         uint32_t elem_bits, boff, row_len;
434         TRY ( NGS_CursorCellDataDirect ( self -> curs, ctx, self -> cur_row, seq_PRIMARY_ALIGNMENT_ID, & elem_bits, & base, & boff, & row_len ) )
435         {
436             uint32_t idx, bidx;
437             const int64_t * orig = base;
438             assert ( base != NULL );
439             assert ( elem_bits == 64 );
440             assert ( boff == 0 );
441             assert ( row_len == self -> frag_max );
442 
443             /* technically, we do not expect technical reads (fragments) within CSRA1,
444                but it is correct to check for this possibility
445                same applies to 0-length biological fragments (VDB-3132)
446             */
447             if ( self -> bio_frags == self -> frag_max )
448                 return orig [ frag_idx ] != 0;
449 
450             for ( idx = bidx = 0; idx < row_len; ++ idx )
451             {
452                 if ( ( self -> READ_TYPE [ idx ] & READ_TYPE_BIOLOGICAL ) != 0 && self -> READ_LEN [ idx ] != 0 )
453                 {
454                     if ( bidx == frag_idx )
455                         return orig [ idx ] != 0;
456 
457                     ++ bidx;
458                 }
459             }
460         }
461     }
462 
463     CLEAR();
464     return false;
465 }
466 
467 bool
CSRA1_ReadIteratorNext(CSRA1_Read * cself,ctx_t ctx)468 CSRA1_ReadIteratorNext ( CSRA1_Read * cself, ctx_t ctx )
469 {
470     FUNC_ENTRY ( ctx, rcSRA, rcCursor, rcAccessing );
471     SRA_Read * self;
472 
473     assert ( cself != NULL );
474 
475     self = & cself -> dad;
476 
477     if ( self -> wants_full )
478     {
479         return SRA_ReadIteratorNext ( cself, ctx );
480     }
481 
482     /* to iterate over partially aligned and unaligned reads, use column CMP_READ */
483 
484     self -> seen_first_frag = false;
485     self -> seen_last_frag = false;
486 
487     self -> cur_frag = 0;
488     self -> bio_frags = 0;
489     self -> frag_idx = 0;
490     self -> frag_max = 0;
491     self -> frag_start = 0;
492     self -> frag_len = 0;
493 
494     self -> READ_TYPE = NULL;
495     self -> READ_LEN = NULL;
496 
497     if ( self -> seen_first )
498     {   /* move to next row */
499         ++ self -> cur_row;
500     }
501     else
502     {
503         self -> seen_first = true;
504     }
505 
506     while ( self -> cur_row < self -> row_max )
507     {
508         enum NGS_ReadCategory cat;
509         NGS_String * read;
510         bool aligned;
511         ON_FAIL ( read = NGS_CursorGetString ( self -> curs, ctx, self -> cur_row, seq_CMP_READ ) )
512             return false;
513 
514         aligned = NGS_StringSize ( read, ctx ) == 0;
515         NGS_StringRelease( read, ctx );
516         if ( aligned )
517         {   // aligned read - skip
518             ++ self -> cur_row;
519             continue;
520         }
521 
522         /* work the category filter, we know wants_full is false */
523         ON_FAIL ( cat = SRA_ReadGetCategory ( cself, ctx ) )
524             return false;
525 
526         if ( ( cat == NGS_ReadCategory_partiallyAligned && ! self -> wants_partial )
527                 ||
528              ( cat == NGS_ReadCategory_unaligned && ! self -> wants_unaligned ) )
529         {
530             ++ self -> cur_row;
531             continue;
532         }
533 
534         /* work the read group filter if required */
535         if ( self -> group_name != NULL )
536         {
537             uint32_t size;
538 
539             ON_FAIL ( NGS_String* group = NGS_CursorGetString ( self -> curs, ctx, self -> cur_row, seq_GROUP ) )
540                 return false;
541 
542             size = ( uint32_t ) NGS_StringSize ( group, ctx );
543             if ( string_cmp ( NGS_StringData ( self -> group_name, ctx ),
544                               NGS_StringSize ( self -> group_name, ctx ),
545                               NGS_StringData ( group, ctx ),
546                               size,
547                               size ) != 0 )
548             {
549                 NGS_StringRelease ( group, ctx );
550                 ++ self -> cur_row;
551                 continue;
552             }
553             NGS_StringRelease ( group, ctx );
554         }
555 
556         TRY ( SRA_ReadIteratorInitFragment ( cself, ctx ) )
557         {
558             return true;
559         }
560 
561         break;
562     }
563 
564     return false;
565 }
566