1 /*===========================================================================
2 *
3 *                            PUBLIC DOMAIN NOTICE
4 *               National Center for Biotechnology Information
5 *
6 *  This software/database is a "United States Government Work" under the
7 *  terms of the United States Copyright Act.  It was written as part of
8 *  the author's official duties as a United States Government employee and
9 *  thus cannot be copyrighted.  This software/database is freely available
10 *  to the public for use. The National Library of Medicine and the U.S.
11 *  Government have not placed any restriction on its use or reproduction.
12 *
13 *  Although all reasonable efforts have been taken to ensure the accuracy
14 *  and reliability of the software and data, the NLM and the U.S.
15 *  Government do not and cannot warrant the performance or results that
16 *  may be obtained by using this software or data. The NLM and the U.S.
17 *  Government disclaim all warranties, express or implied, including
18 *  warranties of performance, merchantability or fitness for any particular
19 *  purpose.
20 *
21 *  Please cite the author in any work or product based on this material.
22 *
23 * ===========================================================================
24 *
25 */
26 
27 #include "SRA_Read.h"
28 
29 #include "NGS_String.h"
30 #include "NGS_Cursor.h"
31 #include "NGS_Id.h"
32 
33 #include <kfc/ctx.h>
34 #include <kfc/rsrc.h>
35 #include <kfc/except.h>
36 #include <kfc/xc.h>
37 #include <klib/text.h>
38 #include <klib/printf.h>
39 #include <klib/refcount.h>
40 #include <klib/rc.h>
41 #include <vdb/cursor.h>
42 #include <vdb/schema.h>
43 #include <vdb/vdb-priv.h>
44 #include <insdc/insdc.h>
45 
46 #include <stddef.h>
47 #include <assert.h>
48 
49 #include <sysalloc.h>
50 
51 #ifndef min
52 #   define min(a,b) ( (a) < (b) ? (a) : (b) )
53 #endif
54 
55 /*--------------------------------------------------------------------------
56  * SRA_Read
57  */
58 
59 const char * sequence_col_specs [] =
60 {
61     "(INSDC:dna:text)READ",
62     "READ_TYPE",
63     "(INSDC:quality:phred)QUALITY",
64     "(INSDC:coord:len)READ_LEN",
65     "NAME",
66     "SPOT_GROUP",
67     "PRIMARY_ALIGNMENT_ID",
68     "(U64)SPOT_COUNT",
69     "(INSDC:dna:text)CMP_READ",
70 };
71 
72 static NGS_Read_vt NGS_Read_vt_inst =
73 {
74     {
75         {
76             /* NGS_Refcount */
77             SRA_ReadWhack
78         },
79 
80         /* NGS_Fragment */
81         SRA_FragmentGetId,
82         SRA_FragmentGetSequence,
83         SRA_FragmentGetQualities,
84         SRA_FragmentIsPaired,
85         SRA_FragmentIsAligned,
86         SRA_FragmentNext
87     },
88 
89     /* NGS_Read */
90     SRA_ReadGetId,
91     SRA_ReadGetName,
92     SRA_ReadGetReadGroup,
93     SRA_ReadGetCategory,
94     SRA_ReadGetSequence,
95     SRA_ReadGetQualities,
96     SRA_ReadNumFragments,
97     SRA_ReadFragIsAligned,
98     SRA_ReadIteratorNext,
99     SRA_ReadIteratorGetCount,
100 };
101 
102 /* Init
103  */
104 static
SRA_ReadInit(ctx_t ctx,SRA_Read * self,const char * clsname,const char * instname,const NGS_String * run_name)105 void SRA_ReadInit ( ctx_t ctx, SRA_Read * self, const char *clsname, const char *instname, const NGS_String * run_name )
106 {
107     FUNC_ENTRY ( ctx, rcSRA, rcCursor, rcConstructing );
108 
109     if ( self == NULL )
110         INTERNAL_ERROR ( xcParamNull, "bad object reference" );
111     else
112     {
113         TRY ( NGS_ReadInit ( ctx, & self -> dad, & NGS_Read_vt_inst, clsname, instname ) )
114         {
115             TRY ( self -> run_name = NGS_StringDuplicate ( run_name, ctx ) )
116             {
117                 self -> wants_full      = true;
118                 self -> wants_partial   = true;
119                 self -> wants_unaligned = true;
120             }
121         }
122     }
123 }
124 
125 static
SRA_ReadIteratorInit(ctx_t ctx,SRA_Read * self,const char * clsname,const char * instname,const NGS_String * run_name,bool wants_full,bool wants_partial,bool wants_unaligned)126 void SRA_ReadIteratorInit ( ctx_t ctx,
127                             SRA_Read * self,
128                             const char *clsname,
129                             const char *instname,
130                             const NGS_String * run_name,
131                             bool wants_full,
132                             bool wants_partial,
133                             bool wants_unaligned )
134 {
135     FUNC_ENTRY ( ctx, rcSRA, rcCursor, rcConstructing );
136 
137     if ( self == NULL )
138         INTERNAL_ERROR ( xcParamNull, "bad object reference" );
139     else
140     {
141         TRY ( NGS_ReadIteratorInit ( ctx, & self -> dad, & NGS_Read_vt_inst, clsname, instname ) )
142         {
143             TRY ( self -> run_name = NGS_StringDuplicate ( run_name, ctx ) )
144             {
145                 self -> wants_full      = wants_full;
146                 self -> wants_partial   = wants_partial;
147                 self -> wants_unaligned = wants_unaligned;
148             }
149         }
150     }
151 }
152 
153 /* Whack
154  */
SRA_ReadWhack(SRA_Read * self,ctx_t ctx)155 void SRA_ReadWhack ( SRA_Read * self, ctx_t ctx )
156 {
157     NGS_CursorRelease ( self -> curs, ctx );
158 
159     NGS_StringRelease ( self -> group_name, ctx );
160     NGS_StringRelease ( self -> run_name, ctx );
161 }
162 
163 /* Release
164  *  release reference
165  */
SRA_ReadRelease(const SRA_Read * self,ctx_t ctx)166 void SRA_ReadRelease ( const SRA_Read * self, ctx_t ctx )
167 {
168     if ( self != NULL )
169     {
170         NGS_ReadRelease ( & self -> dad, ctx );
171     }
172 }
173 
174 /* Duplicate
175  *  duplicate reference
176  */
SRA_ReadDuplicate(const SRA_Read * self,ctx_t ctx)177 SRA_Read * SRA_ReadDuplicate ( const SRA_Read * self, ctx_t ctx )
178 {
179     return NGS_RefcountDuplicate ( NGS_ReadToRefcount ( & self -> dad ), ctx );
180 }
181 
SRA_ReadIteratorInitFragment(SRA_Read * self,ctx_t ctx)182 void SRA_ReadIteratorInitFragment ( SRA_Read * self, ctx_t ctx )
183 {
184     const void * base;
185     uint32_t elem_bits, boff, row_len;
186 
187     /* read from READ_TYPE must succeed */
188     TRY ( NGS_CursorCellDataDirect ( self -> curs, ctx, self -> cur_row, seq_READ_TYPE, & elem_bits, & base, & boff, & row_len ) )
189     {
190         assert ( elem_bits == 8 );
191         assert ( boff == 0 );
192         self -> READ_TYPE = base;
193 
194         TRY ( NGS_CursorCellDataDirect ( self -> curs, ctx, self -> cur_row, seq_READ_LEN, & elem_bits, & base, & boff, & row_len ) )
195         {
196             uint32_t i;
197 
198             assert ( elem_bits == 32 );
199             assert ( boff == 0 );
200             self -> READ_LEN = base;
201             self -> frag_max = row_len;
202 
203             /* naked hackery to quickly scan types */
204             assert ( READ_TYPE_TECHNICAL == 0 );
205             assert ( READ_TYPE_BIOLOGICAL == 1 );
206 
207             /* NB - should also be taking READ_FILTER into account */
208             for ( i = 0; i < row_len; ++ i )
209             {
210                 if ( self -> READ_LEN [ i ] != 0 )
211                 {
212                     self -> bio_frags += self -> READ_TYPE [ i ] & READ_TYPE_BIOLOGICAL;
213                 }
214             }
215         }
216     }
217 }
218 
SRA_ReadMake(ctx_t ctx,const NGS_Cursor * curs,int64_t readId,const struct NGS_String * run_name)219 NGS_Read * SRA_ReadMake ( ctx_t ctx, const NGS_Cursor * curs, int64_t readId, const struct NGS_String * run_name )
220 {
221     FUNC_ENTRY ( ctx, rcSRA, rcCursor, rcConstructing );
222 
223     SRA_Read * ref;
224 
225     assert ( curs != NULL );
226 
227     ref = calloc ( 1, sizeof * ref );
228     if ( ref == NULL )
229         SYSTEM_ERROR ( xcNoMemory, "allocating SRA_Read(%lu) on '%.*s'", readId, NGS_StringSize ( run_name, ctx ), NGS_StringData ( run_name, ctx ) );
230     else
231     {
232 #if _DEBUGGING
233         char instname [ 256 ];
234         string_printf ( instname, sizeof instname, NULL, "%.*s(%lu)", NGS_StringSize ( run_name, ctx ), NGS_StringData ( run_name, ctx ), readId );
235         instname [ sizeof instname - 1 ] = 0;
236 #else
237         const char *instname = "";
238 #endif
239         TRY ( SRA_ReadInit ( ctx, ref, "SRA_Read", instname, run_name ) )
240         {
241             uint64_t row_count = NGS_CursorGetRowCount ( curs, ctx );
242 
243             /* validate the requested rowId and seek to it */
244             if ( readId <= 0 || (uint64_t)readId > row_count )
245             {
246                 INTERNAL_ERROR ( xcCursorAccessFailed, "rowId ( %li ) out of range for %.*s", readId, NGS_StringSize ( run_name, ctx ), NGS_StringData ( run_name, ctx ) );
247             }
248             else
249             {
250                 ref -> curs = NGS_CursorDuplicate ( curs, ctx );
251                 ref -> cur_row = readId;
252                 TRY ( SRA_ReadIteratorInitFragment ( ref, ctx ) )
253                 {
254                     ref -> row_max = readId + 1;
255                     ref -> row_count = 1;
256                     ref -> seen_first = true;
257                     return & ref -> dad;
258                 }
259             }
260 
261             SRA_ReadRelease ( ref, ctx );
262             return NULL;
263         }
264         free ( ref );
265     }
266 
267     return NULL;
268 }
269 
270 /* GetReadId
271  */
SRA_ReadGetId(SRA_Read * self,ctx_t ctx)272 NGS_String * SRA_ReadGetId ( SRA_Read * self, ctx_t ctx )
273 {
274     FUNC_ENTRY ( ctx, rcSRA, rcCursor, rcAccessing );
275 
276     assert ( self != NULL );
277 
278     if ( ! self -> seen_first )
279     {
280         USER_ERROR ( xcIteratorUninitialized, "Read accessed before a call to ReadIteratorNext()" );
281         return NULL;
282     }
283 
284     /* if current row is valid, read data */
285     if ( self -> cur_row < self -> row_max )
286         return NGS_IdMake ( ctx, self -> run_name, NGSObject_Read, self -> cur_row );
287 
288     USER_ERROR ( xcCursorExhausted, "No more rows available" );
289     return NULL;
290 }
291 
292 /* GetReadName
293  */
SRA_ReadGetName(SRA_Read * self,ctx_t ctx)294 NGS_String * SRA_ReadGetName ( SRA_Read * self, ctx_t ctx )
295 {
296     FUNC_ENTRY ( ctx, rcSRA, rcCursor, rcReading );
297 
298     assert ( self != NULL );
299 
300     if ( ! self -> seen_first )
301     {
302         USER_ERROR ( xcIteratorUninitialized, "Read accessed before a call to ReadIteratorNext()" );
303         return NULL;
304     }
305     else
306     {
307         NGS_String * ret;
308         ON_FAIL ( ret = NGS_CursorGetString( self -> curs, ctx, self -> cur_row, seq_NAME ) )
309         {
310             if ( GetRCObject ( ctx -> rc ) == (int)rcColumn && GetRCState ( ctx -> rc ) == rcNotFound )
311             {   /* no NAME column; synthesize a read name based on run_name and row_id */
312                 CLEAR ();
313                 ret = NGS_IdMake ( ctx, self -> run_name, NGSObject_Read, self -> cur_row );
314             }
315         }
316 
317         return ret;
318     }
319 }
320 
321 /* GetReadGroup
322  */
SRA_ReadGetReadGroup(SRA_Read * self,ctx_t ctx)323 NGS_String * SRA_ReadGetReadGroup ( SRA_Read * self, ctx_t ctx )
324 {
325     FUNC_ENTRY ( ctx, rcSRA, rcCursor, rcReading );
326 
327     assert ( self != NULL );
328 
329     if ( ! self -> seen_first )
330     {
331         USER_ERROR ( xcIteratorUninitialized, "Read accessed before a call to ReadIteratorNext()" );
332         return NULL;
333     }
334 
335     return NGS_CursorGetString ( self -> curs, ctx, self -> cur_row, seq_GROUP );
336 }
337 
SRA_ReadGetCategory(const SRA_Read * self,ctx_t ctx)338 enum NGS_ReadCategory SRA_ReadGetCategory ( const SRA_Read * self, ctx_t ctx )
339 {
340     FUNC_ENTRY ( ctx, rcSRA, rcCursor, rcAccessing );
341 
342     assert ( self != NULL );
343 
344     if ( ! self -> seen_first )
345     {
346         USER_ERROR ( xcIteratorUninitialized, "Read accessed before a call to ReadIteratorNext()" );
347         return NGS_ReadCategory_unaligned;
348     }
349 
350     if ( self -> cur_row < self -> row_max )
351     {
352         const void * base;
353         uint32_t elem_bits, boff, row_len;
354         ON_FAIL ( NGS_CursorCellDataDirect ( self -> curs, ctx, self -> cur_row, seq_PRIMARY_ALIGNMENT_ID, & elem_bits, & base, & boff, & row_len ) )
355         {
356             CLEAR();
357             return NGS_ReadCategory_unaligned;
358         }
359 
360         {
361             uint32_t i;
362             bool seen_aligned = false;
363             bool seen_unaligned = false;
364             const int64_t * orig = base;
365             assert(elem_bits == 64);
366             for ( i = 0; i < row_len; ++ i )
367             {
368                 if (orig[i] == 0)
369                 {
370                     if (!seen_unaligned)
371                         seen_unaligned = true;
372                 }
373                 else
374                 {
375                     if (!seen_aligned)
376                         seen_aligned = true;
377                 }
378             }
379             if (seen_aligned)
380             {
381                 return seen_unaligned ? NGS_ReadCategory_partiallyAligned : NGS_ReadCategory_fullyAligned;
382             }
383             else
384             {   /* no aligned fragments */
385                 return NGS_ReadCategory_unaligned;
386             }
387         }
388     }
389     USER_ERROR ( xcCursorExhausted, "No more rows available" );
390     return NGS_ReadCategory_unaligned;
391 }
392 
393 /* GetReadSequence
394  */
SRA_ReadGetSequence(SRA_Read * self,ctx_t ctx,uint64_t offset,uint64_t size)395 NGS_String * SRA_ReadGetSequence ( SRA_Read * self, ctx_t ctx, uint64_t offset, uint64_t size )
396 {
397     FUNC_ENTRY ( ctx, rcSRA, rcCursor, rcReading );
398     NGS_String * seq;
399 
400     assert ( self != NULL );
401 
402     if ( ! self -> seen_first )
403     {
404         USER_ERROR ( xcIteratorUninitialized, "Read accessed before a call to ReadIteratorNext()" );
405         return NULL;
406     }
407 
408     TRY ( seq = NGS_CursorGetString ( self -> curs, ctx, self -> cur_row, seq_READ ) )
409     {
410         NGS_String * sub;
411         TRY ( sub = NGS_StringSubstrOffsetSize ( seq, ctx, offset, size ) )
412         {
413             NGS_StringRelease ( seq, ctx );
414             seq = sub;
415         }
416     }
417 
418     return seq;
419 }
420 
421 /* GetReadQualities
422  * GetReadSubQualities
423  */
424 static
GetReadQualities(SRA_Read * self,ctx_t ctx)425 NGS_String * GetReadQualities ( SRA_Read * self, ctx_t ctx )
426 {
427     FUNC_ENTRY ( ctx, rcSRA, rcCursor, rcReading );
428 
429     /* if current row is valid, read data */
430     if ( self -> cur_row < self -> row_max )
431     {
432         const void * base;
433         uint32_t elem_bits, boff, row_len;
434         TRY ( NGS_CursorCellDataDirect ( self -> curs, ctx, self -> cur_row, seq_QUALITY, & elem_bits, & base, & boff, & row_len ) )
435         {
436             NGS_String * new_data = NULL;
437 
438             assert ( elem_bits == 8 );
439             assert ( boff == 0 );
440 
441             {   /* convert to ascii-33 */
442                 char * copy = malloc ( row_len + 1 );
443                 if ( copy == NULL )
444                     SYSTEM_ERROR ( xcNoMemory, "allocating %u bytes for QUALITY row %ld", row_len + 1, self -> cur_row );
445                 else
446                 {
447                     uint32_t i;
448                     const uint8_t * orig = base;
449                     for ( i = 0; i < row_len; ++ i )
450                         copy [ i ] = ( char ) ( orig [ i ] + 33 );
451                     copy [ i ] = 0;
452 
453                     new_data = NGS_StringMakeOwned ( ctx, copy, row_len );
454                     if ( FAILED () )
455                         free ( copy );
456                 }
457             }
458 
459             if ( ! FAILED () )
460             {
461                 return new_data;
462             }
463         }
464     }
465 
466     return NULL;
467 }
468 
SRA_ReadGetQualities(SRA_Read * self,ctx_t ctx,uint64_t offset,uint64_t size)469 NGS_String * SRA_ReadGetQualities ( SRA_Read * self, ctx_t ctx, uint64_t offset, uint64_t size )
470 {
471     FUNC_ENTRY ( ctx, rcSRA, rcCursor, rcReading );
472     NGS_String * qual;
473 
474     assert ( self != NULL );
475 
476     if ( ! self -> seen_first )
477     {
478         USER_ERROR ( xcIteratorUninitialized, "Read accessed before a call to ReadIteratorNext()" );
479         return NULL;
480     }
481 
482     TRY ( qual = GetReadQualities ( self, ctx ) )
483     {
484         NGS_String * sub;
485         TRY ( sub = NGS_StringSubstrOffsetSize ( qual, ctx, offset, size ) )
486         {
487             NGS_StringRelease ( qual, ctx );
488             qual = sub;
489         }
490     }
491 
492     return qual;
493 }
494 
495 /* NumFragments
496  */
SRA_ReadNumFragments(SRA_Read * self,ctx_t ctx)497 uint32_t SRA_ReadNumFragments ( SRA_Read * self, ctx_t ctx )
498 {
499     FUNC_ENTRY ( ctx, rcSRA, rcCursor, rcReading );
500 
501     assert ( self != NULL );
502 
503     if ( ! self -> seen_first )
504     {
505         USER_ERROR ( xcIteratorUninitialized, "Read accessed before a call to ReadIteratorNext()" );
506         return 0;
507     }
508 
509     if ( self -> cur_row >= self -> row_max )
510     {
511         USER_ERROR ( xcCursorExhausted, "No more rows available" );
512         return false;
513     }
514 
515     return self -> bio_frags;
516 }
517 
518 /* FragIsAligned
519  */
SRA_ReadFragIsAligned(SRA_Read * self,ctx_t ctx,uint32_t frag_idx)520 bool SRA_ReadFragIsAligned ( SRA_Read * self, ctx_t ctx, uint32_t frag_idx )
521 {
522     FUNC_ENTRY ( ctx, rcSRA, rcCursor, rcReading );
523 
524     assert ( self != NULL );
525 
526     if ( ! self -> seen_first )
527     {
528         USER_ERROR ( xcIteratorUninitialized, "Read accessed before a call to ReadIteratorNext()" );
529         return 0;
530     }
531 
532     if ( self -> cur_row >= self -> row_max )
533     {
534         USER_ERROR ( xcCursorExhausted, "No more rows available" );
535         return false;
536     }
537 
538     if ( frag_idx >= self -> bio_frags )
539     {
540         USER_ERROR ( xcIntegerOutOfBounds, "bad fragment index" );
541         return false;
542     }
543 
544     return false;
545 }
546 
547 /*--------------------------------------------------------------------------
548  * NGS_ReadIterator
549  */
550 
551 /* Make
552  */
SRA_ReadIteratorMake(ctx_t ctx,const NGS_Cursor * curs,const NGS_String * run_name,bool wants_full,bool wants_partial,bool wants_unaligned)553 NGS_Read * SRA_ReadIteratorMake ( ctx_t ctx,
554                                         const NGS_Cursor * curs,
555                                         const NGS_String * run_name,
556                                         bool wants_full,
557                                         bool wants_partial,
558                                         bool wants_unaligned )
559 {
560     FUNC_ENTRY ( ctx, rcSRA, rcCursor, rcConstructing );
561 
562     SRA_Read * ref;
563 
564     assert ( curs != NULL );
565 
566     ref = calloc ( 1, sizeof * ref );
567     if ( ref == NULL )
568         SYSTEM_ERROR ( xcNoMemory, "allocating NGS_ReadIterator on '%.*s'", NGS_StringSize ( run_name, ctx ), NGS_StringData ( run_name, ctx ) );
569     else
570     {
571 #if _DEBUGGING
572         char instname [ 256 ];
573         string_printf ( instname, sizeof instname, NULL, "%.*s", NGS_StringSize ( run_name, ctx ), NGS_StringData ( run_name, ctx ) );
574         instname [ sizeof instname - 1 ] = 0;
575 #else
576         const char *instname = "";
577 #endif
578         TRY ( SRA_ReadIteratorInit ( ctx, ref, "NGS_ReadIterator", instname, run_name, wants_full, wants_partial, wants_unaligned ) )
579         {
580             ref -> curs = NGS_CursorDuplicate ( curs, ctx );
581             TRY ( NGS_CursorGetRowRange ( ref -> curs, ctx, & ref -> cur_row, & ref -> row_count ) )
582             {
583                 ref -> row_max = ref -> cur_row + ref -> row_count;
584                 return & ref -> dad;
585             }
586             SRA_ReadRelease ( ref, ctx );
587             return NULL;
588         }
589 
590         free ( ref );
591     }
592 
593     return NULL;
594 }
595 
596 /* MakeRange
597  */
SRA_ReadIteratorMakeRange(ctx_t ctx,const NGS_Cursor * curs,const NGS_String * run_name,uint64_t first,uint64_t count,bool wants_full,bool wants_partial,bool wants_unaligned)598 NGS_Read * SRA_ReadIteratorMakeRange ( ctx_t ctx,
599                                        const NGS_Cursor * curs,
600                                        const NGS_String * run_name,
601                                        uint64_t first,
602                                        uint64_t count,
603                                        bool wants_full,
604                                        bool wants_partial,
605                                        bool wants_unaligned )
606 {
607     FUNC_ENTRY ( ctx, rcSRA, rcCursor, rcConstructing );
608 
609     SRA_Read * ref;
610 
611     assert ( curs != NULL );
612 
613     ref = calloc ( 1, sizeof * ref );
614     if ( ref == NULL )
615         SYSTEM_ERROR ( xcNoMemory, "allocating NGS_ReadIterator on '%.*s'", NGS_StringSize ( run_name, ctx ), NGS_StringData ( run_name, ctx ) );
616     else
617     {
618 #if _DEBUGGING
619         char instname [ 256 ];
620         string_printf ( instname, sizeof instname, NULL, "%.*s", NGS_StringSize ( run_name, ctx ), NGS_StringData ( run_name, ctx ) );
621         instname [ sizeof instname - 1 ] = 0;
622 #else
623         const char *instname = "";
624 #endif
625         TRY ( SRA_ReadIteratorInit ( ctx, ref, "NGS_ReadIterator", instname, run_name, wants_full, wants_partial, wants_unaligned ) )
626         {
627             ref -> curs = NGS_CursorDuplicate ( curs, ctx );
628             TRY ( NGS_CursorGetRowRange ( ref -> curs, ctx, & ref -> cur_row, & ref -> row_count ) )
629             {
630                 ref -> row_max = min ( first + count, ref -> cur_row + ref -> row_count );
631                 ref -> cur_row = first;
632                 return & ref -> dad;
633             }
634             SRA_ReadRelease ( ref, ctx );
635             return NULL;
636         }
637 
638         free ( ref );
639     }
640 
641     return NULL;
642 }
643 
644 /* MakeReadGroup
645  */
SRA_ReadIteratorMakeReadGroup(ctx_t ctx,const NGS_Cursor * curs,const NGS_String * run_name,const NGS_String * group_name,uint64_t first,uint64_t count,bool wants_full,bool wants_partial,bool wants_unaligned)646 NGS_Read * SRA_ReadIteratorMakeReadGroup ( ctx_t ctx,
647                                            const NGS_Cursor * curs,
648                                            const NGS_String * run_name,
649                                            const NGS_String * group_name,
650                                            uint64_t first,
651                                            uint64_t count,
652                                            bool wants_full,
653                                            bool wants_partial,
654                                            bool wants_unaligned )
655 {
656     FUNC_ENTRY ( ctx, rcSRA, rcCursor, rcConstructing );
657 
658     TRY ( SRA_Read * ref = (SRA_Read*) SRA_ReadIteratorMakeRange ( ctx,
659                                                                    curs,
660                                                                    run_name,
661                                                                    first,
662                                                                    count,
663                                                                    wants_full,
664                                                                    wants_partial,
665                                                                    wants_unaligned ) )
666     {
667         TRY ( ref -> group_name = NGS_StringDuplicate ( group_name, ctx ) )
668         {
669             return & ref -> dad;
670         }
671         SRA_ReadWhack ( ref, ctx );
672     }
673     return NULL;
674 }
675 
676 /* Next
677  */
SRA_ReadIteratorNext(SRA_Read * self,ctx_t ctx)678 bool SRA_ReadIteratorNext ( SRA_Read * self, ctx_t ctx )
679 {
680     FUNC_ENTRY ( ctx, rcSRA, rcCursor, rcConstructing );
681 
682     assert ( self != NULL );
683 
684     /* reset fragment */
685     self -> seen_first_frag = false;
686     self -> seen_last_frag = false;
687 
688     self -> cur_frag = 0;
689     self -> bio_frags = 0;
690     self -> frag_idx = 0;
691     self -> frag_max = 0;
692     self -> frag_start = 0;
693     self -> frag_len = 0;
694 
695     self -> READ_TYPE = NULL;
696     self -> READ_LEN = NULL;
697 
698     if ( self -> seen_first )
699     {   /* move to next row */
700         ++ self -> cur_row;
701     }
702     else
703     {
704         self -> seen_first = true;
705     }
706 
707     while ( self -> cur_row < self -> row_max )
708     {
709         /* work the category filter */
710         if ( ! self -> wants_full ||
711              ! self -> wants_partial ||
712              ! self -> wants_unaligned )
713         {
714             ON_FAIL ( enum NGS_ReadCategory cat = SRA_ReadGetCategory ( self, ctx ) )
715                 return false;
716 
717             if ( ( cat == NGS_ReadCategory_fullyAligned && ! self -> wants_full )
718                  ||
719                  ( cat == NGS_ReadCategory_partiallyAligned && ! self -> wants_partial )
720                  ||
721                  ( cat == NGS_ReadCategory_unaligned && ! self -> wants_unaligned ) )
722             {
723                 ++ self -> cur_row;
724                 continue;
725             }
726         }
727 
728         /* work the read group filter if required */
729         if ( self -> group_name != NULL )
730         {
731             uint32_t size;
732 
733             ON_FAIL ( NGS_String* group = NGS_CursorGetString ( self -> curs, ctx, self -> cur_row, seq_GROUP ) )
734                 return false;
735 
736             size = ( uint32_t ) NGS_StringSize ( group, ctx );
737             if ( string_cmp ( NGS_StringData ( self -> group_name, ctx ),
738                               NGS_StringSize ( self -> group_name, ctx ),
739                               NGS_StringData ( group, ctx ),
740                               size,
741                               size ) != 0 )
742             {
743                 NGS_StringRelease ( group, ctx );
744                 ++ self -> cur_row;
745                 continue;
746             }
747             NGS_StringRelease ( group, ctx );
748         }
749 
750         TRY ( SRA_ReadIteratorInitFragment ( self, ctx ) )
751         {
752             return true;
753         }
754 
755         break;
756     }
757 
758     return false;
759 }
760 
761 /* GetCount
762  *  TEMPORARY
763  */
SRA_ReadIteratorGetCount(const SRA_Read * self,ctx_t ctx)764 uint64_t SRA_ReadIteratorGetCount ( const SRA_Read * self, ctx_t ctx )
765 {
766     FUNC_ENTRY ( ctx, rcSRA, rcCursor, rcConstructing );
767 
768     assert ( self != NULL );
769     return self -> row_count;
770 }
771 
SRA_FragmentGetId(SRA_Read * self,ctx_t ctx)772 NGS_String * SRA_FragmentGetId ( SRA_Read * self, ctx_t ctx )
773 {
774     FUNC_ENTRY ( ctx, rcSRA, rcCursor, rcConstructing );
775 
776     assert ( self != NULL );
777     if ( ! self -> seen_first_frag )
778     {
779         USER_ERROR ( xcIteratorUninitialized, "Fragment accessed before a call to FragmentIteratorNext()" );
780         return NULL;
781     }
782     if ( self -> seen_last_frag )
783     {
784         USER_ERROR ( xcCursorExhausted, "No more rows available" );
785         return NULL;
786     }
787 
788     return NGS_IdMakeFragment ( ctx, self -> run_name, false, self -> cur_row, self -> cur_frag );
789 }
790 
791 static
GetFragmentString(const SRA_Read * self,ctx_t ctx,NGS_String * str)792 NGS_String * GetFragmentString ( const SRA_Read * self, ctx_t ctx, NGS_String * str )
793 {
794     FUNC_ENTRY ( ctx, rcSRA, rcCursor, rcReading );
795 
796     assert ( self != NULL );
797     if ( ! self -> seen_first_frag )
798     {
799         USER_ERROR ( xcIteratorUninitialized, "Fragment accessed before a call to FragmentIteratorNext()" );
800         return NULL;
801     }
802     if ( self -> seen_last_frag )
803     {
804         USER_ERROR ( xcCursorExhausted, "No more rows available" );
805         return NULL;
806     }
807 
808     if ( self -> cur_row < self -> row_max )
809     {
810         TRY ( NGS_String * frag = NGS_StringSubstrOffsetSize ( str, ctx, self -> frag_start, self -> frag_len ) )
811         {
812             return frag;
813         }
814     }
815 
816     return NULL;
817 }
818 
SRA_FragmentGetSequence(SRA_Read * self,ctx_t ctx,uint64_t offset,uint64_t length)819 struct NGS_String * SRA_FragmentGetSequence ( SRA_Read * self, ctx_t ctx, uint64_t offset, uint64_t length )
820 {
821     FUNC_ENTRY ( ctx, rcSRA, rcCursor, rcConstructing );
822     NGS_String * ret = NULL;
823 
824     assert ( self != NULL );
825     if ( ! self -> seen_first_frag )
826     {
827         USER_ERROR ( xcIteratorUninitialized, "Fragment accessed before a call to FragmentIteratorNext()" );
828         return NULL;
829     }
830     if ( self -> seen_last_frag )
831     {
832         USER_ERROR ( xcCursorExhausted, "No more rows available" );
833         return NULL;
834     }
835 
836     {
837         TRY ( NGS_String * read = NGS_CursorGetString ( self -> curs, ctx, self -> cur_row, seq_READ ) )
838         {
839             TRY ( NGS_String * seq = GetFragmentString ( self, ctx, read ) )
840             {
841                 ret = NGS_StringSubstrOffsetSize ( seq, ctx, offset, length );
842                 NGS_StringRelease ( seq, ctx );
843             }
844             NGS_StringRelease ( read, ctx );
845         }
846     }
847     return ret;
848 }
849 
SRA_FragmentGetQualities(SRA_Read * self,ctx_t ctx,uint64_t offset,uint64_t length)850 struct NGS_String * SRA_FragmentGetQualities ( SRA_Read * self, ctx_t ctx, uint64_t offset, uint64_t length )
851 {
852     FUNC_ENTRY ( ctx, rcSRA, rcCursor, rcConstructing );
853     NGS_String * ret = NULL;
854 
855     assert ( self != NULL );
856     if ( ! self -> seen_first_frag )
857     {
858         USER_ERROR ( xcIteratorUninitialized, "Fragment accessed before a call to FragmentIteratorNext()" );
859         return NULL;
860     }
861     if ( self -> seen_last_frag )
862     {
863         USER_ERROR ( xcCursorExhausted, "No more rows available" );
864         return NULL;
865     }
866 
867     {
868         TRY ( NGS_String * readQual = GetReadQualities ( self, ctx ) )
869         {
870             TRY ( NGS_String * fragQual = GetFragmentString ( self, ctx, readQual ) )
871             {
872                 ret = NGS_StringSubstrOffsetSize ( fragQual, ctx, offset, length );
873                 NGS_StringRelease ( fragQual, ctx );
874             }
875             NGS_StringRelease ( readQual, ctx );
876         }
877     }
878     return ret;
879 }
880 
SRA_FragmentIsPaired(SRA_Read * self,ctx_t ctx)881 bool SRA_FragmentIsPaired ( SRA_Read * self, ctx_t ctx )
882 {
883     FUNC_ENTRY ( ctx, rcSRA, rcCursor, rcAccessing );
884 
885     assert ( self != NULL );
886     if ( ! self -> seen_first_frag )
887     {
888         USER_ERROR ( xcIteratorUninitialized, "Fragment accessed before a call to FragmentIteratorNext()" );
889         return false;
890     }
891     if ( self -> seen_last_frag )
892     {
893         USER_ERROR ( xcCursorExhausted, "No more rows available" );
894         return false;
895     }
896 
897     return self -> bio_frags > 1;
898 }
899 
SRA_FragmentIsAligned(SRA_Read * self,ctx_t ctx)900 bool SRA_FragmentIsAligned ( SRA_Read * self, ctx_t ctx )
901 {
902     assert ( self != NULL );
903     return false;
904 }
905 
SRA_FragmentNext(SRA_Read * self,ctx_t ctx)906 bool SRA_FragmentNext ( SRA_Read * self, ctx_t ctx )
907 {
908     FUNC_ENTRY ( ctx, rcSRA, rcCursor, rcConstructing );
909 
910     assert ( self != NULL );
911 
912     if ( ! self -> seen_first )
913     {
914         USER_ERROR ( xcIteratorUninitialized, "Read accessed before a call to ReadIteratorNext()" );
915         return false;
916     }
917 
918     if ( self -> seen_first_frag )
919     {   /* move to next fragment */
920         ++ self -> cur_frag;
921         ++ self -> frag_idx;
922     }
923 
924     /* advance to next non-empty biological fragment */
925     for ( self -> seen_first_frag = true; self -> frag_idx < self -> frag_max; ++ self -> frag_idx )
926     {
927         if ( self -> READ_LEN [ self -> frag_idx ] != 0 )
928         {
929             /* get coordinates */
930             self -> frag_start += self -> frag_len;
931             self -> frag_len = self -> READ_LEN [ self -> frag_idx ];
932 
933             /* test for biological fragment */
934             assert ( READ_TYPE_TECHNICAL == 0 );
935             assert ( READ_TYPE_BIOLOGICAL == 1 );
936             if ( ( self -> READ_TYPE [ self -> frag_idx ] & READ_TYPE_BIOLOGICAL ) != 0 )
937                 return true;
938         }
939     }
940 
941     self -> seen_last_frag = true;
942     return false;
943 }
944 
945