1 /*===========================================================================
2 *
3 *                            PUBLIC DOMAIN NOTICE
4 *               National Center for Biotechnology Information
5 *
6 *  This software/database is a "United States Government Work" under the
7 *  terms of the United States Copyright Act.  It was written as part of
8 *  the author's official duties as a United States Government employee and
9 *  thus cannot be copyrighted.  This software/database is freely available
10 *  to the public for use. The National Library of Medicine and the U.S.
11 *  Government have not placed any restriction on its use or reproduction.
12 *
13 *  Although all reasonable efforts have been taken to ensure the accuracy
14 *  and reliability of the software and data, the NLM and the U.S.
15 *  Government do not and cannot warrant the performance or results that
16 *  may be obtained by using this software or data. The NLM and the U.S.
17 *  Government disclaim all warranties, express or implied, including
18 *  warranties of performance, merchantability or fitness for any particular
19 *  purpose.
20 *
21 *  Please cite the author in any work or product based on this material.
22 *
23 * ===========================================================================
24 *
25 */
26 #include <klib/log.h>
27 #include <klib/out.h>
28 #include <klib/container.h>
29 #include <klib/text.h>
30 #include <kapp/main.h>
31 
32 #include <sra/sradb.h>
33 #include <sra/types.h>
34 #include <sra/fastq.h>
35 
36 #include <os-native.h>
37 #include <sysalloc.h>
38 
39 #include <stdio.h>
40 #include <stdlib.h>
41 #include <string.h>
42 #include <ctype.h>
43 #include <errno.h>
44 #include <strtol.h>
45 
46 #include "core.h"
47 #include "debug.h"
48 
49 #define DATABUFFERINITSIZE 10240
50 
51 typedef struct TAlignedRegion_struct
52 {
53     const char* name;
54     uint32_t name_len;
55     /* 1-based, 0 - means not set */
56     uint64_t from;
57     uint64_t to;
58 } TAlignedRegion;
59 
60 
61 typedef struct TMatepairDistance_struct
62 {
63     uint64_t from;
64     uint64_t to;
65 } TMatepairDistance;
66 
67 
68 struct FastqArgs_struct
69 {
70     bool is_platform_cs_native;
71 
72     int maxReads;
73     bool skipTechnical;
74 
75     uint32_t minReadLen;
76     bool applyClip;
77     bool dumpOrigFmt;
78     bool dumpBase;
79     bool dumpCs;
80     bool readIds;
81     bool SuppressQualForCSKey;      /* added Jan 15th 2014 ( a new fastq-variation! ) */
82     int offset;
83     bool qual_filter;
84     bool qual_filter1;
85     const char* b_deffmt;
86     SLList* b_defline;
87     const char* q_deffmt;
88     SLList* q_defline;
89     const char *desiredCsKey;
90     bool split_files;
91     bool split_3;
92     bool split_spot;
93     uint64_t fasta;
94     const char* file_extension;
95     bool aligned;
96     bool unaligned;
97     TAlignedRegion* alregion;
98     uint32_t alregion_qty;
99     bool mp_dist_unknown;
100     TMatepairDistance* mp_dist;
101     uint32_t mp_dist_qty;
102 } FastqArgs;
103 
104 
105 typedef enum DefNodeType_enum
106 {
107     DefNode_Unknown = 0,
108     DefNode_Text = 1,
109     DefNode_Optional,
110     DefNode_Accession,
111     DefNode_SpotId,
112     DefNode_SpotName,
113     DefNode_SpotGroup,
114     DefNode_SpotLen,
115     DefNode_ReadId,
116     DefNode_ReadName,
117     DefNode_ReadLen,
118     DefNode_Last
119 } DefNodeType;
120 
121 
122 typedef struct DefNode_struct
123 {
124     SLNode node;
125     DefNodeType type;
126     union
127     {
128         SLList* optional;
129         char* text;
130     } data;
131 } DefNode;
132 
133 
134 typedef struct DeflineData_struct
135 {
136     rc_t rc;
137     bool optional;
138     union
139     {
140         spotid_t* id;
141         struct
142         {
143             const char* s;
144             size_t sz;
145         } str;
146         uint32_t* u32;
147     } values[ DefNode_Last ];
148     char* buf;
149     size_t buf_sz;
150     size_t* writ;
151 } DeflineData;
152 
153 
Defline_Builder(SLNode * node,void * data)154 static bool CC Defline_Builder( SLNode *node, void *data )
155 {
156     DefNode* n = ( DefNode* )node;
157     DeflineData* d = ( DeflineData* )data;
158     char s[ 256 ];
159     size_t w;
160     int x = 0;
161 
162     s[ 0 ] = '\0';
163     switch ( n->type )
164     {
165         case DefNode_Optional :
166             d->optional = true;
167             w = *d->writ;
168             *d->writ = 0;
169             SLListDoUntil( n->data.optional, Defline_Builder, data );
170             x = ( *d->writ == 0 ) ? 0 : 1;
171             d->optional = false;
172             *d->writ = w;
173             if ( x > 0 )
174             {
175                 SLListDoUntil( n->data.optional, Defline_Builder, data );
176             }
177             break;
178 
179         case DefNode_Text :
180             if ( d->optional )
181             {
182                 break;
183             }
184             d->values[n->type].str.s = n->data.text;
185             d->values[n->type].str.sz = strlen( n->data.text );
186 
187         case DefNode_Accession :
188         case DefNode_SpotName :
189         case DefNode_SpotGroup :
190         case DefNode_ReadName :
191             if ( d->values[n->type].str.s != NULL )
192             {
193                 x = d->values[ n->type ].str.sz;
194                 if ( x < sizeof( s ) )
195                 {
196                     strncpy( s, d->values[ n->type ].str.s, x );
197                     s[ x ] = '\0';
198                     *d->writ = *d->writ + x;
199                 }
200                 else
201                 {
202                     d->rc = RC( rcExe, rcNamelist, rcExecuting, rcBuffer, rcInsufficient );
203                 }
204             }
205             break;
206 
207         case DefNode_SpotId :
208             if ( d->values[ n->type ].id != NULL &&
209                 ( !d->optional || *d->values[ n->type ].id > 0 ) )
210             {
211                 x = snprintf( s, sizeof( s ), "%ld", *d->values[ n->type ].id );
212                 if ( x < 0 || x >= sizeof( s ) )
213                 {
214                     d->rc = RC( rcExe, rcNamelist, rcExecuting, rcBuffer, rcInsufficient );
215                 }
216                 else
217                 {
218                     *d->writ = *d->writ + x;
219                 }
220             }
221             break;
222 
223         case DefNode_ReadId :
224         case DefNode_SpotLen :
225         case DefNode_ReadLen :
226             if ( d->values[ n->type] .u32 != NULL &&
227                  ( !d->optional || *d->values[ n->type ].u32 > 0 ) )
228             {
229                 x = snprintf( s, sizeof( s ), "%u", *d->values[ n->type ].u32 );
230                 if ( x < 0 || x >= sizeof( s ) )
231                 {
232                     d->rc = RC( rcExe, rcNamelist, rcExecuting, rcBuffer, rcInsufficient );
233                 }
234                 else
235                 {
236                     *d->writ = *d->writ + x;
237                 }
238             }
239             break;
240 
241         default:
242             d->rc = RC( rcExe, rcNamelist, rcExecuting, rcId, rcInvalid );
243     } /* switch */
244 
245     if ( d->rc == 0 && !d->optional && n->type != DefNode_Optional )
246     {
247         if ( *d->writ < d->buf_sz )
248         {
249             strcat( d->buf, s );
250         }
251         else
252         {
253             d->rc = RC( rcExe, rcNamelist, rcExecuting, rcBuffer, rcInsufficient );
254         }
255     }
256     return d->rc != 0;
257 }
258 
259 
Defline_Bind(DeflineData * data,const char * accession,spotid_t * spotId,const char * spot_name,size_t spotname_sz,const char * spot_group,size_t sgrp_sz,uint32_t * spot_len,uint32_t * readId,const char * read_name,INSDC_coord_len rlabel_sz,INSDC_coord_len * read_len)260 static rc_t Defline_Bind( DeflineData* data, const char* accession,
261             spotid_t* spotId, const char* spot_name, size_t spotname_sz,
262             const char* spot_group, size_t sgrp_sz, uint32_t* spot_len,
263             uint32_t* readId, const char* read_name, INSDC_coord_len rlabel_sz,
264             INSDC_coord_len* read_len )
265 {
266     if ( data == NULL )
267     {
268         return RC( rcExe, rcNamelist, rcExecuting, rcMemory, rcInsufficient );
269     }
270     data->values[ DefNode_Unknown ].str.s = NULL;
271     data->values[ DefNode_Text ].str.s = NULL;
272     data->values[ DefNode_Optional ].str.s = NULL;
273     data->values[ DefNode_Accession ].str.s = accession;
274     data->values[ DefNode_Accession ].str.sz = strlen( accession );
275     data->values[ DefNode_SpotId ].id = spotId;
276     data->values[ DefNode_SpotName ].str.s = spot_name;
277     data->values[ DefNode_SpotName ].str.sz = spotname_sz;
278     data->values[ DefNode_SpotGroup ].str.s = spot_group;
279     data->values[ DefNode_SpotGroup ].str.sz = sgrp_sz;
280     data->values[ DefNode_SpotLen ].u32 = spot_len;
281     data->values[ DefNode_ReadId ].u32 = readId;
282     data->values[ DefNode_ReadName ].str.s = read_name;
283     data->values[ DefNode_ReadName ].str.sz = rlabel_sz;
284     data->values[ DefNode_ReadLen ].u32 = read_len;
285     return 0;
286 }
287 
288 
Defline_Build(const SLList * def,DeflineData * data,char * buf,size_t buf_sz,size_t * writ)289 static rc_t Defline_Build( const SLList* def, DeflineData* data, char* buf,
290                            size_t buf_sz, size_t* writ )
291 {
292     if ( data == NULL )
293     {
294         return RC( rcExe, rcNamelist, rcExecuting, rcMemory, rcInsufficient );
295     }
296 
297     data->rc = 0;
298     data->optional = false;
299     data->buf = buf;
300     data->buf_sz = buf_sz;
301     data->writ = writ;
302 
303     data->buf[ 0 ] = '\0';
304     *data->writ = 0;
305 
306     SLListDoUntil( def, Defline_Builder, data );
307     return data->rc;
308 }
309 
310 
DeflineNode_Add(SLList * list,DefNode ** node,DefNodeType type,const char * text,size_t text_sz)311 static rc_t DeflineNode_Add( SLList* list, DefNode** node,
312                             DefNodeType type, const char* text, size_t text_sz )
313 {
314     rc_t rc = 0;
315 
316     *node = calloc( 1, sizeof( **node ) );
317     if ( *node == NULL )
318     {
319         rc = RC( rcExe, rcNamelist, rcConstructing, rcMemory, rcExhausted );
320     }
321     else if ( type == DefNode_Text && ( text == NULL || text_sz == 0 ) )
322     {
323         rc = RC( rcExe, rcNamelist, rcConstructing, rcParam, rcInvalid );
324     }
325     else
326     {
327         ( *node )->type = type;
328         if ( type == DefNode_Text )
329         {
330             ( *node )->data.text = string_dup( text, text_sz );
331             if ( ( *node )->data.text == NULL )
332             {
333                 rc = RC( rcExe, rcNamelist, rcConstructing, rcMemory, rcExhausted );
334                 free( *node );
335             }
336         }
337         else if ( type == DefNode_Optional )
338         {
339             ( *node )->data.optional = malloc( sizeof( SLList ) );
340             if ( ( *node )->data.optional == NULL )
341             {
342                 rc = RC( rcExe, rcNamelist, rcConstructing, rcMemory, rcExhausted );
343                 free( *node );
344             }
345             else
346             {
347                 SLListInit( ( *node )->data.optional );
348             }
349         }
350         if ( rc == 0 )
351         {
352             SLListPushTail( list, &( *node )->node );
353         }
354     }
355     return rc;
356 }
357 
358 
359 static void Defline_Release( SLList* list );
360 
361 
DeflineNode_Whack(SLNode * node,void * data)362 static void CC DeflineNode_Whack( SLNode* node, void* data )
363 {
364     if ( node != NULL )
365     {
366         DefNode* n = (DefNode*)node;
367         if ( n->type == DefNode_Text )
368         {
369             free( n->data.text );
370         }
371         else if ( n->type == DefNode_Optional )
372         {
373             Defline_Release( n->data.optional );
374         }
375         free( node );
376     }
377 }
378 
379 
Defline_Release(SLList * list)380 static void Defline_Release( SLList* list )
381 {
382     if ( list != NULL )
383     {
384         SLListForEach( list, DeflineNode_Whack, NULL );
385         free( list );
386     }
387 }
388 
389 
390 #if _DEBUGGING
Defline_Dump(SLNode * node,void * data)391 static void CC Defline_Dump( SLNode* node, void* data )
392 {
393     DefNode* n = ( DefNode* )node;
394     const char* s = NULL, *t;
395     if ( n->type == DefNode_Text )
396     {
397         s = n->data.text;
398     }
399     switch ( n->type )
400     {
401         case DefNode_Unknown :      t = "Unknown";   break;
402         case DefNode_Text :         t = "Text";      break;
403         case DefNode_Optional :     t = "Optional";  break;
404         case DefNode_Accession :    t = "Accession"; break;
405         case DefNode_SpotId :       t = "SpotId";    break;
406         case DefNode_SpotName :     t = "SpotName";  break;
407         case DefNode_SpotGroup :    t = "SpotGroup"; break;
408         case DefNode_SpotLen :      t = "SpotLen";   break;
409         case DefNode_ReadId :       t = "ReadId";    break;
410         case DefNode_ReadName :     t = "ReadName";  break;
411         case DefNode_ReadLen :      t = "ReadLen";   break;
412         default :  t = "ERROR";
413     }
414 
415     SRA_DUMP_DBG( 3, ( "%s type %s", data, t ) );
416     if ( s )
417     {
418         SRA_DUMP_DBG( 3, ( ": '%s'", s ) );
419     }
420     SRA_DUMP_DBG( 3, ( "\n" ) );
421     if ( n->type == DefNode_Optional )
422     {
423         SLListForEach( n->data.optional, Defline_Dump, "+-->" );
424     }
425 }
426 #endif
427 
428 
Defline_Parse(SLList ** def,const char * line)429 static rc_t Defline_Parse( SLList** def, const char* line )
430 {
431     rc_t rc = 0;
432     size_t i, sz, text = 0, opt_vars = 0;
433     DefNode* node;
434     SLList* list = NULL;
435 
436     if ( def == NULL || line == NULL )
437     {
438         return RC( rcExe, rcNamelist, rcConstructing, rcParam, rcInvalid );
439     }
440     *def = malloc( sizeof( SLList ) );
441     if ( *def  == NULL )
442     {
443         return RC( rcExe, rcNamelist, rcConstructing, rcMemory, rcExhausted );
444     }
445     sz = strlen( line );
446     list = *def;
447     SLListInit( list );
448 
449     for ( i = 0; rc == 0 && i < sz; i++ )
450     {
451         if ( line[ i ] == '[' )
452         {
453             if ( ( i + 1 ) < sz && line[ i + 1 ] == '[' )
454             {
455                 i++;
456             }
457             else
458             {
459                 if ( list != *def )
460                 {
461                     rc = RC( rcExe, rcNamelist, rcConstructing, rcFormat, rcIncorrect );
462                 }
463                 else if ( i > text )
464                 {
465                     rc = DeflineNode_Add( list, &node, DefNode_Text, &line[text], i - text );
466                 }
467                 if ( rc == 0 )
468                 {
469                     rc = DeflineNode_Add( list, &node, DefNode_Optional, NULL, 0 );
470                     if ( rc == 0 )
471                     {
472                         opt_vars = 0;
473                         list = node->data.optional;
474                         text = i + 1;
475                     }
476                 }
477             }
478         }
479         else if ( line[ i ] == ']' )
480         {
481             if ( ( i + 1 ) < sz && line[ i + 1 ] == ']' )
482             {
483                 i++;
484             }
485             else
486             {
487                 if ( list == *def )
488                 {
489                     rc = RC( rcExe, rcNamelist, rcConstructing, rcFormat, rcIncorrect );
490                 }
491                 else
492                 {
493                     if ( opt_vars < 1 )
494                     {
495                         rc = RC( rcExe, rcNamelist, rcConstructing, rcConstraint, rcEmpty );
496                     }
497                     else if ( i > text )
498                     {
499                         rc = DeflineNode_Add( list, &node, DefNode_Text, &line[text], i - text );
500                     }
501                     list = *def;
502                     text = i + 1;
503                 }
504             }
505         }
506         else if ( line[ i ] == '$' )
507         {
508             if ( ( i + 1 ) < sz && line[ i + 1 ] == '$' )
509             {
510                 i++;
511             }
512             else
513             {
514                 DefNodeType type = DefNode_Unknown;
515                 switch ( line[ ++i] )
516                 {
517                     case 'a' :
518                         switch( line[ ++i ] )
519                         {
520                             case 'c': type = DefNode_Accession; break;
521                         }
522                         break;
523 
524                     case 's' :
525                         switch ( line[ ++i ] )
526                         {
527                             case 'i': type = DefNode_SpotId; break;
528                             case 'n': type = DefNode_SpotName; break;
529                             case 'g': type = DefNode_SpotGroup; break;
530                             case 'l': type = DefNode_SpotLen; break;
531                         }
532                         break;
533                     case 'r' :
534                         switch( line[ ++i ] )
535                         {
536                             case 'i': type = DefNode_ReadId; break;
537                             case 'n': type = DefNode_ReadName; break;
538                             case 'l': type = DefNode_ReadLen; break;
539                         }
540                         break;
541                 }
542                 if ( type == DefNode_Unknown )
543                 {
544                     rc = RC( rcExe, rcNamelist, rcConstructing, rcName, rcUnrecognized );
545                 }
546                 else
547                 {
548                     if ( ( i - 2 ) > text )
549                     {
550                         rc = DeflineNode_Add( list, &node, DefNode_Text, &line[ text ], i - text - 2 );
551                     }
552                     if ( rc == 0 )
553                     {
554                         rc = DeflineNode_Add( list, &node, type, NULL, 0 );
555                         if ( rc == 0 )
556                         {
557                             opt_vars++;
558                             text = i + 1;
559                         }
560                     }
561                 }
562             }
563         }
564     }
565     if ( rc == 0 )
566     {
567         if ( list != *def )
568         {
569             rc = RC( rcExe, rcNamelist, rcConstructing, rcFormat, rcInvalid );
570         }
571         else if ( i > text )
572         {
573             rc = DeflineNode_Add( list, &node, DefNode_Text, &line[ text ], i - text );
574         }
575     }
576     if ( rc != 0 )
577     {
578         i = i < sz ? i : sz;
579         PLOGERR( klogErr, ( klogErr, rc, "$(l1) -->$(c)<-- $(l2)",
580                  "l1=%.*s,c=%c,l2=%.*s", i - 1, line, line[ i - 1 ], sz - i, &line[ i ] ) );
581     }
582 #if _DEBUGGING
583     SRA_DUMP_DBG( 3, ( "| defline\n" ) );
584     SLListForEach( *def, Defline_Dump, "+->" );
585 #endif
586     return rc;
587 }
588 
589 /* ### ALIGNMENT_COUNT based filtering ##################################################### */
590 
591 typedef struct AlignedFilter_struct
592 {
593     const SRAColumn* col;
594     uint64_t rejected_reads;
595     bool aligned;
596     bool unaligned;
597 } AlignedFilter;
598 
599 
AlignedFilter_GetKey(const SRASplitter * cself,const char ** key,spotid_t spot,readmask_t * readmask)600 static rc_t AlignedFilter_GetKey( const SRASplitter* cself, const char** key,
601                                   spotid_t spot, readmask_t* readmask )
602 {
603     rc_t rc = 0;
604     AlignedFilter* self = ( AlignedFilter* )cself;
605 
606     if ( self == NULL || key == NULL )
607     {
608         rc = RC( rcSRA, rcNode, rcExecuting, rcParam, rcNull );
609     }
610     else
611     {
612         *key = "";
613         if ( self->col != NULL )
614         {
615             const uint8_t* ac = NULL;
616             bitsz_t o = 0, sz = 0;
617             rc = SRAColumnRead( self->col, spot, (const void **)&ac, &o, &sz );
618             if ( rc == 0 && sz > 0 )
619             {
620                 sz = sz / 8 / sizeof( *ac );
621                 for( o = 0; o < sz; o++ )
622                 {
623                     if ( ( self->aligned   && !self->unaligned && ac[ o ] == 0 ) ||
624                          ( self->unaligned && !self->aligned   && ac[ o ] != 0 ) )
625                     {
626                         unset_readmask( readmask, o );
627                         self->rejected_reads ++;
628                     }
629                 }
630             }
631         }
632     }
633     return rc;
634 }
635 
636 
AlignedFilter_Release(const SRASplitter * cself)637 static rc_t AlignedFilter_Release( const SRASplitter* cself )
638 {
639     rc_t rc = 0;
640     AlignedFilter* self = ( AlignedFilter* )cself;
641     if ( self == NULL )
642     {
643         rc = RC( rcSRA, rcNode, rcExecuting, rcParam, rcNull );
644     }
645     else
646     {
647         if ( self->rejected_reads > 0 && !g_legacy_report )
648             rc = KOutMsg( "Rejected %lu READS because of aligned/unaligned filter\n", self->rejected_reads );
649     }
650     return rc;
651 }
652 
653 typedef struct AlignedFilterFactory_struct
654 {
655     const SRATable* table;
656     const SRAColumn* col;
657     bool aligned;
658     bool unaligned;
659 } AlignedFilterFactory;
660 
661 
AlignedFilterFactory_Init(const SRASplitterFactory * cself)662 static rc_t AlignedFilterFactory_Init( const SRASplitterFactory* cself )
663 {
664     rc_t rc = 0;
665     AlignedFilterFactory* self = ( AlignedFilterFactory* )cself;
666 
667     if ( self == NULL )
668     {
669         rc = RC( rcSRA, rcType, rcConstructing, rcParam, rcNull );
670     }
671     else
672     {
673         rc = SRATableOpenColumnRead( self->table, &self->col, "ALIGNMENT_COUNT", vdb_uint8_t );
674         if ( rc != 0 )
675         {
676             if ( GetRCState( rc ) == rcNotFound )
677             {
678                 LOGMSG( klogWarn, "Column ALIGNMENT_COUNT was not found, param ignored" );
679                 rc = 0;
680             }
681             else if ( GetRCState( rc ) == rcExists )
682             {
683                 rc = 0;
684             }
685         }
686     }
687     return rc;
688 }
689 
690 
AlignedFilterFactory_NewObj(const SRASplitterFactory * cself,const SRASplitter ** splitter)691 static rc_t AlignedFilterFactory_NewObj( const SRASplitterFactory* cself, const SRASplitter** splitter )
692 {
693     rc_t rc = 0;
694     AlignedFilterFactory* self = ( AlignedFilterFactory* )cself;
695 
696     if ( self == NULL )
697     {
698         rc = RC( rcSRA, rcType, rcExecuting, rcParam, rcNull );
699     }
700     else
701     {
702         rc = SRASplitter_Make( splitter, sizeof( AlignedFilter ),
703                                AlignedFilter_GetKey, NULL, NULL, AlignedFilter_Release );
704         if ( rc == 0 )
705         {
706             AlignedFilter * filter = ( AlignedFilter * )( * splitter );
707             filter->col = self->col;
708             filter->aligned = self->aligned;
709             filter->unaligned = self->unaligned;
710             filter->rejected_reads = 0;
711         }
712     }
713     return rc;
714 }
715 
716 
AlignedFilterFactory_Release(const SRASplitterFactory * cself)717 static void AlignedFilterFactory_Release( const SRASplitterFactory* cself )
718 {
719     if ( cself != NULL )
720     {
721         AlignedFilterFactory* self = ( AlignedFilterFactory* )cself;
722         SRAColumnRelease( self->col );
723     }
724 }
725 
726 
AlignedFilterFactory_Make(const SRASplitterFactory ** cself,const SRATable * table,bool aligned,bool unaligned)727 static rc_t AlignedFilterFactory_Make( const SRASplitterFactory** cself,
728                     const SRATable* table, bool aligned, bool unaligned )
729 {
730     rc_t rc = 0;
731     AlignedFilterFactory* obj = NULL;
732 
733     if ( cself == NULL || table == NULL )
734     {
735         rc = RC( rcSRA, rcType, rcAllocating, rcParam, rcNull );
736     }
737     else
738     {
739         SRASplitterFactory_Make( cself, eSplitterSpot, sizeof( *obj ),
740                                  AlignedFilterFactory_Init,
741                                  AlignedFilterFactory_NewObj,
742                                  AlignedFilterFactory_Release );
743         if ( rc == 0 )
744         {
745             obj = ( AlignedFilterFactory* )*cself;
746             obj->table = table;
747             obj->aligned = aligned;
748             obj->unaligned = unaligned;
749         }
750     }
751     return rc;
752 }
753 
754 
755 /* ### alignment region filtering ##################################################### */
756 
757 typedef struct AlignRegionFilter_struct
758 {
759     const SRAColumn* col_pos;
760     const SRAColumn* col_name;
761     const SRAColumn* col_seqid;
762     const TAlignedRegion* alregion;
763     uint64_t rejected_spots;
764     uint32_t alregion_qty;
765 } AlignRegionFilter;
766 
767 
AlignRegionFilter_GetKey(const SRASplitter * cself,const char ** key,spotid_t spot,readmask_t * readmask)768 static rc_t AlignRegionFilter_GetKey( const SRASplitter* cself, const char** key,
769                                       spotid_t spot, readmask_t* readmask )
770 {
771     rc_t rc = 0;
772     AlignRegionFilter* self = ( AlignRegionFilter* )cself;
773 
774     if ( self == NULL || key == NULL )
775     {
776         rc = RC( rcSRA, rcNode, rcExecuting, rcParam, rcNull );
777     }
778     else
779     {
780         *key = "";
781         reset_readmask( readmask );
782         if ( self->col_name || self->col_seqid )
783         {
784             bool match = false;
785             uint32_t i;
786             const char* nm, *si;
787             bitsz_t o, nm_len = 0, si_len = 0;
788 
789             if ( self->col_name )
790             {
791                 rc = SRAColumnRead( self->col_name, spot, (const void **)&nm, &o, &nm_len );
792                 if ( rc == 0 && nm_len > 0 )
793                     nm_len /= 8;
794             }
795             if ( rc == 0 && self->col_seqid )
796             {
797                 rc = SRAColumnRead( self->col_seqid, spot, (const void **)&si, &o, &si_len );
798                 if ( rc == 0 && si_len > 0 )
799                     si_len /= 8;
800             }
801             for ( i = 0; rc == 0 && i < self->alregion_qty; i++ )
802             {
803                 if ( ( self->alregion[i].name_len == nm_len &&
804                        strncasecmp( self->alregion[ i ].name, nm, nm_len ) == 0 ) ||
805                      ( self->alregion[i].name_len == si_len &&
806                        strncasecmp(self->alregion[i].name, si, si_len) == 0 ) )
807                 {
808                     if ( self->col_pos )
809                     {
810                         INSDC_coord_zero* pos;
811                         bitsz_t nreads, j;
812                         rc = SRAColumnRead( self->col_pos, spot, ( const void ** )&pos, &o, &nreads );
813                         if ( rc == 0 && nreads > 0 )
814                         {
815                             nreads = nreads / sizeof(*pos) / 8;
816                             for ( i = 0; !match && i < self->alregion_qty; i++ )
817                             {
818                                 for ( j = 0; j < nreads; j++ )
819                                 {
820                                     if ( self->alregion[i].from <= pos[j] &&
821                                          pos[j] <= self->alregion[i].to )
822                                     {
823                                         match = true;
824                                         break;
825                                     }
826                                 }
827                             }
828                         }
829                     }
830                     else
831                     {
832                         match = true;
833                     }
834                     break;
835                 }
836             }
837             if ( !match )
838             {
839                 clear_readmask( readmask );
840                 self->rejected_spots ++;
841             }
842         }
843     }
844     return rc;
845 }
846 
847 
AlignRegionFilter_Release(const SRASplitter * cself)848 static rc_t AlignRegionFilter_Release( const SRASplitter* cself )
849 {
850     rc_t rc = 0;
851     AlignRegionFilter* self = ( AlignRegionFilter* )cself;
852 
853     if ( self == NULL )
854     {
855         rc = RC( rcSRA, rcNode, rcExecuting, rcParam, rcNull );
856     }
857     else
858     {
859         if ( self->rejected_spots > 0 && !g_legacy_report )
860             rc = KOutMsg( "Rejected %lu SPOTS because of AlignRegionFilter\n", self->rejected_spots );
861     }
862     return rc;
863 }
864 
865 
866 typedef struct AlignRegionFilterFactory_struct
867 {
868     const SRATable* table;
869     const SRAColumn* col_pos;
870     const SRAColumn* col_name;
871     const SRAColumn* col_seqid;
872     TAlignedRegion* alregion;
873     uint32_t alregion_qty;
874 } AlignRegionFilterFactory;
875 
876 
AlignRegionFilterFactory_Init(const SRASplitterFactory * cself)877 static rc_t AlignRegionFilterFactory_Init( const SRASplitterFactory* cself )
878 {
879     rc_t rc = 0;
880     AlignRegionFilterFactory* self = ( AlignRegionFilterFactory* )cself;
881 
882     if ( self == NULL )
883     {
884         rc = RC( rcSRA, rcType, rcConstructing, rcParam, rcNull );
885     }
886     else
887     {
888 
889 #define COLNF(v,n,t) (!(rc == 0 && ((rc = SRATableOpenColumnRead(self->table, &v, n, t)) == 0 || \
890                        GetRCState(rc) == rcExists)) && GetRCState(rc) == rcNotFound)
891 
892         if ( COLNF( self->col_name, "REF_NAME", vdb_ascii_t ) )
893         {
894             SRAColumnRelease( self->col_name );
895             self->col_name = NULL;
896             rc = 0;
897         }
898 
899         if ( COLNF( self->col_seqid, "REF_SEQ_ID", vdb_ascii_t ) )
900         {
901             SRAColumnRelease( self->col_seqid );
902             self->col_seqid = NULL;
903             rc = 0;
904         }
905 
906         if ( self->col_name == NULL && self->col_seqid == NULL )
907         {
908             LOGMSG( klogWarn, "None of columns REF_NAME or REF_SEQ_ID was found, no region filtering" );
909         }
910         else
911         {
912             bool range_used = false;
913             uint32_t i;
914             /* see if there are any actual ranges set and open col only if yes */
915             for ( i = 0; i < self->alregion_qty; i++ )
916             {
917                 if ( self->alregion[ i ].from != 0 || self->alregion[ i ].to != 0 )
918                 {
919                     range_used = true;
920                     break;
921                 }
922             }
923             if ( range_used )
924             {
925                 if ( COLNF( self->col_pos, "REF_POS", "INSDC:coord:zero" ) )
926                 {
927                     LOGMSG( klogWarn, "Column REF_POS is not found, no region position filtering" );
928                     rc = 0;
929                 }
930                 else
931                 {
932                     for ( i = 0; i < self->alregion_qty; i++ )
933                     {
934                         if ( self->alregion[ i ].from > 0 )
935                         {
936                             self->alregion[ i ].from--;
937                         }
938                         if ( self->alregion[ i ].to > 0 )
939                         {
940                             self->alregion[ i ].to--;
941                         }
942                         else if ( self->alregion[ i ].to == 0 )
943                         {
944                             self->alregion[ i ].to = ~0;
945                         }
946                     }
947                 }
948             }
949         }
950     }
951 #undef COLNF
952     return rc;
953 }
954 
955 
AlignRegionFilterFactory_NewObj(const SRASplitterFactory * cself,const SRASplitter ** splitter)956 static rc_t AlignRegionFilterFactory_NewObj( const SRASplitterFactory* cself, const SRASplitter** splitter )
957 {
958     rc_t rc = 0;
959     AlignRegionFilterFactory* self = ( AlignRegionFilterFactory* )cself;
960 
961     if ( self == NULL )
962     {
963         rc = RC( rcSRA, rcType, rcExecuting, rcParam, rcNull );
964     }
965     else
966     {
967         rc = SRASplitter_Make( splitter, sizeof( AlignRegionFilter ),
968                                AlignRegionFilter_GetKey, NULL, NULL, AlignRegionFilter_Release );
969         if ( rc == 0 )
970         {
971             AlignRegionFilter * filter = ( AlignRegionFilter * )( * splitter );
972             filter->col_pos = self->col_pos;
973             filter->col_name = self->col_name;
974             filter->col_seqid = self->col_seqid;
975             filter->alregion = self->alregion;
976             filter->alregion_qty = self->alregion_qty;
977             filter->rejected_spots = 0;
978         }
979     }
980     return rc;
981 }
982 
983 
AlignRegionFilterFactory_Release(const SRASplitterFactory * cself)984 static void AlignRegionFilterFactory_Release( const SRASplitterFactory* cself )
985 {
986     if ( cself != NULL )
987     {
988         AlignRegionFilterFactory* self = ( AlignRegionFilterFactory* )cself;
989         SRAColumnRelease( self->col_pos );
990         SRAColumnRelease( self->col_name );
991         SRAColumnRelease( self->col_seqid );
992     }
993 }
994 
995 
AlignRegionFilterFactory_Make(const SRASplitterFactory ** cself,const SRATable * table,TAlignedRegion * alregion,uint32_t alregion_qty)996 static rc_t AlignRegionFilterFactory_Make( const SRASplitterFactory** cself,
997             const SRATable* table, TAlignedRegion* alregion, uint32_t alregion_qty )
998 {
999     rc_t rc = 0;
1000     AlignRegionFilterFactory* obj = NULL;
1001 
1002     if ( cself == NULL || table == NULL )
1003     {
1004         rc = RC( rcSRA, rcType, rcAllocating, rcParam, rcNull );
1005     }
1006     else
1007     {
1008         rc = SRASplitterFactory_Make( cself, eSplitterSpot, sizeof( *obj ),
1009                                       AlignRegionFilterFactory_Init,
1010                                       AlignRegionFilterFactory_NewObj,
1011                                       AlignRegionFilterFactory_Release );
1012         if ( rc == 0 )
1013         {
1014             obj = ( AlignRegionFilterFactory* )*cself;
1015             obj->table = table;
1016             obj->alregion = alregion;
1017             obj->alregion_qty = alregion_qty;
1018         }
1019     }
1020     return rc;
1021 }
1022 
1023 /* ### alignment meta-pair distance filtering ##################################################### */
1024 
1025 typedef struct AlignPairDistanceFilter_struct
1026 {
1027     const SRAColumn* col_tlen;
1028     const TMatepairDistance* mp_dist;
1029     uint64_t rejected_reads;
1030     bool mp_dist_unknown;
1031     uint32_t mp_dist_qty;
1032 } AlignPairDistanceFilter;
1033 
1034 
AlignPairDistanceFilter_GetKey(const SRASplitter * cself,const char ** key,spotid_t spot,readmask_t * readmask)1035 static rc_t AlignPairDistanceFilter_GetKey( const SRASplitter* cself,
1036                 const char** key, spotid_t spot, readmask_t* readmask )
1037 {
1038     rc_t rc = 0;
1039     AlignPairDistanceFilter* self = ( AlignPairDistanceFilter* )cself;
1040 
1041     if ( self == NULL || key == NULL )
1042     {
1043         rc = RC( rcSRA, rcNode, rcExecuting, rcParam, rcNull );
1044     }
1045     else
1046     {
1047         *key = "";
1048         reset_readmask( readmask );
1049         if ( self->col_tlen )
1050         {
1051             uint32_t i, j;
1052             bitsz_t o, nreads;
1053             int32_t* tlen;
1054             rc = SRAColumnRead( self->col_tlen, spot, (const void **)&tlen, &o, &nreads );
1055             if ( rc == 0 )
1056             {
1057                 nreads = nreads / sizeof(*tlen) / 8;
1058                 for ( i = 0; i < self->mp_dist_qty; i++ )
1059                 {
1060                     for ( j = 0; j < nreads; j++ )
1061                     {
1062                         if ( ( tlen[ j ] == 0 && !self->mp_dist_unknown) ||
1063                              ( tlen[ j ] != 0 &&
1064                                ( tlen[ j ] < self->mp_dist[ i ].from || self->mp_dist[ i ].to < tlen[ j ] ) ) )
1065                         {
1066                             unset_readmask( readmask, j );
1067                             self->rejected_reads ++;
1068                         }
1069                     }
1070                 }
1071             }
1072         }
1073     }
1074     return rc;
1075 }
1076 
1077 
AlignPairDistanceFilter_Release(const SRASplitter * cself)1078 static rc_t AlignPairDistanceFilter_Release( const SRASplitter* cself )
1079 {
1080     rc_t rc = 0;
1081     AlignPairDistanceFilter* self = ( AlignPairDistanceFilter* )cself;
1082 
1083     if ( self == NULL )
1084     {
1085         rc = RC( rcSRA, rcNode, rcExecuting, rcParam, rcNull );
1086     }
1087     else
1088     {
1089         if ( self->rejected_reads > 0 && !g_legacy_report )
1090             rc = KOutMsg( "Rejected %lu READS because of AlignPairDistanceFilter\n", self->rejected_reads );
1091     }
1092     return rc;
1093 }
1094 
1095 typedef struct AlignPairDistanceFilterFactory_struct
1096 {
1097     const SRATable* table;
1098     const SRAColumn* col_tlen;
1099     bool mp_dist_unknown;
1100     TMatepairDistance* mp_dist;
1101     uint32_t mp_dist_qty;
1102 } AlignPairDistanceFilterFactory;
1103 
1104 
AlignPairDistanceFilterFactory_Init(const SRASplitterFactory * cself)1105 static rc_t AlignPairDistanceFilterFactory_Init( const SRASplitterFactory* cself )
1106 {
1107     rc_t rc = 0;
1108     AlignPairDistanceFilterFactory* self = ( AlignPairDistanceFilterFactory* )cself;
1109 
1110     if ( self == NULL )
1111     {
1112         rc = RC( rcSRA, rcType, rcConstructing, rcParam, rcNull );
1113     }
1114     else
1115     {
1116         rc = SRATableOpenColumnRead( self->table, &self->col_tlen, "TEMPLATE_LEN", "I32" );
1117         if( !( rc == 0 || GetRCState( rc ) == rcExists ) && GetRCState( rc ) == rcNotFound )
1118         {
1119             LOGMSG( klogWarn, "Column TLEN was not found, no distance filtering" );
1120             self->col_tlen = NULL;
1121             rc = 0;
1122         }
1123         if ( rc == 0 )
1124         {
1125             uint32_t i;
1126             for ( i = 0; i < self->mp_dist_qty; i++ )
1127             {
1128                 if ( self->mp_dist[ i ].to == 0 )
1129                 {
1130                     self->mp_dist[ i ].to = ~0;
1131                 }
1132             }
1133         }
1134     }
1135     return rc;
1136 }
1137 
1138 
AlignPairDistanceFilterFactory_NewObj(const SRASplitterFactory * cself,const SRASplitter ** splitter)1139 static rc_t AlignPairDistanceFilterFactory_NewObj( const SRASplitterFactory* cself,
1140                                                    const SRASplitter** splitter )
1141 {
1142     rc_t rc = 0;
1143     AlignPairDistanceFilterFactory* self = ( AlignPairDistanceFilterFactory* )cself;
1144 
1145     if ( self == NULL )
1146     {
1147         rc = RC( rcSRA, rcType, rcExecuting, rcParam, rcNull );
1148     }
1149     else
1150     {
1151         rc = SRASplitter_Make( splitter, sizeof( AlignPairDistanceFilter ),
1152                                AlignPairDistanceFilter_GetKey, NULL, NULL, AlignPairDistanceFilter_Release );
1153         if ( rc == 0 )
1154         {
1155             AlignPairDistanceFilter * filter = ( AlignPairDistanceFilter * )( * splitter );
1156             filter->col_tlen = self->col_tlen;
1157             filter->mp_dist_unknown = self->mp_dist_unknown;
1158             filter->mp_dist = self->mp_dist;
1159             filter->mp_dist_qty = self->mp_dist_qty;
1160             filter->rejected_reads = 0;
1161         }
1162     }
1163     return rc;
1164 }
1165 
1166 
AlignPairDistanceFilterFactory_Release(const SRASplitterFactory * cself)1167 static void AlignPairDistanceFilterFactory_Release( const SRASplitterFactory* cself )
1168 {
1169     if ( cself != NULL )
1170     {
1171         AlignPairDistanceFilterFactory* self = ( AlignPairDistanceFilterFactory* )cself;
1172         SRAColumnRelease( self->col_tlen );
1173     }
1174 }
1175 
1176 
AlignPairDistanceFilterFactory_Make(const SRASplitterFactory ** cself,const SRATable * table,bool mp_dist_unknown,TMatepairDistance * mp_dist,uint32_t mp_dist_qty)1177 static rc_t AlignPairDistanceFilterFactory_Make( const SRASplitterFactory** cself,
1178         const SRATable* table, bool mp_dist_unknown, TMatepairDistance* mp_dist, uint32_t mp_dist_qty )
1179 {
1180     rc_t rc = 0;
1181     AlignPairDistanceFilterFactory* obj = NULL;
1182 
1183     if ( cself == NULL || table == NULL )
1184     {
1185         rc = RC( rcSRA, rcType, rcAllocating, rcParam, rcNull );
1186     }
1187     else
1188     {
1189         rc = SRASplitterFactory_Make( cself, eSplitterSpot, sizeof( *obj ),
1190                                       AlignPairDistanceFilterFactory_Init,
1191                                       AlignPairDistanceFilterFactory_NewObj,
1192                                       AlignPairDistanceFilterFactory_Release );
1193         if ( rc == 0 )
1194         {
1195             obj = ( AlignPairDistanceFilterFactory* ) *cself;
1196             obj->table = table;
1197             obj->mp_dist_unknown = mp_dist_unknown;
1198             obj->mp_dist = mp_dist;
1199             obj->mp_dist_qty = mp_dist_qty;
1200         }
1201     }
1202     return rc;
1203 }
1204 
1205 
1206 /* ============== FASTQ read type (bio/tech) filter ============================ */
1207 
1208 typedef struct FastqBioFilter_struct
1209 {
1210     const FastqReader* reader;
1211     uint64_t rejected_reads;
1212 } FastqBioFilter;
1213 
1214 
1215 /* filter out non-bio reads */
FastqBioFilter_GetKey(const SRASplitter * cself,const char ** key,spotid_t spot,readmask_t * readmask)1216 static rc_t FastqBioFilter_GetKey( const SRASplitter* cself, const char** key,
1217                                    spotid_t spot, readmask_t* readmask )
1218 {
1219     rc_t rc = 0;
1220     FastqBioFilter* self = ( FastqBioFilter* )cself;
1221 
1222     if ( self == NULL || key == NULL )
1223     {
1224         rc = RC( rcExe, rcNode, rcExecuting, rcParam, rcInvalid );
1225     }
1226     else
1227     {
1228         uint32_t num_reads = 0;
1229 
1230         *key = NULL;
1231         rc = FastqReaderSeekSpot( self->reader, spot );
1232         if ( rc == 0 )
1233         {
1234             rc = FastqReader_SpotInfo( self->reader, NULL, NULL, NULL, NULL, NULL, &num_reads );
1235             if ( rc == 0 )
1236             {
1237                 uint32_t readId;
1238                 SRA_DUMP_DBG( 3, ( "%s %u row reads: ", __func__, spot ) );
1239                 for (readId = 0; rc == 0 && readId < num_reads; readId++ )
1240                 {
1241                     if ( isset_readmask( readmask, readId ) )
1242                     {
1243                         SRAReadTypes read_type;
1244                         rc = FastqReader_SpotReadInfo( self->reader, readId + 1, &read_type,
1245                                                        NULL, NULL, NULL, NULL);
1246                         if ( rc == 0 )
1247                         {
1248                             if ( !( read_type & SRA_READ_TYPE_BIOLOGICAL ) )
1249                             {
1250                                 unset_readmask( readmask, readId );
1251                                 self->rejected_reads ++;
1252                             }
1253                             else
1254                             {
1255                                 SRA_DUMP_DBG( 3, ( " %u", readId ) );
1256                             }
1257                         }
1258                     }
1259                 }
1260                 *key = "";
1261                 SRA_DUMP_DBG( 3, ( " key '%s'\n", *key ) );
1262             }
1263         }
1264         else if ( GetRCObject( rc ) == rcRow && GetRCState( rc ) == rcNotFound )
1265         {
1266             SRA_DUMP_DBG( 3, ( "%s skipped %u row\n", __func__, spot ) );
1267             rc = 0;
1268         }
1269     }
1270     return rc;
1271 }
1272 
1273 
FastqBioFilter_Release(const SRASplitter * cself)1274 static rc_t FastqBioFilter_Release( const SRASplitter * cself )
1275 {
1276     rc_t rc = 0;
1277     FastqBioFilter * self = ( FastqBioFilter * )cself;
1278 
1279     if ( self == NULL )
1280         rc = RC( rcExe, rcNode, rcExecuting, rcParam, rcInvalid );
1281     else
1282     {
1283         if ( self->rejected_reads > 0 && !g_legacy_report )
1284             rc = KOutMsg( "Rejected %lu READS because of filtering out non-biological READS\n", self->rejected_reads );
1285     }
1286     return rc;
1287 }
1288 
1289 
1290 typedef struct FastqBioFilterFactory_struct
1291 {
1292     const char* accession;
1293     const SRATable* table;
1294     const FastqReader* reader;
1295 } FastqBioFilterFactory;
1296 
1297 
FastqBioFilterFactory_Init(const SRASplitterFactory * cself)1298 static rc_t FastqBioFilterFactory_Init( const SRASplitterFactory* cself )
1299 {
1300     rc_t rc = 0;
1301     FastqBioFilterFactory* self = ( FastqBioFilterFactory* )cself;
1302 
1303     if ( self == NULL )
1304     {
1305         rc = RC( rcExe, rcType, rcConstructing, rcParam, rcNull );
1306     }
1307     else
1308     {
1309         rc = FastqReaderMake( &self->reader, self->table, self->accession,
1310                               /* preserve orig spot format to save on conversions */
1311                               FastqArgs.is_platform_cs_native, false, FastqArgs.fasta > 0, false,
1312                               false, !FastqArgs.applyClip, FastqArgs.SuppressQualForCSKey, 0,
1313                               FastqArgs.offset, '\0', 0, 0) ;
1314     }
1315     return rc;
1316 }
1317 
1318 
FastqBioFilterFactory_NewObj(const SRASplitterFactory * cself,const SRASplitter ** splitter)1319 static rc_t FastqBioFilterFactory_NewObj( const SRASplitterFactory* cself, const SRASplitter** splitter )
1320 {
1321     rc_t rc = 0;
1322     FastqBioFilterFactory* self = ( FastqBioFilterFactory* )cself;
1323 
1324     if ( self == NULL )
1325     {
1326         rc = RC( rcExe, rcType, rcExecuting, rcParam, rcNull );
1327     }
1328     else
1329     {
1330         rc = SRASplitter_Make( splitter, sizeof( FastqBioFilter ),
1331                                FastqBioFilter_GetKey, NULL, NULL, FastqBioFilter_Release );
1332         if ( rc == 0 )
1333         {
1334             FastqBioFilter * filter = ( FastqBioFilter * )( * splitter );
1335             filter->reader = self->reader;
1336             filter->rejected_reads = 0;
1337         }
1338     }
1339     return rc;
1340 }
1341 
1342 
FastqBioFilterFactory_Release(const SRASplitterFactory * cself)1343 static void FastqBioFilterFactory_Release( const SRASplitterFactory* cself )
1344 {
1345     if ( cself != NULL )
1346     {
1347         FastqBioFilterFactory* self = ( FastqBioFilterFactory* )cself;
1348         FastqReaderWhack( self->reader );
1349     }
1350 }
1351 
1352 
FastqBioFilterFactory_Make(const SRASplitterFactory ** cself,const char * accession,const SRATable * table)1353 static rc_t FastqBioFilterFactory_Make( const SRASplitterFactory** cself,
1354                         const char* accession, const SRATable* table )
1355 {
1356     rc_t rc = 0;
1357     FastqBioFilterFactory* obj = NULL;
1358 
1359     if ( cself == NULL || accession == NULL || table == NULL )
1360     {
1361         rc = RC( rcExe, rcType, rcConstructing, rcParam, rcNull );
1362     }
1363     else
1364     {
1365         rc = SRASplitterFactory_Make( cself, eSplitterSpot, sizeof( *obj ),
1366                                       FastqBioFilterFactory_Init,
1367                                       FastqBioFilterFactory_NewObj,
1368                                       FastqBioFilterFactory_Release );
1369 
1370         if ( rc == 0 )
1371         {
1372             obj = ( FastqBioFilterFactory* )*cself;
1373             obj->accession = accession;
1374             obj->table = table;
1375         }
1376     }
1377     return rc;
1378 }
1379 
1380 
1381 /* ============== FASTQ number of reads filter ============================ */
1382 
1383 typedef struct FastqRNumberFilter_struct
1384 {
1385     const FastqReader* reader;
1386     uint64_t rejected_reads;
1387 } FastqRNumberFilter;
1388 
1389 
FastqRNumberFilter_GetKey(const SRASplitter * cself,const char ** key,spotid_t spot,readmask_t * readmask)1390 static rc_t FastqRNumberFilter_GetKey( const SRASplitter* cself, const char** key,
1391                                        spotid_t spot, readmask_t* readmask )
1392 {
1393     rc_t rc = 0;
1394     FastqRNumberFilter* self = ( FastqRNumberFilter* )cself;
1395 
1396     if ( self == NULL || key == NULL )
1397     {
1398         rc = RC( rcExe, rcNode, rcExecuting, rcParam, rcInvalid );
1399     }
1400     else
1401     {
1402         uint32_t num_reads = 0;
1403 
1404         *key = NULL;
1405         rc = FastqReaderSeekSpot( self->reader, spot );
1406         if ( rc == 0 )
1407         {
1408             rc = FastqReader_SpotInfo( self->reader, NULL, NULL, NULL, NULL, NULL, &num_reads );
1409             if ( rc == 0 )
1410             {
1411                 int readId, q;
1412 
1413                 SRA_DUMP_DBG( 3, ( "%s %u row reads: ", __func__, spot ) );
1414                 for ( readId = 0, q = 0; readId < num_reads; readId++ )
1415                 {
1416                     if ( isset_readmask( readmask, readId ) )
1417                     {
1418                         if ( ++q > FastqArgs.maxReads )
1419                         {
1420                             unset_readmask( readmask, 1 << readId );
1421                         }
1422                         else
1423                         {
1424                             SRA_DUMP_DBG( 3, ( " %u", readId ) );
1425                         }
1426                     }
1427                 }
1428                 *key = "";
1429                 SRA_DUMP_DBG( 3, ( " key '%s'\n", *key ) );
1430             }
1431         }
1432         else if ( GetRCObject( rc ) == rcRow && GetRCState( rc ) == rcNotFound )
1433         {
1434             SRA_DUMP_DBG( 3, ( "%s skipped %u row\n", __func__, spot ) );
1435             rc = 0;
1436         }
1437     }
1438     return rc;
1439 }
1440 
1441 
FastqRNumberFilter_Release(const SRASplitter * cself)1442 static rc_t FastqRNumberFilter_Release( const SRASplitter* cself )
1443 {
1444     rc_t rc = 0;
1445     FastqRNumberFilter * self = ( FastqRNumberFilter * )cself;
1446 
1447     if ( self == NULL )
1448     {
1449         rc = RC( rcExe, rcNode, rcExecuting, rcParam, rcInvalid );
1450     }
1451     else
1452     {
1453         if ( self->rejected_reads > 0 && !g_legacy_report )
1454             rc = KOutMsg( "Rejected %lu READS because of max. number of READS = %u\n", self->rejected_reads, FastqArgs.maxReads );
1455     }
1456     return rc;
1457 }
1458 
1459 typedef struct FastqRNumberFilterFactory_struct
1460 {
1461     const char* accession;
1462     const SRATable* table;
1463     const FastqReader* reader;
1464 } FastqRNumberFilterFactory;
1465 
1466 
FastqRNumberFilterFactory_Init(const SRASplitterFactory * cself)1467 static rc_t FastqRNumberFilterFactory_Init( const SRASplitterFactory* cself )
1468 {
1469     rc_t rc = 0;
1470     FastqRNumberFilterFactory* self = ( FastqRNumberFilterFactory* )cself;
1471 
1472     if ( self == NULL )
1473     {
1474         rc = RC( rcExe, rcType, rcConstructing, rcParam, rcNull );
1475     }
1476     else
1477     {
1478         rc = FastqReaderMake( &self->reader, self->table, self->accession,
1479                               /* preserve orig spot format to save on conversions */
1480                               FastqArgs.is_platform_cs_native, false, FastqArgs.fasta > 0, false,
1481                               false, !FastqArgs.applyClip, FastqArgs.SuppressQualForCSKey, 0,
1482                               FastqArgs.offset, '\0', 0, 0 );
1483     }
1484     return rc;
1485 }
1486 
1487 
FastqRNumberFilterFactory_NewObj(const SRASplitterFactory * cself,const SRASplitter ** splitter)1488 static rc_t FastqRNumberFilterFactory_NewObj( const SRASplitterFactory* cself, const SRASplitter** splitter )
1489 {
1490     rc_t rc = 0;
1491     FastqRNumberFilterFactory* self = ( FastqRNumberFilterFactory* )cself;
1492 
1493     if ( self == NULL )
1494     {
1495         rc = RC( rcExe, rcType, rcExecuting, rcParam, rcNull );
1496     }
1497     else
1498     {
1499         rc = SRASplitter_Make( splitter, sizeof( FastqRNumberFilter ),
1500                                FastqRNumberFilter_GetKey, NULL, NULL, FastqRNumberFilter_Release );
1501         if ( rc == 0 )
1502         {
1503             FastqRNumberFilter * filter = ( FastqRNumberFilter * )( * splitter );
1504             filter->reader = self->reader;
1505             filter->rejected_reads = 0;
1506         }
1507     }
1508     return rc;
1509 }
1510 
1511 
FastqRNumberFilterFactory_Release(const SRASplitterFactory * cself)1512 static void FastqRNumberFilterFactory_Release( const SRASplitterFactory* cself )
1513 {
1514     if ( cself != NULL )
1515     {
1516         FastqRNumberFilterFactory* self = ( FastqRNumberFilterFactory* )cself;
1517         FastqReaderWhack( self->reader );
1518     }
1519 }
1520 
1521 
FastqRNumberFilterFactory_Make(const SRASplitterFactory ** cself,const char * accession,const SRATable * table)1522 static rc_t FastqRNumberFilterFactory_Make( const SRASplitterFactory** cself,
1523                                 const char* accession, const SRATable* table )
1524 {
1525     rc_t rc = 0;
1526     FastqRNumberFilterFactory* obj = NULL;
1527 
1528     if ( cself == NULL || accession == NULL || table == NULL )
1529     {
1530         rc = RC( rcExe, rcType, rcConstructing, rcParam, rcNull );
1531     }
1532     else
1533     {
1534         rc = SRASplitterFactory_Make( cself, eSplitterSpot, sizeof( *obj ),
1535                                       FastqRNumberFilterFactory_Init,
1536                                       FastqRNumberFilterFactory_NewObj,
1537                                       FastqRNumberFilterFactory_Release );
1538         if ( rc == 0 )
1539         {
1540             obj = ( FastqRNumberFilterFactory* )*cself;
1541             obj->accession = accession;
1542             obj->table = table;
1543         }
1544     }
1545     return rc;
1546 }
1547 
1548 /* ============== FASTQ quality filter ============================ */
1549 
1550 typedef struct FastqQFilter_struct
1551 {
1552     const FastqReader * reader;
1553     KDataBuffer * buffer_for_read_column;
1554     KDataBuffer * buffer_for_quality;
1555     uint64_t rejected_spots;
1556     uint64_t rejected_reads;
1557 } FastqQFilter;
1558 
1559 
1560 #define MIN_QUAL_FOR_RULE_3 ( 33 + 2 )
1561 
QFilter_Test_new(const char * read,const char * quality,uint32_t readlen,uint32_t minlen,char N_char)1562 static bool QFilter_Test_new( const char * read, const char * quality, uint32_t readlen, uint32_t minlen, char N_char )
1563 {
1564     uint32_t idx, cnt, cnt1, start;
1565     char b0;
1566 
1567     /* 1st criteria : READ has to be longer than the minlen */
1568     if ( readlen < minlen ) return false;
1569 
1570     /* 2nd criteria : READ does not contain any N's in the first minlen bases */
1571     for ( idx = 0; idx < minlen; ++idx )
1572     {
1573         if ( read[ idx ] == N_char ) return false;
1574     }
1575 
1576     start = ( N_char == '.' ) ? 1 : 0;
1577     /* 3rd criteria : QUALITY values are all 2 or higher in the first minlen values */
1578     for ( idx = start; idx < minlen; ++idx )
1579     {
1580         if ( quality[ idx ] < MIN_QUAL_FOR_RULE_3 ) return false;
1581     }
1582 
1583     /* 4th criteria : READ contains more than one type of base in the first minlen bases */
1584     b0 = read[ 0 ];
1585     for ( idx = 1, cnt = 1; idx < minlen; ++idx )
1586     {
1587         if ( read[ idx ] == b0 ) cnt++;
1588     }
1589     if ( cnt == minlen ) return false;
1590 
1591     /* 5th criteria : READ does not contain more than 50% N's in its whole length */
1592     cnt = ( readlen / 2 );
1593     for ( idx = 0, cnt1 = 0; idx < readlen; ++idx )
1594     {
1595         if ( read[ idx ] == N_char ) cnt1++;
1596     }
1597     if ( cnt1 > cnt ) return false;
1598 
1599     /* 6th criteria : READ does not contain values other than ACGTN in its whole length */
1600     if ( N_char != '.' )
1601     {
1602         for ( idx = 0; idx < readlen; ++idx )
1603         {
1604             if ( ! ( ( read[ idx ] == 'A' ) || ( read[ idx ] == 'C' ) || ( read[ idx ] == 'G' ) ||
1605                     ( read[ idx ] == 'T' ) || ( read[ idx ] == 'N' ) ) )
1606                 return false;
1607         }
1608     }
1609 
1610     return true;
1611 }
1612 
1613 
QFilter_Test_old(const char * read,uint32_t readlen,bool cs_native)1614 static bool QFilter_Test_old( const char * read, uint32_t readlen, bool cs_native )
1615 {
1616     static char const* const baseStr = "NNNNNNNNNN";
1617     static char const* const colorStr = "..........";
1618     static const uint32_t strLen = 10;
1619     uint32_t xLen = readlen < strLen ? readlen : strLen;
1620 
1621     if ( cs_native )
1622     {
1623         if ( strncmp( &read[ 1 ], colorStr, xLen ) == 0 || strcmp( &read[ readlen - xLen + 1 ], colorStr ) == 0 )
1624             return false;
1625     }
1626     else
1627     {
1628         if ( strncmp( read, baseStr, xLen ) == 0 || strcmp( &read[ readlen - xLen ], baseStr ) == 0 )
1629             return false;
1630     }
1631     return true;
1632 }
1633 
1634 
1635 /* filter out reads by leading/trialing quality */
FastqQFilter_GetKey(const SRASplitter * cself,const char ** key,spotid_t spot,readmask_t * readmask)1636 static rc_t FastqQFilter_GetKey( const SRASplitter* cself, const char** key,
1637                                  spotid_t spot, readmask_t* readmask )
1638 {
1639     rc_t rc = 0;
1640     FastqQFilter* self = ( FastqQFilter* )cself;
1641 
1642     if ( self == NULL || key == NULL )
1643     {
1644         rc = RC( rcExe, rcNode, rcExecuting, rcParam, rcInvalid );
1645     }
1646     else
1647     {
1648         uint32_t num_reads = 0, spot_len;
1649 
1650         *key = NULL;
1651         rc = FastqReaderSeekSpot( self->reader, spot );
1652         if ( rc == 0 )
1653         {
1654             rc = FastqReader_SpotInfo( self->reader, NULL, NULL, NULL, NULL, &spot_len, &num_reads );
1655             if ( rc == 0 )
1656             {
1657                 SRA_DUMP_DBG( 3, ( "%s %u row reads: ", __func__, spot ) );
1658                 if ( FastqArgs.split_spot )
1659                 {
1660                     uint32_t readId;
1661                     INSDC_coord_len read_len;
1662                     for ( readId = 0; rc == 0 && readId < num_reads; readId++ )
1663                     {
1664                         if ( isset_readmask( readmask, readId ) )
1665                         {
1666                             rc = FastqReader_SpotReadInfo( self->reader, readId + 1, NULL, NULL, NULL, NULL, &read_len );
1667                             if ( rc == 0 )
1668                             {
1669                                 /* FastqReaderBase() in libsra! libs/sra/fastq.h
1670                                    IF_BUF ... very, very bad macro defined in factory.h !!
1671                                    it combines reading from table with resizing of a KDatabuffer in a loop until rc == 0 is reached!
1672                                 */
1673                                 IF_BUF( ( FastqReaderBase( self->reader, readId + 1,
1674                                           self->buffer_for_read_column->base,
1675                                           KDataBufferBytes( self->buffer_for_read_column ), NULL) ),
1676                                           self->buffer_for_read_column, read_len )
1677                                 {
1678                                     const char * b = self->buffer_for_read_column->base;
1679                                     bool filter_passed = true;
1680 
1681                                     if ( FastqArgs.qual_filter1 )
1682                                     {
1683                                         if ( quality_N_limit > 0 )
1684                                         {
1685                                             IF_BUF( ( FastqReaderQuality( self->reader, readId + 1,
1686                                                       self->buffer_for_quality->base,
1687                                                       KDataBufferBytes( self->buffer_for_quality ), NULL) ),
1688                                                       self->buffer_for_quality, read_len )
1689                                             {
1690                                                 const char * q = self->buffer_for_quality->base;
1691                                                 char N_char = FastqArgs.is_platform_cs_native ? '.' : 'N';
1692                                                 filter_passed = QFilter_Test_new( b, q, read_len, quality_N_limit, N_char );
1693                                             }
1694                                         }
1695                                     }
1696                                     else
1697                                         filter_passed = QFilter_Test_old( b, read_len, FastqArgs.is_platform_cs_native );
1698 
1699                                     if ( filter_passed )
1700                                         SRA_DUMP_DBG( 3, ( " %u", readId ) );
1701                                     else
1702                                     {
1703                                         unset_readmask( readmask, readId );
1704                                         /* READ did not pass filter */
1705                                         self->rejected_reads ++;
1706                                     }
1707                                 }
1708                             }
1709                         }
1710                     }
1711                     *key = "";
1712                 }
1713                 else
1714                 {
1715                     IF_BUF( ( FastqReaderBase( self->reader, 0, self->buffer_for_read_column->base,
1716                               KDataBufferBytes( self->buffer_for_read_column ), NULL ) ), self->buffer_for_read_column, spot_len )
1717                     {
1718                         const char* b = self->buffer_for_read_column->base;
1719                         bool filter_passed = true;
1720 
1721                         if ( FastqArgs.qual_filter1 )
1722                         {
1723                             if ( quality_N_limit > 0 )
1724                             {
1725                                 IF_BUF( ( FastqReaderQuality( self->reader, 0, self->buffer_for_quality->base,
1726                                           KDataBufferBytes( self->buffer_for_quality ), NULL ) ), self->buffer_for_quality, spot_len )
1727                                 {
1728                                     const char * q = self->buffer_for_quality->base;
1729                                     char N_char = FastqArgs.is_platform_cs_native ? '.' : 'N';
1730                                     filter_passed = QFilter_Test_new( b, q, spot_len, quality_N_limit, N_char );
1731                                 }
1732                             }
1733                         }
1734                         else
1735                             filter_passed = QFilter_Test_old( b, spot_len, FastqArgs.is_platform_cs_native );
1736 
1737                         if ( filter_passed )
1738                         {
1739                             *key = "";
1740                             SRA_DUMP_DBG( 3, ( " whole spot" ) );
1741                         }
1742                         else
1743                         {
1744                             /* SPOT did not pass filter */
1745                             self->rejected_spots ++;
1746                         }
1747                     }
1748                 }
1749                 SRA_DUMP_DBG( 3, ( " key '%s'\n", *key ) );
1750             }
1751         }
1752         else if ( GetRCObject( rc ) == rcRow && GetRCState( rc ) == rcNotFound )
1753         {
1754             SRA_DUMP_DBG( 3, ( "%s skipped %u row\n", __func__, spot ) );
1755             rc = 0;
1756         }
1757     }
1758     return rc;
1759 }
1760 
1761 
FastqQFilter_Release(const SRASplitter * cself)1762 static rc_t FastqQFilter_Release( const SRASplitter * cself )
1763 {
1764     rc_t rc = 0;
1765     FastqQFilter* self = ( FastqQFilter* )cself;
1766 
1767     if ( self == NULL )
1768         rc = RC( rcExe, rcNode, rcExecuting, rcParam, rcInvalid );
1769     else if ( !g_legacy_report )
1770     {
1771         if ( self->rejected_reads > 0 )
1772             rc = KOutMsg( "Rejected %lu READS because of Quality-Filtering\n", self->rejected_reads );
1773         if ( rc == 0 && self->rejected_spots > 0 )
1774             rc = KOutMsg( "Rejected %lu SPOTS because of Quality-Filtering\n", self->rejected_spots );
1775 
1776     }
1777     return rc;
1778 }
1779 
1780 
1781 typedef struct FastqQFilterFactory_struct
1782 {
1783     const char* accession;
1784     const SRATable* table;
1785     const FastqReader* reader;
1786     KDataBuffer kdbuf;
1787     KDataBuffer kdbuf2;
1788 } FastqQFilterFactory;
1789 
1790 
FastqQFilterFactory_Init(const SRASplitterFactory * cself)1791 static rc_t FastqQFilterFactory_Init( const SRASplitterFactory* cself )
1792 {
1793     rc_t rc = 0;
1794     FastqQFilterFactory* self = ( FastqQFilterFactory* )cself;
1795 
1796     if ( self == NULL )
1797     {
1798         rc = RC( rcExe, rcType, rcConstructing, rcParam, rcNull );
1799     }
1800     else
1801     {
1802         rc = KDataBufferMakeBytes( &self->kdbuf, DATABUFFERINITSIZE );
1803         if ( rc == 0 )
1804         {
1805             rc = KDataBufferMakeBytes( &self->kdbuf2, DATABUFFERINITSIZE );
1806             if ( rc == 0 )
1807             {
1808                 rc = FastqReaderMake( &self->reader, self->table, self->accession,
1809                              /* preserve orig spot format to save on conversions */
1810                              FastqArgs.is_platform_cs_native, false, FastqArgs.fasta > 0, false,
1811                              false, !FastqArgs.applyClip, FastqArgs.SuppressQualForCSKey, 0,
1812                              FastqArgs.offset, '\0', 0, 0 );
1813             }
1814         }
1815     }
1816     return rc;
1817 }
1818 
1819 
FastqQFilterFactory_NewObj(const SRASplitterFactory * cself,const SRASplitter ** splitter)1820 static rc_t FastqQFilterFactory_NewObj( const SRASplitterFactory* cself, const SRASplitter** splitter )
1821 {
1822     rc_t rc = 0;
1823     FastqQFilterFactory* self = ( FastqQFilterFactory* )cself;
1824 
1825     if ( self == NULL )
1826     {
1827         rc = RC( rcExe, rcType, rcExecuting, rcParam, rcNull );
1828     }
1829     else
1830     {
1831         rc = SRASplitter_Make( splitter, sizeof( FastqQFilter ), FastqQFilter_GetKey, NULL, NULL, FastqQFilter_Release );
1832         if ( rc == 0 )
1833         {
1834             FastqQFilter * filter = ( FastqQFilter * )( * splitter );
1835             filter->reader = self->reader;
1836             filter->buffer_for_read_column = &self->kdbuf;
1837             filter->buffer_for_quality = &self->kdbuf2;
1838             filter->rejected_spots = 0;
1839             filter->rejected_reads = 0;
1840         }
1841     }
1842     return rc;
1843 }
1844 
1845 
FastqQFilterFactory_Release(const SRASplitterFactory * cself)1846 static void FastqQFilterFactory_Release( const SRASplitterFactory* cself )
1847 {
1848     if ( cself != NULL )
1849     {
1850         FastqQFilterFactory* self = ( FastqQFilterFactory* )cself;
1851         KDataBufferWhack( &self->kdbuf );
1852         KDataBufferWhack( &self->kdbuf2 );
1853         FastqReaderWhack( self->reader );
1854     }
1855 }
1856 
1857 
FastqQFilterFactory_Make(const SRASplitterFactory ** cself,const char * accession,const SRATable * table)1858 static rc_t FastqQFilterFactory_Make( const SRASplitterFactory** cself,
1859                                       const char* accession, const SRATable* table )
1860 {
1861     rc_t rc = 0;
1862     FastqQFilterFactory* obj = NULL;
1863 
1864     if ( cself == NULL || accession == NULL || table == NULL )
1865     {
1866         rc = RC( rcExe, rcType, rcConstructing, rcParam, rcNull );
1867     }
1868     else
1869     {
1870         rc = SRASplitterFactory_Make( cself, eSplitterSpot, sizeof( *obj ),
1871                                       FastqQFilterFactory_Init,
1872                                       FastqQFilterFactory_NewObj,
1873                                       FastqQFilterFactory_Release );
1874         if ( rc == 0 )
1875         {
1876             obj = ( FastqQFilterFactory* )*cself;
1877             obj->accession = accession;
1878             obj->table = table;
1879         }
1880     }
1881     return rc;
1882 }
1883 
1884 
1885 /* ============== FASTQ min read length filter ============================ */
1886 
1887 
1888 typedef struct FastqReadLenFilter_struct
1889 {
1890     const FastqReader* reader;
1891     uint64_t rejected_spots;
1892     uint64_t rejected_reads;
1893 } FastqReadLenFilter;
1894 
1895 
FastqReadLenFilter_GetKey(const SRASplitter * cself,const char ** key,spotid_t spot,readmask_t * readmask)1896 static rc_t FastqReadLenFilter_GetKey( const SRASplitter* cself, const char** key,
1897                                        spotid_t spot, readmask_t* readmask )
1898 {
1899     rc_t rc = 0;
1900     FastqReadLenFilter* self = ( FastqReadLenFilter* )cself;
1901 
1902     if ( self == NULL || key == NULL )
1903     {
1904         rc = RC( rcExe, rcNode, rcExecuting, rcParam, rcInvalid );
1905     }
1906     else
1907     {
1908         uint32_t num_reads = 0, spot_len = 0;
1909 
1910         *key = NULL;
1911         rc = FastqReaderSeekSpot( self->reader, spot );
1912         if ( rc == 0 )
1913         {
1914             rc = FastqReader_SpotInfo( self->reader, NULL, NULL, NULL, NULL, &spot_len, &num_reads );
1915             if ( rc == 0 )
1916             {
1917                 SRA_DUMP_DBG( 3, ( "%s %u row reads:", __func__, spot ) );
1918                 if ( FastqArgs.split_spot )
1919                 {
1920                     uint32_t readId;
1921                     INSDC_coord_len read_len = 0;
1922                     for ( readId = 0; rc == 0 && readId < num_reads; readId++ )
1923                     {
1924                         if ( isset_readmask( readmask, readId ) )
1925                         {
1926                             rc = FastqReader_SpotReadInfo( self->reader, readId + 1, NULL, NULL, NULL, NULL, &read_len );
1927                             if ( rc == 0 )
1928                             {
1929                                 if ( read_len < FastqArgs.minReadLen )
1930                                 {
1931                                     unset_readmask( readmask, readId );
1932                                     self->rejected_reads ++;
1933                                 }
1934                                 else
1935                                 {
1936                                     SRA_DUMP_DBG( 3, ( " %u", readId ) );
1937                                 }
1938                             }
1939                         }
1940                     }
1941                     *key = "";
1942                 }
1943                 else
1944                 {
1945                     if ( spot_len >= FastqArgs.minReadLen )
1946                     {
1947                         *key = "";
1948                         SRA_DUMP_DBG( 3, ( " whole spot" ) );
1949                     }
1950                     else
1951                         self->rejected_spots ++;
1952                 }
1953                 SRA_DUMP_DBG( 3, ( "\n" ) );
1954             }
1955         }
1956         else if ( GetRCObject( rc ) == rcRow && GetRCState( rc ) == rcNotFound )
1957         {
1958             SRA_DUMP_DBG( 3, ( "%s skipped %u row\n", __func__, spot ) );
1959             rc = 0;
1960         }
1961     }
1962     return rc;
1963 }
1964 
1965 
FastqReadLenFilter_Release(const SRASplitter * cself)1966 static rc_t FastqReadLenFilter_Release( const SRASplitter * cself )
1967 {
1968     rc_t rc = 0;
1969     FastqReadLenFilter * self = ( FastqReadLenFilter* )cself;
1970 
1971     if ( self == NULL )
1972         rc = RC( rcExe, rcNode, rcExecuting, rcParam, rcInvalid );
1973     else if ( !g_legacy_report )
1974     {
1975         if ( self->rejected_reads > 0 )
1976             rc = KOutMsg( "Rejected %lu READS because READLEN < %u\n", self->rejected_reads, FastqArgs.minReadLen );
1977         if ( self->rejected_spots > 0 && rc == 0 )
1978             rc = KOutMsg( "Rejected %lu SPOTS because SPOTLEN < %u\n", self->rejected_spots, FastqArgs.minReadLen );
1979     }
1980     return rc;
1981 }
1982 
1983 
1984 typedef struct FastqReadLenFilterFactory_struct
1985 {
1986     const char* accession;
1987     const SRATable* table;
1988     const FastqReader* reader;
1989 } FastqReadLenFilterFactory;
1990 
1991 
FastqReadLenFilterFactory_Init(const SRASplitterFactory * cself)1992 static rc_t FastqReadLenFilterFactory_Init( const SRASplitterFactory* cself )
1993 {
1994     rc_t rc = 0;
1995     FastqReadLenFilterFactory* self = ( FastqReadLenFilterFactory* )cself;
1996 
1997     if ( self == NULL )
1998     {
1999         rc = RC( rcExe, rcType, rcConstructing, rcParam, rcNull );
2000     }
2001     else
2002     {
2003         rc = FastqReaderMake( &self->reader, self->table, self->accession,
2004                              /* preserve orig spot format to save on conversions */
2005                               FastqArgs.is_platform_cs_native, false, FastqArgs.fasta > 0, false,
2006                               false, !FastqArgs.applyClip, FastqArgs.SuppressQualForCSKey, 0,
2007                               FastqArgs.offset, '\0', 0, 0 );
2008     }
2009     return rc;
2010 }
2011 
2012 
FastqReadLenFilterFactory_NewObj(const SRASplitterFactory * cself,const SRASplitter ** splitter)2013 static rc_t FastqReadLenFilterFactory_NewObj( const SRASplitterFactory* cself, const SRASplitter** splitter )
2014 {
2015     rc_t rc = 0;
2016     FastqReadLenFilterFactory* self = ( FastqReadLenFilterFactory* )cself;
2017 
2018     if ( self == NULL )
2019     {
2020         rc = RC( rcExe, rcType, rcExecuting, rcParam, rcNull );
2021     }
2022     else
2023     {
2024         rc = SRASplitter_Make( splitter, sizeof( FastqReadLenFilter ),
2025                                FastqReadLenFilter_GetKey, NULL, NULL, FastqReadLenFilter_Release );
2026         if ( rc == 0 )
2027         {
2028             FastqReadLenFilter * filter = ( FastqReadLenFilter* )( *splitter );
2029             filter->reader = self->reader;
2030             filter->rejected_spots = 0;
2031             filter->rejected_reads = 0;
2032         }
2033     }
2034     return rc;
2035 }
2036 
2037 
FastqReadLenFilterFactory_Release(const SRASplitterFactory * cself)2038 static void FastqReadLenFilterFactory_Release( const SRASplitterFactory* cself )
2039 {
2040     if ( cself != NULL )
2041     {
2042         FastqReadLenFilterFactory* self = ( FastqReadLenFilterFactory* )cself;
2043         FastqReaderWhack( self->reader );
2044     }
2045 }
2046 
2047 
FastqReadLenFilterFactory_Make(const SRASplitterFactory ** cself,const char * accession,const SRATable * table)2048 static rc_t FastqReadLenFilterFactory_Make( const SRASplitterFactory** cself,
2049                             const char* accession, const SRATable* table )
2050 {
2051     rc_t rc = 0;
2052     FastqReadLenFilterFactory* obj = NULL;
2053 
2054     if ( cself == NULL || accession == NULL || table == NULL )
2055     {
2056         rc = RC( rcExe, rcType, rcConstructing, rcParam, rcNull );
2057     }
2058     else
2059     {
2060         rc = SRASplitterFactory_Make( cself, eSplitterSpot, sizeof( *obj ),
2061                                       FastqReadLenFilterFactory_Init,
2062                                       FastqReadLenFilterFactory_NewObj,
2063                                       FastqReadLenFilterFactory_Release );
2064         if ( rc == 0 )
2065         {
2066             obj = ( FastqReadLenFilterFactory* )*cself;
2067             obj->accession = accession;
2068             obj->table = table;
2069         }
2070     }
2071     return rc;
2072 }
2073 
2074 
2075 /* ============== FASTQ read splitter ============================ */
2076 
2077 char* FastqReadSplitter_key_buf = NULL;
2078 
2079 
2080 typedef struct FastqReadSplitter_struct
2081 {
2082     const FastqReader* reader;
2083     SRASplitter_Keys* keys;
2084     uint32_t keys_max;
2085 } FastqReadSplitter;
2086 
2087 
FastqReadSplitter_GetKeySet(const SRASplitter * cself,const SRASplitter_Keys ** key,uint32_t * keys,spotid_t spot,const readmask_t * readmask)2088 static rc_t FastqReadSplitter_GetKeySet( const SRASplitter* cself, const SRASplitter_Keys** key,
2089                     uint32_t* keys, spotid_t spot, const readmask_t* readmask )
2090 {
2091     rc_t rc = 0;
2092     FastqReadSplitter* self = ( FastqReadSplitter* )cself;
2093     const size_t key_offset = 5;
2094 
2095     if ( self == NULL || key == NULL )
2096     {
2097         rc = RC( rcExe, rcNode, rcExecuting, rcParam, rcInvalid );
2098     }
2099     else
2100     {
2101         uint32_t num_reads = 0;
2102 
2103         *keys = 0;
2104         if ( FastqReadSplitter_key_buf == NULL )
2105         {
2106             /* initial alloc key_buf: "  1\0  2\0...\0  9\0 10\0 11\0...\03220..\08192\0" */
2107             if ( nreads_max > 9999 )
2108             {
2109                 /* key_offset and sprintf format size are insufficient for keys longer than 4 digits */
2110                 rc = RC( rcExe, rcNode, rcExecuting, rcBuffer, rcInsufficient );
2111             }
2112             else
2113             {
2114                 FastqReadSplitter_key_buf = malloc( nreads_max * key_offset );
2115                 if ( FastqReadSplitter_key_buf == NULL )
2116                 {
2117                     rc = RC( rcExe, rcNode, rcExecuting, rcMemory, rcExhausted );
2118                 }
2119                 else
2120                 {
2121                     /* fill buffer w/keys */
2122                     int i;
2123                     char* p = FastqReadSplitter_key_buf;
2124                     for ( i = 1; rc == 0 && i <= nreads_max; i++ )
2125                     {
2126                         if ( sprintf( p, "%4u", i ) <= 0 )
2127                         {
2128                             rc = RC( rcExe, rcNode, rcExecuting, rcTransfer, rcIncomplete );
2129                         }
2130                         p += key_offset;
2131                     }
2132                 }
2133             }
2134         }
2135 
2136         if ( rc == 0 )
2137         {
2138             rc = FastqReaderSeekSpot( self->reader, spot );
2139             if ( rc == 0 )
2140             {
2141                 rc = FastqReader_SpotInfo( self->reader, NULL, NULL, NULL, NULL, NULL, &num_reads );
2142                 if ( rc == 0 )
2143                 {
2144                     uint32_t readId, good = 0;
2145 
2146                     SRA_DUMP_DBG( 3, ( "%s %u row reads:", __func__, spot ) );
2147                     for ( readId = 0; rc == 0 && readId < num_reads; readId++ )
2148                     {
2149                         rc = FastqReader_SpotReadInfo( self->reader, readId + 1, NULL, NULL, NULL, NULL, NULL );
2150                         if ( !isset_readmask( readmask, readId ) )
2151                         {
2152                             continue;
2153                         }
2154                         if ( self->keys_max < ( good + 1 ) )
2155                         {
2156                             void* p = realloc( self->keys, sizeof( *self->keys ) * ( good + 1 ) );
2157                             if ( p == NULL )
2158                             {
2159                                 rc = RC( rcExe, rcNode, rcExecuting, rcMemory, rcExhausted );
2160                                 break;
2161                             }
2162                             else
2163                             {
2164                                 self->keys = p;
2165                                 self->keys_max = good + 1;
2166                             }
2167                         }
2168                         self->keys[ good ].key = &FastqReadSplitter_key_buf[ readId * key_offset ];
2169                         while ( self->keys[ good ].key[ 0 ] == ' ' && self->keys[ good ].key[0] != '\0' )
2170                         {
2171                             self->keys[ good ].key++;
2172                         }
2173                         clear_readmask( self->keys[ good ].readmask );
2174                         set_readmask( self->keys[good].readmask, readId );
2175                         SRA_DUMP_DBG( 3, ( " key['%s']+=%u", self->keys[ good ].key, readId ) );
2176                         good++;
2177                     }
2178                     if ( rc == 0 )
2179                     {
2180                         *key = self->keys;
2181                         *keys = good;
2182                     }
2183                     SRA_DUMP_DBG( 3, ( "\n" ) );
2184                 }
2185             }
2186             else if ( GetRCObject( rc ) == rcRow && GetRCState( rc ) == rcNotFound )
2187             {
2188                 SRA_DUMP_DBG( 3, ( "%s skipped %u row\n", __func__, spot ) );
2189                 rc = 0;
2190             }
2191         }
2192         else if ( GetRCObject( rc ) == rcRow && GetRCState( rc ) == rcNotFound )
2193         {
2194                 SRA_DUMP_DBG( 3, ( "%s skipped %u row\n", __func__, spot ) );
2195                 rc = 0;
2196         }
2197     }
2198     return rc;
2199 }
2200 
2201 
FastqReadSplitter_Release(const SRASplitter * cself)2202 rc_t FastqReadSplitter_Release( const SRASplitter* cself )
2203 {
2204     if ( cself != NULL )
2205     {
2206         FastqReadSplitter* self = ( FastqReadSplitter* )cself;
2207         free( self->keys );
2208     }
2209     return 0;
2210 }
2211 
2212 
2213 typedef struct FastqReadSplitterFactory_struct
2214 {
2215     const char* accession;
2216     const SRATable* table;
2217     const FastqReader* reader;
2218 } FastqReadSplitterFactory;
2219 
2220 
FastqReadSplitterFactory_Init(const SRASplitterFactory * cself)2221 static rc_t FastqReadSplitterFactory_Init( const SRASplitterFactory* cself )
2222 {
2223     rc_t rc = 0;
2224     FastqReadSplitterFactory* self = ( FastqReadSplitterFactory* )cself;
2225 
2226     if ( self == NULL )
2227     {
2228         rc = RC( rcExe, rcType, rcConstructing, rcParam, rcNull );
2229     }
2230     else
2231     {
2232         rc = FastqReaderMake( &self->reader, self->table, self->accession,
2233                               /* preserve orig spot format to save on conversions */
2234                               FastqArgs.is_platform_cs_native, false, FastqArgs.fasta > 0, false,
2235                               false, !FastqArgs.applyClip, FastqArgs.SuppressQualForCSKey, 0,
2236                               FastqArgs.offset, '\0', 0, 0 );
2237     }
2238     return rc;
2239 }
2240 
2241 
FastqReadSplitterFactory_NewObj(const SRASplitterFactory * cself,const SRASplitter ** splitter)2242 static rc_t FastqReadSplitterFactory_NewObj( const SRASplitterFactory* cself, const SRASplitter** splitter )
2243 {
2244     rc_t rc = 0;
2245     FastqReadSplitterFactory* self = ( FastqReadSplitterFactory* )cself;
2246 
2247     if ( self == NULL )
2248     {
2249         rc = RC( rcExe, rcType, rcExecuting, rcParam, rcNull );
2250     }
2251     else
2252     {
2253         rc = SRASplitter_Make( splitter, sizeof( FastqReadSplitter ), NULL,
2254                                FastqReadSplitter_GetKeySet, NULL, FastqReadSplitter_Release );
2255         if ( rc == 0 )
2256         {
2257             ( (FastqReadSplitter*)(*splitter) )->reader = self->reader;
2258         }
2259     }
2260     return rc;
2261 }
2262 
2263 
FastqReadSplitterFactory_Release(const SRASplitterFactory * cself)2264 static void FastqReadSplitterFactory_Release( const SRASplitterFactory* cself )
2265 {
2266     if ( cself != NULL )
2267     {
2268         FastqReadSplitterFactory* self = ( FastqReadSplitterFactory* )cself;
2269         FastqReaderWhack( self->reader );
2270         free( FastqReadSplitter_key_buf );
2271         FastqReadSplitter_key_buf = NULL;
2272     }
2273 }
2274 
2275 
FastqReadSplitterFactory_Make(const SRASplitterFactory ** cself,const char * accession,const SRATable * table)2276 static rc_t FastqReadSplitterFactory_Make( const SRASplitterFactory** cself,
2277                 const char* accession, const SRATable* table )
2278 {
2279     rc_t rc = 0;
2280     FastqReadSplitterFactory* obj = NULL;
2281 
2282     if ( cself == NULL || accession == NULL || table == NULL )
2283     {
2284         rc = RC( rcExe, rcType, rcConstructing, rcParam, rcNull );
2285     }
2286     else
2287     {
2288         rc = SRASplitterFactory_Make( cself, eSplitterRead, sizeof( *obj ),
2289                                       FastqReadSplitterFactory_Init,
2290                                       FastqReadSplitterFactory_NewObj,
2291                                       FastqReadSplitterFactory_Release );
2292         if ( rc == 0 )
2293         {
2294             obj = ( FastqReadSplitterFactory* )*cself;
2295             obj->accession = accession;
2296             obj->table = table;
2297         }
2298     }
2299     return rc;
2300 }
2301 
2302 
2303 /* ============== FASTQ 3 read splitter ============================ */
2304 
2305 char* Fastq3ReadSplitter_key_buf = NULL;
2306 
2307 typedef struct Fastq3ReadSplitter_struct
2308 {
2309     const FastqReader* reader;
2310     SRASplitter_Keys keys[ 2 ];
2311 } Fastq3ReadSplitter;
2312 
2313 
2314 /* if all active reads len >= minreadlen, reads are splitted into separate files, otherwise on single key "" is returnded for all reads */
Fastq3ReadSplitter_GetKeySet(const SRASplitter * cself,const SRASplitter_Keys ** key,uint32_t * keys,spotid_t spot,const readmask_t * readmask)2315 static rc_t Fastq3ReadSplitter_GetKeySet( const SRASplitter* cself, const SRASplitter_Keys** key,
2316             uint32_t* keys, spotid_t spot, const readmask_t* readmask )
2317 {
2318     rc_t rc = 0;
2319     Fastq3ReadSplitter* self = ( Fastq3ReadSplitter* )cself;
2320     const size_t key_offset = 5;
2321 
2322     if ( self == NULL || key == NULL )
2323     {
2324         rc = RC( rcExe, rcNode, rcExecuting, rcParam, rcInvalid );
2325     }
2326     else
2327     {
2328         uint32_t num_reads = 0;
2329 
2330         *keys = 0;
2331         if ( Fastq3ReadSplitter_key_buf == NULL )
2332         {
2333             /* initial alloc key_buf: "  1\0  2\0...\0  9\0 10\0 11\0...\03220..\08192\0" */
2334             if ( nreads_max > 9999 )
2335             {
2336                 /* key_offset and sprintf format size are insufficient for keys longer than 4 digits */
2337                 rc = RC( rcExe, rcNode, rcExecuting, rcBuffer, rcInsufficient );
2338             }
2339             else
2340             {
2341                 Fastq3ReadSplitter_key_buf = malloc( nreads_max * key_offset );
2342                 if ( Fastq3ReadSplitter_key_buf == NULL )
2343                 {
2344                     rc = RC( rcExe, rcNode, rcExecuting, rcMemory, rcExhausted );
2345                 }
2346                 else
2347                 {
2348                     /* fill buffer w/keys */
2349                     int i;
2350                     char* p = Fastq3ReadSplitter_key_buf;
2351                     for ( i = 1; rc == 0 && i <= nreads_max; i++ )
2352                     {
2353                         if ( sprintf( p, "%4u", i ) <= 0 )
2354                         {
2355                             rc = RC( rcExe, rcNode, rcExecuting, rcTransfer, rcIncomplete );
2356                         }
2357                         p += key_offset;
2358                     }
2359                 }
2360             }
2361         }
2362 
2363         if ( rc == 0 )
2364         {
2365             rc = FastqReaderSeekSpot( self->reader, spot );
2366             if ( rc == 0 )
2367             {
2368                 rc = FastqReader_SpotInfo( self->reader, NULL, NULL, NULL, NULL, NULL, &num_reads );
2369                 if ( rc == 0 )
2370                 {
2371                     uint32_t readId, good  = 0;
2372                     const uint32_t max_reads = sizeof( self->keys ) /sizeof( self->keys[ 0 ] );
2373 
2374                     SRA_DUMP_DBG( 3, ( "%s %u row reads:", __func__, spot ) );
2375                     for ( readId = 0; rc == 0 && readId < num_reads && good < max_reads; readId++ )
2376                     {
2377                         rc = FastqReader_SpotReadInfo( self->reader, readId + 1, NULL, NULL, NULL, NULL, NULL );
2378                         if ( !isset_readmask( readmask, readId ) )
2379                         {
2380                             continue;
2381                         }
2382                         self->keys[ good ].key = &Fastq3ReadSplitter_key_buf[ good * key_offset ];
2383                         while ( self->keys[ good ].key[ 0 ] == ' ' && self->keys[good].key[ 0 ] != '\0' )
2384                         {
2385                             self->keys[ good ].key++;
2386                         }
2387                         clear_readmask( self->keys[ good ].readmask );
2388                         set_readmask( self->keys[ good ].readmask, readId );
2389                         SRA_DUMP_DBG( 3, ( " key['%s']+=%u", self->keys[ good ].key, readId ) );
2390                         good++;
2391                     }
2392                     if ( rc == 0 )
2393                     {
2394                         *key = self->keys;
2395                         *keys = good;
2396                         if ( good != max_reads )
2397                         {
2398                             /* some are short -> reset keys to same value for all valid reads */
2399                             /* run has just one read -> no suffix */
2400                             /* or single file was requested */
2401                             for ( readId = 0; readId < good; readId++ )
2402                             {
2403                                 self->keys[ readId ].key = "";
2404                             }
2405                             SRA_DUMP_DBG( 3, ( " all keys joined to ''" ) );
2406                         }
2407                     }
2408                     SRA_DUMP_DBG( 3, ( "\n" ) );
2409                 }
2410             }
2411             else if ( GetRCObject( rc ) == rcRow && GetRCState( rc ) == rcNotFound )
2412             {
2413                 SRA_DUMP_DBG( 3, ( "%s skipped %u row\n", __func__, spot ) );
2414                 rc = 0;
2415             }
2416         }
2417         else if ( GetRCObject( rc ) == rcRow && GetRCState( rc ) == rcNotFound )
2418         {
2419             SRA_DUMP_DBG( 3, ( "%s skipped %u row\n", __func__, spot ) );
2420             rc = 0;
2421         }
2422     }
2423     return rc;
2424 }
2425 
2426 typedef struct Fastq3ReadSplitterFactory_struct
2427 {
2428     const char* accession;
2429     const SRATable* table;
2430     const FastqReader* reader;
2431 } Fastq3ReadSplitterFactory;
2432 
2433 
Fastq3ReadSplitterFactory_Init(const SRASplitterFactory * cself)2434 static rc_t Fastq3ReadSplitterFactory_Init( const SRASplitterFactory* cself )
2435 {
2436     rc_t rc = 0;
2437     Fastq3ReadSplitterFactory* self = ( Fastq3ReadSplitterFactory* )cself;
2438 
2439     if ( self == NULL )
2440     {
2441         rc = RC( rcExe, rcType, rcConstructing, rcParam, rcNull );
2442     }
2443     else
2444     {
2445         rc = FastqReaderMake( &self->reader, self->table, self->accession,
2446                               /* preserve orig spot format to save on conversions */
2447                               FastqArgs.is_platform_cs_native, false, FastqArgs.fasta > 0, false,
2448                               false, !FastqArgs.applyClip, FastqArgs.SuppressQualForCSKey, 0,
2449                               FastqArgs.offset, '\0', 0, 0 );
2450     }
2451     return rc;
2452 }
2453 
2454 
Fastq3ReadSplitterFactory_NewObj(const SRASplitterFactory * cself,const SRASplitter ** splitter)2455 static rc_t Fastq3ReadSplitterFactory_NewObj( const SRASplitterFactory* cself, const SRASplitter** splitter )
2456 {
2457     rc_t rc = 0;
2458     Fastq3ReadSplitterFactory* self = ( Fastq3ReadSplitterFactory* )cself;
2459 
2460     if ( self == NULL )
2461     {
2462         rc = RC( rcExe, rcType, rcExecuting, rcParam, rcNull );
2463     }
2464     else
2465     {
2466         rc = SRASplitter_Make( splitter, sizeof( Fastq3ReadSplitter ),
2467                                NULL, Fastq3ReadSplitter_GetKeySet, NULL, NULL );
2468         if ( rc == 0 )
2469         {
2470             ( (Fastq3ReadSplitter*)(*splitter) )->reader = self->reader;
2471         }
2472     }
2473     return rc;
2474 }
2475 
2476 
Fastq3ReadSplitterFactory_Release(const SRASplitterFactory * cself)2477 static void Fastq3ReadSplitterFactory_Release( const SRASplitterFactory* cself )
2478 {
2479     if ( cself != NULL )
2480     {
2481         Fastq3ReadSplitterFactory* self = ( Fastq3ReadSplitterFactory* )cself;
2482         FastqReaderWhack( self->reader );
2483         memset ( self, 0, sizeof * self );
2484 
2485         free( Fastq3ReadSplitter_key_buf );
2486         Fastq3ReadSplitter_key_buf = NULL;
2487     }
2488 }
2489 
2490 
Fastq3ReadSplitterFactory_Make(const SRASplitterFactory ** cself,const char * accession,const SRATable * table)2491 static rc_t Fastq3ReadSplitterFactory_Make( const SRASplitterFactory** cself,
2492                     const char* accession, const SRATable* table )
2493 {
2494     rc_t rc = 0;
2495     Fastq3ReadSplitterFactory* obj = NULL;
2496 
2497     if ( cself == NULL || accession == NULL || table == NULL )
2498     {
2499         rc = RC( rcExe, rcType, rcConstructing, rcParam, rcNull );
2500     }
2501     else
2502     {
2503         rc = SRASplitterFactory_Make( cself, eSplitterRead, sizeof( *obj ),
2504                                       Fastq3ReadSplitterFactory_Init,
2505                                       Fastq3ReadSplitterFactory_NewObj,
2506                                       Fastq3ReadSplitterFactory_Release );
2507         if ( rc == 0 )
2508         {
2509             obj = ( Fastq3ReadSplitterFactory* )*cself;
2510             obj->accession = accession;
2511             obj->table = table;
2512         }
2513     }
2514     return rc;
2515 }
2516 
2517 
2518 /* ============== FASTQ formatter object  ============================ */
2519 
2520 
2521 typedef struct FastqFormatterSplitter_struct
2522 {
2523     const char* accession;
2524     const FastqReader* reader;
2525     KDataBuffer* b[ 5 ];
2526     size_t bsz[ 5 ]; /* fifth is for fasta line wrap */
2527 } FastqFormatterSplitter;
2528 
2529 
FastqFormatterSplitter_WrapLine(const KDataBuffer * src,size_t src_sz,KDataBuffer * dst,size_t * dst_sz,const uint64_t width)2530 static rc_t FastqFormatterSplitter_WrapLine( const KDataBuffer* src, size_t src_sz,
2531                                              KDataBuffer* dst, size_t* dst_sz, const uint64_t width )
2532 {
2533     rc_t rc = 0;
2534     size_t sz = src_sz + ( src_sz - 1 ) / width;
2535 
2536     if ( KDataBufferBytes( dst ) < sz )
2537     {
2538         SRA_DUMP_DBG( 10, ( "%s grow buffer from %u to %u\n", __func__, KDataBufferBytes( dst ), sz + 10 ) );
2539         rc = KDataBufferResize( dst, sz + 10 );
2540     }
2541     if ( rc == 0 )
2542     {
2543         const char* s = src->base;
2544         char* d = dst->base;
2545         *dst_sz = sz = src_sz;
2546         while ( sz > 0 )
2547         {
2548             memmove( d, s, sz > width ? width : sz );
2549             if ( sz <= width )
2550             {
2551                 break;
2552             }
2553             d += width;
2554             *d++ = '\n';
2555             *dst_sz = *dst_sz + 1;
2556             s += width;
2557             sz -= width;
2558         }
2559     }
2560     return rc;
2561 }
2562 
2563 
FastqFormatterSplitter_DumpByRead(const SRASplitter * cself,spotid_t spot,const readmask_t * readmask)2564 static rc_t FastqFormatterSplitter_DumpByRead( const SRASplitter* cself, spotid_t spot, const readmask_t* readmask )
2565 {
2566     rc_t rc = 0;
2567     FastqFormatterSplitter* self = ( FastqFormatterSplitter* )cself;
2568 
2569     if ( self == NULL )
2570     {
2571         rc = RC( rcExe, rcType, rcExecuting, rcParam, rcNull );
2572     }
2573     else
2574     {
2575         rc = FastqReaderSeekSpot( self->reader, spot );
2576         if ( rc == 0 )
2577         {
2578             rc = SRASplitter_FileActivate( cself, FastqArgs.file_extension );
2579             if ( rc == 0 )
2580             {
2581                 DeflineData def_data;
2582                 const char* spot_name = NULL, *spot_group = NULL, *read_name = NULL;
2583                 uint32_t readIdx, spot_len = 0;
2584                 uint32_t num_reads, readId, k;
2585                 size_t sname_sz, sgrp_sz;
2586                 INSDC_coord_len rlabel_sz = 0, read_len = 0;
2587 
2588                 if ( FastqArgs.b_defline || FastqArgs.q_defline )
2589                 {
2590                     rc = FastqReader_SpotInfo( self->reader, &spot_name, &sname_sz, &spot_group, &sgrp_sz, &spot_len, &num_reads );
2591                 }
2592                 else
2593                 {
2594                     rc = FastqReader_SpotInfo( self->reader, NULL, NULL, NULL, NULL, NULL, &num_reads );
2595                 }
2596 
2597                 for ( readId = 1, readIdx = 1; rc == 0 && readId <= num_reads; readId++, readIdx++ )
2598                 {
2599                     if ( !isset_readmask( readmask, readId - 1 ) )
2600                     {
2601                         continue;
2602                     }
2603                     if ( FastqArgs.b_defline || FastqArgs.q_defline )
2604                     {
2605                         rc = FastqReader_SpotReadInfo( self->reader, readId, NULL, &read_name, &rlabel_sz, NULL, &read_len );
2606                         if ( rc == 0 )
2607                         {
2608                             rc = Defline_Bind( &def_data, self->accession, &spot, spot_name, sname_sz, spot_group, sgrp_sz,
2609                                                &spot_len, &readIdx, read_name, rlabel_sz, &read_len );
2610                         }
2611                     }
2612                     if ( rc == 0 )
2613                     {
2614                         if ( FastqArgs.b_defline )
2615                         {
2616                             IF_BUF( ( Defline_Build( FastqArgs.b_defline, &def_data, self->b[0]->base,
2617                                       KDataBufferBytes( self->b[0] ), &self->bsz[0] ) ), self->b[0], self->bsz[0] );
2618                         }
2619                         else
2620                         {
2621                             IF_BUF( ( FastqReaderBaseName( self->reader, readId, &FastqArgs.dumpCs, self->b[0]->base,
2622                                       KDataBufferBytes( self->b[0] ), &self->bsz[0] ) ), self->b[0], self->bsz[0] );
2623                         }
2624                     }
2625                     if ( rc == 0 )
2626                     {
2627                         if ( FastqArgs.fasta > 0 )
2628                         {
2629                             IF_BUF( ( FastqReaderBase( self->reader, readId, self->b[4]->base, KDataBufferBytes( self->b[4] ),
2630                                       &self->bsz[4] ) ), self->b[4], self->bsz[4] )
2631                             {
2632                                 rc = FastqFormatterSplitter_WrapLine( self->b[4], self->bsz[4], self->b[1], &self->bsz[1], FastqArgs.fasta );
2633                             }
2634                         }
2635                         else
2636                         {
2637                             IF_BUF( ( FastqReaderBase( self->reader, readId, self->b[1]->base,
2638                                       KDataBufferBytes( self->b[1] ), &self->bsz[1] ) ), self->b[1], self->bsz[1] );
2639                         }
2640                     }
2641                     if ( !FastqArgs.fasta && rc == 0 )
2642                     {
2643                         if ( FastqArgs.q_defline )
2644                         {
2645                             IF_BUF( ( Defline_Build( FastqArgs.q_defline, &def_data, self->b[2]->base,
2646                                       KDataBufferBytes( self->b[2] ), &self->bsz[2] ) ), self->b[2], self->bsz[2] );
2647                         }
2648                         else
2649                         {
2650                             IF_BUF( ( FastqReaderQualityName( self->reader, readId, &FastqArgs.dumpCs, self->b[2]->base,
2651                                       KDataBufferBytes( self->b[2] ), &self->bsz[2] ) ), self->b[2], self->bsz[2] );
2652                         }
2653                         if ( rc == 0 )
2654                         {
2655                             IF_BUF( ( FastqReaderQuality( self->reader, readId, self->b[3]->base,
2656                                       KDataBufferBytes( self->b[3] ), &self->bsz[3] ) ), self->b[3], self->bsz[3] );
2657                         }
2658                     }
2659                     else if ( ( (char*)(self->b[0]->base))[0] == '@' )
2660                     {
2661                         ( (char*)(self->b[0]->base) )[0] = '>';
2662                     }
2663                     for ( k = 0; rc == 0 && k < ( FastqArgs.fasta ? 2 : 4); k++ )
2664                     {
2665                         rc = SRASplitter_FileWrite( cself, spot, self->b[ k ]->base, self->bsz[ k ] );
2666                         if ( rc == 0 )
2667                         {
2668                             rc = SRASplitter_FileWrite( cself, spot, "\n", 1 );
2669                         }
2670                     }
2671                 }
2672             }
2673         }
2674         else if ( GetRCObject( rc ) == rcRow && GetRCState( rc ) == rcNotFound )
2675         {
2676             SRA_DUMP_DBG( 3, ( "%s skipped %u row\n", __func__, spot ) );
2677             rc = 0;
2678         }
2679     }
2680     return rc;
2681 }
2682 
2683 
Fasta_dump(const SRASplitter * cself,FastqFormatterSplitter * self,spotid_t spot,uint32_t columns)2684 static rc_t Fasta_dump( const SRASplitter* cself, FastqFormatterSplitter* self, spotid_t spot, uint32_t columns )
2685 {
2686     uint32_t readId;
2687     rc_t rc = FastqFormatterSplitter_WrapLine( self->b[4], self->bsz[4], self->b[1], &self->bsz[1], columns );
2688     for ( readId = 0; rc == 0 && readId < 2; readId++ )
2689     {
2690         rc = SRASplitter_FileWrite( cself, spot, self->b[readId]->base, self->bsz[ readId ] );
2691         if ( rc == 0 )
2692         {
2693             rc = SRASplitter_FileWrite( cself, spot, "\n", 1 );
2694         }
2695     }
2696     return rc;
2697 }
2698 
2699 
FastqFormatterSplitter_DumpBySpot(const SRASplitter * cself,spotid_t spot,const readmask_t * readmask)2700 static rc_t FastqFormatterSplitter_DumpBySpot( const SRASplitter* cself, spotid_t spot, const readmask_t* readmask )
2701 {
2702     rc_t rc = 0;
2703     FastqFormatterSplitter* self = ( FastqFormatterSplitter* )cself;
2704 
2705     if ( self == NULL )
2706     {
2707         rc = RC( rcExe, rcType, rcExecuting, rcParam, rcNull );
2708     }
2709     else
2710     {
2711         rc = FastqReaderSeekSpot( self->reader, spot );
2712         if ( rc == 0 )
2713         {
2714             uint32_t num_reads, readId;
2715 
2716             rc = FastqReader_SpotInfo( self->reader, NULL, NULL, NULL, NULL, NULL, &num_reads );
2717             for ( readId = 0; rc == 0 && readId < num_reads; readId++ )
2718             {
2719                 /* if any read in readmask is not set - skip whole spot */
2720                 if ( !isset_readmask( readmask, readId ) )
2721                 {
2722                     return 0;
2723                 }
2724             }
2725             if ( rc == 0 )
2726             {
2727                 rc = SRASplitter_FileActivate( cself, FastqArgs.file_extension );
2728                 if ( rc == 0 )
2729                 {
2730                     DeflineData def_data;
2731                     const char* spot_name = NULL, *spot_group = NULL;
2732                     uint32_t spot_len = 0;
2733                     size_t writ, sname_sz, sgrp_sz;
2734                     const int base_i = FastqArgs.fasta ? 4 : 1;
2735 
2736                     if ( FastqArgs.b_defline || FastqArgs.q_defline )
2737                     {
2738                         rc = FastqReader_SpotInfo( self->reader, &spot_name, &sname_sz,
2739                                                    &spot_group, &sgrp_sz, &spot_len, NULL );
2740                         if ( rc == 0 )
2741                         {
2742                             rc = Defline_Bind( &def_data, self->accession, &spot, spot_name,
2743                                                sname_sz, spot_group, sgrp_sz,
2744                                                &spot_len, &readId, NULL, 0, &spot_len );
2745                         }
2746                     }
2747 
2748                     if ( rc == 0 )
2749                     {
2750                         if ( FastqArgs.b_defline )
2751                         {
2752                             IF_BUF( ( Defline_Build( FastqArgs.b_defline, &def_data, self->b[0]->base,
2753                                       KDataBufferBytes( self->b[0] ), &self->bsz[0] ) ), self->b[0], self->bsz[0] );
2754                         }
2755                         else
2756                         {
2757                             IF_BUF( ( FastqReaderBaseName( self->reader, 0, &FastqArgs.dumpCs, self->b[0]->base,
2758                                       KDataBufferBytes( self->b[0] ), &self->bsz[0] ) ), self->b[0], self->bsz[0] );
2759                         }
2760                         self->bsz[ base_i ] = 0;
2761                         if ( !FastqArgs.fasta && rc == 0 )
2762                         {
2763                             if ( FastqArgs.q_defline )
2764                             {
2765                                 IF_BUF( ( Defline_Build( FastqArgs.q_defline, &def_data, self->b[2]->base,
2766                                           KDataBufferBytes( self->b[2] ), &self->bsz[2] ) ), self->b[2], self->bsz[2] );
2767                             }
2768                             else
2769                             {
2770                                 IF_BUF( ( FastqReaderQualityName( self->reader, 0, &FastqArgs.dumpCs, self->b[2]->base,
2771                                           KDataBufferBytes( self->b[2] ), &self->bsz[2] ) ), self->b[2], self->bsz[2] );
2772                             }
2773                             self->bsz[3] = 0;
2774                         }
2775                         else if ( ( (char*)(self->b[0]->base) )[0] == '@' )
2776                         {
2777                             ( (char*)(self->b[0]->base) )[0] = '>';
2778                         }
2779                     }
2780 
2781                     for ( readId = 1; rc == 0 && readId <= num_reads; readId++ )
2782                     {
2783                         IF_BUF( ( FastqReaderBase( self->reader, readId, &( (char*)(self->b[base_i]->base) )[ self->bsz[ base_i ] ],
2784                                   KDataBufferBytes( self->b[ base_i ] ) - self->bsz[ base_i ], &writ) ), self->b[ base_i ], self->bsz[ base_i ] + writ )
2785                         {
2786                             self->bsz[base_i] += writ;
2787                             if ( !FastqArgs.fasta )
2788                             {
2789                                 IF_BUF( ( FastqReaderQuality( self->reader, readId, &( (char*)(self->b[3]->base) )[ self->bsz[3] ],
2790                                         KDataBufferBytes( self->b[3] ) - self->bsz[3], &writ ) ), self->b[3], self->bsz[3] + writ )
2791                                 {
2792                                     self->bsz[ 3 ] += writ;
2793                                 }
2794                             }
2795                         }
2796                     }
2797 
2798                     if ( rc == 0 )
2799                     {
2800                         if ( FastqArgs.fasta > 0 )
2801                         {
2802                             /* special printint of fasta-output ( line-wrap... ) */
2803                             rc = Fasta_dump( cself, self, spot, FastqArgs.fasta );
2804                         }
2805                         else
2806                         {
2807                             for ( readId = 0; rc == 0 && readId < 4; readId++ )
2808                             {
2809                                 rc = SRASplitter_FileWrite( cself, spot, self->b[readId]->base, self->bsz[ readId ] );
2810                                 if ( rc == 0 )
2811                                 {
2812                                     rc = SRASplitter_FileWrite( cself, spot, "\n", 1 );
2813                                 }
2814                             }
2815                         }
2816                     }
2817                 }
2818             }
2819         }
2820         else if ( GetRCObject( rc ) == rcRow && GetRCState( rc ) == rcNotFound )
2821         {
2822             SRA_DUMP_DBG( 3, ( "%s skipped %u row\n", __func__, spot ) );
2823             rc = 0;
2824         }
2825     }
2826     return rc;
2827 }
2828 
2829 
2830 typedef struct FastqFormatterFactory_struct
2831 {
2832     const char* accession;
2833     const SRATable* table;
2834     const FastqReader* reader;
2835     KDataBuffer buf[ 5 ]; /* fifth is for fasta line wrap */
2836 } FastqFormatterFactory;
2837 
2838 
FastqFormatterFactory_Init(const SRASplitterFactory * cself)2839 static rc_t FastqFormatterFactory_Init( const SRASplitterFactory* cself )
2840 {
2841     rc_t rc = 0;
2842     FastqFormatterFactory* self = ( FastqFormatterFactory* )cself;
2843 
2844     if ( self == NULL )
2845     {
2846         rc = RC( rcExe, rcType, rcConstructing, rcParam, rcNull );
2847     }
2848     else
2849     {
2850         rc = FastqReaderMake( &self->reader, self->table, self->accession,
2851                      FastqArgs.dumpCs, FastqArgs.dumpOrigFmt, FastqArgs.fasta > 0, FastqArgs.dumpCs,
2852                      FastqArgs.readIds, !FastqArgs.applyClip, FastqArgs.SuppressQualForCSKey, 0,
2853                      FastqArgs.offset, FastqArgs.desiredCsKey[ 0 ], 0, 0 );
2854         if ( rc == 0 )
2855         {
2856             int i;
2857             for( i = 0; rc == 0 && i < sizeof( self->buf ) / sizeof( self->buf[ 0 ] ); i++ )
2858             {
2859                 rc = KDataBufferMakeBytes( &self->buf[ i ], DATABUFFERINITSIZE );
2860             }
2861         }
2862     }
2863     return rc;
2864 }
2865 
2866 
FastqFormatterFactory_NewObj(const SRASplitterFactory * cself,const SRASplitter ** splitter)2867 static rc_t FastqFormatterFactory_NewObj( const SRASplitterFactory* cself, const SRASplitter** splitter )
2868 {
2869     rc_t rc = 0;
2870     FastqFormatterFactory* self = ( FastqFormatterFactory* )cself;
2871 
2872     if ( self == NULL )
2873     {
2874         rc = RC( rcExe, rcType, rcExecuting, rcParam, rcNull );
2875     }
2876     else
2877     {
2878         rc = SRASplitter_Make ( splitter, sizeof( FastqFormatterSplitter ), NULL, NULL,
2879                                 FastqArgs.split_spot ?
2880                                     FastqFormatterSplitter_DumpByRead :
2881                                     FastqFormatterSplitter_DumpBySpot, NULL );
2882         if ( rc == 0 )
2883         {
2884             int i;
2885             ( (FastqFormatterSplitter*)(*splitter) )->accession = self->accession;
2886             ( (FastqFormatterSplitter*)(*splitter) )->reader = self->reader;
2887             for ( i = 0; i < sizeof( self->buf ) / sizeof( self->buf[ 0 ] ); i++ )
2888             {
2889                 ( (FastqFormatterSplitter*)(*splitter) )->b[ i ] = &self->buf[ i ];
2890             }
2891         }
2892     }
2893     return rc;
2894 }
2895 
2896 
FastqFormatterFactory_Release(const SRASplitterFactory * cself)2897 static void FastqFormatterFactory_Release( const SRASplitterFactory* cself )
2898 {
2899     if ( cself != NULL )
2900     {
2901         int i;
2902         FastqFormatterFactory* self = ( FastqFormatterFactory* )cself;
2903         FastqReaderWhack( self->reader );
2904         for ( i = 0; i < sizeof( self->buf ) / sizeof( self->buf[ 0 ] ); i++ )
2905         {
2906             KDataBufferWhack( &self->buf[ i ] );
2907         }
2908     }
2909 }
2910 
2911 
FastqFormatterFactory_Make(const SRASplitterFactory ** cself,const char * accession,const SRATable * table)2912 static rc_t FastqFormatterFactory_Make( const SRASplitterFactory** cself,
2913                 const char* accession, const SRATable* table )
2914 {
2915     rc_t rc = 0;
2916     FastqFormatterFactory* obj = NULL;
2917 
2918     if ( cself == NULL || accession == NULL || table == NULL )
2919     {
2920         rc = RC( rcExe, rcType, rcConstructing, rcParam, rcNull );
2921     }
2922     else
2923     {
2924         rc = SRASplitterFactory_Make( cself, eSplitterFormat, sizeof( *obj ),
2925                                       FastqFormatterFactory_Init,
2926                                       FastqFormatterFactory_NewObj,
2927                                       FastqFormatterFactory_Release );
2928         if ( rc == 0 )
2929         {
2930             obj = (FastqFormatterFactory*)*cself;
2931             obj->accession = accession;
2932             obj->table = table;
2933         }
2934     }
2935     return rc;
2936 }
2937 
2938 
2939 /* ### External entry points ##################################################### */
2940 
2941 
2942 const char UsageDefaultName[] = "fastq-dump";
2943 
2944 
UsageSummary(const char * progname)2945 rc_t CC UsageSummary ( const char * progname )
2946 {
2947     return 0;
2948 }
2949 
2950 #define H_splip_sot 0
2951 #define H_clip 1
2952 #define H_minReadLen 2
2953 #define H_qual_filter 3
2954 #define H_skip_tech 4
2955 #define H_split_files 5
2956 #define H_split_3 6
2957 #define H_dumpcs 7
2958 #define H_dumpbase 8
2959 #define H_offset 9
2960 #define H_fasta 10
2961 #define H_origfmt 11
2962 #define H_readids 12
2963 #define H_helicos 13
2964 #define H_defline_seq 14
2965 #define H_defline_qual 15
2966 #define H_aligned 16
2967 #define H_unaligned 17
2968 #define H_aligned_region 18
2969 #define H_matepair_distance 19
2970 #define H_qual_filter_1 20
2971 #define H_SuppressQualForCSKey 21
2972 
FastqDumper_Usage(const SRADumperFmt * fmt,const SRADumperFmt_Arg * core_args,int first)2973 rc_t FastqDumper_Usage( const SRADumperFmt* fmt, const SRADumperFmt_Arg* core_args, int first )
2974 {
2975     int i, j;
2976     /* known core options */
2977     const SRADumperFmt_Arg* core[ 13 ];
2978 
2979     memset( (void*)core, 0, sizeof( core ) );
2980     for ( i = first; core_args[i].abbr != NULL || core_args[ i ].full != NULL; i++ )
2981     {
2982         const char* nm = core_args[ i ].abbr;
2983         nm = nm ? nm : core_args[ i ].full;
2984         if ( strcmp( nm, "A" ) == 0 )
2985         {
2986             core[ 0 ] = &core_args[ i ];
2987         }
2988         else if ( strcmp( nm, "O" ) == 0 )
2989         {
2990             core[ 1 ] = &core_args[ i ];
2991         }
2992         else if ( strcmp( nm, "N" ) == 0 )
2993         {
2994             core[ 2 ] = &core_args[ i ];
2995         }
2996         else if ( strcmp( nm, "X" ) == 0 )
2997         {
2998             core[ 3 ] = &core_args[ i ];
2999         }
3000         else if ( strcmp( nm, "G" ) == 0 )
3001         {
3002             core[ 4 ] = &core_args[ i ];
3003         }
3004         else if ( strcmp( nm, "spot-groups" ) == 0 )
3005         {
3006             core[ 5 ] = &core_args[ i ];
3007         }
3008         else if ( strcmp( nm, "R" ) == 0 )
3009         {
3010             core[ 6 ] = &core_args[ i ];
3011         }
3012         else if ( strcmp( nm, "T" ) == 0 )
3013         {
3014             core[ 7 ] = &core_args[ i ];
3015         }
3016         else if ( strcmp( nm, "K" ) == 0 )
3017         {
3018             core[ 8 ] = &core_args[ i ];
3019         }
3020         else if ( strcmp( nm, "table" ) == 0 )
3021         {
3022             core[ 9 ] = &core_args[ i ];
3023         }
3024         else if ( strcmp( nm, "Z" ) == 0 )
3025         {
3026             core[ 10 ] = &core_args[ i ];
3027         }
3028         else if ( strcmp( nm, "gzip" ) == 0 )
3029         {
3030             core[ 11 ] = &core_args[ i ];
3031         }
3032         else if ( strcmp( nm, "bzip2" ) == 0 )
3033         {
3034             core[ 12 ] = &core_args[ i ];
3035         }
3036     }
3037 
3038 #define OARG(arg,msg) (arg ? HelpOptionLine((arg)->abbr, (arg)->full, (arg)->param, \
3039                        msg ? msg : (const char**)((arg)->descr)) : ((void)(0)))
3040 
3041     OUTMSG(( "INPUT\n" ));
3042     OARG( core[ 0 ], NULL );
3043     OARG( core[ 9 ], NULL );
3044 
3045     OUTMSG(( "\nPROCESSING\n" ));
3046     OUTMSG(( "\nRead Splitting                     Sequence data may be used in raw form or\n"));
3047     OUTMSG(( "                                     split into individual reads\n" ));
3048     OARG( &fmt->arg_desc[ 0 ], NULL);
3049 
3050     OUTMSG(( "\nFull Spot Filters                  Applied to the full spot independently\n" ));
3051     OUTMSG(( "                                     of --%s\n", fmt->arg_desc[0].full ));
3052     OARG( core[ 2 ], NULL );
3053     OARG( core[ 3 ], NULL );
3054     OARG( core[ 5 ], NULL );
3055     OARG( &fmt->arg_desc[ 1 ], NULL );
3056 
3057     OUTMSG(( "\nCommon Filters                     Applied to spots when --%s is not\n",
3058                                                   fmt->arg_desc[0].full ));
3059     OUTMSG(( "                                     set, otherwise - to individual reads\n" ));
3060     OARG( &fmt->arg_desc[ 2 ], NULL );
3061     OARG( core[ 6 ], NULL );
3062     OARG( &fmt->arg_desc[ 3 ], NULL );
3063     OARG( &fmt->arg_desc[ H_qual_filter_1 ], NULL );
3064 
3065     OUTMSG(( "\nFilters based on alignments        Filters are active when alignment\n" ));
3066     OUTMSG(( "                                     data are present\n" ));
3067     OARG( &fmt->arg_desc[ 16 ], NULL );
3068     OARG( &fmt->arg_desc[ 17 ], NULL );
3069     OARG( &fmt->arg_desc[ 18 ], NULL );
3070     OARG( &fmt->arg_desc[ 19 ], NULL );
3071 
3072     OUTMSG(( "\nFilters for individual reads       Applied only with --%s set\n",
3073                                                   fmt->arg_desc[0].full ));
3074     OARG( &fmt->arg_desc[ 4 ], NULL );
3075 
3076     OUTMSG(( "\nOUTPUT\n" ));
3077     OARG( core[ 1 ], NULL );
3078     OARG( core[ 10 ], NULL );
3079     OARG( core[ 11 ], NULL  );
3080     OARG( core[ 12 ], NULL);
3081 
3082     OUTMSG(( "\nMultiple File Options              Setting these options will produce more\n" ));
3083     OUTMSG(( "                                     than 1 file, each of which will be suffixed\n" ));
3084     OUTMSG(( "                                     according to splitting criteria.\n" ));
3085     OARG( &fmt->arg_desc[ 5 ], NULL );
3086     OARG( &fmt->arg_desc[ 6 ], NULL );
3087     OARG( core[ 4 ], NULL );
3088     OARG( core[ 6 ], NULL );
3089     OARG( core[ 7 ], NULL );
3090     OARG( core[ 8 ], NULL );
3091 
3092     OUTMSG(( "\nFORMATTING\n" ));
3093     OUTMSG(( "\nSequence\n" ));
3094     OARG( &fmt->arg_desc[ 7 ], NULL );
3095     OARG( &fmt->arg_desc[ 8 ], NULL );
3096 
3097     OUTMSG(( "\nQuality\n" ));
3098     OARG( &fmt->arg_desc[ 9 ], NULL );
3099     OARG( &fmt->arg_desc[ 10 ], NULL );
3100     OARG( &fmt->arg_desc[ H_SuppressQualForCSKey ], NULL ); /* added Jan 15th 2014 ( a new fastq-variation! ) */
3101 
3102     OUTMSG(( "\nDefline\n" ));
3103     OARG( &fmt->arg_desc[ 11 ], NULL );
3104     OARG( &fmt->arg_desc[ 12 ], NULL );
3105     OARG( &fmt->arg_desc[ 13 ], NULL );
3106     OARG( &fmt->arg_desc[ 14 ], NULL );
3107     OARG( &fmt->arg_desc[ 15 ], NULL );
3108 
3109     OUTMSG(( "OTHER:\n" ));
3110     for ( i = first; core_args[ i ].abbr != NULL || core_args[ i ].full != NULL; i++ )
3111     {
3112         bool print = true;
3113         for ( j = 0; j < sizeof( core ) / sizeof( core[ 0 ] ); j++ )
3114         {
3115             if ( core[ j ] == &core_args[ i ] )
3116             {
3117                 print = false;
3118                 break;
3119             }
3120         }
3121         if ( print )
3122         {
3123             OARG( &core_args[ i ], NULL );
3124         }
3125     }
3126     return 0;
3127 }
3128 
3129 
FastqDumper_Release(const SRADumperFmt * fmt)3130 rc_t FastqDumper_Release( const SRADumperFmt* fmt )
3131 {
3132     if ( fmt == NULL )
3133     {
3134         return RC( rcExe, rcFormatter, rcDestroying, rcParam, rcInvalid );
3135     }
3136     else
3137     {
3138         Defline_Release( FastqArgs.b_defline );
3139         Defline_Release( FastqArgs.q_defline );
3140         free( FastqArgs.alregion );
3141     }
3142     return 0;
3143 }
3144 
3145 
FastqDumper_AddArg(const SRADumperFmt * fmt,GetArg * f,int * i,int argc,char * argv[])3146 bool FastqDumper_AddArg( const SRADumperFmt* fmt, GetArg* f, int* i, int argc, char *argv[] )
3147 {
3148     const char* arg = NULL;
3149 
3150     if ( fmt == NULL || f == NULL || i == NULL || argv == NULL )
3151     {
3152         LOGERR( klogErr, RC( rcExe, rcArgv, rcReading, rcParam, rcInvalid ), "null param" );
3153         return false;
3154     }
3155     else if ( f( fmt, "M", "minReadLen", i, argc, argv, &arg ) )
3156     {
3157         FastqArgs.minReadLen = AsciiToU32( arg, NULL, NULL );
3158     }
3159     else if ( f( fmt, "W", "clip", i, argc, argv, NULL ) )
3160     {
3161         FastqArgs.applyClip = true;
3162     }
3163     else if ( f( fmt, "SU", "suppress-qual-for-cskey", i, argc, argv, NULL ) )
3164     {
3165         FastqArgs.SuppressQualForCSKey = true;
3166     }
3167     else if ( f( fmt, "F", "origfmt", i, argc, argv, NULL ) )
3168     {
3169         FastqArgs.dumpOrigFmt = true;
3170     }
3171     else if ( f( fmt, "C", "dumpcs", i, argc, argv, NULL ) )
3172     {
3173         int k = ( *i ) + 1;
3174         FastqArgs.dumpCs = true;
3175         if ( k < argc && strlen( argv[ k ] ) == 1 && strchr( "acgtACGT", argv[ k ][ 0 ] ) != NULL )
3176         {
3177             FastqArgs.desiredCsKey = argv[ k ];
3178             *i = k;
3179         }
3180     }
3181     else if ( f( fmt, "B", "dumpbase", i, argc, argv, NULL ) )
3182     {
3183         FastqArgs.dumpBase = true;
3184     }
3185     else if ( f( fmt, "Q", "offset", i, argc, argv, &arg ) )
3186     {
3187         FastqArgs.offset = AsciiToU32( arg, NULL, NULL );
3188     }
3189     else if ( f( fmt, "I", "readids", i, argc, argv, NULL ) )
3190     {
3191         FastqArgs.readIds = true;
3192     }
3193     else if ( f( fmt, "E", "qual-filter", i, argc, argv, NULL ) )
3194     {
3195         FastqArgs.qual_filter = true;
3196     }
3197     else if ( f( fmt, "E1", "qual-filter-1", i, argc, argv, NULL ) )
3198     {
3199         FastqArgs.qual_filter1 = true;
3200     }
3201     else if ( f( fmt, "DB", "defline-seq", i, argc, argv, &arg ) )
3202     {
3203         FastqArgs.b_deffmt = arg;
3204     }
3205     else if ( f( fmt, "DQ", "defline-qual", i, argc, argv, &arg ) )
3206     {
3207         FastqArgs.q_deffmt = arg;
3208     }
3209     else if ( f( fmt, "TR", "skip-technical", i, argc, argv, NULL ) )
3210     {
3211         FastqArgs.skipTechnical = true;
3212     }
3213     else if ( f( fmt, "SF", "split-files", i, argc, argv, NULL ) )
3214     {
3215         SRADumperFmt * nc_fmt = ( SRADumperFmt * )fmt;
3216         FastqArgs.split_spot = true;
3217         FastqArgs.split_files = true;
3218         nc_fmt->split_files = true;
3219     }
3220     else if ( f( fmt, NULL, "split-3", i, argc, argv, NULL ) )
3221     {
3222         FastqArgs.split_3 = true;
3223         FastqArgs.split_spot = true;
3224         FastqArgs.maxReads = 2;
3225         FastqArgs.skipTechnical = true;
3226     }
3227     else if ( f( fmt, "SL", "split-spot", i, argc, argv, NULL ) )
3228     {
3229         FastqArgs.split_spot = true;
3230     }
3231     else if ( f( fmt, "HS", "helicos", i, argc, argv, NULL ) )
3232     {
3233         FastqArgs.b_deffmt = "@$sn";
3234         FastqArgs.q_deffmt =  "+";
3235     }
3236     else if ( f( fmt, "FA", "fasta", i, argc, argv, NULL ) )
3237     {
3238         int k = (*i) + 1;
3239         FastqArgs.fasta = 70;
3240         if ( k < argc && isdigit( argv[ k ][ 0 ] ) )
3241         {
3242             char* endp = NULL;
3243             errno = 0;
3244             FastqArgs.fasta = strtou64( argv[ k ], &endp, 10 );
3245             if ( errno != 0 || endp == NULL || *endp != '\0' )
3246             {
3247                 return false;
3248             }
3249             *i = k;
3250             if ( FastqArgs.fasta == 0 )
3251             {
3252                 FastqArgs.fasta = ~0;
3253             }
3254         }
3255         FastqArgs.file_extension = ".fasta";
3256     }
3257     else if ( f( fmt, NULL, "aligned", i, argc, argv, NULL ) )
3258     {
3259         FastqArgs.aligned = true;
3260     }
3261     else if ( f( fmt, NULL, "unaligned", i, argc, argv, NULL ) )
3262     {
3263         FastqArgs.unaligned = true;
3264     }
3265     else if ( f( fmt, NULL, "aligned-region", i, argc, argv, &arg ) )
3266     {
3267         /* name[:[from][-[to]]] */
3268         TAlignedRegion* r = realloc( FastqArgs.alregion, sizeof( *FastqArgs.alregion ) * ++FastqArgs.alregion_qty );
3269         if ( r == NULL )
3270         {
3271             return false;
3272         }
3273         FastqArgs.alregion = r;
3274         r = &FastqArgs.alregion[ FastqArgs.alregion_qty - 1 ];
3275         r->name = strchr( arg, ':' );
3276         r->from = 0;
3277         r->to = 0;
3278         if ( r->name == NULL )
3279         {
3280             r->name = arg;
3281             r->name_len = strlen( arg );
3282         }
3283         else
3284         {
3285             const char* c;
3286             uint64_t* v;
3287 
3288             r->name_len = r->name - arg;
3289             c = r->name;
3290             r->name = arg;
3291 
3292             v = &r->from;
3293             while ( *++c != '\0')
3294             {
3295                 if ( *c == '-' )
3296                 {
3297                     v = &r->to;
3298                 }
3299                 else if ( *c == '+' )
3300                 {
3301                     if ( *v != 0 )
3302                     {
3303                         return false;
3304                     }
3305                 }
3306                 else if ( !isdigit( *c ) )
3307                 {
3308                     return false;
3309                 }
3310                 else
3311                 {
3312                     *v = *v * 10 + ( *c - '0' );
3313                 }
3314             }
3315             if ( r->from > r->to && r->to != 0 )
3316             {
3317                 uint64_t x = r->from;
3318                 r->from = r->to;
3319                 r->to = x;
3320             }
3321         }
3322     }
3323     else if ( f( fmt, NULL, "matepair-distance", i, argc, argv, &arg ) )
3324     {
3325         /* from[-to] | [from]-to | unknown */
3326         if ( strcmp( arg, "unknown" ) == 0 )
3327         {
3328             FastqArgs.mp_dist_unknown = true;
3329         }
3330         else
3331         {
3332             uint64_t* v;
3333             TMatepairDistance* p = realloc( FastqArgs.mp_dist, sizeof( *FastqArgs.mp_dist ) * ++FastqArgs.mp_dist_qty );
3334             if ( p == NULL )
3335             {
3336                 return false;
3337             }
3338             FastqArgs.mp_dist = p;
3339             p = &FastqArgs.mp_dist[ FastqArgs.mp_dist_qty - 1 ];
3340             p->from = 0;
3341             p->to = 0;
3342 
3343             v = &p->from;
3344             while ( *++arg != '\0' )
3345             {
3346                 if ( *arg == '-' )
3347                 {
3348                     v = &p->to;
3349                 }
3350                 else if ( *arg == '+' )
3351                 {
3352                     if ( *v != 0 )
3353                     {
3354                         return false;
3355                     }
3356                 }
3357                 else if ( !isdigit( *arg ) )
3358                 {
3359                     return false;
3360                 }
3361                 else
3362                 {
3363                     *v = *v * 10 + ( *arg - '0' );
3364                 }
3365             }
3366             if ( p->from > p->to && p->to != 0 )
3367             {
3368                 uint64_t x = p->from;
3369                 p->from = p->to;
3370                 p->to = x;
3371             }
3372             if ( p->from == 0 && p->to == 0 )
3373             {
3374                 FastqArgs.mp_dist_qty--;
3375             }
3376         }
3377     }
3378     else
3379     {
3380         return false;
3381     }
3382     return true;
3383 }
3384 
3385 
FastqDumper_Factories(const SRADumperFmt * fmt,const SRASplitterFactory ** factory)3386 rc_t FastqDumper_Factories( const SRADumperFmt* fmt, const SRASplitterFactory** factory )
3387 {
3388     rc_t rc = 0;
3389     const SRASplitterFactory* parent = NULL, *child = NULL;
3390 
3391     if ( fmt == NULL || factory == NULL )
3392     {
3393         return RC( rcExe, rcFormatter, rcReading, rcParam, rcInvalid );
3394     }
3395     *factory = NULL;
3396     {
3397         const SRAColumn* c = NULL;
3398         rc = SRATableOpenColumnRead( fmt->table, &c, "PLATFORM", sra_platform_id_t );
3399         if ( rc == 0 )
3400         {
3401             const INSDC_SRA_platform_id * platform;
3402             bitsz_t o, z;
3403             rc = SRAColumnRead( c, 1, ( const void ** )&platform, &o, &z );
3404             if ( rc == 0 )
3405             {
3406                 if ( !FastqArgs.dumpCs && !FastqArgs.dumpBase )
3407                 {
3408                     if ( platform != NULL && *platform == SRA_PLATFORM_ABSOLID )
3409                     {
3410                         FastqArgs.dumpCs = true;
3411                     }
3412                     else
3413                     {
3414                         FastqArgs.dumpBase = true;
3415                     }
3416                 }
3417                 if ( platform != NULL && *platform == SRA_PLATFORM_ABSOLID )
3418                 {
3419                     FastqArgs.is_platform_cs_native = true;
3420                 }
3421             }
3422             SRAColumnRelease( c );
3423         }
3424         else if ( GetRCState( rc ) == rcNotFound && GetRCObject( rc ) == ( enum RCObject )rcColumn )
3425         {
3426             rc = 0;
3427         }
3428     }
3429 
3430     if ( ( FastqArgs.aligned || FastqArgs.unaligned ) && !( FastqArgs.aligned && FastqArgs.unaligned ) )
3431     {
3432         rc = AlignedFilterFactory_Make( &child, fmt->table, FastqArgs.aligned, FastqArgs.unaligned );
3433         if ( rc == 0 )
3434         {
3435             if ( parent != NULL )
3436             {
3437                 rc = SRASplitterFactory_AddNext( parent, child );
3438                 if ( rc != 0 )
3439                 {
3440                     SRASplitterFactory_Release(child);
3441                 }
3442                 else
3443                 {
3444                     parent = child;
3445                 }
3446             }
3447             else
3448             {
3449                 parent = child;
3450                 *factory = parent;
3451             }
3452         }
3453     }
3454 
3455     if ( FastqArgs.alregion_qty > 0 )
3456     {
3457         rc = AlignRegionFilterFactory_Make( &child, fmt->table, FastqArgs.alregion, FastqArgs.alregion_qty );
3458         if ( rc == 0 )
3459         {
3460             if ( parent != NULL )
3461             {
3462                 rc = SRASplitterFactory_AddNext( parent, child );
3463                 if ( rc != 0 )
3464                 {
3465                     SRASplitterFactory_Release( child );
3466                 }
3467                 else
3468                 {
3469                     parent = child;
3470                 }
3471             }
3472             else
3473             {
3474                 parent = child;
3475                 *factory = parent;
3476             }
3477         }
3478     }
3479 
3480     if ( FastqArgs.mp_dist_unknown || FastqArgs.mp_dist_qty > 0 )
3481     {
3482         rc = AlignPairDistanceFilterFactory_Make( &child, fmt->table,
3483                 FastqArgs.mp_dist_unknown, FastqArgs.mp_dist, FastqArgs.mp_dist_qty);
3484         if ( rc == 0 )
3485         {
3486             if ( parent != NULL )
3487             {
3488                 rc = SRASplitterFactory_AddNext( parent, child );
3489                 if ( rc != 0 )
3490                 {
3491                     SRASplitterFactory_Release(child);
3492                 }
3493                 else
3494                 {
3495                     parent = child;
3496                 }
3497             }
3498             else
3499             {
3500                 parent = child;
3501                 *factory = parent;
3502             }
3503         }
3504     }
3505 
3506     if ( rc == 0 && FastqArgs.skipTechnical && FastqArgs.split_spot )
3507     {
3508         rc = FastqBioFilterFactory_Make( &child, fmt->accession, fmt->table );
3509         if ( rc == 0 )
3510         {
3511             if ( parent != NULL )
3512             {
3513                 rc = SRASplitterFactory_AddNext( parent, child );
3514                 if ( rc != 0 )
3515                 {
3516                     SRASplitterFactory_Release( child );
3517                 }
3518                 else
3519                 {
3520                     parent = child;
3521                 }
3522             }
3523             else
3524             {
3525                 parent = child;
3526                 *factory = parent;
3527             }
3528         }
3529     }
3530 
3531     if ( rc == 0 && FastqArgs.maxReads > 0 )
3532     {
3533         rc = FastqRNumberFilterFactory_Make( &child, fmt->accession, fmt->table );
3534         if ( rc == 0 )
3535         {
3536             if ( parent != NULL )
3537             {
3538                 rc = SRASplitterFactory_AddNext( parent, child );
3539                 if ( rc != 0 )
3540                 {
3541                     SRASplitterFactory_Release( child );
3542                 }
3543                 else
3544                 {
3545                     parent = child;
3546                 }
3547             }
3548             else
3549             {
3550                 parent = child;
3551                 *factory = parent;
3552             }
3553         }
3554     }
3555 
3556     if ( rc == 0 && ( FastqArgs.qual_filter || FastqArgs.qual_filter1 ) )
3557     {
3558         rc = FastqQFilterFactory_Make( &child, fmt->accession, fmt->table );
3559         if ( rc == 0 )
3560         {
3561             if ( parent != NULL )
3562             {
3563                 rc = SRASplitterFactory_AddNext( parent, child );
3564                 if ( rc != 0 )
3565                 {
3566                     SRASplitterFactory_Release( child );
3567                 }
3568                 else
3569                 {
3570                     parent = child;
3571                 }
3572             }
3573             else
3574             {
3575                 parent = child;
3576                 *factory = parent;
3577             }
3578         }
3579     }
3580 
3581     if ( rc == 0 )
3582     {
3583         rc = FastqReadLenFilterFactory_Make( &child, fmt->accession, fmt->table );
3584         if ( rc == 0 )
3585         {
3586             if ( parent != NULL )
3587             {
3588                 rc = SRASplitterFactory_AddNext( parent, child );
3589                 if ( rc != 0 )
3590                 {
3591                     SRASplitterFactory_Release( child );
3592                 }
3593                 else
3594                 {
3595                     parent = child;
3596                 }
3597             }
3598             else
3599             {
3600                 parent = child;
3601                 *factory = parent;
3602             }
3603         }
3604     }
3605 
3606     if ( rc == 0 )
3607     {
3608         if ( FastqArgs.split_3 )
3609         {
3610             rc = Fastq3ReadSplitterFactory_Make( &child, fmt->accession, fmt->table );
3611             if ( rc == 0 )
3612             {
3613                 if ( parent != NULL )
3614                 {
3615                     rc = SRASplitterFactory_AddNext( parent, child );
3616                     if ( rc != 0 )
3617                     {
3618                         SRASplitterFactory_Release( child );
3619                     }
3620                     else
3621                     {
3622                         parent = child;
3623                     }
3624                 }
3625                 else
3626                 {
3627                     parent = child;
3628                     *factory = parent;
3629                 }
3630             }
3631         }
3632         else if ( FastqArgs.split_files )
3633         {
3634             rc = FastqReadSplitterFactory_Make( &child, fmt->accession, fmt->table );
3635             if ( rc == 0 )
3636             {
3637                 if ( parent != NULL )
3638                 {
3639                     rc = SRASplitterFactory_AddNext( parent, child );
3640                     if ( rc != 0 )
3641                     {
3642                         SRASplitterFactory_Release(child);
3643                     }
3644                     else
3645                     {
3646                         parent = child;
3647                     }
3648                 }
3649                 else
3650                 {
3651                     parent = child;
3652                     *factory = parent;
3653                 }
3654             }
3655         }
3656     }
3657 
3658     if ( rc == 0 )
3659     {
3660         if ( FastqArgs.b_deffmt != NULL )
3661         {
3662             rc = Defline_Parse( &FastqArgs.b_defline, FastqArgs.b_deffmt );
3663         }
3664         if ( FastqArgs.q_deffmt != NULL )
3665         {
3666             rc = Defline_Parse( &FastqArgs.q_defline, FastqArgs.q_deffmt );
3667         }
3668         if ( rc == 0 )
3669         {
3670             rc = FastqFormatterFactory_Make( &child, fmt->accession, fmt->table );
3671             if ( rc == 0 )
3672             {
3673                 if ( parent != NULL )
3674                 {
3675                     rc = SRASplitterFactory_AddNext( parent, child );
3676                     if ( rc != 0 )
3677                     {
3678                         SRASplitterFactory_Release( child );
3679                     }
3680                 }
3681                 else
3682                 {
3683                     *factory = child;
3684                 }
3685             }
3686         }
3687     }
3688     return rc;
3689 }
3690 
3691 /* main entry point of the file */
SRADumper_Init(SRADumperFmt * fmt)3692 rc_t SRADumper_Init( SRADumperFmt* fmt )
3693 {
3694     static const SRADumperFmt_Arg arg[] =
3695         {
3696 
3697             /* DO NOT ADD IN THE MIDDLE ORDER IS IMPORTANT IN USAGE FUNCTION ABOVE!!! */
3698             {NULL, "split-spot", NULL, {"Split spots into individual reads", NULL}},            /* H_splip_sot = 0 */
3699 
3700             {"W", "clip", NULL, {"Remove adapter sequences from reads", NULL}},                          /* H_clip = 1 */
3701 
3702             {"M", "minReadLen", "len", {"Filter by sequence length >= <len>", NULL}},           /* H_minReadLen = 2 */
3703             {"E", "qual-filter", NULL, {"Filter used in early 1000 Genomes data:",              /* H_qual_filter = 3 */
3704                                         "no sequences starting or ending with >= 10N", NULL}},
3705 
3706             {NULL, "skip-technical", NULL, {"Dump only biological reads", NULL}},               /* H_skip_tech = 4 */
3707 
3708             {NULL, "split-files", NULL, {"Write reads into separate files. "                /* H_split_files = 5 */
3709                                          "Read number will be suffixed to the file name. ",
3710                                          "NOTE! The `--split-3` option is recommended.",
3711                                          "In cases where not all spots have the same number of reads,",
3712                                          "this option will produce files",
3713                                          "that WILL CAUSE ERRORS in most programs",
3714                                          "which process split pair fastq files.", NULL}},
3715 
3716             /* DO NOT ADD IN THE MIDDLE ORDER IS IMPORTANT IN USAGE FUNCTION ABOVE!!! */
3717             {NULL, "split-3", NULL, {"3-way splitting for mate-pairs.",          /* H_split_3 = 6 */
3718                                      "For each spot, if there are two biological reads satisfying filter conditions,",
3719                                      "the first is placed in the `*_1.fastq` file,",
3720                                      "and the second is placed in the `*_2.fastq` file.",
3721                                      "If there is only one biological read satisfying the filter conditions,",
3722                                      "it is placed in the `*.fastq` file.All other reads in the spot are ignored.",
3723                                      NULL}},
3724 
3725             {"C", "dumpcs", "[cskey]", {"Formats sequence using color space (default for SOLiD),"   /* H_dumpcs = 7 */
3726                                         "\"cskey\" may be specified for translation", NULL}},
3727             {"B", "dumpbase", NULL, {"Formats sequence using base space (default for other than SOLiD).", NULL}}, /* H_dumpbase = 8 */
3728 
3729             /* DO NOT ADD IN THE MIDDLE ORDER IS IMPORTANT IN USAGE FUNCTION ABOVE!!! */
3730             {"Q", "offset", "integer", {"Offset to use for quality conversion, default is 33", NULL}},  /* H_offset = 9 */
3731             {NULL, "fasta", "[line width]", {"FASTA only, no qualities, optional line wrap width (set to zero for no wrapping)", NULL}}, /* H_fasta = 10 */
3732 
3733             {"F", "origfmt", NULL, {"Defline contains only original sequence name", NULL}}, /* H_origfmt = 11 */
3734             {"I", "readids", NULL, {"Append read id after spot id as 'accession.spot.readid' on defline", NULL}}, /* H_readids = 12 */
3735             {NULL, "helicos", NULL, {"Helicos style defline", NULL}}, /* H_helicos = 13 */
3736             {NULL, "defline-seq", "fmt", {"Defline format specification for sequence.", NULL}}, /* H_defline_seq = 14 */
3737             {NULL, "defline-qual", "fmt", {"Defline format specification for quality.", /* H_defline_qual = 15 */
3738                               "<fmt> is a string of characters and/or variables. The variables can be one of:",
3739                               "$ac - accession, $si - spot id, $sn - spot name, $sg - spot group (barcode),",
3740                               "$sl - spot length in bases, $ri - read number, $rn - read name, $rl - read length in bases.",
3741                               "'[]' could be used for an optional output: if all vars in [] yield empty values whole group is not printed.",
3742                               "Empty value is empty string or 0 for numeric variables.",
3743                               "Ex: @$sn[_$rn]/$ri - '_$rn' is omitted if name is empty\n", NULL}},
3744 
3745             {NULL, "aligned", NULL, {"Dump only aligned sequences", NULL}}, /* H_aligned = 16 */
3746             {NULL, "unaligned", NULL, {"Dump only unaligned sequences", NULL}}, /* H_unaligned = 17 */
3747 
3748             /* DO NOT ADD IN THE MIDDLE ORDER IS IMPORTANT IN USAGE FUNCTION ABOVE!!! */
3749             {NULL, "aligned-region", "name[:from-to]", {"Filter by position on genome.", /* H_aligned_region = 18 */
3750                             "Name can either be accession.version (ex: NC_000001.10) or",
3751                             "file specific name (ex: \"chr1\" or \"1\").",
3752                             "\"from\" and \"to\" are 1-based coordinates", NULL}},
3753             {NULL, "matepair-distance", "from-to|unknown", {"Filter by distance between matepairs.", /* H_matepair-distance = 19 */
3754                             "Use \"unknown\" to find matepairs split between the references.",
3755                             "Use from-to to limit matepair distance on the same reference", NULL}},
3756 
3757             {NULL, "qual-filter-1", NULL, {"Filter used in current 1000 Genomes data", NULL}}, /* H_qual_filter_1 = 20 */
3758 
3759             {NULL, "suppress-qual-for-cskey", NULL, {"suppress quality-value for cskey", NULL}}, /* H_SuppressQualForCSKey = 21 */
3760 
3761             {NULL, NULL, NULL, {NULL}}
3762         };
3763 
3764     if ( fmt == NULL )
3765     {
3766         return RC( rcExe, rcFileFormat, rcConstructing, rcParam, rcNull );
3767     }
3768 
3769     memset( &FastqArgs, 0, sizeof( FastqArgs ) );
3770     FastqArgs.is_platform_cs_native = false;
3771     FastqArgs.maxReads = 0;
3772     FastqArgs.skipTechnical = false;
3773     FastqArgs.minReadLen = 1;
3774     FastqArgs.applyClip = false;
3775     FastqArgs.SuppressQualForCSKey = false;     /* added Jan 15th 2014 ( a new fastq-variation! ) */
3776     FastqArgs.dumpOrigFmt = false;
3777     FastqArgs.dumpBase = false;
3778     FastqArgs.dumpCs = false;
3779     FastqArgs.readIds = false;
3780     FastqArgs.offset = 33;
3781     FastqArgs.desiredCsKey = "";
3782     FastqArgs.qual_filter = false;
3783     FastqArgs.b_deffmt = NULL;
3784     FastqArgs.b_defline = NULL;
3785     FastqArgs.q_deffmt = NULL;
3786     FastqArgs.q_defline = NULL;
3787     FastqArgs.split_files = false;
3788     FastqArgs.split_3 = false;
3789     FastqArgs.split_spot = false;
3790     FastqArgs.fasta = 0;
3791     FastqArgs.file_extension = ".fastq";
3792     FastqArgs.aligned = false;
3793     FastqArgs.unaligned = false;
3794     FastqArgs.alregion = NULL;
3795     FastqArgs.alregion_qty = 0;
3796     FastqArgs.mp_dist = NULL;
3797     FastqArgs.mp_dist_unknown = false;
3798     FastqArgs.mp_dist_qty = 0;
3799 
3800     fmt->usage = FastqDumper_Usage;
3801     fmt->release = FastqDumper_Release;
3802     fmt->arg_desc = arg;
3803     fmt->add_arg = FastqDumper_AddArg;
3804     fmt->get_factory = FastqDumper_Factories;
3805     fmt->gzip = true;
3806     fmt->bzip2 = true;
3807     fmt->split_files = false;
3808 
3809     return 0;
3810 }
3811