1 /*===========================================================================
2 *
3 * PUBLIC DOMAIN NOTICE
4 * National Center for Biotechnology Information
5 *
6 * This software/database is a "United States Government Work" under the
7 * terms of the United States Copyright Act. It was written as part of
8 * the author's official duties as a United States Government employee and
9 * thus cannot be copyrighted. This software/database is freely available
10 * to the public for use. The National Library of Medicine and the U.S.
11 * Government have not placed any restriction on its use or reproduction.
12 *
13 * Although all reasonable efforts have been taken to ensure the accuracy
14 * and reliability of the software and data, the NLM and the U.S.
15 * Government do not and cannot warrant the performance or results that
16 * may be obtained by using this software or data. The NLM and the U.S.
17 * Government disclaim all warranties, express or implied, including
18 * warranties of performance, merchantability or fitness for any particular
19 * purpose.
20 *
21 * Please cite the author in any work or product based on this material.
22 *
23 * ===========================================================================
24 *
25 */
26 #include <klib/log.h>
27 #include <klib/out.h>
28 #include <klib/container.h>
29 #include <klib/text.h>
30 #include <kapp/main.h>
31
32 #include <sra/sradb.h>
33 #include <sra/types.h>
34 #include <sra/fastq.h>
35
36 #include <os-native.h>
37 #include <sysalloc.h>
38
39 #include <stdio.h>
40 #include <stdlib.h>
41 #include <string.h>
42 #include <ctype.h>
43 #include <errno.h>
44 #include <strtol.h>
45
46 #include "core.h"
47 #include "debug.h"
48
49 #define DATABUFFERINITSIZE 10240
50
51 typedef struct TAlignedRegion_struct
52 {
53 const char* name;
54 uint32_t name_len;
55 /* 1-based, 0 - means not set */
56 uint64_t from;
57 uint64_t to;
58 } TAlignedRegion;
59
60
61 typedef struct TMatepairDistance_struct
62 {
63 uint64_t from;
64 uint64_t to;
65 } TMatepairDistance;
66
67
68 struct FastqArgs_struct
69 {
70 bool is_platform_cs_native;
71
72 int maxReads;
73 bool skipTechnical;
74
75 uint32_t minReadLen;
76 bool applyClip;
77 bool dumpOrigFmt;
78 bool dumpBase;
79 bool dumpCs;
80 bool readIds;
81 bool SuppressQualForCSKey; /* added Jan 15th 2014 ( a new fastq-variation! ) */
82 int offset;
83 bool qual_filter;
84 bool qual_filter1;
85 const char* b_deffmt;
86 SLList* b_defline;
87 const char* q_deffmt;
88 SLList* q_defline;
89 const char *desiredCsKey;
90 bool split_files;
91 bool split_3;
92 bool split_spot;
93 uint64_t fasta;
94 const char* file_extension;
95 bool aligned;
96 bool unaligned;
97 TAlignedRegion* alregion;
98 uint32_t alregion_qty;
99 bool mp_dist_unknown;
100 TMatepairDistance* mp_dist;
101 uint32_t mp_dist_qty;
102 } FastqArgs;
103
104
105 typedef enum DefNodeType_enum
106 {
107 DefNode_Unknown = 0,
108 DefNode_Text = 1,
109 DefNode_Optional,
110 DefNode_Accession,
111 DefNode_SpotId,
112 DefNode_SpotName,
113 DefNode_SpotGroup,
114 DefNode_SpotLen,
115 DefNode_ReadId,
116 DefNode_ReadName,
117 DefNode_ReadLen,
118 DefNode_Last
119 } DefNodeType;
120
121
122 typedef struct DefNode_struct
123 {
124 SLNode node;
125 DefNodeType type;
126 union
127 {
128 SLList* optional;
129 char* text;
130 } data;
131 } DefNode;
132
133
134 typedef struct DeflineData_struct
135 {
136 rc_t rc;
137 bool optional;
138 union
139 {
140 spotid_t* id;
141 struct
142 {
143 const char* s;
144 size_t sz;
145 } str;
146 uint32_t* u32;
147 } values[ DefNode_Last ];
148 char* buf;
149 size_t buf_sz;
150 size_t* writ;
151 } DeflineData;
152
153
Defline_Builder(SLNode * node,void * data)154 static bool CC Defline_Builder( SLNode *node, void *data )
155 {
156 DefNode* n = ( DefNode* )node;
157 DeflineData* d = ( DeflineData* )data;
158 char s[ 256 ];
159 size_t w;
160 int x = 0;
161
162 s[ 0 ] = '\0';
163 switch ( n->type )
164 {
165 case DefNode_Optional :
166 d->optional = true;
167 w = *d->writ;
168 *d->writ = 0;
169 SLListDoUntil( n->data.optional, Defline_Builder, data );
170 x = ( *d->writ == 0 ) ? 0 : 1;
171 d->optional = false;
172 *d->writ = w;
173 if ( x > 0 )
174 {
175 SLListDoUntil( n->data.optional, Defline_Builder, data );
176 }
177 break;
178
179 case DefNode_Text :
180 if ( d->optional )
181 {
182 break;
183 }
184 d->values[n->type].str.s = n->data.text;
185 d->values[n->type].str.sz = strlen( n->data.text );
186
187 case DefNode_Accession :
188 case DefNode_SpotName :
189 case DefNode_SpotGroup :
190 case DefNode_ReadName :
191 if ( d->values[n->type].str.s != NULL )
192 {
193 x = d->values[ n->type ].str.sz;
194 if ( x < sizeof( s ) )
195 {
196 strncpy( s, d->values[ n->type ].str.s, x );
197 s[ x ] = '\0';
198 *d->writ = *d->writ + x;
199 }
200 else
201 {
202 d->rc = RC( rcExe, rcNamelist, rcExecuting, rcBuffer, rcInsufficient );
203 }
204 }
205 break;
206
207 case DefNode_SpotId :
208 if ( d->values[ n->type ].id != NULL &&
209 ( !d->optional || *d->values[ n->type ].id > 0 ) )
210 {
211 x = snprintf( s, sizeof( s ), "%ld", *d->values[ n->type ].id );
212 if ( x < 0 || x >= sizeof( s ) )
213 {
214 d->rc = RC( rcExe, rcNamelist, rcExecuting, rcBuffer, rcInsufficient );
215 }
216 else
217 {
218 *d->writ = *d->writ + x;
219 }
220 }
221 break;
222
223 case DefNode_ReadId :
224 case DefNode_SpotLen :
225 case DefNode_ReadLen :
226 if ( d->values[ n->type] .u32 != NULL &&
227 ( !d->optional || *d->values[ n->type ].u32 > 0 ) )
228 {
229 x = snprintf( s, sizeof( s ), "%u", *d->values[ n->type ].u32 );
230 if ( x < 0 || x >= sizeof( s ) )
231 {
232 d->rc = RC( rcExe, rcNamelist, rcExecuting, rcBuffer, rcInsufficient );
233 }
234 else
235 {
236 *d->writ = *d->writ + x;
237 }
238 }
239 break;
240
241 default:
242 d->rc = RC( rcExe, rcNamelist, rcExecuting, rcId, rcInvalid );
243 } /* switch */
244
245 if ( d->rc == 0 && !d->optional && n->type != DefNode_Optional )
246 {
247 if ( *d->writ < d->buf_sz )
248 {
249 strcat( d->buf, s );
250 }
251 else
252 {
253 d->rc = RC( rcExe, rcNamelist, rcExecuting, rcBuffer, rcInsufficient );
254 }
255 }
256 return d->rc != 0;
257 }
258
259
Defline_Bind(DeflineData * data,const char * accession,spotid_t * spotId,const char * spot_name,size_t spotname_sz,const char * spot_group,size_t sgrp_sz,uint32_t * spot_len,uint32_t * readId,const char * read_name,INSDC_coord_len rlabel_sz,INSDC_coord_len * read_len)260 static rc_t Defline_Bind( DeflineData* data, const char* accession,
261 spotid_t* spotId, const char* spot_name, size_t spotname_sz,
262 const char* spot_group, size_t sgrp_sz, uint32_t* spot_len,
263 uint32_t* readId, const char* read_name, INSDC_coord_len rlabel_sz,
264 INSDC_coord_len* read_len )
265 {
266 if ( data == NULL )
267 {
268 return RC( rcExe, rcNamelist, rcExecuting, rcMemory, rcInsufficient );
269 }
270 data->values[ DefNode_Unknown ].str.s = NULL;
271 data->values[ DefNode_Text ].str.s = NULL;
272 data->values[ DefNode_Optional ].str.s = NULL;
273 data->values[ DefNode_Accession ].str.s = accession;
274 data->values[ DefNode_Accession ].str.sz = strlen( accession );
275 data->values[ DefNode_SpotId ].id = spotId;
276 data->values[ DefNode_SpotName ].str.s = spot_name;
277 data->values[ DefNode_SpotName ].str.sz = spotname_sz;
278 data->values[ DefNode_SpotGroup ].str.s = spot_group;
279 data->values[ DefNode_SpotGroup ].str.sz = sgrp_sz;
280 data->values[ DefNode_SpotLen ].u32 = spot_len;
281 data->values[ DefNode_ReadId ].u32 = readId;
282 data->values[ DefNode_ReadName ].str.s = read_name;
283 data->values[ DefNode_ReadName ].str.sz = rlabel_sz;
284 data->values[ DefNode_ReadLen ].u32 = read_len;
285 return 0;
286 }
287
288
Defline_Build(const SLList * def,DeflineData * data,char * buf,size_t buf_sz,size_t * writ)289 static rc_t Defline_Build( const SLList* def, DeflineData* data, char* buf,
290 size_t buf_sz, size_t* writ )
291 {
292 if ( data == NULL )
293 {
294 return RC( rcExe, rcNamelist, rcExecuting, rcMemory, rcInsufficient );
295 }
296
297 data->rc = 0;
298 data->optional = false;
299 data->buf = buf;
300 data->buf_sz = buf_sz;
301 data->writ = writ;
302
303 data->buf[ 0 ] = '\0';
304 *data->writ = 0;
305
306 SLListDoUntil( def, Defline_Builder, data );
307 return data->rc;
308 }
309
310
DeflineNode_Add(SLList * list,DefNode ** node,DefNodeType type,const char * text,size_t text_sz)311 static rc_t DeflineNode_Add( SLList* list, DefNode** node,
312 DefNodeType type, const char* text, size_t text_sz )
313 {
314 rc_t rc = 0;
315
316 *node = calloc( 1, sizeof( **node ) );
317 if ( *node == NULL )
318 {
319 rc = RC( rcExe, rcNamelist, rcConstructing, rcMemory, rcExhausted );
320 }
321 else if ( type == DefNode_Text && ( text == NULL || text_sz == 0 ) )
322 {
323 rc = RC( rcExe, rcNamelist, rcConstructing, rcParam, rcInvalid );
324 }
325 else
326 {
327 ( *node )->type = type;
328 if ( type == DefNode_Text )
329 {
330 ( *node )->data.text = string_dup( text, text_sz );
331 if ( ( *node )->data.text == NULL )
332 {
333 rc = RC( rcExe, rcNamelist, rcConstructing, rcMemory, rcExhausted );
334 free( *node );
335 }
336 }
337 else if ( type == DefNode_Optional )
338 {
339 ( *node )->data.optional = malloc( sizeof( SLList ) );
340 if ( ( *node )->data.optional == NULL )
341 {
342 rc = RC( rcExe, rcNamelist, rcConstructing, rcMemory, rcExhausted );
343 free( *node );
344 }
345 else
346 {
347 SLListInit( ( *node )->data.optional );
348 }
349 }
350 if ( rc == 0 )
351 {
352 SLListPushTail( list, &( *node )->node );
353 }
354 }
355 return rc;
356 }
357
358
359 static void Defline_Release( SLList* list );
360
361
DeflineNode_Whack(SLNode * node,void * data)362 static void CC DeflineNode_Whack( SLNode* node, void* data )
363 {
364 if ( node != NULL )
365 {
366 DefNode* n = (DefNode*)node;
367 if ( n->type == DefNode_Text )
368 {
369 free( n->data.text );
370 }
371 else if ( n->type == DefNode_Optional )
372 {
373 Defline_Release( n->data.optional );
374 }
375 free( node );
376 }
377 }
378
379
Defline_Release(SLList * list)380 static void Defline_Release( SLList* list )
381 {
382 if ( list != NULL )
383 {
384 SLListForEach( list, DeflineNode_Whack, NULL );
385 free( list );
386 }
387 }
388
389
390 #if _DEBUGGING
Defline_Dump(SLNode * node,void * data)391 static void CC Defline_Dump( SLNode* node, void* data )
392 {
393 DefNode* n = ( DefNode* )node;
394 const char* s = NULL, *t;
395 if ( n->type == DefNode_Text )
396 {
397 s = n->data.text;
398 }
399 switch ( n->type )
400 {
401 case DefNode_Unknown : t = "Unknown"; break;
402 case DefNode_Text : t = "Text"; break;
403 case DefNode_Optional : t = "Optional"; break;
404 case DefNode_Accession : t = "Accession"; break;
405 case DefNode_SpotId : t = "SpotId"; break;
406 case DefNode_SpotName : t = "SpotName"; break;
407 case DefNode_SpotGroup : t = "SpotGroup"; break;
408 case DefNode_SpotLen : t = "SpotLen"; break;
409 case DefNode_ReadId : t = "ReadId"; break;
410 case DefNode_ReadName : t = "ReadName"; break;
411 case DefNode_ReadLen : t = "ReadLen"; break;
412 default : t = "ERROR";
413 }
414
415 SRA_DUMP_DBG( 3, ( "%s type %s", data, t ) );
416 if ( s )
417 {
418 SRA_DUMP_DBG( 3, ( ": '%s'", s ) );
419 }
420 SRA_DUMP_DBG( 3, ( "\n" ) );
421 if ( n->type == DefNode_Optional )
422 {
423 SLListForEach( n->data.optional, Defline_Dump, "+-->" );
424 }
425 }
426 #endif
427
428
Defline_Parse(SLList ** def,const char * line)429 static rc_t Defline_Parse( SLList** def, const char* line )
430 {
431 rc_t rc = 0;
432 size_t i, sz, text = 0, opt_vars = 0;
433 DefNode* node;
434 SLList* list = NULL;
435
436 if ( def == NULL || line == NULL )
437 {
438 return RC( rcExe, rcNamelist, rcConstructing, rcParam, rcInvalid );
439 }
440 *def = malloc( sizeof( SLList ) );
441 if ( *def == NULL )
442 {
443 return RC( rcExe, rcNamelist, rcConstructing, rcMemory, rcExhausted );
444 }
445 sz = strlen( line );
446 list = *def;
447 SLListInit( list );
448
449 for ( i = 0; rc == 0 && i < sz; i++ )
450 {
451 if ( line[ i ] == '[' )
452 {
453 if ( ( i + 1 ) < sz && line[ i + 1 ] == '[' )
454 {
455 i++;
456 }
457 else
458 {
459 if ( list != *def )
460 {
461 rc = RC( rcExe, rcNamelist, rcConstructing, rcFormat, rcIncorrect );
462 }
463 else if ( i > text )
464 {
465 rc = DeflineNode_Add( list, &node, DefNode_Text, &line[text], i - text );
466 }
467 if ( rc == 0 )
468 {
469 rc = DeflineNode_Add( list, &node, DefNode_Optional, NULL, 0 );
470 if ( rc == 0 )
471 {
472 opt_vars = 0;
473 list = node->data.optional;
474 text = i + 1;
475 }
476 }
477 }
478 }
479 else if ( line[ i ] == ']' )
480 {
481 if ( ( i + 1 ) < sz && line[ i + 1 ] == ']' )
482 {
483 i++;
484 }
485 else
486 {
487 if ( list == *def )
488 {
489 rc = RC( rcExe, rcNamelist, rcConstructing, rcFormat, rcIncorrect );
490 }
491 else
492 {
493 if ( opt_vars < 1 )
494 {
495 rc = RC( rcExe, rcNamelist, rcConstructing, rcConstraint, rcEmpty );
496 }
497 else if ( i > text )
498 {
499 rc = DeflineNode_Add( list, &node, DefNode_Text, &line[text], i - text );
500 }
501 list = *def;
502 text = i + 1;
503 }
504 }
505 }
506 else if ( line[ i ] == '$' )
507 {
508 if ( ( i + 1 ) < sz && line[ i + 1 ] == '$' )
509 {
510 i++;
511 }
512 else
513 {
514 DefNodeType type = DefNode_Unknown;
515 switch ( line[ ++i] )
516 {
517 case 'a' :
518 switch( line[ ++i ] )
519 {
520 case 'c': type = DefNode_Accession; break;
521 }
522 break;
523
524 case 's' :
525 switch ( line[ ++i ] )
526 {
527 case 'i': type = DefNode_SpotId; break;
528 case 'n': type = DefNode_SpotName; break;
529 case 'g': type = DefNode_SpotGroup; break;
530 case 'l': type = DefNode_SpotLen; break;
531 }
532 break;
533 case 'r' :
534 switch( line[ ++i ] )
535 {
536 case 'i': type = DefNode_ReadId; break;
537 case 'n': type = DefNode_ReadName; break;
538 case 'l': type = DefNode_ReadLen; break;
539 }
540 break;
541 }
542 if ( type == DefNode_Unknown )
543 {
544 rc = RC( rcExe, rcNamelist, rcConstructing, rcName, rcUnrecognized );
545 }
546 else
547 {
548 if ( ( i - 2 ) > text )
549 {
550 rc = DeflineNode_Add( list, &node, DefNode_Text, &line[ text ], i - text - 2 );
551 }
552 if ( rc == 0 )
553 {
554 rc = DeflineNode_Add( list, &node, type, NULL, 0 );
555 if ( rc == 0 )
556 {
557 opt_vars++;
558 text = i + 1;
559 }
560 }
561 }
562 }
563 }
564 }
565 if ( rc == 0 )
566 {
567 if ( list != *def )
568 {
569 rc = RC( rcExe, rcNamelist, rcConstructing, rcFormat, rcInvalid );
570 }
571 else if ( i > text )
572 {
573 rc = DeflineNode_Add( list, &node, DefNode_Text, &line[ text ], i - text );
574 }
575 }
576 if ( rc != 0 )
577 {
578 i = i < sz ? i : sz;
579 PLOGERR( klogErr, ( klogErr, rc, "$(l1) -->$(c)<-- $(l2)",
580 "l1=%.*s,c=%c,l2=%.*s", i - 1, line, line[ i - 1 ], sz - i, &line[ i ] ) );
581 }
582 #if _DEBUGGING
583 SRA_DUMP_DBG( 3, ( "| defline\n" ) );
584 SLListForEach( *def, Defline_Dump, "+->" );
585 #endif
586 return rc;
587 }
588
589 /* ### ALIGNMENT_COUNT based filtering ##################################################### */
590
591 typedef struct AlignedFilter_struct
592 {
593 const SRAColumn* col;
594 uint64_t rejected_reads;
595 bool aligned;
596 bool unaligned;
597 } AlignedFilter;
598
599
AlignedFilter_GetKey(const SRASplitter * cself,const char ** key,spotid_t spot,readmask_t * readmask)600 static rc_t AlignedFilter_GetKey( const SRASplitter* cself, const char** key,
601 spotid_t spot, readmask_t* readmask )
602 {
603 rc_t rc = 0;
604 AlignedFilter* self = ( AlignedFilter* )cself;
605
606 if ( self == NULL || key == NULL )
607 {
608 rc = RC( rcSRA, rcNode, rcExecuting, rcParam, rcNull );
609 }
610 else
611 {
612 *key = "";
613 if ( self->col != NULL )
614 {
615 const uint8_t* ac = NULL;
616 bitsz_t o = 0, sz = 0;
617 rc = SRAColumnRead( self->col, spot, (const void **)&ac, &o, &sz );
618 if ( rc == 0 && sz > 0 )
619 {
620 sz = sz / 8 / sizeof( *ac );
621 for( o = 0; o < sz; o++ )
622 {
623 if ( ( self->aligned && !self->unaligned && ac[ o ] == 0 ) ||
624 ( self->unaligned && !self->aligned && ac[ o ] != 0 ) )
625 {
626 unset_readmask( readmask, o );
627 self->rejected_reads ++;
628 }
629 }
630 }
631 }
632 }
633 return rc;
634 }
635
636
AlignedFilter_Release(const SRASplitter * cself)637 static rc_t AlignedFilter_Release( const SRASplitter* cself )
638 {
639 rc_t rc = 0;
640 AlignedFilter* self = ( AlignedFilter* )cself;
641 if ( self == NULL )
642 {
643 rc = RC( rcSRA, rcNode, rcExecuting, rcParam, rcNull );
644 }
645 else
646 {
647 if ( self->rejected_reads > 0 && !g_legacy_report )
648 rc = KOutMsg( "Rejected %lu READS because of aligned/unaligned filter\n", self->rejected_reads );
649 }
650 return rc;
651 }
652
653 typedef struct AlignedFilterFactory_struct
654 {
655 const SRATable* table;
656 const SRAColumn* col;
657 bool aligned;
658 bool unaligned;
659 } AlignedFilterFactory;
660
661
AlignedFilterFactory_Init(const SRASplitterFactory * cself)662 static rc_t AlignedFilterFactory_Init( const SRASplitterFactory* cself )
663 {
664 rc_t rc = 0;
665 AlignedFilterFactory* self = ( AlignedFilterFactory* )cself;
666
667 if ( self == NULL )
668 {
669 rc = RC( rcSRA, rcType, rcConstructing, rcParam, rcNull );
670 }
671 else
672 {
673 rc = SRATableOpenColumnRead( self->table, &self->col, "ALIGNMENT_COUNT", vdb_uint8_t );
674 if ( rc != 0 )
675 {
676 if ( GetRCState( rc ) == rcNotFound )
677 {
678 LOGMSG( klogWarn, "Column ALIGNMENT_COUNT was not found, param ignored" );
679 rc = 0;
680 }
681 else if ( GetRCState( rc ) == rcExists )
682 {
683 rc = 0;
684 }
685 }
686 }
687 return rc;
688 }
689
690
AlignedFilterFactory_NewObj(const SRASplitterFactory * cself,const SRASplitter ** splitter)691 static rc_t AlignedFilterFactory_NewObj( const SRASplitterFactory* cself, const SRASplitter** splitter )
692 {
693 rc_t rc = 0;
694 AlignedFilterFactory* self = ( AlignedFilterFactory* )cself;
695
696 if ( self == NULL )
697 {
698 rc = RC( rcSRA, rcType, rcExecuting, rcParam, rcNull );
699 }
700 else
701 {
702 rc = SRASplitter_Make( splitter, sizeof( AlignedFilter ),
703 AlignedFilter_GetKey, NULL, NULL, AlignedFilter_Release );
704 if ( rc == 0 )
705 {
706 AlignedFilter * filter = ( AlignedFilter * )( * splitter );
707 filter->col = self->col;
708 filter->aligned = self->aligned;
709 filter->unaligned = self->unaligned;
710 filter->rejected_reads = 0;
711 }
712 }
713 return rc;
714 }
715
716
AlignedFilterFactory_Release(const SRASplitterFactory * cself)717 static void AlignedFilterFactory_Release( const SRASplitterFactory* cself )
718 {
719 if ( cself != NULL )
720 {
721 AlignedFilterFactory* self = ( AlignedFilterFactory* )cself;
722 SRAColumnRelease( self->col );
723 }
724 }
725
726
AlignedFilterFactory_Make(const SRASplitterFactory ** cself,const SRATable * table,bool aligned,bool unaligned)727 static rc_t AlignedFilterFactory_Make( const SRASplitterFactory** cself,
728 const SRATable* table, bool aligned, bool unaligned )
729 {
730 rc_t rc = 0;
731 AlignedFilterFactory* obj = NULL;
732
733 if ( cself == NULL || table == NULL )
734 {
735 rc = RC( rcSRA, rcType, rcAllocating, rcParam, rcNull );
736 }
737 else
738 {
739 SRASplitterFactory_Make( cself, eSplitterSpot, sizeof( *obj ),
740 AlignedFilterFactory_Init,
741 AlignedFilterFactory_NewObj,
742 AlignedFilterFactory_Release );
743 if ( rc == 0 )
744 {
745 obj = ( AlignedFilterFactory* )*cself;
746 obj->table = table;
747 obj->aligned = aligned;
748 obj->unaligned = unaligned;
749 }
750 }
751 return rc;
752 }
753
754
755 /* ### alignment region filtering ##################################################### */
756
757 typedef struct AlignRegionFilter_struct
758 {
759 const SRAColumn* col_pos;
760 const SRAColumn* col_name;
761 const SRAColumn* col_seqid;
762 const TAlignedRegion* alregion;
763 uint64_t rejected_spots;
764 uint32_t alregion_qty;
765 } AlignRegionFilter;
766
767
AlignRegionFilter_GetKey(const SRASplitter * cself,const char ** key,spotid_t spot,readmask_t * readmask)768 static rc_t AlignRegionFilter_GetKey( const SRASplitter* cself, const char** key,
769 spotid_t spot, readmask_t* readmask )
770 {
771 rc_t rc = 0;
772 AlignRegionFilter* self = ( AlignRegionFilter* )cself;
773
774 if ( self == NULL || key == NULL )
775 {
776 rc = RC( rcSRA, rcNode, rcExecuting, rcParam, rcNull );
777 }
778 else
779 {
780 *key = "";
781 reset_readmask( readmask );
782 if ( self->col_name || self->col_seqid )
783 {
784 bool match = false;
785 uint32_t i;
786 const char* nm, *si;
787 bitsz_t o, nm_len = 0, si_len = 0;
788
789 if ( self->col_name )
790 {
791 rc = SRAColumnRead( self->col_name, spot, (const void **)&nm, &o, &nm_len );
792 if ( rc == 0 && nm_len > 0 )
793 nm_len /= 8;
794 }
795 if ( rc == 0 && self->col_seqid )
796 {
797 rc = SRAColumnRead( self->col_seqid, spot, (const void **)&si, &o, &si_len );
798 if ( rc == 0 && si_len > 0 )
799 si_len /= 8;
800 }
801 for ( i = 0; rc == 0 && i < self->alregion_qty; i++ )
802 {
803 if ( ( self->alregion[i].name_len == nm_len &&
804 strncasecmp( self->alregion[ i ].name, nm, nm_len ) == 0 ) ||
805 ( self->alregion[i].name_len == si_len &&
806 strncasecmp(self->alregion[i].name, si, si_len) == 0 ) )
807 {
808 if ( self->col_pos )
809 {
810 INSDC_coord_zero* pos;
811 bitsz_t nreads, j;
812 rc = SRAColumnRead( self->col_pos, spot, ( const void ** )&pos, &o, &nreads );
813 if ( rc == 0 && nreads > 0 )
814 {
815 nreads = nreads / sizeof(*pos) / 8;
816 for ( i = 0; !match && i < self->alregion_qty; i++ )
817 {
818 for ( j = 0; j < nreads; j++ )
819 {
820 if ( self->alregion[i].from <= pos[j] &&
821 pos[j] <= self->alregion[i].to )
822 {
823 match = true;
824 break;
825 }
826 }
827 }
828 }
829 }
830 else
831 {
832 match = true;
833 }
834 break;
835 }
836 }
837 if ( !match )
838 {
839 clear_readmask( readmask );
840 self->rejected_spots ++;
841 }
842 }
843 }
844 return rc;
845 }
846
847
AlignRegionFilter_Release(const SRASplitter * cself)848 static rc_t AlignRegionFilter_Release( const SRASplitter* cself )
849 {
850 rc_t rc = 0;
851 AlignRegionFilter* self = ( AlignRegionFilter* )cself;
852
853 if ( self == NULL )
854 {
855 rc = RC( rcSRA, rcNode, rcExecuting, rcParam, rcNull );
856 }
857 else
858 {
859 if ( self->rejected_spots > 0 && !g_legacy_report )
860 rc = KOutMsg( "Rejected %lu SPOTS because of AlignRegionFilter\n", self->rejected_spots );
861 }
862 return rc;
863 }
864
865
866 typedef struct AlignRegionFilterFactory_struct
867 {
868 const SRATable* table;
869 const SRAColumn* col_pos;
870 const SRAColumn* col_name;
871 const SRAColumn* col_seqid;
872 TAlignedRegion* alregion;
873 uint32_t alregion_qty;
874 } AlignRegionFilterFactory;
875
876
AlignRegionFilterFactory_Init(const SRASplitterFactory * cself)877 static rc_t AlignRegionFilterFactory_Init( const SRASplitterFactory* cself )
878 {
879 rc_t rc = 0;
880 AlignRegionFilterFactory* self = ( AlignRegionFilterFactory* )cself;
881
882 if ( self == NULL )
883 {
884 rc = RC( rcSRA, rcType, rcConstructing, rcParam, rcNull );
885 }
886 else
887 {
888
889 #define COLNF(v,n,t) (!(rc == 0 && ((rc = SRATableOpenColumnRead(self->table, &v, n, t)) == 0 || \
890 GetRCState(rc) == rcExists)) && GetRCState(rc) == rcNotFound)
891
892 if ( COLNF( self->col_name, "REF_NAME", vdb_ascii_t ) )
893 {
894 SRAColumnRelease( self->col_name );
895 self->col_name = NULL;
896 rc = 0;
897 }
898
899 if ( COLNF( self->col_seqid, "REF_SEQ_ID", vdb_ascii_t ) )
900 {
901 SRAColumnRelease( self->col_seqid );
902 self->col_seqid = NULL;
903 rc = 0;
904 }
905
906 if ( self->col_name == NULL && self->col_seqid == NULL )
907 {
908 LOGMSG( klogWarn, "None of columns REF_NAME or REF_SEQ_ID was found, no region filtering" );
909 }
910 else
911 {
912 bool range_used = false;
913 uint32_t i;
914 /* see if there are any actual ranges set and open col only if yes */
915 for ( i = 0; i < self->alregion_qty; i++ )
916 {
917 if ( self->alregion[ i ].from != 0 || self->alregion[ i ].to != 0 )
918 {
919 range_used = true;
920 break;
921 }
922 }
923 if ( range_used )
924 {
925 if ( COLNF( self->col_pos, "REF_POS", "INSDC:coord:zero" ) )
926 {
927 LOGMSG( klogWarn, "Column REF_POS is not found, no region position filtering" );
928 rc = 0;
929 }
930 else
931 {
932 for ( i = 0; i < self->alregion_qty; i++ )
933 {
934 if ( self->alregion[ i ].from > 0 )
935 {
936 self->alregion[ i ].from--;
937 }
938 if ( self->alregion[ i ].to > 0 )
939 {
940 self->alregion[ i ].to--;
941 }
942 else if ( self->alregion[ i ].to == 0 )
943 {
944 self->alregion[ i ].to = ~0;
945 }
946 }
947 }
948 }
949 }
950 }
951 #undef COLNF
952 return rc;
953 }
954
955
AlignRegionFilterFactory_NewObj(const SRASplitterFactory * cself,const SRASplitter ** splitter)956 static rc_t AlignRegionFilterFactory_NewObj( const SRASplitterFactory* cself, const SRASplitter** splitter )
957 {
958 rc_t rc = 0;
959 AlignRegionFilterFactory* self = ( AlignRegionFilterFactory* )cself;
960
961 if ( self == NULL )
962 {
963 rc = RC( rcSRA, rcType, rcExecuting, rcParam, rcNull );
964 }
965 else
966 {
967 rc = SRASplitter_Make( splitter, sizeof( AlignRegionFilter ),
968 AlignRegionFilter_GetKey, NULL, NULL, AlignRegionFilter_Release );
969 if ( rc == 0 )
970 {
971 AlignRegionFilter * filter = ( AlignRegionFilter * )( * splitter );
972 filter->col_pos = self->col_pos;
973 filter->col_name = self->col_name;
974 filter->col_seqid = self->col_seqid;
975 filter->alregion = self->alregion;
976 filter->alregion_qty = self->alregion_qty;
977 filter->rejected_spots = 0;
978 }
979 }
980 return rc;
981 }
982
983
AlignRegionFilterFactory_Release(const SRASplitterFactory * cself)984 static void AlignRegionFilterFactory_Release( const SRASplitterFactory* cself )
985 {
986 if ( cself != NULL )
987 {
988 AlignRegionFilterFactory* self = ( AlignRegionFilterFactory* )cself;
989 SRAColumnRelease( self->col_pos );
990 SRAColumnRelease( self->col_name );
991 SRAColumnRelease( self->col_seqid );
992 }
993 }
994
995
AlignRegionFilterFactory_Make(const SRASplitterFactory ** cself,const SRATable * table,TAlignedRegion * alregion,uint32_t alregion_qty)996 static rc_t AlignRegionFilterFactory_Make( const SRASplitterFactory** cself,
997 const SRATable* table, TAlignedRegion* alregion, uint32_t alregion_qty )
998 {
999 rc_t rc = 0;
1000 AlignRegionFilterFactory* obj = NULL;
1001
1002 if ( cself == NULL || table == NULL )
1003 {
1004 rc = RC( rcSRA, rcType, rcAllocating, rcParam, rcNull );
1005 }
1006 else
1007 {
1008 rc = SRASplitterFactory_Make( cself, eSplitterSpot, sizeof( *obj ),
1009 AlignRegionFilterFactory_Init,
1010 AlignRegionFilterFactory_NewObj,
1011 AlignRegionFilterFactory_Release );
1012 if ( rc == 0 )
1013 {
1014 obj = ( AlignRegionFilterFactory* )*cself;
1015 obj->table = table;
1016 obj->alregion = alregion;
1017 obj->alregion_qty = alregion_qty;
1018 }
1019 }
1020 return rc;
1021 }
1022
1023 /* ### alignment meta-pair distance filtering ##################################################### */
1024
1025 typedef struct AlignPairDistanceFilter_struct
1026 {
1027 const SRAColumn* col_tlen;
1028 const TMatepairDistance* mp_dist;
1029 uint64_t rejected_reads;
1030 bool mp_dist_unknown;
1031 uint32_t mp_dist_qty;
1032 } AlignPairDistanceFilter;
1033
1034
AlignPairDistanceFilter_GetKey(const SRASplitter * cself,const char ** key,spotid_t spot,readmask_t * readmask)1035 static rc_t AlignPairDistanceFilter_GetKey( const SRASplitter* cself,
1036 const char** key, spotid_t spot, readmask_t* readmask )
1037 {
1038 rc_t rc = 0;
1039 AlignPairDistanceFilter* self = ( AlignPairDistanceFilter* )cself;
1040
1041 if ( self == NULL || key == NULL )
1042 {
1043 rc = RC( rcSRA, rcNode, rcExecuting, rcParam, rcNull );
1044 }
1045 else
1046 {
1047 *key = "";
1048 reset_readmask( readmask );
1049 if ( self->col_tlen )
1050 {
1051 uint32_t i, j;
1052 bitsz_t o, nreads;
1053 int32_t* tlen;
1054 rc = SRAColumnRead( self->col_tlen, spot, (const void **)&tlen, &o, &nreads );
1055 if ( rc == 0 )
1056 {
1057 nreads = nreads / sizeof(*tlen) / 8;
1058 for ( i = 0; i < self->mp_dist_qty; i++ )
1059 {
1060 for ( j = 0; j < nreads; j++ )
1061 {
1062 if ( ( tlen[ j ] == 0 && !self->mp_dist_unknown) ||
1063 ( tlen[ j ] != 0 &&
1064 ( tlen[ j ] < self->mp_dist[ i ].from || self->mp_dist[ i ].to < tlen[ j ] ) ) )
1065 {
1066 unset_readmask( readmask, j );
1067 self->rejected_reads ++;
1068 }
1069 }
1070 }
1071 }
1072 }
1073 }
1074 return rc;
1075 }
1076
1077
AlignPairDistanceFilter_Release(const SRASplitter * cself)1078 static rc_t AlignPairDistanceFilter_Release( const SRASplitter* cself )
1079 {
1080 rc_t rc = 0;
1081 AlignPairDistanceFilter* self = ( AlignPairDistanceFilter* )cself;
1082
1083 if ( self == NULL )
1084 {
1085 rc = RC( rcSRA, rcNode, rcExecuting, rcParam, rcNull );
1086 }
1087 else
1088 {
1089 if ( self->rejected_reads > 0 && !g_legacy_report )
1090 rc = KOutMsg( "Rejected %lu READS because of AlignPairDistanceFilter\n", self->rejected_reads );
1091 }
1092 return rc;
1093 }
1094
1095 typedef struct AlignPairDistanceFilterFactory_struct
1096 {
1097 const SRATable* table;
1098 const SRAColumn* col_tlen;
1099 bool mp_dist_unknown;
1100 TMatepairDistance* mp_dist;
1101 uint32_t mp_dist_qty;
1102 } AlignPairDistanceFilterFactory;
1103
1104
AlignPairDistanceFilterFactory_Init(const SRASplitterFactory * cself)1105 static rc_t AlignPairDistanceFilterFactory_Init( const SRASplitterFactory* cself )
1106 {
1107 rc_t rc = 0;
1108 AlignPairDistanceFilterFactory* self = ( AlignPairDistanceFilterFactory* )cself;
1109
1110 if ( self == NULL )
1111 {
1112 rc = RC( rcSRA, rcType, rcConstructing, rcParam, rcNull );
1113 }
1114 else
1115 {
1116 rc = SRATableOpenColumnRead( self->table, &self->col_tlen, "TEMPLATE_LEN", "I32" );
1117 if( !( rc == 0 || GetRCState( rc ) == rcExists ) && GetRCState( rc ) == rcNotFound )
1118 {
1119 LOGMSG( klogWarn, "Column TLEN was not found, no distance filtering" );
1120 self->col_tlen = NULL;
1121 rc = 0;
1122 }
1123 if ( rc == 0 )
1124 {
1125 uint32_t i;
1126 for ( i = 0; i < self->mp_dist_qty; i++ )
1127 {
1128 if ( self->mp_dist[ i ].to == 0 )
1129 {
1130 self->mp_dist[ i ].to = ~0;
1131 }
1132 }
1133 }
1134 }
1135 return rc;
1136 }
1137
1138
AlignPairDistanceFilterFactory_NewObj(const SRASplitterFactory * cself,const SRASplitter ** splitter)1139 static rc_t AlignPairDistanceFilterFactory_NewObj( const SRASplitterFactory* cself,
1140 const SRASplitter** splitter )
1141 {
1142 rc_t rc = 0;
1143 AlignPairDistanceFilterFactory* self = ( AlignPairDistanceFilterFactory* )cself;
1144
1145 if ( self == NULL )
1146 {
1147 rc = RC( rcSRA, rcType, rcExecuting, rcParam, rcNull );
1148 }
1149 else
1150 {
1151 rc = SRASplitter_Make( splitter, sizeof( AlignPairDistanceFilter ),
1152 AlignPairDistanceFilter_GetKey, NULL, NULL, AlignPairDistanceFilter_Release );
1153 if ( rc == 0 )
1154 {
1155 AlignPairDistanceFilter * filter = ( AlignPairDistanceFilter * )( * splitter );
1156 filter->col_tlen = self->col_tlen;
1157 filter->mp_dist_unknown = self->mp_dist_unknown;
1158 filter->mp_dist = self->mp_dist;
1159 filter->mp_dist_qty = self->mp_dist_qty;
1160 filter->rejected_reads = 0;
1161 }
1162 }
1163 return rc;
1164 }
1165
1166
AlignPairDistanceFilterFactory_Release(const SRASplitterFactory * cself)1167 static void AlignPairDistanceFilterFactory_Release( const SRASplitterFactory* cself )
1168 {
1169 if ( cself != NULL )
1170 {
1171 AlignPairDistanceFilterFactory* self = ( AlignPairDistanceFilterFactory* )cself;
1172 SRAColumnRelease( self->col_tlen );
1173 }
1174 }
1175
1176
AlignPairDistanceFilterFactory_Make(const SRASplitterFactory ** cself,const SRATable * table,bool mp_dist_unknown,TMatepairDistance * mp_dist,uint32_t mp_dist_qty)1177 static rc_t AlignPairDistanceFilterFactory_Make( const SRASplitterFactory** cself,
1178 const SRATable* table, bool mp_dist_unknown, TMatepairDistance* mp_dist, uint32_t mp_dist_qty )
1179 {
1180 rc_t rc = 0;
1181 AlignPairDistanceFilterFactory* obj = NULL;
1182
1183 if ( cself == NULL || table == NULL )
1184 {
1185 rc = RC( rcSRA, rcType, rcAllocating, rcParam, rcNull );
1186 }
1187 else
1188 {
1189 rc = SRASplitterFactory_Make( cself, eSplitterSpot, sizeof( *obj ),
1190 AlignPairDistanceFilterFactory_Init,
1191 AlignPairDistanceFilterFactory_NewObj,
1192 AlignPairDistanceFilterFactory_Release );
1193 if ( rc == 0 )
1194 {
1195 obj = ( AlignPairDistanceFilterFactory* ) *cself;
1196 obj->table = table;
1197 obj->mp_dist_unknown = mp_dist_unknown;
1198 obj->mp_dist = mp_dist;
1199 obj->mp_dist_qty = mp_dist_qty;
1200 }
1201 }
1202 return rc;
1203 }
1204
1205
1206 /* ============== FASTQ read type (bio/tech) filter ============================ */
1207
1208 typedef struct FastqBioFilter_struct
1209 {
1210 const FastqReader* reader;
1211 uint64_t rejected_reads;
1212 } FastqBioFilter;
1213
1214
1215 /* filter out non-bio reads */
FastqBioFilter_GetKey(const SRASplitter * cself,const char ** key,spotid_t spot,readmask_t * readmask)1216 static rc_t FastqBioFilter_GetKey( const SRASplitter* cself, const char** key,
1217 spotid_t spot, readmask_t* readmask )
1218 {
1219 rc_t rc = 0;
1220 FastqBioFilter* self = ( FastqBioFilter* )cself;
1221
1222 if ( self == NULL || key == NULL )
1223 {
1224 rc = RC( rcExe, rcNode, rcExecuting, rcParam, rcInvalid );
1225 }
1226 else
1227 {
1228 uint32_t num_reads = 0;
1229
1230 *key = NULL;
1231 rc = FastqReaderSeekSpot( self->reader, spot );
1232 if ( rc == 0 )
1233 {
1234 rc = FastqReader_SpotInfo( self->reader, NULL, NULL, NULL, NULL, NULL, &num_reads );
1235 if ( rc == 0 )
1236 {
1237 uint32_t readId;
1238 SRA_DUMP_DBG( 3, ( "%s %u row reads: ", __func__, spot ) );
1239 for (readId = 0; rc == 0 && readId < num_reads; readId++ )
1240 {
1241 if ( isset_readmask( readmask, readId ) )
1242 {
1243 SRAReadTypes read_type;
1244 rc = FastqReader_SpotReadInfo( self->reader, readId + 1, &read_type,
1245 NULL, NULL, NULL, NULL);
1246 if ( rc == 0 )
1247 {
1248 if ( !( read_type & SRA_READ_TYPE_BIOLOGICAL ) )
1249 {
1250 unset_readmask( readmask, readId );
1251 self->rejected_reads ++;
1252 }
1253 else
1254 {
1255 SRA_DUMP_DBG( 3, ( " %u", readId ) );
1256 }
1257 }
1258 }
1259 }
1260 *key = "";
1261 SRA_DUMP_DBG( 3, ( " key '%s'\n", *key ) );
1262 }
1263 }
1264 else if ( GetRCObject( rc ) == rcRow && GetRCState( rc ) == rcNotFound )
1265 {
1266 SRA_DUMP_DBG( 3, ( "%s skipped %u row\n", __func__, spot ) );
1267 rc = 0;
1268 }
1269 }
1270 return rc;
1271 }
1272
1273
FastqBioFilter_Release(const SRASplitter * cself)1274 static rc_t FastqBioFilter_Release( const SRASplitter * cself )
1275 {
1276 rc_t rc = 0;
1277 FastqBioFilter * self = ( FastqBioFilter * )cself;
1278
1279 if ( self == NULL )
1280 rc = RC( rcExe, rcNode, rcExecuting, rcParam, rcInvalid );
1281 else
1282 {
1283 if ( self->rejected_reads > 0 && !g_legacy_report )
1284 rc = KOutMsg( "Rejected %lu READS because of filtering out non-biological READS\n", self->rejected_reads );
1285 }
1286 return rc;
1287 }
1288
1289
1290 typedef struct FastqBioFilterFactory_struct
1291 {
1292 const char* accession;
1293 const SRATable* table;
1294 const FastqReader* reader;
1295 } FastqBioFilterFactory;
1296
1297
FastqBioFilterFactory_Init(const SRASplitterFactory * cself)1298 static rc_t FastqBioFilterFactory_Init( const SRASplitterFactory* cself )
1299 {
1300 rc_t rc = 0;
1301 FastqBioFilterFactory* self = ( FastqBioFilterFactory* )cself;
1302
1303 if ( self == NULL )
1304 {
1305 rc = RC( rcExe, rcType, rcConstructing, rcParam, rcNull );
1306 }
1307 else
1308 {
1309 rc = FastqReaderMake( &self->reader, self->table, self->accession,
1310 /* preserve orig spot format to save on conversions */
1311 FastqArgs.is_platform_cs_native, false, FastqArgs.fasta > 0, false,
1312 false, !FastqArgs.applyClip, FastqArgs.SuppressQualForCSKey, 0,
1313 FastqArgs.offset, '\0', 0, 0) ;
1314 }
1315 return rc;
1316 }
1317
1318
FastqBioFilterFactory_NewObj(const SRASplitterFactory * cself,const SRASplitter ** splitter)1319 static rc_t FastqBioFilterFactory_NewObj( const SRASplitterFactory* cself, const SRASplitter** splitter )
1320 {
1321 rc_t rc = 0;
1322 FastqBioFilterFactory* self = ( FastqBioFilterFactory* )cself;
1323
1324 if ( self == NULL )
1325 {
1326 rc = RC( rcExe, rcType, rcExecuting, rcParam, rcNull );
1327 }
1328 else
1329 {
1330 rc = SRASplitter_Make( splitter, sizeof( FastqBioFilter ),
1331 FastqBioFilter_GetKey, NULL, NULL, FastqBioFilter_Release );
1332 if ( rc == 0 )
1333 {
1334 FastqBioFilter * filter = ( FastqBioFilter * )( * splitter );
1335 filter->reader = self->reader;
1336 filter->rejected_reads = 0;
1337 }
1338 }
1339 return rc;
1340 }
1341
1342
FastqBioFilterFactory_Release(const SRASplitterFactory * cself)1343 static void FastqBioFilterFactory_Release( const SRASplitterFactory* cself )
1344 {
1345 if ( cself != NULL )
1346 {
1347 FastqBioFilterFactory* self = ( FastqBioFilterFactory* )cself;
1348 FastqReaderWhack( self->reader );
1349 }
1350 }
1351
1352
FastqBioFilterFactory_Make(const SRASplitterFactory ** cself,const char * accession,const SRATable * table)1353 static rc_t FastqBioFilterFactory_Make( const SRASplitterFactory** cself,
1354 const char* accession, const SRATable* table )
1355 {
1356 rc_t rc = 0;
1357 FastqBioFilterFactory* obj = NULL;
1358
1359 if ( cself == NULL || accession == NULL || table == NULL )
1360 {
1361 rc = RC( rcExe, rcType, rcConstructing, rcParam, rcNull );
1362 }
1363 else
1364 {
1365 rc = SRASplitterFactory_Make( cself, eSplitterSpot, sizeof( *obj ),
1366 FastqBioFilterFactory_Init,
1367 FastqBioFilterFactory_NewObj,
1368 FastqBioFilterFactory_Release );
1369
1370 if ( rc == 0 )
1371 {
1372 obj = ( FastqBioFilterFactory* )*cself;
1373 obj->accession = accession;
1374 obj->table = table;
1375 }
1376 }
1377 return rc;
1378 }
1379
1380
1381 /* ============== FASTQ number of reads filter ============================ */
1382
1383 typedef struct FastqRNumberFilter_struct
1384 {
1385 const FastqReader* reader;
1386 uint64_t rejected_reads;
1387 } FastqRNumberFilter;
1388
1389
FastqRNumberFilter_GetKey(const SRASplitter * cself,const char ** key,spotid_t spot,readmask_t * readmask)1390 static rc_t FastqRNumberFilter_GetKey( const SRASplitter* cself, const char** key,
1391 spotid_t spot, readmask_t* readmask )
1392 {
1393 rc_t rc = 0;
1394 FastqRNumberFilter* self = ( FastqRNumberFilter* )cself;
1395
1396 if ( self == NULL || key == NULL )
1397 {
1398 rc = RC( rcExe, rcNode, rcExecuting, rcParam, rcInvalid );
1399 }
1400 else
1401 {
1402 uint32_t num_reads = 0;
1403
1404 *key = NULL;
1405 rc = FastqReaderSeekSpot( self->reader, spot );
1406 if ( rc == 0 )
1407 {
1408 rc = FastqReader_SpotInfo( self->reader, NULL, NULL, NULL, NULL, NULL, &num_reads );
1409 if ( rc == 0 )
1410 {
1411 int readId, q;
1412
1413 SRA_DUMP_DBG( 3, ( "%s %u row reads: ", __func__, spot ) );
1414 for ( readId = 0, q = 0; readId < num_reads; readId++ )
1415 {
1416 if ( isset_readmask( readmask, readId ) )
1417 {
1418 if ( ++q > FastqArgs.maxReads )
1419 {
1420 unset_readmask( readmask, 1 << readId );
1421 }
1422 else
1423 {
1424 SRA_DUMP_DBG( 3, ( " %u", readId ) );
1425 }
1426 }
1427 }
1428 *key = "";
1429 SRA_DUMP_DBG( 3, ( " key '%s'\n", *key ) );
1430 }
1431 }
1432 else if ( GetRCObject( rc ) == rcRow && GetRCState( rc ) == rcNotFound )
1433 {
1434 SRA_DUMP_DBG( 3, ( "%s skipped %u row\n", __func__, spot ) );
1435 rc = 0;
1436 }
1437 }
1438 return rc;
1439 }
1440
1441
FastqRNumberFilter_Release(const SRASplitter * cself)1442 static rc_t FastqRNumberFilter_Release( const SRASplitter* cself )
1443 {
1444 rc_t rc = 0;
1445 FastqRNumberFilter * self = ( FastqRNumberFilter * )cself;
1446
1447 if ( self == NULL )
1448 {
1449 rc = RC( rcExe, rcNode, rcExecuting, rcParam, rcInvalid );
1450 }
1451 else
1452 {
1453 if ( self->rejected_reads > 0 && !g_legacy_report )
1454 rc = KOutMsg( "Rejected %lu READS because of max. number of READS = %u\n", self->rejected_reads, FastqArgs.maxReads );
1455 }
1456 return rc;
1457 }
1458
1459 typedef struct FastqRNumberFilterFactory_struct
1460 {
1461 const char* accession;
1462 const SRATable* table;
1463 const FastqReader* reader;
1464 } FastqRNumberFilterFactory;
1465
1466
FastqRNumberFilterFactory_Init(const SRASplitterFactory * cself)1467 static rc_t FastqRNumberFilterFactory_Init( const SRASplitterFactory* cself )
1468 {
1469 rc_t rc = 0;
1470 FastqRNumberFilterFactory* self = ( FastqRNumberFilterFactory* )cself;
1471
1472 if ( self == NULL )
1473 {
1474 rc = RC( rcExe, rcType, rcConstructing, rcParam, rcNull );
1475 }
1476 else
1477 {
1478 rc = FastqReaderMake( &self->reader, self->table, self->accession,
1479 /* preserve orig spot format to save on conversions */
1480 FastqArgs.is_platform_cs_native, false, FastqArgs.fasta > 0, false,
1481 false, !FastqArgs.applyClip, FastqArgs.SuppressQualForCSKey, 0,
1482 FastqArgs.offset, '\0', 0, 0 );
1483 }
1484 return rc;
1485 }
1486
1487
FastqRNumberFilterFactory_NewObj(const SRASplitterFactory * cself,const SRASplitter ** splitter)1488 static rc_t FastqRNumberFilterFactory_NewObj( const SRASplitterFactory* cself, const SRASplitter** splitter )
1489 {
1490 rc_t rc = 0;
1491 FastqRNumberFilterFactory* self = ( FastqRNumberFilterFactory* )cself;
1492
1493 if ( self == NULL )
1494 {
1495 rc = RC( rcExe, rcType, rcExecuting, rcParam, rcNull );
1496 }
1497 else
1498 {
1499 rc = SRASplitter_Make( splitter, sizeof( FastqRNumberFilter ),
1500 FastqRNumberFilter_GetKey, NULL, NULL, FastqRNumberFilter_Release );
1501 if ( rc == 0 )
1502 {
1503 FastqRNumberFilter * filter = ( FastqRNumberFilter * )( * splitter );
1504 filter->reader = self->reader;
1505 filter->rejected_reads = 0;
1506 }
1507 }
1508 return rc;
1509 }
1510
1511
FastqRNumberFilterFactory_Release(const SRASplitterFactory * cself)1512 static void FastqRNumberFilterFactory_Release( const SRASplitterFactory* cself )
1513 {
1514 if ( cself != NULL )
1515 {
1516 FastqRNumberFilterFactory* self = ( FastqRNumberFilterFactory* )cself;
1517 FastqReaderWhack( self->reader );
1518 }
1519 }
1520
1521
FastqRNumberFilterFactory_Make(const SRASplitterFactory ** cself,const char * accession,const SRATable * table)1522 static rc_t FastqRNumberFilterFactory_Make( const SRASplitterFactory** cself,
1523 const char* accession, const SRATable* table )
1524 {
1525 rc_t rc = 0;
1526 FastqRNumberFilterFactory* obj = NULL;
1527
1528 if ( cself == NULL || accession == NULL || table == NULL )
1529 {
1530 rc = RC( rcExe, rcType, rcConstructing, rcParam, rcNull );
1531 }
1532 else
1533 {
1534 rc = SRASplitterFactory_Make( cself, eSplitterSpot, sizeof( *obj ),
1535 FastqRNumberFilterFactory_Init,
1536 FastqRNumberFilterFactory_NewObj,
1537 FastqRNumberFilterFactory_Release );
1538 if ( rc == 0 )
1539 {
1540 obj = ( FastqRNumberFilterFactory* )*cself;
1541 obj->accession = accession;
1542 obj->table = table;
1543 }
1544 }
1545 return rc;
1546 }
1547
1548 /* ============== FASTQ quality filter ============================ */
1549
1550 typedef struct FastqQFilter_struct
1551 {
1552 const FastqReader * reader;
1553 KDataBuffer * buffer_for_read_column;
1554 KDataBuffer * buffer_for_quality;
1555 uint64_t rejected_spots;
1556 uint64_t rejected_reads;
1557 } FastqQFilter;
1558
1559
1560 #define MIN_QUAL_FOR_RULE_3 ( 33 + 2 )
1561
QFilter_Test_new(const char * read,const char * quality,uint32_t readlen,uint32_t minlen,char N_char)1562 static bool QFilter_Test_new( const char * read, const char * quality, uint32_t readlen, uint32_t minlen, char N_char )
1563 {
1564 uint32_t idx, cnt, cnt1, start;
1565 char b0;
1566
1567 /* 1st criteria : READ has to be longer than the minlen */
1568 if ( readlen < minlen ) return false;
1569
1570 /* 2nd criteria : READ does not contain any N's in the first minlen bases */
1571 for ( idx = 0; idx < minlen; ++idx )
1572 {
1573 if ( read[ idx ] == N_char ) return false;
1574 }
1575
1576 start = ( N_char == '.' ) ? 1 : 0;
1577 /* 3rd criteria : QUALITY values are all 2 or higher in the first minlen values */
1578 for ( idx = start; idx < minlen; ++idx )
1579 {
1580 if ( quality[ idx ] < MIN_QUAL_FOR_RULE_3 ) return false;
1581 }
1582
1583 /* 4th criteria : READ contains more than one type of base in the first minlen bases */
1584 b0 = read[ 0 ];
1585 for ( idx = 1, cnt = 1; idx < minlen; ++idx )
1586 {
1587 if ( read[ idx ] == b0 ) cnt++;
1588 }
1589 if ( cnt == minlen ) return false;
1590
1591 /* 5th criteria : READ does not contain more than 50% N's in its whole length */
1592 cnt = ( readlen / 2 );
1593 for ( idx = 0, cnt1 = 0; idx < readlen; ++idx )
1594 {
1595 if ( read[ idx ] == N_char ) cnt1++;
1596 }
1597 if ( cnt1 > cnt ) return false;
1598
1599 /* 6th criteria : READ does not contain values other than ACGTN in its whole length */
1600 if ( N_char != '.' )
1601 {
1602 for ( idx = 0; idx < readlen; ++idx )
1603 {
1604 if ( ! ( ( read[ idx ] == 'A' ) || ( read[ idx ] == 'C' ) || ( read[ idx ] == 'G' ) ||
1605 ( read[ idx ] == 'T' ) || ( read[ idx ] == 'N' ) ) )
1606 return false;
1607 }
1608 }
1609
1610 return true;
1611 }
1612
1613
QFilter_Test_old(const char * read,uint32_t readlen,bool cs_native)1614 static bool QFilter_Test_old( const char * read, uint32_t readlen, bool cs_native )
1615 {
1616 static char const* const baseStr = "NNNNNNNNNN";
1617 static char const* const colorStr = "..........";
1618 static const uint32_t strLen = 10;
1619 uint32_t xLen = readlen < strLen ? readlen : strLen;
1620
1621 if ( cs_native )
1622 {
1623 if ( strncmp( &read[ 1 ], colorStr, xLen ) == 0 || strcmp( &read[ readlen - xLen + 1 ], colorStr ) == 0 )
1624 return false;
1625 }
1626 else
1627 {
1628 if ( strncmp( read, baseStr, xLen ) == 0 || strcmp( &read[ readlen - xLen ], baseStr ) == 0 )
1629 return false;
1630 }
1631 return true;
1632 }
1633
1634
1635 /* filter out reads by leading/trialing quality */
FastqQFilter_GetKey(const SRASplitter * cself,const char ** key,spotid_t spot,readmask_t * readmask)1636 static rc_t FastqQFilter_GetKey( const SRASplitter* cself, const char** key,
1637 spotid_t spot, readmask_t* readmask )
1638 {
1639 rc_t rc = 0;
1640 FastqQFilter* self = ( FastqQFilter* )cself;
1641
1642 if ( self == NULL || key == NULL )
1643 {
1644 rc = RC( rcExe, rcNode, rcExecuting, rcParam, rcInvalid );
1645 }
1646 else
1647 {
1648 uint32_t num_reads = 0, spot_len;
1649
1650 *key = NULL;
1651 rc = FastqReaderSeekSpot( self->reader, spot );
1652 if ( rc == 0 )
1653 {
1654 rc = FastqReader_SpotInfo( self->reader, NULL, NULL, NULL, NULL, &spot_len, &num_reads );
1655 if ( rc == 0 )
1656 {
1657 SRA_DUMP_DBG( 3, ( "%s %u row reads: ", __func__, spot ) );
1658 if ( FastqArgs.split_spot )
1659 {
1660 uint32_t readId;
1661 INSDC_coord_len read_len;
1662 for ( readId = 0; rc == 0 && readId < num_reads; readId++ )
1663 {
1664 if ( isset_readmask( readmask, readId ) )
1665 {
1666 rc = FastqReader_SpotReadInfo( self->reader, readId + 1, NULL, NULL, NULL, NULL, &read_len );
1667 if ( rc == 0 )
1668 {
1669 /* FastqReaderBase() in libsra! libs/sra/fastq.h
1670 IF_BUF ... very, very bad macro defined in factory.h !!
1671 it combines reading from table with resizing of a KDatabuffer in a loop until rc == 0 is reached!
1672 */
1673 IF_BUF( ( FastqReaderBase( self->reader, readId + 1,
1674 self->buffer_for_read_column->base,
1675 KDataBufferBytes( self->buffer_for_read_column ), NULL) ),
1676 self->buffer_for_read_column, read_len )
1677 {
1678 const char * b = self->buffer_for_read_column->base;
1679 bool filter_passed = true;
1680
1681 if ( FastqArgs.qual_filter1 )
1682 {
1683 if ( quality_N_limit > 0 )
1684 {
1685 IF_BUF( ( FastqReaderQuality( self->reader, readId + 1,
1686 self->buffer_for_quality->base,
1687 KDataBufferBytes( self->buffer_for_quality ), NULL) ),
1688 self->buffer_for_quality, read_len )
1689 {
1690 const char * q = self->buffer_for_quality->base;
1691 char N_char = FastqArgs.is_platform_cs_native ? '.' : 'N';
1692 filter_passed = QFilter_Test_new( b, q, read_len, quality_N_limit, N_char );
1693 }
1694 }
1695 }
1696 else
1697 filter_passed = QFilter_Test_old( b, read_len, FastqArgs.is_platform_cs_native );
1698
1699 if ( filter_passed )
1700 SRA_DUMP_DBG( 3, ( " %u", readId ) );
1701 else
1702 {
1703 unset_readmask( readmask, readId );
1704 /* READ did not pass filter */
1705 self->rejected_reads ++;
1706 }
1707 }
1708 }
1709 }
1710 }
1711 *key = "";
1712 }
1713 else
1714 {
1715 IF_BUF( ( FastqReaderBase( self->reader, 0, self->buffer_for_read_column->base,
1716 KDataBufferBytes( self->buffer_for_read_column ), NULL ) ), self->buffer_for_read_column, spot_len )
1717 {
1718 const char* b = self->buffer_for_read_column->base;
1719 bool filter_passed = true;
1720
1721 if ( FastqArgs.qual_filter1 )
1722 {
1723 if ( quality_N_limit > 0 )
1724 {
1725 IF_BUF( ( FastqReaderQuality( self->reader, 0, self->buffer_for_quality->base,
1726 KDataBufferBytes( self->buffer_for_quality ), NULL ) ), self->buffer_for_quality, spot_len )
1727 {
1728 const char * q = self->buffer_for_quality->base;
1729 char N_char = FastqArgs.is_platform_cs_native ? '.' : 'N';
1730 filter_passed = QFilter_Test_new( b, q, spot_len, quality_N_limit, N_char );
1731 }
1732 }
1733 }
1734 else
1735 filter_passed = QFilter_Test_old( b, spot_len, FastqArgs.is_platform_cs_native );
1736
1737 if ( filter_passed )
1738 {
1739 *key = "";
1740 SRA_DUMP_DBG( 3, ( " whole spot" ) );
1741 }
1742 else
1743 {
1744 /* SPOT did not pass filter */
1745 self->rejected_spots ++;
1746 }
1747 }
1748 }
1749 SRA_DUMP_DBG( 3, ( " key '%s'\n", *key ) );
1750 }
1751 }
1752 else if ( GetRCObject( rc ) == rcRow && GetRCState( rc ) == rcNotFound )
1753 {
1754 SRA_DUMP_DBG( 3, ( "%s skipped %u row\n", __func__, spot ) );
1755 rc = 0;
1756 }
1757 }
1758 return rc;
1759 }
1760
1761
FastqQFilter_Release(const SRASplitter * cself)1762 static rc_t FastqQFilter_Release( const SRASplitter * cself )
1763 {
1764 rc_t rc = 0;
1765 FastqQFilter* self = ( FastqQFilter* )cself;
1766
1767 if ( self == NULL )
1768 rc = RC( rcExe, rcNode, rcExecuting, rcParam, rcInvalid );
1769 else if ( !g_legacy_report )
1770 {
1771 if ( self->rejected_reads > 0 )
1772 rc = KOutMsg( "Rejected %lu READS because of Quality-Filtering\n", self->rejected_reads );
1773 if ( rc == 0 && self->rejected_spots > 0 )
1774 rc = KOutMsg( "Rejected %lu SPOTS because of Quality-Filtering\n", self->rejected_spots );
1775
1776 }
1777 return rc;
1778 }
1779
1780
1781 typedef struct FastqQFilterFactory_struct
1782 {
1783 const char* accession;
1784 const SRATable* table;
1785 const FastqReader* reader;
1786 KDataBuffer kdbuf;
1787 KDataBuffer kdbuf2;
1788 } FastqQFilterFactory;
1789
1790
FastqQFilterFactory_Init(const SRASplitterFactory * cself)1791 static rc_t FastqQFilterFactory_Init( const SRASplitterFactory* cself )
1792 {
1793 rc_t rc = 0;
1794 FastqQFilterFactory* self = ( FastqQFilterFactory* )cself;
1795
1796 if ( self == NULL )
1797 {
1798 rc = RC( rcExe, rcType, rcConstructing, rcParam, rcNull );
1799 }
1800 else
1801 {
1802 rc = KDataBufferMakeBytes( &self->kdbuf, DATABUFFERINITSIZE );
1803 if ( rc == 0 )
1804 {
1805 rc = KDataBufferMakeBytes( &self->kdbuf2, DATABUFFERINITSIZE );
1806 if ( rc == 0 )
1807 {
1808 rc = FastqReaderMake( &self->reader, self->table, self->accession,
1809 /* preserve orig spot format to save on conversions */
1810 FastqArgs.is_platform_cs_native, false, FastqArgs.fasta > 0, false,
1811 false, !FastqArgs.applyClip, FastqArgs.SuppressQualForCSKey, 0,
1812 FastqArgs.offset, '\0', 0, 0 );
1813 }
1814 }
1815 }
1816 return rc;
1817 }
1818
1819
FastqQFilterFactory_NewObj(const SRASplitterFactory * cself,const SRASplitter ** splitter)1820 static rc_t FastqQFilterFactory_NewObj( const SRASplitterFactory* cself, const SRASplitter** splitter )
1821 {
1822 rc_t rc = 0;
1823 FastqQFilterFactory* self = ( FastqQFilterFactory* )cself;
1824
1825 if ( self == NULL )
1826 {
1827 rc = RC( rcExe, rcType, rcExecuting, rcParam, rcNull );
1828 }
1829 else
1830 {
1831 rc = SRASplitter_Make( splitter, sizeof( FastqQFilter ), FastqQFilter_GetKey, NULL, NULL, FastqQFilter_Release );
1832 if ( rc == 0 )
1833 {
1834 FastqQFilter * filter = ( FastqQFilter * )( * splitter );
1835 filter->reader = self->reader;
1836 filter->buffer_for_read_column = &self->kdbuf;
1837 filter->buffer_for_quality = &self->kdbuf2;
1838 filter->rejected_spots = 0;
1839 filter->rejected_reads = 0;
1840 }
1841 }
1842 return rc;
1843 }
1844
1845
FastqQFilterFactory_Release(const SRASplitterFactory * cself)1846 static void FastqQFilterFactory_Release( const SRASplitterFactory* cself )
1847 {
1848 if ( cself != NULL )
1849 {
1850 FastqQFilterFactory* self = ( FastqQFilterFactory* )cself;
1851 KDataBufferWhack( &self->kdbuf );
1852 KDataBufferWhack( &self->kdbuf2 );
1853 FastqReaderWhack( self->reader );
1854 }
1855 }
1856
1857
FastqQFilterFactory_Make(const SRASplitterFactory ** cself,const char * accession,const SRATable * table)1858 static rc_t FastqQFilterFactory_Make( const SRASplitterFactory** cself,
1859 const char* accession, const SRATable* table )
1860 {
1861 rc_t rc = 0;
1862 FastqQFilterFactory* obj = NULL;
1863
1864 if ( cself == NULL || accession == NULL || table == NULL )
1865 {
1866 rc = RC( rcExe, rcType, rcConstructing, rcParam, rcNull );
1867 }
1868 else
1869 {
1870 rc = SRASplitterFactory_Make( cself, eSplitterSpot, sizeof( *obj ),
1871 FastqQFilterFactory_Init,
1872 FastqQFilterFactory_NewObj,
1873 FastqQFilterFactory_Release );
1874 if ( rc == 0 )
1875 {
1876 obj = ( FastqQFilterFactory* )*cself;
1877 obj->accession = accession;
1878 obj->table = table;
1879 }
1880 }
1881 return rc;
1882 }
1883
1884
1885 /* ============== FASTQ min read length filter ============================ */
1886
1887
1888 typedef struct FastqReadLenFilter_struct
1889 {
1890 const FastqReader* reader;
1891 uint64_t rejected_spots;
1892 uint64_t rejected_reads;
1893 } FastqReadLenFilter;
1894
1895
FastqReadLenFilter_GetKey(const SRASplitter * cself,const char ** key,spotid_t spot,readmask_t * readmask)1896 static rc_t FastqReadLenFilter_GetKey( const SRASplitter* cself, const char** key,
1897 spotid_t spot, readmask_t* readmask )
1898 {
1899 rc_t rc = 0;
1900 FastqReadLenFilter* self = ( FastqReadLenFilter* )cself;
1901
1902 if ( self == NULL || key == NULL )
1903 {
1904 rc = RC( rcExe, rcNode, rcExecuting, rcParam, rcInvalid );
1905 }
1906 else
1907 {
1908 uint32_t num_reads = 0, spot_len = 0;
1909
1910 *key = NULL;
1911 rc = FastqReaderSeekSpot( self->reader, spot );
1912 if ( rc == 0 )
1913 {
1914 rc = FastqReader_SpotInfo( self->reader, NULL, NULL, NULL, NULL, &spot_len, &num_reads );
1915 if ( rc == 0 )
1916 {
1917 SRA_DUMP_DBG( 3, ( "%s %u row reads:", __func__, spot ) );
1918 if ( FastqArgs.split_spot )
1919 {
1920 uint32_t readId;
1921 INSDC_coord_len read_len = 0;
1922 for ( readId = 0; rc == 0 && readId < num_reads; readId++ )
1923 {
1924 if ( isset_readmask( readmask, readId ) )
1925 {
1926 rc = FastqReader_SpotReadInfo( self->reader, readId + 1, NULL, NULL, NULL, NULL, &read_len );
1927 if ( rc == 0 )
1928 {
1929 if ( read_len < FastqArgs.minReadLen )
1930 {
1931 unset_readmask( readmask, readId );
1932 self->rejected_reads ++;
1933 }
1934 else
1935 {
1936 SRA_DUMP_DBG( 3, ( " %u", readId ) );
1937 }
1938 }
1939 }
1940 }
1941 *key = "";
1942 }
1943 else
1944 {
1945 if ( spot_len >= FastqArgs.minReadLen )
1946 {
1947 *key = "";
1948 SRA_DUMP_DBG( 3, ( " whole spot" ) );
1949 }
1950 else
1951 self->rejected_spots ++;
1952 }
1953 SRA_DUMP_DBG( 3, ( "\n" ) );
1954 }
1955 }
1956 else if ( GetRCObject( rc ) == rcRow && GetRCState( rc ) == rcNotFound )
1957 {
1958 SRA_DUMP_DBG( 3, ( "%s skipped %u row\n", __func__, spot ) );
1959 rc = 0;
1960 }
1961 }
1962 return rc;
1963 }
1964
1965
FastqReadLenFilter_Release(const SRASplitter * cself)1966 static rc_t FastqReadLenFilter_Release( const SRASplitter * cself )
1967 {
1968 rc_t rc = 0;
1969 FastqReadLenFilter * self = ( FastqReadLenFilter* )cself;
1970
1971 if ( self == NULL )
1972 rc = RC( rcExe, rcNode, rcExecuting, rcParam, rcInvalid );
1973 else if ( !g_legacy_report )
1974 {
1975 if ( self->rejected_reads > 0 )
1976 rc = KOutMsg( "Rejected %lu READS because READLEN < %u\n", self->rejected_reads, FastqArgs.minReadLen );
1977 if ( self->rejected_spots > 0 && rc == 0 )
1978 rc = KOutMsg( "Rejected %lu SPOTS because SPOTLEN < %u\n", self->rejected_spots, FastqArgs.minReadLen );
1979 }
1980 return rc;
1981 }
1982
1983
1984 typedef struct FastqReadLenFilterFactory_struct
1985 {
1986 const char* accession;
1987 const SRATable* table;
1988 const FastqReader* reader;
1989 } FastqReadLenFilterFactory;
1990
1991
FastqReadLenFilterFactory_Init(const SRASplitterFactory * cself)1992 static rc_t FastqReadLenFilterFactory_Init( const SRASplitterFactory* cself )
1993 {
1994 rc_t rc = 0;
1995 FastqReadLenFilterFactory* self = ( FastqReadLenFilterFactory* )cself;
1996
1997 if ( self == NULL )
1998 {
1999 rc = RC( rcExe, rcType, rcConstructing, rcParam, rcNull );
2000 }
2001 else
2002 {
2003 rc = FastqReaderMake( &self->reader, self->table, self->accession,
2004 /* preserve orig spot format to save on conversions */
2005 FastqArgs.is_platform_cs_native, false, FastqArgs.fasta > 0, false,
2006 false, !FastqArgs.applyClip, FastqArgs.SuppressQualForCSKey, 0,
2007 FastqArgs.offset, '\0', 0, 0 );
2008 }
2009 return rc;
2010 }
2011
2012
FastqReadLenFilterFactory_NewObj(const SRASplitterFactory * cself,const SRASplitter ** splitter)2013 static rc_t FastqReadLenFilterFactory_NewObj( const SRASplitterFactory* cself, const SRASplitter** splitter )
2014 {
2015 rc_t rc = 0;
2016 FastqReadLenFilterFactory* self = ( FastqReadLenFilterFactory* )cself;
2017
2018 if ( self == NULL )
2019 {
2020 rc = RC( rcExe, rcType, rcExecuting, rcParam, rcNull );
2021 }
2022 else
2023 {
2024 rc = SRASplitter_Make( splitter, sizeof( FastqReadLenFilter ),
2025 FastqReadLenFilter_GetKey, NULL, NULL, FastqReadLenFilter_Release );
2026 if ( rc == 0 )
2027 {
2028 FastqReadLenFilter * filter = ( FastqReadLenFilter* )( *splitter );
2029 filter->reader = self->reader;
2030 filter->rejected_spots = 0;
2031 filter->rejected_reads = 0;
2032 }
2033 }
2034 return rc;
2035 }
2036
2037
FastqReadLenFilterFactory_Release(const SRASplitterFactory * cself)2038 static void FastqReadLenFilterFactory_Release( const SRASplitterFactory* cself )
2039 {
2040 if ( cself != NULL )
2041 {
2042 FastqReadLenFilterFactory* self = ( FastqReadLenFilterFactory* )cself;
2043 FastqReaderWhack( self->reader );
2044 }
2045 }
2046
2047
FastqReadLenFilterFactory_Make(const SRASplitterFactory ** cself,const char * accession,const SRATable * table)2048 static rc_t FastqReadLenFilterFactory_Make( const SRASplitterFactory** cself,
2049 const char* accession, const SRATable* table )
2050 {
2051 rc_t rc = 0;
2052 FastqReadLenFilterFactory* obj = NULL;
2053
2054 if ( cself == NULL || accession == NULL || table == NULL )
2055 {
2056 rc = RC( rcExe, rcType, rcConstructing, rcParam, rcNull );
2057 }
2058 else
2059 {
2060 rc = SRASplitterFactory_Make( cself, eSplitterSpot, sizeof( *obj ),
2061 FastqReadLenFilterFactory_Init,
2062 FastqReadLenFilterFactory_NewObj,
2063 FastqReadLenFilterFactory_Release );
2064 if ( rc == 0 )
2065 {
2066 obj = ( FastqReadLenFilterFactory* )*cself;
2067 obj->accession = accession;
2068 obj->table = table;
2069 }
2070 }
2071 return rc;
2072 }
2073
2074
2075 /* ============== FASTQ read splitter ============================ */
2076
2077 char* FastqReadSplitter_key_buf = NULL;
2078
2079
2080 typedef struct FastqReadSplitter_struct
2081 {
2082 const FastqReader* reader;
2083 SRASplitter_Keys* keys;
2084 uint32_t keys_max;
2085 } FastqReadSplitter;
2086
2087
FastqReadSplitter_GetKeySet(const SRASplitter * cself,const SRASplitter_Keys ** key,uint32_t * keys,spotid_t spot,const readmask_t * readmask)2088 static rc_t FastqReadSplitter_GetKeySet( const SRASplitter* cself, const SRASplitter_Keys** key,
2089 uint32_t* keys, spotid_t spot, const readmask_t* readmask )
2090 {
2091 rc_t rc = 0;
2092 FastqReadSplitter* self = ( FastqReadSplitter* )cself;
2093 const size_t key_offset = 5;
2094
2095 if ( self == NULL || key == NULL )
2096 {
2097 rc = RC( rcExe, rcNode, rcExecuting, rcParam, rcInvalid );
2098 }
2099 else
2100 {
2101 uint32_t num_reads = 0;
2102
2103 *keys = 0;
2104 if ( FastqReadSplitter_key_buf == NULL )
2105 {
2106 /* initial alloc key_buf: " 1\0 2\0...\0 9\0 10\0 11\0...\03220..\08192\0" */
2107 if ( nreads_max > 9999 )
2108 {
2109 /* key_offset and sprintf format size are insufficient for keys longer than 4 digits */
2110 rc = RC( rcExe, rcNode, rcExecuting, rcBuffer, rcInsufficient );
2111 }
2112 else
2113 {
2114 FastqReadSplitter_key_buf = malloc( nreads_max * key_offset );
2115 if ( FastqReadSplitter_key_buf == NULL )
2116 {
2117 rc = RC( rcExe, rcNode, rcExecuting, rcMemory, rcExhausted );
2118 }
2119 else
2120 {
2121 /* fill buffer w/keys */
2122 int i;
2123 char* p = FastqReadSplitter_key_buf;
2124 for ( i = 1; rc == 0 && i <= nreads_max; i++ )
2125 {
2126 if ( sprintf( p, "%4u", i ) <= 0 )
2127 {
2128 rc = RC( rcExe, rcNode, rcExecuting, rcTransfer, rcIncomplete );
2129 }
2130 p += key_offset;
2131 }
2132 }
2133 }
2134 }
2135
2136 if ( rc == 0 )
2137 {
2138 rc = FastqReaderSeekSpot( self->reader, spot );
2139 if ( rc == 0 )
2140 {
2141 rc = FastqReader_SpotInfo( self->reader, NULL, NULL, NULL, NULL, NULL, &num_reads );
2142 if ( rc == 0 )
2143 {
2144 uint32_t readId, good = 0;
2145
2146 SRA_DUMP_DBG( 3, ( "%s %u row reads:", __func__, spot ) );
2147 for ( readId = 0; rc == 0 && readId < num_reads; readId++ )
2148 {
2149 rc = FastqReader_SpotReadInfo( self->reader, readId + 1, NULL, NULL, NULL, NULL, NULL );
2150 if ( !isset_readmask( readmask, readId ) )
2151 {
2152 continue;
2153 }
2154 if ( self->keys_max < ( good + 1 ) )
2155 {
2156 void* p = realloc( self->keys, sizeof( *self->keys ) * ( good + 1 ) );
2157 if ( p == NULL )
2158 {
2159 rc = RC( rcExe, rcNode, rcExecuting, rcMemory, rcExhausted );
2160 break;
2161 }
2162 else
2163 {
2164 self->keys = p;
2165 self->keys_max = good + 1;
2166 }
2167 }
2168 self->keys[ good ].key = &FastqReadSplitter_key_buf[ readId * key_offset ];
2169 while ( self->keys[ good ].key[ 0 ] == ' ' && self->keys[ good ].key[0] != '\0' )
2170 {
2171 self->keys[ good ].key++;
2172 }
2173 clear_readmask( self->keys[ good ].readmask );
2174 set_readmask( self->keys[good].readmask, readId );
2175 SRA_DUMP_DBG( 3, ( " key['%s']+=%u", self->keys[ good ].key, readId ) );
2176 good++;
2177 }
2178 if ( rc == 0 )
2179 {
2180 *key = self->keys;
2181 *keys = good;
2182 }
2183 SRA_DUMP_DBG( 3, ( "\n" ) );
2184 }
2185 }
2186 else if ( GetRCObject( rc ) == rcRow && GetRCState( rc ) == rcNotFound )
2187 {
2188 SRA_DUMP_DBG( 3, ( "%s skipped %u row\n", __func__, spot ) );
2189 rc = 0;
2190 }
2191 }
2192 else if ( GetRCObject( rc ) == rcRow && GetRCState( rc ) == rcNotFound )
2193 {
2194 SRA_DUMP_DBG( 3, ( "%s skipped %u row\n", __func__, spot ) );
2195 rc = 0;
2196 }
2197 }
2198 return rc;
2199 }
2200
2201
FastqReadSplitter_Release(const SRASplitter * cself)2202 rc_t FastqReadSplitter_Release( const SRASplitter* cself )
2203 {
2204 if ( cself != NULL )
2205 {
2206 FastqReadSplitter* self = ( FastqReadSplitter* )cself;
2207 free( self->keys );
2208 }
2209 return 0;
2210 }
2211
2212
2213 typedef struct FastqReadSplitterFactory_struct
2214 {
2215 const char* accession;
2216 const SRATable* table;
2217 const FastqReader* reader;
2218 } FastqReadSplitterFactory;
2219
2220
FastqReadSplitterFactory_Init(const SRASplitterFactory * cself)2221 static rc_t FastqReadSplitterFactory_Init( const SRASplitterFactory* cself )
2222 {
2223 rc_t rc = 0;
2224 FastqReadSplitterFactory* self = ( FastqReadSplitterFactory* )cself;
2225
2226 if ( self == NULL )
2227 {
2228 rc = RC( rcExe, rcType, rcConstructing, rcParam, rcNull );
2229 }
2230 else
2231 {
2232 rc = FastqReaderMake( &self->reader, self->table, self->accession,
2233 /* preserve orig spot format to save on conversions */
2234 FastqArgs.is_platform_cs_native, false, FastqArgs.fasta > 0, false,
2235 false, !FastqArgs.applyClip, FastqArgs.SuppressQualForCSKey, 0,
2236 FastqArgs.offset, '\0', 0, 0 );
2237 }
2238 return rc;
2239 }
2240
2241
FastqReadSplitterFactory_NewObj(const SRASplitterFactory * cself,const SRASplitter ** splitter)2242 static rc_t FastqReadSplitterFactory_NewObj( const SRASplitterFactory* cself, const SRASplitter** splitter )
2243 {
2244 rc_t rc = 0;
2245 FastqReadSplitterFactory* self = ( FastqReadSplitterFactory* )cself;
2246
2247 if ( self == NULL )
2248 {
2249 rc = RC( rcExe, rcType, rcExecuting, rcParam, rcNull );
2250 }
2251 else
2252 {
2253 rc = SRASplitter_Make( splitter, sizeof( FastqReadSplitter ), NULL,
2254 FastqReadSplitter_GetKeySet, NULL, FastqReadSplitter_Release );
2255 if ( rc == 0 )
2256 {
2257 ( (FastqReadSplitter*)(*splitter) )->reader = self->reader;
2258 }
2259 }
2260 return rc;
2261 }
2262
2263
FastqReadSplitterFactory_Release(const SRASplitterFactory * cself)2264 static void FastqReadSplitterFactory_Release( const SRASplitterFactory* cself )
2265 {
2266 if ( cself != NULL )
2267 {
2268 FastqReadSplitterFactory* self = ( FastqReadSplitterFactory* )cself;
2269 FastqReaderWhack( self->reader );
2270 free( FastqReadSplitter_key_buf );
2271 FastqReadSplitter_key_buf = NULL;
2272 }
2273 }
2274
2275
FastqReadSplitterFactory_Make(const SRASplitterFactory ** cself,const char * accession,const SRATable * table)2276 static rc_t FastqReadSplitterFactory_Make( const SRASplitterFactory** cself,
2277 const char* accession, const SRATable* table )
2278 {
2279 rc_t rc = 0;
2280 FastqReadSplitterFactory* obj = NULL;
2281
2282 if ( cself == NULL || accession == NULL || table == NULL )
2283 {
2284 rc = RC( rcExe, rcType, rcConstructing, rcParam, rcNull );
2285 }
2286 else
2287 {
2288 rc = SRASplitterFactory_Make( cself, eSplitterRead, sizeof( *obj ),
2289 FastqReadSplitterFactory_Init,
2290 FastqReadSplitterFactory_NewObj,
2291 FastqReadSplitterFactory_Release );
2292 if ( rc == 0 )
2293 {
2294 obj = ( FastqReadSplitterFactory* )*cself;
2295 obj->accession = accession;
2296 obj->table = table;
2297 }
2298 }
2299 return rc;
2300 }
2301
2302
2303 /* ============== FASTQ 3 read splitter ============================ */
2304
2305 char* Fastq3ReadSplitter_key_buf = NULL;
2306
2307 typedef struct Fastq3ReadSplitter_struct
2308 {
2309 const FastqReader* reader;
2310 SRASplitter_Keys keys[ 2 ];
2311 } Fastq3ReadSplitter;
2312
2313
2314 /* if all active reads len >= minreadlen, reads are splitted into separate files, otherwise on single key "" is returnded for all reads */
Fastq3ReadSplitter_GetKeySet(const SRASplitter * cself,const SRASplitter_Keys ** key,uint32_t * keys,spotid_t spot,const readmask_t * readmask)2315 static rc_t Fastq3ReadSplitter_GetKeySet( const SRASplitter* cself, const SRASplitter_Keys** key,
2316 uint32_t* keys, spotid_t spot, const readmask_t* readmask )
2317 {
2318 rc_t rc = 0;
2319 Fastq3ReadSplitter* self = ( Fastq3ReadSplitter* )cself;
2320 const size_t key_offset = 5;
2321
2322 if ( self == NULL || key == NULL )
2323 {
2324 rc = RC( rcExe, rcNode, rcExecuting, rcParam, rcInvalid );
2325 }
2326 else
2327 {
2328 uint32_t num_reads = 0;
2329
2330 *keys = 0;
2331 if ( Fastq3ReadSplitter_key_buf == NULL )
2332 {
2333 /* initial alloc key_buf: " 1\0 2\0...\0 9\0 10\0 11\0...\03220..\08192\0" */
2334 if ( nreads_max > 9999 )
2335 {
2336 /* key_offset and sprintf format size are insufficient for keys longer than 4 digits */
2337 rc = RC( rcExe, rcNode, rcExecuting, rcBuffer, rcInsufficient );
2338 }
2339 else
2340 {
2341 Fastq3ReadSplitter_key_buf = malloc( nreads_max * key_offset );
2342 if ( Fastq3ReadSplitter_key_buf == NULL )
2343 {
2344 rc = RC( rcExe, rcNode, rcExecuting, rcMemory, rcExhausted );
2345 }
2346 else
2347 {
2348 /* fill buffer w/keys */
2349 int i;
2350 char* p = Fastq3ReadSplitter_key_buf;
2351 for ( i = 1; rc == 0 && i <= nreads_max; i++ )
2352 {
2353 if ( sprintf( p, "%4u", i ) <= 0 )
2354 {
2355 rc = RC( rcExe, rcNode, rcExecuting, rcTransfer, rcIncomplete );
2356 }
2357 p += key_offset;
2358 }
2359 }
2360 }
2361 }
2362
2363 if ( rc == 0 )
2364 {
2365 rc = FastqReaderSeekSpot( self->reader, spot );
2366 if ( rc == 0 )
2367 {
2368 rc = FastqReader_SpotInfo( self->reader, NULL, NULL, NULL, NULL, NULL, &num_reads );
2369 if ( rc == 0 )
2370 {
2371 uint32_t readId, good = 0;
2372 const uint32_t max_reads = sizeof( self->keys ) /sizeof( self->keys[ 0 ] );
2373
2374 SRA_DUMP_DBG( 3, ( "%s %u row reads:", __func__, spot ) );
2375 for ( readId = 0; rc == 0 && readId < num_reads && good < max_reads; readId++ )
2376 {
2377 rc = FastqReader_SpotReadInfo( self->reader, readId + 1, NULL, NULL, NULL, NULL, NULL );
2378 if ( !isset_readmask( readmask, readId ) )
2379 {
2380 continue;
2381 }
2382 self->keys[ good ].key = &Fastq3ReadSplitter_key_buf[ good * key_offset ];
2383 while ( self->keys[ good ].key[ 0 ] == ' ' && self->keys[good].key[ 0 ] != '\0' )
2384 {
2385 self->keys[ good ].key++;
2386 }
2387 clear_readmask( self->keys[ good ].readmask );
2388 set_readmask( self->keys[ good ].readmask, readId );
2389 SRA_DUMP_DBG( 3, ( " key['%s']+=%u", self->keys[ good ].key, readId ) );
2390 good++;
2391 }
2392 if ( rc == 0 )
2393 {
2394 *key = self->keys;
2395 *keys = good;
2396 if ( good != max_reads )
2397 {
2398 /* some are short -> reset keys to same value for all valid reads */
2399 /* run has just one read -> no suffix */
2400 /* or single file was requested */
2401 for ( readId = 0; readId < good; readId++ )
2402 {
2403 self->keys[ readId ].key = "";
2404 }
2405 SRA_DUMP_DBG( 3, ( " all keys joined to ''" ) );
2406 }
2407 }
2408 SRA_DUMP_DBG( 3, ( "\n" ) );
2409 }
2410 }
2411 else if ( GetRCObject( rc ) == rcRow && GetRCState( rc ) == rcNotFound )
2412 {
2413 SRA_DUMP_DBG( 3, ( "%s skipped %u row\n", __func__, spot ) );
2414 rc = 0;
2415 }
2416 }
2417 else if ( GetRCObject( rc ) == rcRow && GetRCState( rc ) == rcNotFound )
2418 {
2419 SRA_DUMP_DBG( 3, ( "%s skipped %u row\n", __func__, spot ) );
2420 rc = 0;
2421 }
2422 }
2423 return rc;
2424 }
2425
2426 typedef struct Fastq3ReadSplitterFactory_struct
2427 {
2428 const char* accession;
2429 const SRATable* table;
2430 const FastqReader* reader;
2431 } Fastq3ReadSplitterFactory;
2432
2433
Fastq3ReadSplitterFactory_Init(const SRASplitterFactory * cself)2434 static rc_t Fastq3ReadSplitterFactory_Init( const SRASplitterFactory* cself )
2435 {
2436 rc_t rc = 0;
2437 Fastq3ReadSplitterFactory* self = ( Fastq3ReadSplitterFactory* )cself;
2438
2439 if ( self == NULL )
2440 {
2441 rc = RC( rcExe, rcType, rcConstructing, rcParam, rcNull );
2442 }
2443 else
2444 {
2445 rc = FastqReaderMake( &self->reader, self->table, self->accession,
2446 /* preserve orig spot format to save on conversions */
2447 FastqArgs.is_platform_cs_native, false, FastqArgs.fasta > 0, false,
2448 false, !FastqArgs.applyClip, FastqArgs.SuppressQualForCSKey, 0,
2449 FastqArgs.offset, '\0', 0, 0 );
2450 }
2451 return rc;
2452 }
2453
2454
Fastq3ReadSplitterFactory_NewObj(const SRASplitterFactory * cself,const SRASplitter ** splitter)2455 static rc_t Fastq3ReadSplitterFactory_NewObj( const SRASplitterFactory* cself, const SRASplitter** splitter )
2456 {
2457 rc_t rc = 0;
2458 Fastq3ReadSplitterFactory* self = ( Fastq3ReadSplitterFactory* )cself;
2459
2460 if ( self == NULL )
2461 {
2462 rc = RC( rcExe, rcType, rcExecuting, rcParam, rcNull );
2463 }
2464 else
2465 {
2466 rc = SRASplitter_Make( splitter, sizeof( Fastq3ReadSplitter ),
2467 NULL, Fastq3ReadSplitter_GetKeySet, NULL, NULL );
2468 if ( rc == 0 )
2469 {
2470 ( (Fastq3ReadSplitter*)(*splitter) )->reader = self->reader;
2471 }
2472 }
2473 return rc;
2474 }
2475
2476
Fastq3ReadSplitterFactory_Release(const SRASplitterFactory * cself)2477 static void Fastq3ReadSplitterFactory_Release( const SRASplitterFactory* cself )
2478 {
2479 if ( cself != NULL )
2480 {
2481 Fastq3ReadSplitterFactory* self = ( Fastq3ReadSplitterFactory* )cself;
2482 FastqReaderWhack( self->reader );
2483 memset ( self, 0, sizeof * self );
2484
2485 free( Fastq3ReadSplitter_key_buf );
2486 Fastq3ReadSplitter_key_buf = NULL;
2487 }
2488 }
2489
2490
Fastq3ReadSplitterFactory_Make(const SRASplitterFactory ** cself,const char * accession,const SRATable * table)2491 static rc_t Fastq3ReadSplitterFactory_Make( const SRASplitterFactory** cself,
2492 const char* accession, const SRATable* table )
2493 {
2494 rc_t rc = 0;
2495 Fastq3ReadSplitterFactory* obj = NULL;
2496
2497 if ( cself == NULL || accession == NULL || table == NULL )
2498 {
2499 rc = RC( rcExe, rcType, rcConstructing, rcParam, rcNull );
2500 }
2501 else
2502 {
2503 rc = SRASplitterFactory_Make( cself, eSplitterRead, sizeof( *obj ),
2504 Fastq3ReadSplitterFactory_Init,
2505 Fastq3ReadSplitterFactory_NewObj,
2506 Fastq3ReadSplitterFactory_Release );
2507 if ( rc == 0 )
2508 {
2509 obj = ( Fastq3ReadSplitterFactory* )*cself;
2510 obj->accession = accession;
2511 obj->table = table;
2512 }
2513 }
2514 return rc;
2515 }
2516
2517
2518 /* ============== FASTQ formatter object ============================ */
2519
2520
2521 typedef struct FastqFormatterSplitter_struct
2522 {
2523 const char* accession;
2524 const FastqReader* reader;
2525 KDataBuffer* b[ 5 ];
2526 size_t bsz[ 5 ]; /* fifth is for fasta line wrap */
2527 } FastqFormatterSplitter;
2528
2529
FastqFormatterSplitter_WrapLine(const KDataBuffer * src,size_t src_sz,KDataBuffer * dst,size_t * dst_sz,const uint64_t width)2530 static rc_t FastqFormatterSplitter_WrapLine( const KDataBuffer* src, size_t src_sz,
2531 KDataBuffer* dst, size_t* dst_sz, const uint64_t width )
2532 {
2533 rc_t rc = 0;
2534 size_t sz = src_sz + ( src_sz - 1 ) / width;
2535
2536 if ( KDataBufferBytes( dst ) < sz )
2537 {
2538 SRA_DUMP_DBG( 10, ( "%s grow buffer from %u to %u\n", __func__, KDataBufferBytes( dst ), sz + 10 ) );
2539 rc = KDataBufferResize( dst, sz + 10 );
2540 }
2541 if ( rc == 0 )
2542 {
2543 const char* s = src->base;
2544 char* d = dst->base;
2545 *dst_sz = sz = src_sz;
2546 while ( sz > 0 )
2547 {
2548 memmove( d, s, sz > width ? width : sz );
2549 if ( sz <= width )
2550 {
2551 break;
2552 }
2553 d += width;
2554 *d++ = '\n';
2555 *dst_sz = *dst_sz + 1;
2556 s += width;
2557 sz -= width;
2558 }
2559 }
2560 return rc;
2561 }
2562
2563
FastqFormatterSplitter_DumpByRead(const SRASplitter * cself,spotid_t spot,const readmask_t * readmask)2564 static rc_t FastqFormatterSplitter_DumpByRead( const SRASplitter* cself, spotid_t spot, const readmask_t* readmask )
2565 {
2566 rc_t rc = 0;
2567 FastqFormatterSplitter* self = ( FastqFormatterSplitter* )cself;
2568
2569 if ( self == NULL )
2570 {
2571 rc = RC( rcExe, rcType, rcExecuting, rcParam, rcNull );
2572 }
2573 else
2574 {
2575 rc = FastqReaderSeekSpot( self->reader, spot );
2576 if ( rc == 0 )
2577 {
2578 rc = SRASplitter_FileActivate( cself, FastqArgs.file_extension );
2579 if ( rc == 0 )
2580 {
2581 DeflineData def_data;
2582 const char* spot_name = NULL, *spot_group = NULL, *read_name = NULL;
2583 uint32_t readIdx, spot_len = 0;
2584 uint32_t num_reads, readId, k;
2585 size_t sname_sz, sgrp_sz;
2586 INSDC_coord_len rlabel_sz = 0, read_len = 0;
2587
2588 if ( FastqArgs.b_defline || FastqArgs.q_defline )
2589 {
2590 rc = FastqReader_SpotInfo( self->reader, &spot_name, &sname_sz, &spot_group, &sgrp_sz, &spot_len, &num_reads );
2591 }
2592 else
2593 {
2594 rc = FastqReader_SpotInfo( self->reader, NULL, NULL, NULL, NULL, NULL, &num_reads );
2595 }
2596
2597 for ( readId = 1, readIdx = 1; rc == 0 && readId <= num_reads; readId++, readIdx++ )
2598 {
2599 if ( !isset_readmask( readmask, readId - 1 ) )
2600 {
2601 continue;
2602 }
2603 if ( FastqArgs.b_defline || FastqArgs.q_defline )
2604 {
2605 rc = FastqReader_SpotReadInfo( self->reader, readId, NULL, &read_name, &rlabel_sz, NULL, &read_len );
2606 if ( rc == 0 )
2607 {
2608 rc = Defline_Bind( &def_data, self->accession, &spot, spot_name, sname_sz, spot_group, sgrp_sz,
2609 &spot_len, &readIdx, read_name, rlabel_sz, &read_len );
2610 }
2611 }
2612 if ( rc == 0 )
2613 {
2614 if ( FastqArgs.b_defline )
2615 {
2616 IF_BUF( ( Defline_Build( FastqArgs.b_defline, &def_data, self->b[0]->base,
2617 KDataBufferBytes( self->b[0] ), &self->bsz[0] ) ), self->b[0], self->bsz[0] );
2618 }
2619 else
2620 {
2621 IF_BUF( ( FastqReaderBaseName( self->reader, readId, &FastqArgs.dumpCs, self->b[0]->base,
2622 KDataBufferBytes( self->b[0] ), &self->bsz[0] ) ), self->b[0], self->bsz[0] );
2623 }
2624 }
2625 if ( rc == 0 )
2626 {
2627 if ( FastqArgs.fasta > 0 )
2628 {
2629 IF_BUF( ( FastqReaderBase( self->reader, readId, self->b[4]->base, KDataBufferBytes( self->b[4] ),
2630 &self->bsz[4] ) ), self->b[4], self->bsz[4] )
2631 {
2632 rc = FastqFormatterSplitter_WrapLine( self->b[4], self->bsz[4], self->b[1], &self->bsz[1], FastqArgs.fasta );
2633 }
2634 }
2635 else
2636 {
2637 IF_BUF( ( FastqReaderBase( self->reader, readId, self->b[1]->base,
2638 KDataBufferBytes( self->b[1] ), &self->bsz[1] ) ), self->b[1], self->bsz[1] );
2639 }
2640 }
2641 if ( !FastqArgs.fasta && rc == 0 )
2642 {
2643 if ( FastqArgs.q_defline )
2644 {
2645 IF_BUF( ( Defline_Build( FastqArgs.q_defline, &def_data, self->b[2]->base,
2646 KDataBufferBytes( self->b[2] ), &self->bsz[2] ) ), self->b[2], self->bsz[2] );
2647 }
2648 else
2649 {
2650 IF_BUF( ( FastqReaderQualityName( self->reader, readId, &FastqArgs.dumpCs, self->b[2]->base,
2651 KDataBufferBytes( self->b[2] ), &self->bsz[2] ) ), self->b[2], self->bsz[2] );
2652 }
2653 if ( rc == 0 )
2654 {
2655 IF_BUF( ( FastqReaderQuality( self->reader, readId, self->b[3]->base,
2656 KDataBufferBytes( self->b[3] ), &self->bsz[3] ) ), self->b[3], self->bsz[3] );
2657 }
2658 }
2659 else if ( ( (char*)(self->b[0]->base))[0] == '@' )
2660 {
2661 ( (char*)(self->b[0]->base) )[0] = '>';
2662 }
2663 for ( k = 0; rc == 0 && k < ( FastqArgs.fasta ? 2 : 4); k++ )
2664 {
2665 rc = SRASplitter_FileWrite( cself, spot, self->b[ k ]->base, self->bsz[ k ] );
2666 if ( rc == 0 )
2667 {
2668 rc = SRASplitter_FileWrite( cself, spot, "\n", 1 );
2669 }
2670 }
2671 }
2672 }
2673 }
2674 else if ( GetRCObject( rc ) == rcRow && GetRCState( rc ) == rcNotFound )
2675 {
2676 SRA_DUMP_DBG( 3, ( "%s skipped %u row\n", __func__, spot ) );
2677 rc = 0;
2678 }
2679 }
2680 return rc;
2681 }
2682
2683
Fasta_dump(const SRASplitter * cself,FastqFormatterSplitter * self,spotid_t spot,uint32_t columns)2684 static rc_t Fasta_dump( const SRASplitter* cself, FastqFormatterSplitter* self, spotid_t spot, uint32_t columns )
2685 {
2686 uint32_t readId;
2687 rc_t rc = FastqFormatterSplitter_WrapLine( self->b[4], self->bsz[4], self->b[1], &self->bsz[1], columns );
2688 for ( readId = 0; rc == 0 && readId < 2; readId++ )
2689 {
2690 rc = SRASplitter_FileWrite( cself, spot, self->b[readId]->base, self->bsz[ readId ] );
2691 if ( rc == 0 )
2692 {
2693 rc = SRASplitter_FileWrite( cself, spot, "\n", 1 );
2694 }
2695 }
2696 return rc;
2697 }
2698
2699
FastqFormatterSplitter_DumpBySpot(const SRASplitter * cself,spotid_t spot,const readmask_t * readmask)2700 static rc_t FastqFormatterSplitter_DumpBySpot( const SRASplitter* cself, spotid_t spot, const readmask_t* readmask )
2701 {
2702 rc_t rc = 0;
2703 FastqFormatterSplitter* self = ( FastqFormatterSplitter* )cself;
2704
2705 if ( self == NULL )
2706 {
2707 rc = RC( rcExe, rcType, rcExecuting, rcParam, rcNull );
2708 }
2709 else
2710 {
2711 rc = FastqReaderSeekSpot( self->reader, spot );
2712 if ( rc == 0 )
2713 {
2714 uint32_t num_reads, readId;
2715
2716 rc = FastqReader_SpotInfo( self->reader, NULL, NULL, NULL, NULL, NULL, &num_reads );
2717 for ( readId = 0; rc == 0 && readId < num_reads; readId++ )
2718 {
2719 /* if any read in readmask is not set - skip whole spot */
2720 if ( !isset_readmask( readmask, readId ) )
2721 {
2722 return 0;
2723 }
2724 }
2725 if ( rc == 0 )
2726 {
2727 rc = SRASplitter_FileActivate( cself, FastqArgs.file_extension );
2728 if ( rc == 0 )
2729 {
2730 DeflineData def_data;
2731 const char* spot_name = NULL, *spot_group = NULL;
2732 uint32_t spot_len = 0;
2733 size_t writ, sname_sz, sgrp_sz;
2734 const int base_i = FastqArgs.fasta ? 4 : 1;
2735
2736 if ( FastqArgs.b_defline || FastqArgs.q_defline )
2737 {
2738 rc = FastqReader_SpotInfo( self->reader, &spot_name, &sname_sz,
2739 &spot_group, &sgrp_sz, &spot_len, NULL );
2740 if ( rc == 0 )
2741 {
2742 rc = Defline_Bind( &def_data, self->accession, &spot, spot_name,
2743 sname_sz, spot_group, sgrp_sz,
2744 &spot_len, &readId, NULL, 0, &spot_len );
2745 }
2746 }
2747
2748 if ( rc == 0 )
2749 {
2750 if ( FastqArgs.b_defline )
2751 {
2752 IF_BUF( ( Defline_Build( FastqArgs.b_defline, &def_data, self->b[0]->base,
2753 KDataBufferBytes( self->b[0] ), &self->bsz[0] ) ), self->b[0], self->bsz[0] );
2754 }
2755 else
2756 {
2757 IF_BUF( ( FastqReaderBaseName( self->reader, 0, &FastqArgs.dumpCs, self->b[0]->base,
2758 KDataBufferBytes( self->b[0] ), &self->bsz[0] ) ), self->b[0], self->bsz[0] );
2759 }
2760 self->bsz[ base_i ] = 0;
2761 if ( !FastqArgs.fasta && rc == 0 )
2762 {
2763 if ( FastqArgs.q_defline )
2764 {
2765 IF_BUF( ( Defline_Build( FastqArgs.q_defline, &def_data, self->b[2]->base,
2766 KDataBufferBytes( self->b[2] ), &self->bsz[2] ) ), self->b[2], self->bsz[2] );
2767 }
2768 else
2769 {
2770 IF_BUF( ( FastqReaderQualityName( self->reader, 0, &FastqArgs.dumpCs, self->b[2]->base,
2771 KDataBufferBytes( self->b[2] ), &self->bsz[2] ) ), self->b[2], self->bsz[2] );
2772 }
2773 self->bsz[3] = 0;
2774 }
2775 else if ( ( (char*)(self->b[0]->base) )[0] == '@' )
2776 {
2777 ( (char*)(self->b[0]->base) )[0] = '>';
2778 }
2779 }
2780
2781 for ( readId = 1; rc == 0 && readId <= num_reads; readId++ )
2782 {
2783 IF_BUF( ( FastqReaderBase( self->reader, readId, &( (char*)(self->b[base_i]->base) )[ self->bsz[ base_i ] ],
2784 KDataBufferBytes( self->b[ base_i ] ) - self->bsz[ base_i ], &writ) ), self->b[ base_i ], self->bsz[ base_i ] + writ )
2785 {
2786 self->bsz[base_i] += writ;
2787 if ( !FastqArgs.fasta )
2788 {
2789 IF_BUF( ( FastqReaderQuality( self->reader, readId, &( (char*)(self->b[3]->base) )[ self->bsz[3] ],
2790 KDataBufferBytes( self->b[3] ) - self->bsz[3], &writ ) ), self->b[3], self->bsz[3] + writ )
2791 {
2792 self->bsz[ 3 ] += writ;
2793 }
2794 }
2795 }
2796 }
2797
2798 if ( rc == 0 )
2799 {
2800 if ( FastqArgs.fasta > 0 )
2801 {
2802 /* special printint of fasta-output ( line-wrap... ) */
2803 rc = Fasta_dump( cself, self, spot, FastqArgs.fasta );
2804 }
2805 else
2806 {
2807 for ( readId = 0; rc == 0 && readId < 4; readId++ )
2808 {
2809 rc = SRASplitter_FileWrite( cself, spot, self->b[readId]->base, self->bsz[ readId ] );
2810 if ( rc == 0 )
2811 {
2812 rc = SRASplitter_FileWrite( cself, spot, "\n", 1 );
2813 }
2814 }
2815 }
2816 }
2817 }
2818 }
2819 }
2820 else if ( GetRCObject( rc ) == rcRow && GetRCState( rc ) == rcNotFound )
2821 {
2822 SRA_DUMP_DBG( 3, ( "%s skipped %u row\n", __func__, spot ) );
2823 rc = 0;
2824 }
2825 }
2826 return rc;
2827 }
2828
2829
2830 typedef struct FastqFormatterFactory_struct
2831 {
2832 const char* accession;
2833 const SRATable* table;
2834 const FastqReader* reader;
2835 KDataBuffer buf[ 5 ]; /* fifth is for fasta line wrap */
2836 } FastqFormatterFactory;
2837
2838
FastqFormatterFactory_Init(const SRASplitterFactory * cself)2839 static rc_t FastqFormatterFactory_Init( const SRASplitterFactory* cself )
2840 {
2841 rc_t rc = 0;
2842 FastqFormatterFactory* self = ( FastqFormatterFactory* )cself;
2843
2844 if ( self == NULL )
2845 {
2846 rc = RC( rcExe, rcType, rcConstructing, rcParam, rcNull );
2847 }
2848 else
2849 {
2850 rc = FastqReaderMake( &self->reader, self->table, self->accession,
2851 FastqArgs.dumpCs, FastqArgs.dumpOrigFmt, FastqArgs.fasta > 0, FastqArgs.dumpCs,
2852 FastqArgs.readIds, !FastqArgs.applyClip, FastqArgs.SuppressQualForCSKey, 0,
2853 FastqArgs.offset, FastqArgs.desiredCsKey[ 0 ], 0, 0 );
2854 if ( rc == 0 )
2855 {
2856 int i;
2857 for( i = 0; rc == 0 && i < sizeof( self->buf ) / sizeof( self->buf[ 0 ] ); i++ )
2858 {
2859 rc = KDataBufferMakeBytes( &self->buf[ i ], DATABUFFERINITSIZE );
2860 }
2861 }
2862 }
2863 return rc;
2864 }
2865
2866
FastqFormatterFactory_NewObj(const SRASplitterFactory * cself,const SRASplitter ** splitter)2867 static rc_t FastqFormatterFactory_NewObj( const SRASplitterFactory* cself, const SRASplitter** splitter )
2868 {
2869 rc_t rc = 0;
2870 FastqFormatterFactory* self = ( FastqFormatterFactory* )cself;
2871
2872 if ( self == NULL )
2873 {
2874 rc = RC( rcExe, rcType, rcExecuting, rcParam, rcNull );
2875 }
2876 else
2877 {
2878 rc = SRASplitter_Make ( splitter, sizeof( FastqFormatterSplitter ), NULL, NULL,
2879 FastqArgs.split_spot ?
2880 FastqFormatterSplitter_DumpByRead :
2881 FastqFormatterSplitter_DumpBySpot, NULL );
2882 if ( rc == 0 )
2883 {
2884 int i;
2885 ( (FastqFormatterSplitter*)(*splitter) )->accession = self->accession;
2886 ( (FastqFormatterSplitter*)(*splitter) )->reader = self->reader;
2887 for ( i = 0; i < sizeof( self->buf ) / sizeof( self->buf[ 0 ] ); i++ )
2888 {
2889 ( (FastqFormatterSplitter*)(*splitter) )->b[ i ] = &self->buf[ i ];
2890 }
2891 }
2892 }
2893 return rc;
2894 }
2895
2896
FastqFormatterFactory_Release(const SRASplitterFactory * cself)2897 static void FastqFormatterFactory_Release( const SRASplitterFactory* cself )
2898 {
2899 if ( cself != NULL )
2900 {
2901 int i;
2902 FastqFormatterFactory* self = ( FastqFormatterFactory* )cself;
2903 FastqReaderWhack( self->reader );
2904 for ( i = 0; i < sizeof( self->buf ) / sizeof( self->buf[ 0 ] ); i++ )
2905 {
2906 KDataBufferWhack( &self->buf[ i ] );
2907 }
2908 }
2909 }
2910
2911
FastqFormatterFactory_Make(const SRASplitterFactory ** cself,const char * accession,const SRATable * table)2912 static rc_t FastqFormatterFactory_Make( const SRASplitterFactory** cself,
2913 const char* accession, const SRATable* table )
2914 {
2915 rc_t rc = 0;
2916 FastqFormatterFactory* obj = NULL;
2917
2918 if ( cself == NULL || accession == NULL || table == NULL )
2919 {
2920 rc = RC( rcExe, rcType, rcConstructing, rcParam, rcNull );
2921 }
2922 else
2923 {
2924 rc = SRASplitterFactory_Make( cself, eSplitterFormat, sizeof( *obj ),
2925 FastqFormatterFactory_Init,
2926 FastqFormatterFactory_NewObj,
2927 FastqFormatterFactory_Release );
2928 if ( rc == 0 )
2929 {
2930 obj = (FastqFormatterFactory*)*cself;
2931 obj->accession = accession;
2932 obj->table = table;
2933 }
2934 }
2935 return rc;
2936 }
2937
2938
2939 /* ### External entry points ##################################################### */
2940
2941
2942 const char UsageDefaultName[] = "fastq-dump";
2943
2944
UsageSummary(const char * progname)2945 rc_t CC UsageSummary ( const char * progname )
2946 {
2947 return 0;
2948 }
2949
2950 #define H_splip_sot 0
2951 #define H_clip 1
2952 #define H_minReadLen 2
2953 #define H_qual_filter 3
2954 #define H_skip_tech 4
2955 #define H_split_files 5
2956 #define H_split_3 6
2957 #define H_dumpcs 7
2958 #define H_dumpbase 8
2959 #define H_offset 9
2960 #define H_fasta 10
2961 #define H_origfmt 11
2962 #define H_readids 12
2963 #define H_helicos 13
2964 #define H_defline_seq 14
2965 #define H_defline_qual 15
2966 #define H_aligned 16
2967 #define H_unaligned 17
2968 #define H_aligned_region 18
2969 #define H_matepair_distance 19
2970 #define H_qual_filter_1 20
2971 #define H_SuppressQualForCSKey 21
2972
FastqDumper_Usage(const SRADumperFmt * fmt,const SRADumperFmt_Arg * core_args,int first)2973 rc_t FastqDumper_Usage( const SRADumperFmt* fmt, const SRADumperFmt_Arg* core_args, int first )
2974 {
2975 int i, j;
2976 /* known core options */
2977 const SRADumperFmt_Arg* core[ 13 ];
2978
2979 memset( (void*)core, 0, sizeof( core ) );
2980 for ( i = first; core_args[i].abbr != NULL || core_args[ i ].full != NULL; i++ )
2981 {
2982 const char* nm = core_args[ i ].abbr;
2983 nm = nm ? nm : core_args[ i ].full;
2984 if ( strcmp( nm, "A" ) == 0 )
2985 {
2986 core[ 0 ] = &core_args[ i ];
2987 }
2988 else if ( strcmp( nm, "O" ) == 0 )
2989 {
2990 core[ 1 ] = &core_args[ i ];
2991 }
2992 else if ( strcmp( nm, "N" ) == 0 )
2993 {
2994 core[ 2 ] = &core_args[ i ];
2995 }
2996 else if ( strcmp( nm, "X" ) == 0 )
2997 {
2998 core[ 3 ] = &core_args[ i ];
2999 }
3000 else if ( strcmp( nm, "G" ) == 0 )
3001 {
3002 core[ 4 ] = &core_args[ i ];
3003 }
3004 else if ( strcmp( nm, "spot-groups" ) == 0 )
3005 {
3006 core[ 5 ] = &core_args[ i ];
3007 }
3008 else if ( strcmp( nm, "R" ) == 0 )
3009 {
3010 core[ 6 ] = &core_args[ i ];
3011 }
3012 else if ( strcmp( nm, "T" ) == 0 )
3013 {
3014 core[ 7 ] = &core_args[ i ];
3015 }
3016 else if ( strcmp( nm, "K" ) == 0 )
3017 {
3018 core[ 8 ] = &core_args[ i ];
3019 }
3020 else if ( strcmp( nm, "table" ) == 0 )
3021 {
3022 core[ 9 ] = &core_args[ i ];
3023 }
3024 else if ( strcmp( nm, "Z" ) == 0 )
3025 {
3026 core[ 10 ] = &core_args[ i ];
3027 }
3028 else if ( strcmp( nm, "gzip" ) == 0 )
3029 {
3030 core[ 11 ] = &core_args[ i ];
3031 }
3032 else if ( strcmp( nm, "bzip2" ) == 0 )
3033 {
3034 core[ 12 ] = &core_args[ i ];
3035 }
3036 }
3037
3038 #define OARG(arg,msg) (arg ? HelpOptionLine((arg)->abbr, (arg)->full, (arg)->param, \
3039 msg ? msg : (const char**)((arg)->descr)) : ((void)(0)))
3040
3041 OUTMSG(( "INPUT\n" ));
3042 OARG( core[ 0 ], NULL );
3043 OARG( core[ 9 ], NULL );
3044
3045 OUTMSG(( "\nPROCESSING\n" ));
3046 OUTMSG(( "\nRead Splitting Sequence data may be used in raw form or\n"));
3047 OUTMSG(( " split into individual reads\n" ));
3048 OARG( &fmt->arg_desc[ 0 ], NULL);
3049
3050 OUTMSG(( "\nFull Spot Filters Applied to the full spot independently\n" ));
3051 OUTMSG(( " of --%s\n", fmt->arg_desc[0].full ));
3052 OARG( core[ 2 ], NULL );
3053 OARG( core[ 3 ], NULL );
3054 OARG( core[ 5 ], NULL );
3055 OARG( &fmt->arg_desc[ 1 ], NULL );
3056
3057 OUTMSG(( "\nCommon Filters Applied to spots when --%s is not\n",
3058 fmt->arg_desc[0].full ));
3059 OUTMSG(( " set, otherwise - to individual reads\n" ));
3060 OARG( &fmt->arg_desc[ 2 ], NULL );
3061 OARG( core[ 6 ], NULL );
3062 OARG( &fmt->arg_desc[ 3 ], NULL );
3063 OARG( &fmt->arg_desc[ H_qual_filter_1 ], NULL );
3064
3065 OUTMSG(( "\nFilters based on alignments Filters are active when alignment\n" ));
3066 OUTMSG(( " data are present\n" ));
3067 OARG( &fmt->arg_desc[ 16 ], NULL );
3068 OARG( &fmt->arg_desc[ 17 ], NULL );
3069 OARG( &fmt->arg_desc[ 18 ], NULL );
3070 OARG( &fmt->arg_desc[ 19 ], NULL );
3071
3072 OUTMSG(( "\nFilters for individual reads Applied only with --%s set\n",
3073 fmt->arg_desc[0].full ));
3074 OARG( &fmt->arg_desc[ 4 ], NULL );
3075
3076 OUTMSG(( "\nOUTPUT\n" ));
3077 OARG( core[ 1 ], NULL );
3078 OARG( core[ 10 ], NULL );
3079 OARG( core[ 11 ], NULL );
3080 OARG( core[ 12 ], NULL);
3081
3082 OUTMSG(( "\nMultiple File Options Setting these options will produce more\n" ));
3083 OUTMSG(( " than 1 file, each of which will be suffixed\n" ));
3084 OUTMSG(( " according to splitting criteria.\n" ));
3085 OARG( &fmt->arg_desc[ 5 ], NULL );
3086 OARG( &fmt->arg_desc[ 6 ], NULL );
3087 OARG( core[ 4 ], NULL );
3088 OARG( core[ 6 ], NULL );
3089 OARG( core[ 7 ], NULL );
3090 OARG( core[ 8 ], NULL );
3091
3092 OUTMSG(( "\nFORMATTING\n" ));
3093 OUTMSG(( "\nSequence\n" ));
3094 OARG( &fmt->arg_desc[ 7 ], NULL );
3095 OARG( &fmt->arg_desc[ 8 ], NULL );
3096
3097 OUTMSG(( "\nQuality\n" ));
3098 OARG( &fmt->arg_desc[ 9 ], NULL );
3099 OARG( &fmt->arg_desc[ 10 ], NULL );
3100 OARG( &fmt->arg_desc[ H_SuppressQualForCSKey ], NULL ); /* added Jan 15th 2014 ( a new fastq-variation! ) */
3101
3102 OUTMSG(( "\nDefline\n" ));
3103 OARG( &fmt->arg_desc[ 11 ], NULL );
3104 OARG( &fmt->arg_desc[ 12 ], NULL );
3105 OARG( &fmt->arg_desc[ 13 ], NULL );
3106 OARG( &fmt->arg_desc[ 14 ], NULL );
3107 OARG( &fmt->arg_desc[ 15 ], NULL );
3108
3109 OUTMSG(( "OTHER:\n" ));
3110 for ( i = first; core_args[ i ].abbr != NULL || core_args[ i ].full != NULL; i++ )
3111 {
3112 bool print = true;
3113 for ( j = 0; j < sizeof( core ) / sizeof( core[ 0 ] ); j++ )
3114 {
3115 if ( core[ j ] == &core_args[ i ] )
3116 {
3117 print = false;
3118 break;
3119 }
3120 }
3121 if ( print )
3122 {
3123 OARG( &core_args[ i ], NULL );
3124 }
3125 }
3126 return 0;
3127 }
3128
3129
FastqDumper_Release(const SRADumperFmt * fmt)3130 rc_t FastqDumper_Release( const SRADumperFmt* fmt )
3131 {
3132 if ( fmt == NULL )
3133 {
3134 return RC( rcExe, rcFormatter, rcDestroying, rcParam, rcInvalid );
3135 }
3136 else
3137 {
3138 Defline_Release( FastqArgs.b_defline );
3139 Defline_Release( FastqArgs.q_defline );
3140 free( FastqArgs.alregion );
3141 }
3142 return 0;
3143 }
3144
3145
FastqDumper_AddArg(const SRADumperFmt * fmt,GetArg * f,int * i,int argc,char * argv[])3146 bool FastqDumper_AddArg( const SRADumperFmt* fmt, GetArg* f, int* i, int argc, char *argv[] )
3147 {
3148 const char* arg = NULL;
3149
3150 if ( fmt == NULL || f == NULL || i == NULL || argv == NULL )
3151 {
3152 LOGERR( klogErr, RC( rcExe, rcArgv, rcReading, rcParam, rcInvalid ), "null param" );
3153 return false;
3154 }
3155 else if ( f( fmt, "M", "minReadLen", i, argc, argv, &arg ) )
3156 {
3157 FastqArgs.minReadLen = AsciiToU32( arg, NULL, NULL );
3158 }
3159 else if ( f( fmt, "W", "clip", i, argc, argv, NULL ) )
3160 {
3161 FastqArgs.applyClip = true;
3162 }
3163 else if ( f( fmt, "SU", "suppress-qual-for-cskey", i, argc, argv, NULL ) )
3164 {
3165 FastqArgs.SuppressQualForCSKey = true;
3166 }
3167 else if ( f( fmt, "F", "origfmt", i, argc, argv, NULL ) )
3168 {
3169 FastqArgs.dumpOrigFmt = true;
3170 }
3171 else if ( f( fmt, "C", "dumpcs", i, argc, argv, NULL ) )
3172 {
3173 int k = ( *i ) + 1;
3174 FastqArgs.dumpCs = true;
3175 if ( k < argc && strlen( argv[ k ] ) == 1 && strchr( "acgtACGT", argv[ k ][ 0 ] ) != NULL )
3176 {
3177 FastqArgs.desiredCsKey = argv[ k ];
3178 *i = k;
3179 }
3180 }
3181 else if ( f( fmt, "B", "dumpbase", i, argc, argv, NULL ) )
3182 {
3183 FastqArgs.dumpBase = true;
3184 }
3185 else if ( f( fmt, "Q", "offset", i, argc, argv, &arg ) )
3186 {
3187 FastqArgs.offset = AsciiToU32( arg, NULL, NULL );
3188 }
3189 else if ( f( fmt, "I", "readids", i, argc, argv, NULL ) )
3190 {
3191 FastqArgs.readIds = true;
3192 }
3193 else if ( f( fmt, "E", "qual-filter", i, argc, argv, NULL ) )
3194 {
3195 FastqArgs.qual_filter = true;
3196 }
3197 else if ( f( fmt, "E1", "qual-filter-1", i, argc, argv, NULL ) )
3198 {
3199 FastqArgs.qual_filter1 = true;
3200 }
3201 else if ( f( fmt, "DB", "defline-seq", i, argc, argv, &arg ) )
3202 {
3203 FastqArgs.b_deffmt = arg;
3204 }
3205 else if ( f( fmt, "DQ", "defline-qual", i, argc, argv, &arg ) )
3206 {
3207 FastqArgs.q_deffmt = arg;
3208 }
3209 else if ( f( fmt, "TR", "skip-technical", i, argc, argv, NULL ) )
3210 {
3211 FastqArgs.skipTechnical = true;
3212 }
3213 else if ( f( fmt, "SF", "split-files", i, argc, argv, NULL ) )
3214 {
3215 SRADumperFmt * nc_fmt = ( SRADumperFmt * )fmt;
3216 FastqArgs.split_spot = true;
3217 FastqArgs.split_files = true;
3218 nc_fmt->split_files = true;
3219 }
3220 else if ( f( fmt, NULL, "split-3", i, argc, argv, NULL ) )
3221 {
3222 FastqArgs.split_3 = true;
3223 FastqArgs.split_spot = true;
3224 FastqArgs.maxReads = 2;
3225 FastqArgs.skipTechnical = true;
3226 }
3227 else if ( f( fmt, "SL", "split-spot", i, argc, argv, NULL ) )
3228 {
3229 FastqArgs.split_spot = true;
3230 }
3231 else if ( f( fmt, "HS", "helicos", i, argc, argv, NULL ) )
3232 {
3233 FastqArgs.b_deffmt = "@$sn";
3234 FastqArgs.q_deffmt = "+";
3235 }
3236 else if ( f( fmt, "FA", "fasta", i, argc, argv, NULL ) )
3237 {
3238 int k = (*i) + 1;
3239 FastqArgs.fasta = 70;
3240 if ( k < argc && isdigit( argv[ k ][ 0 ] ) )
3241 {
3242 char* endp = NULL;
3243 errno = 0;
3244 FastqArgs.fasta = strtou64( argv[ k ], &endp, 10 );
3245 if ( errno != 0 || endp == NULL || *endp != '\0' )
3246 {
3247 return false;
3248 }
3249 *i = k;
3250 if ( FastqArgs.fasta == 0 )
3251 {
3252 FastqArgs.fasta = ~0;
3253 }
3254 }
3255 FastqArgs.file_extension = ".fasta";
3256 }
3257 else if ( f( fmt, NULL, "aligned", i, argc, argv, NULL ) )
3258 {
3259 FastqArgs.aligned = true;
3260 }
3261 else if ( f( fmt, NULL, "unaligned", i, argc, argv, NULL ) )
3262 {
3263 FastqArgs.unaligned = true;
3264 }
3265 else if ( f( fmt, NULL, "aligned-region", i, argc, argv, &arg ) )
3266 {
3267 /* name[:[from][-[to]]] */
3268 TAlignedRegion* r = realloc( FastqArgs.alregion, sizeof( *FastqArgs.alregion ) * ++FastqArgs.alregion_qty );
3269 if ( r == NULL )
3270 {
3271 return false;
3272 }
3273 FastqArgs.alregion = r;
3274 r = &FastqArgs.alregion[ FastqArgs.alregion_qty - 1 ];
3275 r->name = strchr( arg, ':' );
3276 r->from = 0;
3277 r->to = 0;
3278 if ( r->name == NULL )
3279 {
3280 r->name = arg;
3281 r->name_len = strlen( arg );
3282 }
3283 else
3284 {
3285 const char* c;
3286 uint64_t* v;
3287
3288 r->name_len = r->name - arg;
3289 c = r->name;
3290 r->name = arg;
3291
3292 v = &r->from;
3293 while ( *++c != '\0')
3294 {
3295 if ( *c == '-' )
3296 {
3297 v = &r->to;
3298 }
3299 else if ( *c == '+' )
3300 {
3301 if ( *v != 0 )
3302 {
3303 return false;
3304 }
3305 }
3306 else if ( !isdigit( *c ) )
3307 {
3308 return false;
3309 }
3310 else
3311 {
3312 *v = *v * 10 + ( *c - '0' );
3313 }
3314 }
3315 if ( r->from > r->to && r->to != 0 )
3316 {
3317 uint64_t x = r->from;
3318 r->from = r->to;
3319 r->to = x;
3320 }
3321 }
3322 }
3323 else if ( f( fmt, NULL, "matepair-distance", i, argc, argv, &arg ) )
3324 {
3325 /* from[-to] | [from]-to | unknown */
3326 if ( strcmp( arg, "unknown" ) == 0 )
3327 {
3328 FastqArgs.mp_dist_unknown = true;
3329 }
3330 else
3331 {
3332 uint64_t* v;
3333 TMatepairDistance* p = realloc( FastqArgs.mp_dist, sizeof( *FastqArgs.mp_dist ) * ++FastqArgs.mp_dist_qty );
3334 if ( p == NULL )
3335 {
3336 return false;
3337 }
3338 FastqArgs.mp_dist = p;
3339 p = &FastqArgs.mp_dist[ FastqArgs.mp_dist_qty - 1 ];
3340 p->from = 0;
3341 p->to = 0;
3342
3343 v = &p->from;
3344 while ( *++arg != '\0' )
3345 {
3346 if ( *arg == '-' )
3347 {
3348 v = &p->to;
3349 }
3350 else if ( *arg == '+' )
3351 {
3352 if ( *v != 0 )
3353 {
3354 return false;
3355 }
3356 }
3357 else if ( !isdigit( *arg ) )
3358 {
3359 return false;
3360 }
3361 else
3362 {
3363 *v = *v * 10 + ( *arg - '0' );
3364 }
3365 }
3366 if ( p->from > p->to && p->to != 0 )
3367 {
3368 uint64_t x = p->from;
3369 p->from = p->to;
3370 p->to = x;
3371 }
3372 if ( p->from == 0 && p->to == 0 )
3373 {
3374 FastqArgs.mp_dist_qty--;
3375 }
3376 }
3377 }
3378 else
3379 {
3380 return false;
3381 }
3382 return true;
3383 }
3384
3385
FastqDumper_Factories(const SRADumperFmt * fmt,const SRASplitterFactory ** factory)3386 rc_t FastqDumper_Factories( const SRADumperFmt* fmt, const SRASplitterFactory** factory )
3387 {
3388 rc_t rc = 0;
3389 const SRASplitterFactory* parent = NULL, *child = NULL;
3390
3391 if ( fmt == NULL || factory == NULL )
3392 {
3393 return RC( rcExe, rcFormatter, rcReading, rcParam, rcInvalid );
3394 }
3395 *factory = NULL;
3396 {
3397 const SRAColumn* c = NULL;
3398 rc = SRATableOpenColumnRead( fmt->table, &c, "PLATFORM", sra_platform_id_t );
3399 if ( rc == 0 )
3400 {
3401 const INSDC_SRA_platform_id * platform;
3402 bitsz_t o, z;
3403 rc = SRAColumnRead( c, 1, ( const void ** )&platform, &o, &z );
3404 if ( rc == 0 )
3405 {
3406 if ( !FastqArgs.dumpCs && !FastqArgs.dumpBase )
3407 {
3408 if ( platform != NULL && *platform == SRA_PLATFORM_ABSOLID )
3409 {
3410 FastqArgs.dumpCs = true;
3411 }
3412 else
3413 {
3414 FastqArgs.dumpBase = true;
3415 }
3416 }
3417 if ( platform != NULL && *platform == SRA_PLATFORM_ABSOLID )
3418 {
3419 FastqArgs.is_platform_cs_native = true;
3420 }
3421 }
3422 SRAColumnRelease( c );
3423 }
3424 else if ( GetRCState( rc ) == rcNotFound && GetRCObject( rc ) == ( enum RCObject )rcColumn )
3425 {
3426 rc = 0;
3427 }
3428 }
3429
3430 if ( ( FastqArgs.aligned || FastqArgs.unaligned ) && !( FastqArgs.aligned && FastqArgs.unaligned ) )
3431 {
3432 rc = AlignedFilterFactory_Make( &child, fmt->table, FastqArgs.aligned, FastqArgs.unaligned );
3433 if ( rc == 0 )
3434 {
3435 if ( parent != NULL )
3436 {
3437 rc = SRASplitterFactory_AddNext( parent, child );
3438 if ( rc != 0 )
3439 {
3440 SRASplitterFactory_Release(child);
3441 }
3442 else
3443 {
3444 parent = child;
3445 }
3446 }
3447 else
3448 {
3449 parent = child;
3450 *factory = parent;
3451 }
3452 }
3453 }
3454
3455 if ( FastqArgs.alregion_qty > 0 )
3456 {
3457 rc = AlignRegionFilterFactory_Make( &child, fmt->table, FastqArgs.alregion, FastqArgs.alregion_qty );
3458 if ( rc == 0 )
3459 {
3460 if ( parent != NULL )
3461 {
3462 rc = SRASplitterFactory_AddNext( parent, child );
3463 if ( rc != 0 )
3464 {
3465 SRASplitterFactory_Release( child );
3466 }
3467 else
3468 {
3469 parent = child;
3470 }
3471 }
3472 else
3473 {
3474 parent = child;
3475 *factory = parent;
3476 }
3477 }
3478 }
3479
3480 if ( FastqArgs.mp_dist_unknown || FastqArgs.mp_dist_qty > 0 )
3481 {
3482 rc = AlignPairDistanceFilterFactory_Make( &child, fmt->table,
3483 FastqArgs.mp_dist_unknown, FastqArgs.mp_dist, FastqArgs.mp_dist_qty);
3484 if ( rc == 0 )
3485 {
3486 if ( parent != NULL )
3487 {
3488 rc = SRASplitterFactory_AddNext( parent, child );
3489 if ( rc != 0 )
3490 {
3491 SRASplitterFactory_Release(child);
3492 }
3493 else
3494 {
3495 parent = child;
3496 }
3497 }
3498 else
3499 {
3500 parent = child;
3501 *factory = parent;
3502 }
3503 }
3504 }
3505
3506 if ( rc == 0 && FastqArgs.skipTechnical && FastqArgs.split_spot )
3507 {
3508 rc = FastqBioFilterFactory_Make( &child, fmt->accession, fmt->table );
3509 if ( rc == 0 )
3510 {
3511 if ( parent != NULL )
3512 {
3513 rc = SRASplitterFactory_AddNext( parent, child );
3514 if ( rc != 0 )
3515 {
3516 SRASplitterFactory_Release( child );
3517 }
3518 else
3519 {
3520 parent = child;
3521 }
3522 }
3523 else
3524 {
3525 parent = child;
3526 *factory = parent;
3527 }
3528 }
3529 }
3530
3531 if ( rc == 0 && FastqArgs.maxReads > 0 )
3532 {
3533 rc = FastqRNumberFilterFactory_Make( &child, fmt->accession, fmt->table );
3534 if ( rc == 0 )
3535 {
3536 if ( parent != NULL )
3537 {
3538 rc = SRASplitterFactory_AddNext( parent, child );
3539 if ( rc != 0 )
3540 {
3541 SRASplitterFactory_Release( child );
3542 }
3543 else
3544 {
3545 parent = child;
3546 }
3547 }
3548 else
3549 {
3550 parent = child;
3551 *factory = parent;
3552 }
3553 }
3554 }
3555
3556 if ( rc == 0 && ( FastqArgs.qual_filter || FastqArgs.qual_filter1 ) )
3557 {
3558 rc = FastqQFilterFactory_Make( &child, fmt->accession, fmt->table );
3559 if ( rc == 0 )
3560 {
3561 if ( parent != NULL )
3562 {
3563 rc = SRASplitterFactory_AddNext( parent, child );
3564 if ( rc != 0 )
3565 {
3566 SRASplitterFactory_Release( child );
3567 }
3568 else
3569 {
3570 parent = child;
3571 }
3572 }
3573 else
3574 {
3575 parent = child;
3576 *factory = parent;
3577 }
3578 }
3579 }
3580
3581 if ( rc == 0 )
3582 {
3583 rc = FastqReadLenFilterFactory_Make( &child, fmt->accession, fmt->table );
3584 if ( rc == 0 )
3585 {
3586 if ( parent != NULL )
3587 {
3588 rc = SRASplitterFactory_AddNext( parent, child );
3589 if ( rc != 0 )
3590 {
3591 SRASplitterFactory_Release( child );
3592 }
3593 else
3594 {
3595 parent = child;
3596 }
3597 }
3598 else
3599 {
3600 parent = child;
3601 *factory = parent;
3602 }
3603 }
3604 }
3605
3606 if ( rc == 0 )
3607 {
3608 if ( FastqArgs.split_3 )
3609 {
3610 rc = Fastq3ReadSplitterFactory_Make( &child, fmt->accession, fmt->table );
3611 if ( rc == 0 )
3612 {
3613 if ( parent != NULL )
3614 {
3615 rc = SRASplitterFactory_AddNext( parent, child );
3616 if ( rc != 0 )
3617 {
3618 SRASplitterFactory_Release( child );
3619 }
3620 else
3621 {
3622 parent = child;
3623 }
3624 }
3625 else
3626 {
3627 parent = child;
3628 *factory = parent;
3629 }
3630 }
3631 }
3632 else if ( FastqArgs.split_files )
3633 {
3634 rc = FastqReadSplitterFactory_Make( &child, fmt->accession, fmt->table );
3635 if ( rc == 0 )
3636 {
3637 if ( parent != NULL )
3638 {
3639 rc = SRASplitterFactory_AddNext( parent, child );
3640 if ( rc != 0 )
3641 {
3642 SRASplitterFactory_Release(child);
3643 }
3644 else
3645 {
3646 parent = child;
3647 }
3648 }
3649 else
3650 {
3651 parent = child;
3652 *factory = parent;
3653 }
3654 }
3655 }
3656 }
3657
3658 if ( rc == 0 )
3659 {
3660 if ( FastqArgs.b_deffmt != NULL )
3661 {
3662 rc = Defline_Parse( &FastqArgs.b_defline, FastqArgs.b_deffmt );
3663 }
3664 if ( FastqArgs.q_deffmt != NULL )
3665 {
3666 rc = Defline_Parse( &FastqArgs.q_defline, FastqArgs.q_deffmt );
3667 }
3668 if ( rc == 0 )
3669 {
3670 rc = FastqFormatterFactory_Make( &child, fmt->accession, fmt->table );
3671 if ( rc == 0 )
3672 {
3673 if ( parent != NULL )
3674 {
3675 rc = SRASplitterFactory_AddNext( parent, child );
3676 if ( rc != 0 )
3677 {
3678 SRASplitterFactory_Release( child );
3679 }
3680 }
3681 else
3682 {
3683 *factory = child;
3684 }
3685 }
3686 }
3687 }
3688 return rc;
3689 }
3690
3691 /* main entry point of the file */
SRADumper_Init(SRADumperFmt * fmt)3692 rc_t SRADumper_Init( SRADumperFmt* fmt )
3693 {
3694 static const SRADumperFmt_Arg arg[] =
3695 {
3696
3697 /* DO NOT ADD IN THE MIDDLE ORDER IS IMPORTANT IN USAGE FUNCTION ABOVE!!! */
3698 {NULL, "split-spot", NULL, {"Split spots into individual reads", NULL}}, /* H_splip_sot = 0 */
3699
3700 {"W", "clip", NULL, {"Remove adapter sequences from reads", NULL}}, /* H_clip = 1 */
3701
3702 {"M", "minReadLen", "len", {"Filter by sequence length >= <len>", NULL}}, /* H_minReadLen = 2 */
3703 {"E", "qual-filter", NULL, {"Filter used in early 1000 Genomes data:", /* H_qual_filter = 3 */
3704 "no sequences starting or ending with >= 10N", NULL}},
3705
3706 {NULL, "skip-technical", NULL, {"Dump only biological reads", NULL}}, /* H_skip_tech = 4 */
3707
3708 {NULL, "split-files", NULL, {"Write reads into separate files. " /* H_split_files = 5 */
3709 "Read number will be suffixed to the file name. ",
3710 "NOTE! The `--split-3` option is recommended.",
3711 "In cases where not all spots have the same number of reads,",
3712 "this option will produce files",
3713 "that WILL CAUSE ERRORS in most programs",
3714 "which process split pair fastq files.", NULL}},
3715
3716 /* DO NOT ADD IN THE MIDDLE ORDER IS IMPORTANT IN USAGE FUNCTION ABOVE!!! */
3717 {NULL, "split-3", NULL, {"3-way splitting for mate-pairs.", /* H_split_3 = 6 */
3718 "For each spot, if there are two biological reads satisfying filter conditions,",
3719 "the first is placed in the `*_1.fastq` file,",
3720 "and the second is placed in the `*_2.fastq` file.",
3721 "If there is only one biological read satisfying the filter conditions,",
3722 "it is placed in the `*.fastq` file.All other reads in the spot are ignored.",
3723 NULL}},
3724
3725 {"C", "dumpcs", "[cskey]", {"Formats sequence using color space (default for SOLiD)," /* H_dumpcs = 7 */
3726 "\"cskey\" may be specified for translation", NULL}},
3727 {"B", "dumpbase", NULL, {"Formats sequence using base space (default for other than SOLiD).", NULL}}, /* H_dumpbase = 8 */
3728
3729 /* DO NOT ADD IN THE MIDDLE ORDER IS IMPORTANT IN USAGE FUNCTION ABOVE!!! */
3730 {"Q", "offset", "integer", {"Offset to use for quality conversion, default is 33", NULL}}, /* H_offset = 9 */
3731 {NULL, "fasta", "[line width]", {"FASTA only, no qualities, optional line wrap width (set to zero for no wrapping)", NULL}}, /* H_fasta = 10 */
3732
3733 {"F", "origfmt", NULL, {"Defline contains only original sequence name", NULL}}, /* H_origfmt = 11 */
3734 {"I", "readids", NULL, {"Append read id after spot id as 'accession.spot.readid' on defline", NULL}}, /* H_readids = 12 */
3735 {NULL, "helicos", NULL, {"Helicos style defline", NULL}}, /* H_helicos = 13 */
3736 {NULL, "defline-seq", "fmt", {"Defline format specification for sequence.", NULL}}, /* H_defline_seq = 14 */
3737 {NULL, "defline-qual", "fmt", {"Defline format specification for quality.", /* H_defline_qual = 15 */
3738 "<fmt> is a string of characters and/or variables. The variables can be one of:",
3739 "$ac - accession, $si - spot id, $sn - spot name, $sg - spot group (barcode),",
3740 "$sl - spot length in bases, $ri - read number, $rn - read name, $rl - read length in bases.",
3741 "'[]' could be used for an optional output: if all vars in [] yield empty values whole group is not printed.",
3742 "Empty value is empty string or 0 for numeric variables.",
3743 "Ex: @$sn[_$rn]/$ri - '_$rn' is omitted if name is empty\n", NULL}},
3744
3745 {NULL, "aligned", NULL, {"Dump only aligned sequences", NULL}}, /* H_aligned = 16 */
3746 {NULL, "unaligned", NULL, {"Dump only unaligned sequences", NULL}}, /* H_unaligned = 17 */
3747
3748 /* DO NOT ADD IN THE MIDDLE ORDER IS IMPORTANT IN USAGE FUNCTION ABOVE!!! */
3749 {NULL, "aligned-region", "name[:from-to]", {"Filter by position on genome.", /* H_aligned_region = 18 */
3750 "Name can either be accession.version (ex: NC_000001.10) or",
3751 "file specific name (ex: \"chr1\" or \"1\").",
3752 "\"from\" and \"to\" are 1-based coordinates", NULL}},
3753 {NULL, "matepair-distance", "from-to|unknown", {"Filter by distance between matepairs.", /* H_matepair-distance = 19 */
3754 "Use \"unknown\" to find matepairs split between the references.",
3755 "Use from-to to limit matepair distance on the same reference", NULL}},
3756
3757 {NULL, "qual-filter-1", NULL, {"Filter used in current 1000 Genomes data", NULL}}, /* H_qual_filter_1 = 20 */
3758
3759 {NULL, "suppress-qual-for-cskey", NULL, {"suppress quality-value for cskey", NULL}}, /* H_SuppressQualForCSKey = 21 */
3760
3761 {NULL, NULL, NULL, {NULL}}
3762 };
3763
3764 if ( fmt == NULL )
3765 {
3766 return RC( rcExe, rcFileFormat, rcConstructing, rcParam, rcNull );
3767 }
3768
3769 memset( &FastqArgs, 0, sizeof( FastqArgs ) );
3770 FastqArgs.is_platform_cs_native = false;
3771 FastqArgs.maxReads = 0;
3772 FastqArgs.skipTechnical = false;
3773 FastqArgs.minReadLen = 1;
3774 FastqArgs.applyClip = false;
3775 FastqArgs.SuppressQualForCSKey = false; /* added Jan 15th 2014 ( a new fastq-variation! ) */
3776 FastqArgs.dumpOrigFmt = false;
3777 FastqArgs.dumpBase = false;
3778 FastqArgs.dumpCs = false;
3779 FastqArgs.readIds = false;
3780 FastqArgs.offset = 33;
3781 FastqArgs.desiredCsKey = "";
3782 FastqArgs.qual_filter = false;
3783 FastqArgs.b_deffmt = NULL;
3784 FastqArgs.b_defline = NULL;
3785 FastqArgs.q_deffmt = NULL;
3786 FastqArgs.q_defline = NULL;
3787 FastqArgs.split_files = false;
3788 FastqArgs.split_3 = false;
3789 FastqArgs.split_spot = false;
3790 FastqArgs.fasta = 0;
3791 FastqArgs.file_extension = ".fastq";
3792 FastqArgs.aligned = false;
3793 FastqArgs.unaligned = false;
3794 FastqArgs.alregion = NULL;
3795 FastqArgs.alregion_qty = 0;
3796 FastqArgs.mp_dist = NULL;
3797 FastqArgs.mp_dist_unknown = false;
3798 FastqArgs.mp_dist_qty = 0;
3799
3800 fmt->usage = FastqDumper_Usage;
3801 fmt->release = FastqDumper_Release;
3802 fmt->arg_desc = arg;
3803 fmt->add_arg = FastqDumper_AddArg;
3804 fmt->get_factory = FastqDumper_Factories;
3805 fmt->gzip = true;
3806 fmt->bzip2 = true;
3807 fmt->split_files = false;
3808
3809 return 0;
3810 }
3811