1 /*===========================================================================
2 *
3 *                            PUBLIC DOMAIN NOTICE
4 *               National Center for Biotechnology Information
5 *
6 *  This software/database is a "United States Government Work" under the
7 *  terms of the United States Copyright Act.  It was written as part of
8 *  the author's official duties as a United States Government employee and
9 *  thus cannot be copyrighted.  This software/database is freely available
10 *  to the public for use. The National Library of Medicine and the U.S.
11 *  Government have not placed any restriction on its use or reproduction.
12 *
13 *  Although all reasonable efforts have been taken to ensure the accuracy
14 *  and reliability of the software and data, the NLM and the U.S.
15 *  Government do not and cannot warrant the performance or results that
16 *  may be obtained by using this software or data. The NLM and the U.S.
17 *  Government disclaim all warranties, express or implied, including
18 *  warranties of performance, merchantability or fitness for any particular
19 *  purpose.
20 *
21 *  Please cite the author in any work or product based on this material.
22 *
23 * ===========================================================================
24 *
25 */
26 #include <klib/report.h>
27 #include <klib/container.h>
28 #include <klib/log.h>
29 #include <klib/out.h>
30 #include <klib/text.h>
31 #include <klib/status.h>
32 #include <klib/rc.h>
33 #include <klib/vector.h>
34 #include <klib/printf.h>
35 #include <klib/data-buffer.h>
36 #include <vfs/manager.h>
37 #include <vfs/path.h>
38 #include <vfs/path-priv.h>
39 #include <kfs/file.h>
40 #include <kfs/buffile.h>
41 #include <kfs/gzip.h>
42 #include <kfs/bzip.h>
43 #include <kdb/meta.h>
44 #include <kdb/namelist.h>
45 #include <kapp/main.h>
46 #include <kapp/args.h>
47 #include <insdc/insdc.h>
48 #include <insdc/sra.h>
49 #include <vdb/report.h>
50 #include <vdb/manager.h>
51 #include <vdb/database.h>
52 #include <vdb/table.h>
53 #include <vdb/cursor.h>
54 #include <vdb/vdb-priv.h>
55 #include <vdb/schema.h>
56 #include <vdb/dependencies.h>
57 #include <sra/sraschema.h>
58 #include <sra/srapath.h>
59 #include <align/dna-reverse-cmpl.h>
60 #include <align/iterator.h>
61 #include <align/reference.h>
62 #include <align/quality-quantizer.h>
63 
64 #include <kfs/directory.h>
65 #include <os-native.h>
66 #include <sysalloc.h>
67 
68 #include <stdio.h>
69 #include <stdlib.h>
70 #include <stdint.h>
71 #include <limits.h>
72 #include <string.h>
73 #include <strtol.h>
74 #include <ctype.h>
75 #include <assert.h>
76 
77 #include "debug.h"
78 
79 #if _ARCH_BITS == 64
80 #define USE_MATE_CACHE 1
81 #define CURSOR_CACHE (4 * 1024 * 1024 * 1024UL)
82 #else
83 #define USE_MATE_CACHE 0
84 #define CURSOR_CACHE (256 * 1024 * 1024)
85 #endif
86 
87 rc_t cg_canonical_print_cigar( const char * cigar, size_t cigar_len);
88 
89 
90 
91 typedef struct TAlignedRegion_struct
92 {
93     char name[1024];
94     struct {
95         INSDC_coord_zero from;
96         INSDC_coord_zero to;
97     } r[10240];
98     int rq;
99     INSDC_coord_zero max_to;
100     INSDC_coord_zero min_from;
101     int printed;
102 } TAlignedRegion;
103 
104 
105 typedef struct TMatepairDistance_struct
106 {
107     uint32_t from;
108     uint32_t to;
109 } TMatepairDistance;
110 
111 
112 struct params_s
113 {
114     /* which outputs are on */
115     bool primaries;
116     bool secondaries;
117     bool unaligned;
118     bool cg_evidence;
119     bool cg_ev_dnb;
120     bool cg_sam;
121 
122     bool use_seqid;
123     bool long_cigar;
124     bool reheader;
125     bool noheader;
126     bool hide_identical;
127     bool fasta;
128     bool fastq;
129     bool spot_group_in_name;
130     bool cg_friendly_names;
131     bool reverse_unaligned;
132     bool unaligned_spots;
133 
134     bool output_gzip;
135     bool output_bz2;
136 
137     bool xi;
138     int cg_style; /* 0: raw; 1: with B's; 2: without B's, fixed up SEQ/QUAL; */
139     char const *name_prefix;
140     /* region filter data */
141     TAlignedRegion* region;
142     uint32_t region_qty;
143     /* distance filter data */
144     bool mp_dist_unknown;
145     TMatepairDistance* mp_dist;
146     uint32_t mp_dist_qty;
147     uint32_t test_rows;
148 
149     /* mate info cache */
150     int64_t mate_row_gap_cachable;
151 
152     char const **comments;
153 
154     bool quantizeQual;
155     uint8_t qualQuant[256];
156     uint8_t qualQuantSingle; /*** the quality is quantized - single value, no need to retrieve **/
157 };
158 
159 
160 struct params_s const *param;
161 ReferenceList const *gRefList;
162 
163 
164 typedef union UData_union
165 {
166     void const *v;
167     uint32_t const *u32;
168     int32_t const *i32;
169     int64_t const *i64;
170     uint64_t const *u64;
171     uint8_t const *u8;
172     char const *str;
173     bool const *tf;
174     INSDC_coord_one* coord1;
175     INSDC_coord_zero* coord0;
176     INSDC_coord_len* coord_len;
177     INSDC_coord_val* coord_val;
178     INSDC_SRA_xread_type* read_type;
179     INSDC_SRA_read_filter* read_filter;
180 } UData;
181 
182 
183 typedef struct SCol_struct
184 {
185     char const *name;
186     uint32_t idx;
187     UData base;
188     uint32_t len;
189     bool optional;
190 } SCol;
191 
192 
193 typedef struct STable_struct
194 {
195     char const *name;
196     VTable const *vtbl;
197 } STable;
198 
199 
200 typedef struct SCursCache_struct
201 {
202     KVector* cache;
203     KVector* cache_unaligned_mate; /* keeps unaligned-mate for a half-aligned spots */
204     uint32_t sam_flags;
205     INSDC_coord_zero pnext;
206     int32_t tlen;
207     ReferenceObj const *ref;
208     /* cache stats */
209     uint64_t projected;
210     uint64_t added;
211     uint64_t hit;
212     uint64_t bad;
213 } SCursCache;
214 
215 
216 typedef struct SCurs_struct
217 {
218     STable const *tbl;
219     VCursor const *vcurs;
220     SCursCache* cache;
221     SCursCache cache_local;
222     uint64_t col_reads_qty;
223 } SCurs;
224 
225 enum eDSTableType
226 {
227     edstt_NotSpecified,
228     edstt_Reference,
229     edstt_Sequence,
230     edstt_PrimaryAlignment,
231     edstt_SecondaryAlignment,
232     edstt_EvidenceAlignment,
233     edstt_EvidenceInterval
234 };
235 
236 
237 typedef struct DataSource_s {
238     STable tbl;
239     SCurs curs;
240     SCol *cols;
241     enum eDSTableType type;
242 } DataSource;
243 
244 
245 #define DATASOURCE_INIT(O, NAME) do { memset(&O, 0, sizeof(O)); O.tbl.name = NAME; O.curs.tbl = &O.tbl; } while(0)
246 
247 
248 typedef struct SAM_dump_ctx_s
249 {
250     VDatabase const *db;
251     char const *fullPath;
252     char const *accession;
253     char const *readGroup;
254 
255     DataSource ref;
256     DataSource pri;
257     DataSource sec;
258     DataSource evi;
259     DataSource eva;
260     DataSource seq;
261 } SAM_dump_ctx_t;
262 
263 
264 enum ealg_col
265 {
266     alg_SEQ_SPOT_ID = 0,
267     alg_SEQ_NAME,
268     alg_MAPQ,
269     alg_CIGAR,
270     alg_READ,
271     alg_READ_START,
272     alg_READ_LEN,
273     alg_CIGAR_LEN,
274     alg_SAM_QUALITY,
275     alg_SPOT_GROUP,
276     alg_SEQ_SPOT_GROUP,
277     alg_SEQ_READ_ID,
278     alg_REVERSED,
279     alg_ALIGNMENT_COUNT,
280     alg_EDIT_DISTANCE,
281     alg_READ_FILTER,
282     alg_MATE_ALIGN_ID,
283     alg_MATE_REF_NAME,
284     alg_SAM_FLAGS,
285     alg_REF_START,
286     alg_MATE_REF_POS,
287     alg_ALIGN_GROUP,
288     alg_EVIDENCE_ALIGNMENT_IDS,
289     alg_REF_PLOIDY,
290     alg_REF_ID,
291     alg_MATE_REF_ID,
292     alg_HAS_MISMATCH,
293     alg_REGION_FILTER,
294     alg_REF_NAME = alg_REGION_FILTER,
295     alg_REF_SEQ_ID,
296     alg_REF_POS,
297     alg_REF_LEN,
298     alg_DISTANCE_FILTER,
299     alg_TEMPLATE_LEN = alg_DISTANCE_FILTER,
300     alg_CG_TAGS_STR
301 };
302 
303 
304 SCol const g_alg_col_tmpl[] =
305 {
306     { "SEQ_SPOT_ID", 0, {NULL}, 0, true },
307     { "SEQ_NAME", 0, {NULL}, 0, false },
308     { "MAPQ", 0, {NULL}, 0, false },
309     { "?CIGAR column name?", 0, {NULL}, 0, false },
310     { "?READ column name?", 0, {NULL}, 0, false },
311     { "READ_START", 0, {NULL}, 0, false }, /* READ_START */
312     { "READ_LEN", 0, {NULL}, 0, false }, /* READ_LEN */
313     { "?CIGAR_LEN column name?", 0, {NULL}, 0, true }, /* CIGAR_LEN */
314     { "SAM_QUALITY", 0, {NULL}, 0, false },
315     { "SPOT_GROUP", 0, {NULL}, 0, true },
316     { "SEQ_SPOT_GROUP", 0, {NULL}, 0, true },
317     { "SEQ_READ_ID", 0, {NULL}, 0, true },
318     { "REF_ORIENTATION", 0, {NULL}, 0, true },
319     { "ALIGNMENT_COUNT", 0, {NULL}, 0, true },
320     { "EDIT_DISTANCE", 0, {NULL}, 0, false },
321     { "", 0, {NULL}, 0, false },
322     /* start cols used as standalone in DumpUnaligned */
323     /* MATE_ALIGN_ID col must preceeed
324        MATE_REF_NAME, MATE_REF_POS, SAM_FLAGS, TEMPLATE_LEN for cache to work */
325     { "MATE_ALIGN_ID", 0, {NULL}, 0, false },
326     { "?MATE_REF_NAME column name?", 0, {NULL}, 0, false },
327     { "SAM_FLAGS", 0, {NULL}, 0, false },
328     { "REF_START", 0, {NULL}, 0, false },  /* priming cursor cache */
329     { "MATE_REF_POS", 0, {NULL}, 0, false },
330     { "ALIGN_GROUP", 0, {NULL}, 0, true },
331     { "", 0, {NULL}, 0, true }, /* EVIDENCE_ALIGNMENT_IDS */
332     { "", 0, {NULL}, 0, true }, /* REF_PLOIDY */
333     { "REF_ID", 0, {NULL}, 0, true }, /* REF_ID */
334     { "MATE_REF_ID", 0, {NULL}, 0, true }, /* priming cursor cache */
335     { "(bool)HAS_MISMATCH", 0, {NULL}, 0, true },
336     /* these are read before any other for filtering so they must be last */
337     { "REF_NAME", 0, {NULL}, 0, false },
338     { "REF_SEQ_ID", 0, {NULL}, 0, false },
339     { "REF_POS", 0, {NULL}, 0, false },
340     /* end cols used as standalone in DumpUnaligned */
341     { "REF_LEN", 0, {NULL}, 0, false },
342     { "TEMPLATE_LEN", 0, {NULL}, 0, false },
343     { NULL, 0, {NULL}, 0, false }, /* alg_CG_TAGS_STR */
344     { NULL, 0, {NULL}, 0, false }
345 };
346 
347 enum eseq_col
348 {
349     seq_READ = 0,
350     seq_QUALITY,
351     seq_SPOT_GROUP,
352     seq_READ_START,
353     seq_READ_LEN,
354     seq_READ_TYPE,
355     seq_READ_FILTER,
356     seq_NAME,
357     seq_PRIMARY_ALIGNMENT_ID
358 };
359 
360 static
361 SCol const gSeqCol[] =
362 {
363     { "READ", 0, {NULL}, 0, false },
364     { "(INSDC:quality:text:phred_33)QUALITY", 0, {NULL}, 0, false },
365     { "SPOT_GROUP", 0, {NULL}, 0, true },
366     { "READ_START", 0, {NULL}, 0, true },
367     { "READ_LEN", 0, {NULL}, 0, true },
368     { "READ_TYPE", 0, {NULL}, 0, true },
369     { "READ_FILTER", 0, {NULL}, 0, true },
370     { "NAME", 0, {NULL}, 0, true },
371     /* must be last in list to avoid reading all columns */
372     { "PRIMARY_ALIGNMENT_ID", 0, {NULL}, 0, true },
373     { NULL, 0, {NULL}, 0, false }
374 };
375 
376 
RefSeqPrint(void)377 static rc_t RefSeqPrint( void )
378 {
379     rc_t rc = 0;
380     uint32_t i, count = 0;
381 
382     rc = ReferenceList_Count( gRefList, &count );
383     for( i = 0; rc == 0 && i < count; i++ )
384     {
385         ReferenceObj const *obj;
386         rc = ReferenceList_Get( gRefList, &obj, i );
387         if ( rc == 0 )
388         {
389             char const *seqid = NULL;
390             rc = ReferenceObj_SeqId( obj, &seqid );
391             if ( rc == 0 )
392             {
393                 char const *name = NULL;
394                 rc = ReferenceObj_Name( obj, &name );
395                 if ( rc == 0 )
396                 {
397                     INSDC_coord_len len;
398                     rc = ReferenceObj_SeqLength( obj, &len );
399                     if ( rc == 0 )
400                     {
401                         char const *nm;
402                         if ( param->use_seqid && seqid != NULL && seqid[ 0 ] != '\0' )
403                         {
404                             nm = seqid;
405                         }
406                         else
407                         {
408                             nm = name;
409                         }
410                         KOutMsg( "@SQ\tSN:%s", nm );
411                         if ( nm != seqid && seqid != NULL && seqid[ 0 ] != '\0' && strcmp( nm, seqid ) != 0)
412                         {
413                             KOutMsg( "\tAS:%s", seqid );
414                         }
415                         KOutMsg( "\tLN:%u\n", len );
416                     }
417                 }
418             }
419             ReferenceObj_Release( obj );
420         }
421     }
422     return rc;
423 }
424 
425 
426 #if USE_MATE_CACHE
Cache_Init(SCursCache * c)427 static rc_t Cache_Init( SCursCache* c )
428 {
429     if ( c != NULL )
430     {
431 	rc_t rc;
432         memset( c, 0, sizeof( *c ) );
433         rc=KVectorMake( &c->cache );
434 	if(rc == 0){
435 		rc=KVectorMake( &c->cache_unaligned_mate );
436 	}
437     }
438     return 0;
439 }
440 
441 
Cache_Close(char const * name,SCursCache * c)442 static void Cache_Close( char const *name, SCursCache* c )
443 {
444     if ( c != NULL )
445     {
446         KVectorRelease( c->cache );
447         KVectorRelease( c->cache_unaligned_mate );
448         if ( c->added > 0 )
449         {
450             SAM_DUMP_DBG( 2, ( "%s cache stats: projected %lu added of those %lu; "
451                                "hits %lu of those broken %lu;\n",
452                                name, c->projected, c->added, c->hit, c->bad ) );
453         }
454     }
455     memset( c, 0, sizeof( *c ) );
456 }
457 
458 
Cache_Add(uint64_t key,SCurs const * curs,SCol const * cols)459 static rc_t Cache_Add( uint64_t key, SCurs const *curs, SCol const *cols )
460 {
461     /* compact values for mate record to cache as:
462         pos_delta - 32bit, ref_proj - 11bit, flags - 11bit, rnext_idx - 10bit = 64bit
463     */
464     rc_t rc = 0;
465     ReferenceObj const *r = NULL;
466     uint32_t rid = 0;
467     uint64_t val = 0;
468     int64_t mate_id = cols[ alg_MATE_ALIGN_ID ].len > 0 ? cols[ alg_MATE_ALIGN_ID ].base.i64[ 0 ] : 0;
469 
470     rc = ReferenceList_Find( gRefList, &r, cols[ alg_REF_NAME ].base.str, cols[ alg_REF_NAME ].len );
471     if ( rc == 0 )
472     {
473         rc = ReferenceObj_Idx( r, &rid );
474     }
475 #if _DEBUGGING
476     {
477         char const *rname = NULL;
478         curs->cache->projected++;
479         ReferenceObj_Name( r, &rname );
480         SAM_DUMP_DBG( 10, ( "to cache row %li for mate %li: %u,%s[%hu],%u,%u,%i",
481             key, mate_id, cols[ alg_SAM_FLAGS ].base.u32[ 0 ], rname, rid,
482             cols[ alg_REF_POS ].len ? cols[ alg_REF_POS ].base.coord0[ 0 ] : 0,
483             cols[ alg_MATE_REF_POS ].len ? cols[ alg_MATE_REF_POS ].base.coord0[ 0 ] : 0,
484             cols[ alg_TEMPLATE_LEN ].len > 0 ? cols[ alg_TEMPLATE_LEN ].base.i32[ 0 ] : 0));
485     }
486 #endif
487     if ( rc == 0 && !( rid & 0xFC00 ) )
488     {
489         int64_t pos_delta64;
490         int32_t pos_delta32;
491 
492         if ( mate_id != 0 )
493         {
494             ReferenceObj const *rm = NULL;
495             uint32_t rm_id;
496 
497             rc = ReferenceList_Find( gRefList, &rm, cols[ alg_MATE_REF_NAME ].base.str, cols[ alg_MATE_REF_NAME ].len );
498             if ( rc == 0 )
499             {
500                 rc = ReferenceObj_Idx( rm, &rm_id );
501             }
502             assert( rm != NULL );
503             if ( rc == 0 && rid != rm_id )
504             {
505                 char const *rm_name = NULL;
506                 ReferenceObj_Name( rm, &rm_name );
507                 mate_id = 0;
508                 SAM_DUMP_DBG( 10, ( " mate ref differ: %s[%hu]!", rm_name, rm_id ) );
509             }
510             ReferenceObj_Release( rm );
511         }
512 
513         if ( mate_id != 0 )
514         {
515             pos_delta64 = cols[ alg_MATE_REF_POS ].base.coord0[ 0 ] - cols[ alg_REF_POS ].base.coord0[ 0 ];
516         }
517         else
518         {
519             pos_delta64 = cols[ alg_REF_POS ].base.coord0[ 0 ];
520         }
521 
522         pos_delta32 = pos_delta64;
523         if ( pos_delta64 == pos_delta32 )
524         {
525             int64_t ref_proj;
526             if ( mate_id == 0 )
527             {
528                 ref_proj = 0; /* indicates full value in delta */
529             }
530             else if ( cols[ alg_TEMPLATE_LEN ].base.i32[ 0 ] < 0 )
531             {
532                 assert( pos_delta32 <= 0 );
533                 ref_proj = pos_delta32 - cols[ alg_TEMPLATE_LEN ].base.i32[ 0 ];
534             }
535             else
536             {
537                 assert( pos_delta32 >= 0 );
538                 ref_proj = cols[ alg_TEMPLATE_LEN ].base.i32[ 0 ] - pos_delta32;
539             }
540 
541             if ( !( ref_proj & 0xFFFFF800 ) )
542             {
543                 val = ( pos_delta64 << 32 ) | ( ref_proj << 21 ) | ( cols[ alg_SAM_FLAGS ].base.u32[ 0 ] << 10 ) | rid;
544                 rc = KVectorSetU64( curs->cache->cache, key, val );
545             }
546         }
547     }
548     ReferenceObj_Release( r );
549 
550 #if _DEBUGGING
551     if ( val == 0 )
552     {
553         SAM_DUMP_DBG( 10, ( " --> out of range\n" ) );
554     }
555     else
556     {
557         SAM_DUMP_DBG( 10, ( " --> %016lX\n", val ) );
558         curs->cache->added++;
559     }
560 #endif
561     return rc;
562 }
563 
564 
Cache_Get(SCurs const * curs,uint64_t key,uint64_t * val)565 static rc_t Cache_Get( SCurs const *curs, uint64_t key, uint64_t* val )
566 {
567     rc_t rc = KVectorGetU64( curs->cache->cache, key, val );
568     if ( rc == 0 )
569     {
570         uint32_t id = ( *val & 0x3FF );
571 #if _DEBUGGING
572         curs->cache->hit++;
573 #endif
574         KVectorUnset( curs->cache->cache, key );
575         rc = ReferenceList_Get( gRefList, &curs->cache->ref, id );
576         if ( rc != 0 )
577         {
578             *val = 0;
579             curs->cache->ref = NULL;
580             rc = RC( rcExe, rcNoTarg, rcSearching, rcItem, rcNotFound );
581 #if _DEBUGGING
582             curs->cache->bad++;
583 #endif
584         }
585         else
586         {
587             SAM_DUMP_DBG( 10, ( "from cache row %li %016lX", key, *val ) );
588         }
589     }
590     return rc;
591 }
592 
593 
Cache_Unpack(uint64_t val,int64_t mate_id,SCurs const * curs,SCol * cols)594 static void Cache_Unpack( uint64_t val, int64_t mate_id, SCurs const *curs, SCol* cols )
595 {
596     int32_t pos_delta = ( val & 0xFFFFFFFF00000000 ) >> 32;
597     uint32_t ref_proj = ( val & 0x00000000FFE00000 ) >> 21;
598     uint32_t flags = ( val & 0x00000000001FFC00 ) >> 10;
599 
600     if ( mate_id != 0 )
601     {
602         /* adjust flags for mate record */
603         curs->cache->sam_flags = ( flags & 0x1 ) |
604                                  ( flags & 0x2 ) |
605                                  ( ( flags & 0x8 ) >> 1 ) |
606                                  ( ( flags & 0x4 ) << 1 ) |
607                                  ( ( flags & 0x20 ) >> 1 ) |
608                                  ( ( flags & 0x10 ) << 1 ) |
609                                  ( ( flags & 0x40 ) ? 0x80 : 0x40 ) |
610                                  ( flags & 0x700 );
611     }
612     else
613     {
614         /* preserve flags as if original records is restored */
615         curs->cache->sam_flags = flags;
616     }
617     cols[ alg_SAM_FLAGS ].base.u32 = &curs->cache->sam_flags;
618     cols[ alg_SAM_FLAGS ].len = sizeof( curs->cache->sam_flags );
619 
620     if ( param->use_seqid )
621     {
622         ReferenceObj_SeqId( curs->cache->ref, &cols[ alg_MATE_REF_NAME ].base.str );
623     }
624     else
625     {
626         ReferenceObj_Name( curs->cache->ref, &cols[ alg_MATE_REF_NAME ].base.str );
627     }
628 
629     cols[ alg_MATE_REF_NAME ].len = string_size( cols[ alg_MATE_REF_NAME ].base.str );
630 
631     if ( ref_proj == 0 )
632     {
633         curs->cache->pnext = pos_delta;
634         curs->cache->tlen = 0;
635     }
636     else if ( pos_delta > 0 )
637     {
638         curs->cache->pnext = ( cols[ alg_REF_POS ].len > 0 ? cols[ alg_REF_POS ].base.coord0[ 0 ] : 0 ) - pos_delta;
639         curs->cache->tlen = - ( ref_proj + pos_delta );
640     }
641     else
642     {
643         curs->cache->pnext = ( cols[ alg_REF_POS ].len > 0 ? cols[ alg_REF_POS ].base.coord0[ 0 ] : 0 ) - pos_delta;
644         curs->cache->tlen = ref_proj - pos_delta;
645     }
646 
647     cols[ alg_MATE_REF_POS ].base.coord0 = &curs->cache->pnext;
648     cols[ alg_MATE_REF_POS ].len = sizeof( curs->cache->pnext );
649     cols[ alg_TEMPLATE_LEN ].base.i32 = &curs->cache->tlen;
650     cols[ alg_TEMPLATE_LEN ].len = sizeof( curs->cache->tlen );
651     {
652         uint32_t id;
653         ReferenceObj_Idx( curs->cache->ref, &id );
654         SAM_DUMP_DBG( 10, ( " --> mate %li: %u,%s[%hu],%u,%i\n",
655             mate_id, curs->cache->sam_flags, cols[ alg_MATE_REF_NAME ].base.str,
656             id, curs->cache->pnext, curs->cache->tlen ) );
657     }
658 }
659 #endif /* USE_MATE_CACHE */
660 
661 #if 0
662 static rc_t OpenVTable( VDatabase const *db, STable* tbl, char const *name, bool optional )
663 {
664     rc_t rc = VDatabaseOpenTableRead( db, &tbl->vtbl, "%s", name );
665     if ( GetRCState( rc ) == rcNotFound && optional )
666     {
667         rc = 0;
668         tbl->vtbl = NULL;
669     }
670     tbl->name = name;
671     return rc;
672 }
673 #endif
674 
Cursor_Open(STable const * const tbl,SCurs * const curs,SCol cols[],SCursCache * cache)675 static rc_t Cursor_Open( STable const *const tbl, SCurs *const curs, SCol cols[], SCursCache* cache )
676 {
677     rc_t rc = 0;
678     unsigned i;
679 
680     curs->vcurs = NULL;
681     if ( tbl == NULL || tbl->vtbl == NULL )
682         return 0;
683 
684     rc = VTableCreateCachedCursorRead( tbl->vtbl, &curs->vcurs, CURSOR_CACHE );
685     if ( rc != 0 )
686         return rc;
687 
688     rc = VCursorPermitPostOpenAdd( curs->vcurs );
689     if ( rc != 0 )
690         return rc;
691 
692     if ( rc == 0 )
693     {
694         rc = VCursorOpen( curs->vcurs );
695         if ( rc == 0 )
696         {
697 #if USE_MATE_CACHE
698             if ( cache != NULL )
699             {
700                 curs->cache = cache;
701             }
702             else
703             {
704                 curs->cache = &curs->cache_local;
705                 rc = Cache_Init( &curs->cache_local );
706             }
707 #endif /* USE_MATE_CACHE */
708             curs->tbl = tbl;
709         }
710     }
711 
712     for ( i = 0; cols[ i ].name != NULL; ++i )
713     {
714         if ( cols[ i ].name[ 0 ] == 0 )
715             continue;
716         rc = VCursorAddColumn( curs->vcurs, &cols[ i ].idx, "%s", cols[ i ].name );
717         if ( GetRCObject( rc ) == ( enum RCObject ) rcColumn )
718         {
719             switch ( GetRCState( rc ) )
720             {
721             case rcNotFound:
722             case rcUndefined:
723                 if ( !cols[ i ].optional )
724                     break;
725             case rcExists:
726                 rc = 0;
727             default:
728                 break;
729             }
730         }
731         if ( rc != 0 )
732         {
733             (void)PLOGERR( klogErr, ( klogErr, rc, "table $(t) column $(c)", "t=%s,c=%s", tbl->name, cols[ i ].name ) );
734             break;
735         }
736     }
737     if ( rc != 0 )
738     {
739         VCursorRelease( curs->vcurs );
740         curs->vcurs = NULL;
741         if ( rc != KLogLastErrorCode() )
742         {
743             (void)PLOGERR( klogErr, ( klogErr, rc, "table $(t)", "t=%s", tbl->name ) );
744         }
745     }
746     else
747     {
748         SAM_DUMP_DBG( 2, ( "%s: table %s\n", __func__, curs->tbl->name ) );
749     }
750     return rc;
751 }
752 
753 
Cursor_Close(SCurs * curs)754 static void Cursor_Close( SCurs* curs )
755 {
756     if ( curs != NULL && curs->vcurs != NULL )
757     {
758         SAM_DUMP_DBG( 2, ( "%s: table %s, columns rows read %lu\n", __func__, curs->tbl->name, curs->col_reads_qty ) );
759         VCursorRelease( curs->vcurs );
760 #if USE_MATE_CACHE
761         if ( curs->cache == &curs->cache_local )
762         {
763             Cache_Close( curs->tbl->name, curs->cache );
764         }
765 #endif /* USE_MATE_CACHE */
766         memset( curs, 0, sizeof( *curs ) );
767     }
768 }
769 
770 
Cursor_Read(DataSource * ds,int64_t row_id,int firstCol,unsigned nCols)771 static rc_t Cursor_Read( DataSource *ds, int64_t row_id, int firstCol, unsigned nCols )
772 {
773     rc_t rc = 0;
774 
775     if ( ds->curs.vcurs == NULL )
776     {
777         rc = Cursor_Open( &ds->tbl, &ds->curs, ds->cols, ds->curs.cache );
778         if ( rc != 0 )
779             return rc;
780     }
781     if (1)
782     {
783         SCol *const col = &ds->cols[ firstCol ];
784         unsigned i;
785 
786         for ( i = 0; i < nCols && col[ i ].name; ++i )
787         {
788             uint32_t const idx = col[ i ].idx;
789 
790             if ( idx != 0 )
791             {
792                 uint32_t elem_bits;
793                 uint32_t bit_offset;
794                 uint32_t elem_count;
795                 void const *base;
796 
797                 rc = VCursorCellDataDirect( ds->curs.vcurs, row_id, idx, &elem_bits, &base, &bit_offset, &elem_count );
798                 if ( rc != 0 )
799                 {
800                     (void)PLOGERR( klogWarn, ( klogWarn, rc, "reading $(t) row $(r), column $(c)", "t=%s,r=%li,c=%s",
801                                                ds->tbl.name, row_id, col[ i ].name ) );
802                     return rc;
803                 }
804 
805                 assert( bit_offset == 0 );
806                 assert( elem_bits % 8 == 0 );
807 
808                 col[ i ].base.v = base;
809                 col[ i ].len = elem_count;
810             }
811         }
812     }
813     return rc;
814 }
815 
816 struct
817 {
818     KWrtWriter writer;
819     void* data;
820     KFile* kfile;
821     uint64_t pos;
822 } g_out_writer = {NULL};
823 
824 
BufferedWriter(void * const self,char const buffer[],size_t const bufsize,size_t * const pnum_writ)825 static rc_t CC BufferedWriter( void *const self, char const buffer[], size_t const bufsize, size_t *const pnum_writ )
826 {
827     rc_t rc = 0;
828     size_t written = 0;
829 
830     assert( buffer != NULL );
831 
832     while ( written < bufsize )
833     {
834         size_t n;
835 
836         rc = KFileWrite( g_out_writer.kfile, g_out_writer.pos + written, &buffer[ written ], bufsize - written, &n );
837         if ( rc != 0 )
838             break;
839         written += n;
840     }
841     g_out_writer.pos += written;
842     if ( pnum_writ != NULL )
843         *pnum_writ = written;
844     return rc;
845 }
846 
847 
BufferedWriterMake(bool gzip,bool bzip2)848 static rc_t BufferedWriterMake( bool gzip, bool bzip2 )
849 {
850     rc_t rc = 0;
851 
852     if ( gzip && bzip2 )
853     {
854         rc = RC( rcApp, rcFile, rcConstructing, rcParam, rcAmbiguous );
855     }
856     else if ( g_out_writer.writer != NULL )
857     {
858         rc = RC( rcApp, rcFile, rcConstructing, rcParam, rcAmbiguous );
859     }
860     else
861     {
862         rc = KFileMakeStdOut( &g_out_writer.kfile );
863         if ( rc == 0 )
864         {
865             g_out_writer.pos = 0;
866             if ( gzip )
867             {
868                 KFile* gz;
869                 rc = KFileMakeGzipForWrite( &gz, g_out_writer.kfile );
870                 if ( rc == 0 )
871                 {
872                     KFileRelease( g_out_writer.kfile );
873                     g_out_writer.kfile = gz;
874                 }
875             }
876             else if ( bzip2 )
877             {
878                 KFile* bz;
879                 rc = KFileMakeBzip2ForWrite( &bz, g_out_writer.kfile );
880                 if ( rc == 0 )
881                 {
882                     KFileRelease( g_out_writer.kfile );
883                     g_out_writer.kfile = bz;
884                 }
885             }
886             if ( rc == 0 )
887             {
888                 KFile* buf;
889                 rc = KBufFileMakeWrite( &buf, g_out_writer.kfile, false, 128 * 1024 );
890                 if ( rc == 0 )
891                 {
892                     KFileRelease( g_out_writer.kfile );
893                     g_out_writer.kfile = buf;
894                     g_out_writer.writer = KOutWriterGet();
895                     g_out_writer.data = KOutDataGet();
896                     rc = KOutHandlerSet( BufferedWriter, &g_out_writer );
897                 }
898             }
899         }
900     }
901     return rc;
902 }
903 
904 
BufferedWriterRelease(bool flush)905 static void BufferedWriterRelease( bool flush )
906 {
907     if ( flush )
908     {
909         /* avoid flushing buffered data after failure */
910         KFileRelease( g_out_writer.kfile );
911     }
912     if ( g_out_writer.writer != NULL )
913     {
914         KOutHandlerSet( g_out_writer.writer, g_out_writer.data );
915     }
916     g_out_writer.writer = NULL;
917 }
918 
919 
920 typedef struct ReadGroup
921 {
922     BSTNode node;
923     char name[ 1024 ];
924 } ReadGroup;
925 
926 
ReadGroup_sort(BSTNode const * item,BSTNode const * node)927 static int64_t CC ReadGroup_sort( BSTNode const *item, BSTNode const *node )
928 {
929     return strcmp( ( ( ReadGroup const * )item )->name, ( ( ReadGroup const * ) node )->name );
930 }
931 
932 
ReadGroup_print(char const * nm)933 static rc_t ReadGroup_print( char const *nm )
934 {
935     rc_t rc = 0;
936     if ( nm[ 0 ] != '\0' && strcasecmp( nm, "default" ) )
937     {
938         rc = KOutMsg( "@RG\tID:%s\n", nm );
939     }
940     return rc;
941 }
942 
943 
ReadGroup_dump(BSTNode * n,void * data)944 static void CC ReadGroup_dump( BSTNode *n, void *data )
945 {
946     ReadGroup const *g = ( ReadGroup* )n;
947     ReadGroup_print( g->name );
948 }
949 
950 
DumpReadGroupsScan(STable const * tbl)951 static rc_t CC DumpReadGroupsScan( STable const *tbl )
952 {
953     SCol cols[] =
954     {
955         { "SPOT_GROUP", 0, {NULL}, 0, false },
956         { NULL, 0, {NULL}, 0, false }
957     };
958     rc_t rc = 0;
959     BSTree tree;
960     DataSource ds;
961 
962     memset( &ds, 0, sizeof( ds ) );
963     ds.cols = cols;
964 
965     BSTreeInit( &tree );
966     rc = Cursor_Open( tbl, &ds.curs, ds.cols, NULL );
967     if ( rc == 0 )
968     {
969         int64_t start;
970         uint64_t count;
971 
972         rc = VCursorIdRange( ds.curs.vcurs, 0, &start, &count );
973         if ( rc == 0 )
974         {
975             ReadGroup* node = NULL;
976             uint32_t last_len = 0;
977 
978             while ( count > 0 && rc == 0 )
979             {
980                 rc = Cursor_Read( &ds, start, 0, ~(unsigned)0 );
981                 if ( rc == 0 && cols[ 0 ].len != 0 )
982                 {
983                     if ( node == NULL ||
984                          last_len != cols[ 0 ].len ||
985                          strncmp( cols[ 0 ].base.str, node->name, cols[ 0 ].len ) != 0 )
986                     {
987                         node = malloc( sizeof( *node ) );
988                         if ( node == NULL )
989                         {
990                             rc = RC( rcExe, rcNode, rcConstructing, rcMemory, rcExhausted );
991                         }
992                         else if ( cols[ 0 ].len > sizeof( node->name ) )
993                         {
994                             rc = RC( rcExe, rcString, rcCopying, rcBuffer, rcInsufficient );
995                         }
996                         else
997                         {
998                             last_len = cols[ 0 ].len;
999                             string_copy( node->name, ( sizeof node->name ) - 1, cols[ 0 ].base.str, last_len );
1000                             node->name[ last_len ] = '\0';
1001                             rc = BSTreeInsertUnique( &tree, &node->node, NULL, ReadGroup_sort );
1002                             if ( GetRCState( rc ) == rcExists )
1003                             {
1004                                 free( node );
1005                                 rc = 0;
1006                             }
1007                         }
1008                     }
1009                 }
1010                 else if ( GetRCState( rc ) == rcNotFound && GetRCObject( rc ) == rcRow )
1011                 {
1012                     rc = 0;
1013                 }
1014                 start++;
1015                 count--;
1016             }
1017         }
1018         Cursor_Close( &ds.curs );
1019     }
1020 
1021     if ( rc == 0 )
1022     {
1023         BSTreeForEach( &tree, false, ReadGroup_dump, NULL );
1024     }
1025     else if ( rc != KLogLastErrorCode() )
1026     {
1027         (void)PLOGERR( klogErr, ( klogErr, rc, "$(f)", "f=%s", __func__ ) );
1028     }
1029     BSTreeWhack( &tree, NULL, NULL );
1030     return rc;
1031 }
1032 
1033 
DumpReadGroups(STable const * tbl)1034 rc_t CC DumpReadGroups( STable const *tbl )
1035 {
1036     rc_t rc = 0;
1037     KMetadata const *m;
1038 
1039     /* try getting list from stats meta */
1040     rc = VTableOpenMetadataRead( tbl->vtbl, &m );
1041     if ( rc == 0 )
1042     {
1043         KMDataNode const *n;
1044         rc = KMetadataOpenNodeRead( m, &n, "/STATS/SPOT_GROUP" );
1045         if ( rc == 0 )
1046         {
1047             KNamelist* names;
1048             rc = KMDataNodeListChild( n, &names );
1049             if ( rc == 0 )
1050             {
1051                 uint32_t i, q;
1052                 rc = KNamelistCount( names, &q );
1053                 if ( rc == 0 )
1054                 {
1055                     for ( i = 0; rc == 0 && i < q; i++ )
1056                     {
1057                         char const *nm;
1058                         rc = KNamelistGet( names, i, &nm );
1059                         if ( rc == 0 )
1060                         {
1061                             rc = ReadGroup_print( nm );
1062                         }
1063                     }
1064                 }
1065                 KNamelistRelease( names );
1066             }
1067             KMDataNodeRelease( n );
1068         }
1069         KMetadataRelease( m );
1070     }
1071 
1072     if ( GetRCState( rc ) == rcNotFound )
1073     {
1074         rc = DumpReadGroupsScan( tbl );
1075     }
1076     else if ( rc != 0 && rc != KLogLastErrorCode() )
1077     {
1078         (void)PLOGERR( klogErr, ( klogErr, rc, "$(f)", "f=%s", __func__ ) );
1079     }
1080     return rc;
1081 }
1082 
Cursor_ReadAlign(SCurs const * curs,int64_t row_id,SCol * cols,uint32_t idx)1083 static rc_t Cursor_ReadAlign( SCurs const *curs, int64_t row_id, SCol* cols, uint32_t idx )
1084 {
1085     rc_t rc = 0;
1086     SCol* c = NULL;
1087     SCol* mate_id = NULL;
1088 #if USE_MATE_CACHE
1089     uint64_t cache_val = 0;
1090     bool cache_miss = false;
1091 #endif /* USE_MATE_CACHE */
1092 
1093 
1094     for( ; rc == 0 && cols[ idx ].name != NULL; idx++ )
1095     {
1096         c = &cols[ idx ];
1097         if ( c->idx != 0 )
1098         {
1099 #if USE_MATE_CACHE
1100             if ( mate_id != NULL && curs->cache != NULL && !cache_miss &&
1101                 ( idx == alg_SAM_FLAGS || idx == alg_MATE_REF_NAME || idx == alg_MATE_REF_POS || idx == alg_TEMPLATE_LEN ) &&
1102                 mate_id->idx && mate_id->len > 0 && mate_id->base.i64[ 0 ] > 0 )
1103             {
1104                 if ( cache_val != 0 )
1105                 {
1106                     continue;
1107                 }
1108                 rc = Cache_Get( curs, mate_id->base.u64[ 0 ], &cache_val );
1109                 if ( rc == 0 )
1110                 {
1111                     continue;
1112                 }
1113                 else if ( !( GetRCObject( rc ) == rcItem && GetRCState( rc ) == rcNotFound ) )
1114                 {
1115                     break;
1116                 }
1117                 else
1118                 {
1119                     /* avoid multiple lookups in cache */
1120                     cache_miss = true;
1121                 }
1122             }
1123 #endif /* USE_MATE_CACHE */
1124             rc = VCursorCellDataDirect( curs->vcurs, row_id, c->idx, NULL, &c->base.v, NULL, &c->len );
1125             if ( rc == 0 )
1126             {
1127                 if ( idx == alg_SEQ_SPOT_ID && ( c->len  ==0 || c->base.i64[ 0 ] == 0 ) )
1128                 {
1129                     return RC( rcExe, rcColumn, rcReading, rcRow, rcNotFound );
1130                 }
1131                 if ( idx == alg_MATE_ALIGN_ID )
1132                 {
1133                     mate_id = &cols[ alg_MATE_ALIGN_ID ];
1134                 }
1135 #if _DEBUGGING
1136                 ( ( SCurs* )curs )->col_reads_qty++;
1137 #endif
1138             }
1139         }
1140         else
1141         {
1142             static INSDC_coord_zero readStart;
1143             static INSDC_coord_len  readLen;
1144             static INSDC_coord_len  cigarLen;
1145 
1146             switch ( (int)idx )
1147             {
1148             case alg_READ_START:
1149                 readStart = 0;
1150                 c->base.coord0 = &readStart;
1151                 c->len = 1;
1152                 break;
1153             case alg_READ_LEN:
1154                 readLen = cols[ alg_READ ].len;
1155                 c->base.coord_len = &readLen;
1156                 c->len = 1;
1157                 break;
1158             case alg_CIGAR_LEN:
1159                 cigarLen = cols[ alg_CIGAR ].len;
1160                 c->base.coord_len = &cigarLen;
1161                 c->len = 1;
1162                 break;
1163             }
1164         }
1165     }
1166 
1167     if ( rc != 0 )
1168     {
1169         (void)PLOGERR( klogWarn, ( klogWarn, rc, "reading $(t) row $(r), column $(c)", "t=%s,r=%li,c=%s",
1170             curs->tbl->name, row_id, c ? c->name : "<none>" ) );
1171 #if USE_MATE_CACHE
1172     }
1173     else if ( curs->cache == NULL )
1174     {
1175     }
1176     else if ( cache_val == 0 )
1177     {
1178         /* this row is not from cache */
1179         int64_t mate_align_id = ( mate_id != NULL && mate_id->len > 0 ) ? mate_id->base.i64[ 0 ] : 0;
1180         if ( cols[ 0 ].idx != 0 && /* we have full cursor which means we are reading alignment table */
1181             /* no mate -> unaligned (not secondary!) */
1182             ( ( mate_align_id == 0 && param->unaligned && curs->cache != &curs->cache_local ) ||
1183             /* 2nd mate with higher rowid and more than set gap rows away */
1184               ( mate_align_id != 0 && mate_align_id > row_id && ( mate_align_id - row_id) > param->mate_row_gap_cachable ) ) )
1185         {
1186           rc = Cache_Add( row_id, curs, cols );
1187         }
1188 	if(param->unaligned == true && mate_align_id == 0 ){
1189 	  rc = KVectorSetBool( curs->cache->cache_unaligned_mate, cols[alg_SEQ_SPOT_ID].base.i64[0], true );
1190         }
1191     }
1192     else
1193     {
1194         Cache_Unpack( cache_val, row_id, curs, cols );
1195 #endif /* USE_MATE_CACHE */
1196     }
1197     return rc;
1198 }
1199 
1200 
DumpName(char const * name,size_t name_len,const char spot_group_sep,char const * spot_group,size_t spot_group_len,int64_t spot_id)1201 static rc_t DumpName( char const *name, size_t name_len,
1202                       const char spot_group_sep, char const *spot_group,
1203                       size_t spot_group_len, int64_t spot_id )
1204 {
1205     rc_t rc = 0;
1206     if ( param->cg_friendly_names )
1207     {
1208         rc = KOutMsg( "%.*s-1:%lu", spot_group_len, spot_group, spot_id );
1209     }
1210     else
1211     {
1212         if ( param->name_prefix != NULL )
1213         {
1214             rc = KOutMsg( "%s.", param->name_prefix );
1215         }
1216         rc = BufferedWriter( NULL, name, name_len, NULL );
1217         if ( rc == 0 && param->spot_group_in_name && spot_group_len > 0 )
1218         {
1219             rc = BufferedWriter( NULL, &spot_group_sep, 1, NULL );
1220             if ( rc == 0 )
1221                 rc = BufferedWriter( NULL, spot_group, spot_group_len, NULL );
1222         }
1223     }
1224     return rc;
1225 }
1226 
1227 
DumpQuality(char const quality[],unsigned const count,bool const reverse,bool const quantize)1228 static rc_t DumpQuality( char const quality[], unsigned const count, bool const reverse, bool const quantize )
1229 {
1230     rc_t rc = 0;
1231     if ( quality == NULL )
1232     {
1233         unsigned i;
1234 
1235         for ( i = 0; rc == 0 && i < count; ++i )
1236         {
1237             char const newValue = ((param->qualQuant && param->qualQuantSingle)?param->qualQuantSingle:30) + 33;
1238             rc = BufferedWriter( NULL, &newValue, 1, NULL );
1239         }
1240     }
1241     else if ( reverse || quantize )
1242     {
1243         unsigned i;
1244 
1245         for ( i = 0; rc == 0 && i < count; ++i )
1246         {
1247             char const qual = quality[ reverse ? ( count - i - 1 ) : i ];
1248             char const newValue = quantize ? param->qualQuant[ qual - 33 ] + 33 : qual;
1249 
1250             rc = BufferedWriter( NULL, &newValue, 1, NULL );
1251         }
1252     }
1253     else
1254     {
1255         rc = BufferedWriter( NULL, quality, count, NULL );
1256     }
1257     return rc;
1258 }
1259 
1260 
DumpUnalignedFastX(const SCol cols[],uint32_t read_id,INSDC_coord_zero readStart,INSDC_coord_len readLen,int64_t row_id)1261 static rc_t DumpUnalignedFastX( const SCol cols[], uint32_t read_id, INSDC_coord_zero readStart, INSDC_coord_len readLen, int64_t row_id )
1262 {
1263     /* fast[AQ] represnted in SAM fields:
1264        [@|>]QNAME unaligned
1265        SEQ
1266        +
1267        QUAL
1268     */
1269     rc_t rc = BufferedWriter( NULL, param->fastq ? "@" : ">", 1, NULL );
1270 
1271     /* QNAME: [PFX.]SEQUENCE:NAME[#SPOT_GROUP] */
1272     if ( rc == 0 )
1273         rc = DumpName( cols[ seq_NAME ].base.str, cols[ seq_NAME ].len, '#',
1274                        cols[ seq_SPOT_GROUP ].base.str, cols[ seq_SPOT_GROUP ].len, row_id );
1275     if ( rc == 0 && read_id > 0 )
1276     {
1277         rc = KOutMsg( "/%u", read_id );
1278     }
1279     if ( rc == 0 )
1280         rc = BufferedWriter( NULL, " unaligned\n", 11, NULL );
1281 
1282     /* SEQ: SEQUENCE.READ */
1283     if ( rc == 0 )
1284         rc = BufferedWriter( NULL, &cols[ seq_READ ].base.str[readStart], readLen, NULL );
1285     if ( rc == 0 && param->fastq )
1286     {
1287         /* QUAL: SEQUENCE.QUALITY */
1288         rc = BufferedWriter( NULL, "\n+\n", 3, NULL );
1289         if ( rc == 0 )
1290             rc = DumpQuality( &cols[ seq_QUALITY ].base.str[ readStart ], readLen, false, param->quantizeQual );
1291     }
1292     if ( rc == 0 )
1293         rc = BufferedWriter( NULL, "\n", 1, NULL );
1294     return rc;
1295 }
1296 
1297 
DumpAlignedFastX(const SCol cols[],int64_t const alignId,uint32_t read_id,bool primary,bool secondary)1298 static rc_t DumpAlignedFastX( const SCol cols[], int64_t const alignId, uint32_t read_id, bool primary, bool secondary )
1299 {
1300     rc_t rc = 0;
1301     size_t nm;
1302     unsigned readId;
1303     unsigned const nreads = cols[ alg_READ_LEN ].len;
1304 
1305     for ( readId = 0; readId < nreads && rc == 0 ; ++readId )
1306     {
1307         char const *qname = cols[ alg_SEQ_NAME ].base.str;
1308         size_t qname_len = cols[ alg_SEQ_NAME ].len;
1309         char synth_qname[ 40 ];
1310         int64_t const spot_id = cols[alg_SEQ_SPOT_ID].len > 0 ? cols[alg_SEQ_SPOT_ID].base.i64[0] : 0;
1311         char const *const read = cols[ alg_READ ].base.str + cols[ alg_READ_START ].base.coord0[ readId ];
1312         char const *const qual = cols[ alg_SAM_QUALITY ].base.v
1313                                ? cols[ alg_SAM_QUALITY ].base.str + cols[ alg_READ_START ].base.coord0[ readId ]
1314                                : NULL;
1315         unsigned const readlen = cols[ alg_READ_LEN ].base.coord_len[ readId ];
1316 
1317         /* fast[AQ] represnted in SAM fields:
1318            [@|>]QNAME primary|secondary ref=RNAME pos=POS mapq=MAPQ
1319            SEQ
1320            +
1321            QUAL
1322         */
1323         rc = BufferedWriter( NULL, param->fastq ? "@" : ">", 1, NULL );
1324         /* QNAME: [PFX.]SEQ_NAME[#SPOT_GROUP] */
1325         if ( qname_len == 0 || qname == NULL )
1326         {
1327             string_printf( synth_qname, sizeof( synth_qname ), &qname_len, "%li.%u", alignId, readId + 1 );
1328             qname = synth_qname;
1329         }
1330         nm = cols[ alg_SPOT_GROUP ].len ? alg_SPOT_GROUP : alg_SEQ_SPOT_GROUP;
1331         if ( rc == 0 )
1332             rc = DumpName( qname, qname_len, '.', cols[ nm ].base.str, cols[ nm ].len, spot_id);
1333 
1334         if ( rc == 0 && read_id > 0 )
1335             rc = KOutMsg( "/%u", read_id );
1336 
1337         if ( rc == 0 )
1338         {
1339             if ( primary )
1340             {
1341                 rc = BufferedWriter( NULL, " primary", 8, NULL );
1342             }
1343             else if ( secondary )
1344             {
1345                 rc = BufferedWriter( NULL, " secondary", 10, NULL );
1346             }
1347         }
1348 
1349         /* RNAME: REF_NAME or REF_SEQ_ID */
1350         if ( rc == 0 )
1351             rc = BufferedWriter( NULL, " ref=", 5, NULL );
1352         if ( rc == 0 )
1353         {
1354             if ( param->use_seqid )
1355             {
1356                 rc = BufferedWriter( NULL, cols[ alg_REF_SEQ_ID ].base.str, cols[ alg_REF_SEQ_ID ].len, NULL );
1357             }
1358             else
1359             {
1360                 rc = BufferedWriter( NULL, cols[ alg_REF_NAME ].base.str, cols[ alg_REF_NAME ].len, NULL );
1361             }
1362         }
1363 
1364         /* POS: REF_POS, MAPQ: MAPQ */
1365         if ( rc == 0 )
1366             rc = KOutMsg( " pos=%u mapq=%i\n", cols[ alg_REF_POS ].base.coord0[ 0 ] + 1, cols[ alg_MAPQ ].base.i32[ 0 ] );
1367 
1368         /* SEQ: READ */
1369         if ( rc == 0 )
1370             rc = BufferedWriter( NULL, read, readlen, NULL );
1371         if ( rc == 0 && param->fastq )
1372         {
1373             /* QUAL: SAM_QUALITY */
1374             rc = BufferedWriter( NULL, "\n+\n", 3, NULL );
1375             if ( rc == 0 )
1376                 rc = DumpQuality( qual, readlen, false, param->quantizeQual );
1377         }
1378         if ( rc == 0 )
1379             rc = BufferedWriter( NULL, "\n", 1, NULL );
1380     }
1381     return rc;
1382 }
1383 
1384 
1385 static
DumpUnalignedSAM(const SCol cols[],uint32_t flags,INSDC_coord_zero readStart,INSDC_coord_len readLen,char const * rnext,uint32_t rnext_len,INSDC_coord_zero pnext,char const readGroup[],int64_t row_id)1386 rc_t DumpUnalignedSAM( const SCol cols[], uint32_t flags, INSDC_coord_zero readStart, INSDC_coord_len readLen,
1387                        char const *rnext, uint32_t rnext_len, INSDC_coord_zero pnext, char const readGroup[], int64_t row_id )
1388 {
1389     unsigned i;
1390 
1391     /* QNAME: [PFX.]NAME[.SPOT_GROUP] */
1392     rc_t rc = DumpName( cols[ seq_NAME ].base.str, cols[ seq_NAME ].len, '.',
1393               cols[ seq_SPOT_GROUP ].base.str, cols[ seq_SPOT_GROUP ].len, row_id );
1394 
1395     /* all these fields are const text for now */
1396     if ( rc == 0 )
1397         rc = KOutMsg( "\t%u\t*\t0\t0\t*\t%.*s\t%u\t0\t",
1398              flags, rnext_len ? rnext_len : 1, rnext_len ? rnext : "*", pnext );
1399     /* SEQ: SEQUENCE.READ */
1400     if ( flags & 0x10 )
1401     {
1402         for( i = 0; i < readLen && rc == 0; i++ )
1403         {
1404             char base;
1405 
1406             DNAReverseCompliment( &cols[ seq_READ ].base.str[ readStart + readLen - 1 - i ], &base, 1 );
1407             rc = BufferedWriter( NULL, &base, 1, NULL );
1408         }
1409     }
1410     else
1411     {
1412         rc = BufferedWriter( NULL, &cols[ seq_READ ].base.str[ readStart ], readLen, NULL );
1413     }
1414 
1415     if ( rc == 0 )
1416         rc = BufferedWriter( NULL, "\t", 1, NULL );
1417     /* QUAL: SEQUENCE.QUALITY */
1418     if ( rc == 0 )
1419         rc = DumpQuality( &cols[ seq_QUALITY ].base.str[ readStart ], readLen, flags & 0x10, param->quantizeQual );
1420 
1421     /* optional fields: */
1422     if ( rc == 0 )
1423     {
1424         if ( readGroup )
1425         {
1426             rc = BufferedWriter( NULL, "\tRG:Z:", 6, NULL );
1427             if ( rc == 0 )
1428                 rc = BufferedWriter( NULL, readGroup, string_size( readGroup ), NULL );
1429         }
1430         else if ( cols[ seq_SPOT_GROUP ].len > 0 )
1431         {
1432             /* read group */
1433             rc = BufferedWriter( NULL, "\tRG:Z:", 6, NULL );
1434             if ( rc == 0 )
1435                 rc = BufferedWriter( NULL, cols[ seq_SPOT_GROUP ].base.str, cols[ seq_SPOT_GROUP ].len, NULL );
1436         }
1437     }
1438     if ( rc == 0 )
1439         rc = BufferedWriter( NULL, "\n", 1, NULL );
1440     return rc;
1441 }
1442 
1443 
1444 static
DumpAlignedSAM(SAM_dump_ctx_t * const ctx,DataSource const * ds,int64_t alignId,char const readGroup[],int type)1445 rc_t DumpAlignedSAM( SAM_dump_ctx_t *const ctx,
1446                      DataSource const *ds,
1447                      int64_t alignId,
1448                      char const readGroup[],
1449                      int type)
1450 {
1451     rc_t rc = 0;
1452     unsigned const nreads = ds->cols[ alg_READ_LEN ].len;
1453     SCol const *const cols = ds->cols;
1454     int64_t const spot_id = cols[alg_SEQ_SPOT_ID].len > 0 ? cols[alg_SEQ_SPOT_ID].base.i64[0] : 0;
1455     INSDC_coord_one const read_id = cols[alg_SEQ_READ_ID].len > 0 ? cols[alg_SEQ_READ_ID].base.coord1[0] : 0;
1456     INSDC_SRA_read_filter const *align_filter = cols[alg_READ_FILTER].len == nreads ? cols[alg_READ_FILTER].base.read_filter : NULL;
1457     INSDC_SRA_read_filter seq_filter = 0;
1458     unsigned readId;
1459     unsigned cigOffset = 0;
1460 
1461     if ( align_filter == NULL && spot_id && read_id && ctx->seq.cols )
1462     {
1463         rc_t rc;
1464 
1465         rc = Cursor_Read(&ctx->seq, spot_id, seq_READ_FILTER, 1);
1466         if (rc == 0 && ctx->seq.cols[seq_READ_FILTER].len >= read_id)
1467             seq_filter = ctx->seq.cols[seq_READ_FILTER].base.read_filter[read_id - 1];
1468     }
1469 
1470     for ( readId = 0; readId < nreads && rc == 0 ; ++readId )
1471     {
1472         char const *qname = cols[ alg_SEQ_NAME ].base.str;
1473         size_t qname_len = cols[ alg_SEQ_NAME ].len;
1474         char const *const read = cols[ alg_READ ].base.str + cols[ alg_READ_START ].base.coord0[ readId ];
1475         char const *const qual = cols[ alg_SAM_QUALITY ].base.v
1476                                ? cols[ alg_SAM_QUALITY ].base.str + cols[ alg_READ_START ].base.coord0[ readId ]
1477                                : NULL;
1478         unsigned const readlen = nreads > 1 ? cols[ alg_READ_LEN ].base.coord_len[ readId ] : cols[ alg_READ ].len;
1479         unsigned const sflags = cols[ alg_SAM_FLAGS ].base.v ? cols[ alg_SAM_FLAGS ].base.u32[ readId ] : 0;
1480         INSDC_SRA_read_filter const filt = align_filter ? align_filter[readId] : seq_filter;
1481         unsigned const flags = (sflags & ~((unsigned)0x200)) | ((filt == SRA_READ_FILTER_REJECT) ? 0x200 : 0);
1482         char const *const cigar = cols[ alg_CIGAR ].base.str + cigOffset;
1483         unsigned const cigLen = nreads > 1 ? cols[ alg_CIGAR_LEN ].base.coord_len[ readId ] : cols[ alg_CIGAR ].len;
1484         size_t nm;
1485         char synth_qname[1024];
1486 
1487         cigOffset += cigLen;
1488         if ( qname_len == 0 || qname == NULL )
1489         {
1490             string_printf( synth_qname, sizeof( synth_qname ), &qname_len, "ALLELE_%li.%u", alignId, readId + 1 );
1491             qname = synth_qname;
1492         }
1493         else if (ds->type == edstt_EvidenceAlignment) {
1494             string_printf( synth_qname, sizeof( synth_qname ), &qname_len, "%u/ALLELE_%li.%u", spot_id, cols[ alg_REF_ID ].base.i64[ readId ], cols[ alg_REF_PLOIDY ].base.u32[ readId ] );
1495             qname = synth_qname;
1496         }
1497         nm = cols[ alg_SPOT_GROUP ].len ? alg_SPOT_GROUP : alg_SEQ_SPOT_GROUP;
1498         rc = DumpName( qname, qname_len, '.', cols[ nm ].base.str, cols[ nm ].len, spot_id );
1499 
1500         /* FLAG: SAM_FLAGS */
1501         if ( rc == 0 )
1502         {
1503             if ( ds->type == edstt_EvidenceAlignment )
1504             {
1505                 bool const cmpl = cols[alg_REVERSED].base.v && readId < cols[alg_REVERSED].len ? cols[alg_REVERSED].base.tf[readId] : false;
1506                 rc = KOutMsg( "\t%u\t", 1 | (cmpl ? 0x10 : 0) | (read_id == 1 ? 0x40 : 0x80) );
1507             }
1508             else if ( !param->unaligned      /** not going to dump unaligned **/
1509                  && ( flags & 0x1 )     /** but we have sequenced multiple fragments **/
1510                  && ( flags & 0x8 ) )   /** and not all of them align **/
1511             {
1512                 /*** remove flags talking about multiple reads **/
1513                 /* turn off 0x001 0x008 0x040 0x080 */
1514                 rc = KOutMsg( "\t%u\t", flags & ~0xC9 );
1515             }
1516             else
1517             {
1518                 rc = KOutMsg( "\t%u\t", flags );
1519             }
1520         }
1521 
1522         if ( rc == 0 )
1523         {
1524             if ( ds->type == edstt_EvidenceAlignment && type == 0 )
1525             {
1526                 rc = KOutMsg( "ALLELE_%li.%u", cols[ alg_REF_ID ].base.i64[ readId ], cols[ alg_REF_PLOIDY ].base.u32[ readId ] );
1527             }
1528             else
1529             {
1530                 /* RNAME: REF_NAME or REF_SEQ_ID */
1531                 if ( param->use_seqid )
1532                     rc = BufferedWriter( NULL, cols[ alg_REF_SEQ_ID ].base.str, cols[ alg_REF_SEQ_ID ].len, NULL );
1533                 else
1534                     rc = BufferedWriter( NULL, cols[ alg_REF_NAME ].base.str, cols[ alg_REF_NAME ].len, NULL );
1535             }
1536         }
1537 
1538         if ( rc == 0 )
1539             rc = BufferedWriter( NULL, "\t", 1, NULL );
1540 
1541         /* POS: REF_POS */
1542         if ( rc == 0 )
1543             rc = KOutMsg( "%i\t", cols[ alg_REF_POS ].base.coord0[ 0 ] + 1 );
1544 
1545         /* MAPQ: MAPQ */
1546         if ( rc == 0 )
1547             rc = KOutMsg( "%i\t", cols[ alg_MAPQ ].base.i32[ 0 ] );
1548 
1549         /* CIGAR: CIGAR_* */
1550         if ( ds->type == edstt_EvidenceInterval )
1551         {
1552             unsigned i;
1553 
1554             for ( i = 0; i != cigLen && rc == 0; ++i )
1555             {
1556                 char ch = cigar[i];
1557                 if ( ch == 'S' ) ch = 'I';
1558                 rc = BufferedWriter(NULL, &ch, 1, NULL);
1559             }
1560         }
1561 	else if(ds->type == edstt_EvidenceAlignment)
1562 	{
1563 		rc = cg_canonical_print_cigar(cigar,cigLen);
1564 	}
1565         else
1566         {
1567             if ( rc == 0 )
1568                 rc = BufferedWriter( NULL, cigar, cigLen, NULL );
1569         }
1570 
1571         if ( rc == 0 )
1572             rc = BufferedWriter( NULL, "\t", 1, NULL );
1573 
1574         /* RNEXT: MATE_REF_NAME or '*' */
1575         /* PNEXT: MATE_REF_POS or 0 */
1576         if ( rc == 0 )
1577         {
1578             if ( cols[ alg_MATE_REF_NAME ].len )
1579             {
1580                 if ( cols[ alg_MATE_REF_NAME ].len == cols[ alg_REF_NAME ].len &&
1581                     memcmp( cols[ alg_MATE_REF_NAME ].base.str, cols[ alg_REF_NAME ].base.str, cols[ alg_MATE_REF_NAME ].len ) == 0 )
1582                 {
1583                     rc = BufferedWriter( NULL, "=\t", 2, NULL );
1584                 }
1585                 else
1586                 {
1587                     rc = BufferedWriter( NULL, cols[ alg_MATE_REF_NAME ].base.str, cols[ alg_MATE_REF_NAME ].len, NULL );
1588                     if ( rc == 0 )
1589                         rc = BufferedWriter( NULL, "\t", 1, NULL );
1590                 }
1591                 if ( rc == 0 )
1592                     rc = KOutMsg( "%u\t", cols[ alg_MATE_REF_POS ].base.coord0[ 0 ] + 1 );
1593             }
1594             else
1595             {
1596                 rc = BufferedWriter( NULL, "*\t0\t", 4, NULL );
1597             }
1598         }
1599 
1600         /* TLEN: TEMPLATE_LEN */
1601         if ( rc == 0 )
1602             rc = KOutMsg( "%i\t", cols[ alg_TEMPLATE_LEN ].base.v ? cols[ alg_TEMPLATE_LEN ].base.i32[ 0 ] : 0 );
1603 
1604         /* SEQ: READ */
1605         if ( rc == 0 )
1606             rc = BufferedWriter( NULL, read, readlen, NULL );
1607         if ( rc == 0 )
1608             rc = BufferedWriter( NULL, "\t", 1, NULL );
1609 
1610         /* QUAL: SAM_QUALITY */
1611         if ( rc == 0 )
1612             rc = DumpQuality( qual, readlen, false, param->quantizeQual );
1613 
1614         /* optional fields: */
1615         if ( rc == 0 && ds->type == edstt_EvidenceInterval )
1616             rc = KOutMsg( "\tRG:Z:ALLELE_%u", readId + 1 );
1617 
1618         if ( rc == 0 )
1619         {
1620             if ( readGroup )
1621             {
1622                 rc = BufferedWriter( NULL, "\tRG:Z:", 6, NULL );
1623                 if ( rc == 0 )
1624                     rc = BufferedWriter( NULL, readGroup, string_size( readGroup ), NULL );
1625             }
1626             else if ( cols[ alg_SPOT_GROUP ].len > 0 )
1627             {
1628                 /* read group */
1629                 rc = BufferedWriter( NULL, "\tRG:Z:", 6, NULL );
1630                 if ( rc == 0 )
1631                     rc = BufferedWriter( NULL, cols[ alg_SPOT_GROUP ].base.str, cols[ alg_SPOT_GROUP ].len, NULL );
1632             }
1633             else if ( cols[ alg_SEQ_SPOT_GROUP ].len > 0 )
1634             {
1635                 /* backward compatibility */
1636                 rc = BufferedWriter( NULL, "\tRG:Z:", 6, NULL );
1637                 if ( rc == 0 )
1638                     rc = BufferedWriter( NULL, cols[ alg_SEQ_SPOT_GROUP ].base.str, cols[ alg_SEQ_SPOT_GROUP ].len, NULL );
1639             }
1640         }
1641 
1642         if ( rc == 0 && param->cg_style > 0 && cols[ alg_CG_TAGS_STR ].len > 0 )
1643             rc = BufferedWriter( NULL, cols[ alg_CG_TAGS_STR ].base.str, cols[ alg_CG_TAGS_STR ].len, NULL );
1644 
1645         if ( rc == 0 )
1646         {
1647             if ( param->cg_style > 0 && cols[ alg_ALIGN_GROUP ].len > 0 )
1648             {
1649                 char const *ZI = cols[ alg_ALIGN_GROUP ].base.str;
1650                 unsigned i;
1651 
1652                 for ( i = 0; rc == 0 && i < cols[ alg_ALIGN_GROUP ].len - 1; ++i )
1653                 {
1654                     if ( ZI[ i ] == '_' )
1655                     {
1656                         rc = KOutMsg( "\tZI:i:%.*s\tZA:i:%.1s", i, ZI, ZI + i + 1 );
1657                         break;
1658                     }
1659                 }
1660             }
1661             else if ( ds->type == edstt_EvidenceAlignment && type == 1 )
1662             {
1663                 rc = KOutMsg( "\tZI:i:%li\tZA:i:%u", cols[ alg_REF_ID ].base.i64[ readId ], cols[ alg_REF_PLOIDY ].base.u32[ readId ] );
1664             }
1665         }
1666 
1667         /* align id */
1668         if ( rc == 0 && param->xi )
1669             rc = KOutMsg( "\tXI:i:%li", alignId );
1670 
1671         /* hit count */
1672         if ( rc == 0 && cols[alg_ALIGNMENT_COUNT].len )
1673             rc = KOutMsg( "\tNH:i:%i", (int)cols[ alg_ALIGNMENT_COUNT ].base.u8[ readId ] );
1674 
1675         /* edit distance */
1676         if ( rc == 0 && cols[ alg_EDIT_DISTANCE ].len )
1677             rc = KOutMsg( "\tNM:i:%i", cols[ alg_EDIT_DISTANCE ].base.i32[ readId ] );
1678 
1679         if ( rc == 0 )
1680             rc = KOutMsg( "\n" );
1681     }
1682     return rc;
1683 }
1684 
1685 
DumpUnalignedReads(SAM_dump_ctx_t * const ctx,SCol const calg_col[],int64_t row_id,uint64_t * rcount)1686 static rc_t DumpUnalignedReads( SAM_dump_ctx_t *const ctx, SCol const calg_col[], int64_t row_id, uint64_t* rcount )
1687 {
1688     rc_t rc = 0;
1689     uint32_t i, nreads = 0;
1690 
1691     if ( calg_col != NULL )
1692     {
1693         if ( calg_col[ alg_SAM_FLAGS ].base.u32[ 0 ] & 0x02 )
1694         {
1695             /* skip all aligned rows by flag */
1696             return rc;
1697         }
1698         /* get primary alignments only */
1699         rc = Cursor_Read( &ctx->seq, calg_col[ alg_SEQ_SPOT_ID ].base.i64[ 0 ], seq_PRIMARY_ALIGNMENT_ID, ~(unsigned)0 );
1700         if ( rc == 0 )
1701         {
1702             for ( i = 0; i < ctx->seq.cols[ seq_PRIMARY_ALIGNMENT_ID ].len; i++ )
1703             {
1704                 if ( ctx->seq.cols[ seq_PRIMARY_ALIGNMENT_ID ].base.i64[ i ] != 0 )
1705                 {
1706                     if ( ctx->seq.cols[ seq_PRIMARY_ALIGNMENT_ID ].base.i64[ i ] < row_id )
1707                     {
1708                         /* unaligned were printed with 1st aligment */
1709                         return rc;
1710                     }
1711                 }
1712                 else
1713                 {
1714                     nreads++;
1715                 }
1716             }
1717             if ( nreads == ctx->seq.cols[ seq_PRIMARY_ALIGNMENT_ID ].len )
1718             {
1719                 /* skip all aligned rows by actual data, if flag above is not set properly */
1720                 return rc;
1721             }
1722             row_id = calg_col[ alg_SEQ_SPOT_ID ].base.i64[ 0 ];
1723         }
1724     }
1725     if ( rc == 0 )
1726     {
1727         rc = Cursor_Read( &ctx->seq, row_id, 0, ~(unsigned)0 );
1728         if ( rc == 0 )
1729         {
1730             unsigned non_empty_reads = 0;
1731 
1732             nreads = ctx->seq.cols[ seq_READ_LEN ].idx != 0 ? ctx->seq.cols[ seq_READ_LEN ].len : 1;
1733 
1734             for( i = 0; i < nreads; i++ ) {
1735                 unsigned const readLen = ctx->seq.cols[ seq_READ_LEN ].idx
1736                                        ? ctx->seq.cols[ seq_READ_LEN ].base.coord_len[ i ]
1737                                        : 0;
1738 
1739                 if (readLen)
1740                     ++non_empty_reads;
1741             }
1742             for( i = 0; i < nreads; i++ )
1743             {
1744                 INSDC_coord_zero readStart;
1745                 INSDC_coord_len readLen;
1746 
1747                 if ( ctx->seq.cols[ seq_PRIMARY_ALIGNMENT_ID ].idx != 0 &&
1748                      ctx->seq.cols[ seq_PRIMARY_ALIGNMENT_ID ].base.i64[ i ] != 0 )
1749                 {
1750                     continue;
1751                 }
1752 
1753                 if ( ctx->seq.cols[ seq_READ_TYPE ].idx != 0 &&
1754                      !( ctx->seq.cols[ seq_READ_TYPE ].base.read_type[ i ] & SRA_READ_TYPE_BIOLOGICAL ) )
1755                 {
1756                     continue;
1757                 }
1758 
1759                 readLen = ctx->seq.cols[ seq_READ_LEN ].idx ?
1760                             ctx->seq.cols[ seq_READ_LEN ].base.coord_len[ i ] :
1761                             ctx->seq.cols[ seq_READ ].len;
1762                 if ( readLen == 0 )
1763                 {
1764                     continue;
1765                 }
1766 
1767                 readStart = ctx->seq.cols[ seq_READ_START ].idx ?
1768                                 ctx->seq.cols[ seq_READ_START ].base.coord0[ i ] :
1769                                 0;
1770                 if ( param->fasta || param->fastq )
1771                 {
1772                     rc = DumpUnalignedFastX( ctx->seq.cols, nreads > 1 ? i + 1 : 0, readStart, readLen, row_id );
1773                 }
1774                 else
1775                 {
1776                     uint32_t cflags = 0x4;
1777                     if ( param->reverse_unaligned )
1778                     {
1779                         if ( ctx->seq.cols[ seq_READ_TYPE ].base.read_type[ i ] & SRA_READ_TYPE_REVERSE )
1780                         {
1781                             cflags |= 0x10;
1782                         }
1783                         if ( ctx->seq.cols[ seq_READ_TYPE ].base.read_type[ i == nreads - 1 ? 0 : ( i + 1 ) ] & SRA_READ_TYPE_REVERSE )
1784                         {
1785                             cflags |= 0x20;
1786                         }
1787                     }
1788                     if ( ctx->seq.cols[ seq_READ_FILTER ].idx != 0 )
1789                     {
1790                         if ( ctx->seq.cols[ seq_READ_FILTER ].base.read_filter[ i ] == SRA_READ_FILTER_REJECT )
1791                         {
1792                             cflags |= 0x200;
1793                         }
1794                         else if ( ctx->seq.cols[ seq_READ_FILTER ].base.read_filter[ i ] == SRA_READ_FILTER_CRITERIA )
1795                         {
1796                             cflags |= 0x400;
1797                         }
1798                     }
1799                     if ( calg_col == NULL )
1800                     {
1801                         rc = DumpUnalignedSAM( ctx->seq.cols, cflags |
1802                                           ( non_empty_reads > 1 ? ( 0x1 | 0x8 | ( i == 0 ? 0x40 : 0x00 ) | ( i == nreads - 1 ? 0x80 : 0x00 ) ) : 0x00 ),
1803                                           readStart, readLen, NULL, 0, 0, ctx->readGroup, row_id );
1804                     }
1805                     else
1806                     {
1807                         int c = param->use_seqid ? alg_REF_SEQ_ID : alg_REF_NAME;
1808                         uint16_t flags = cflags | 0x1 |
1809                                          ( ( calg_col[ alg_SAM_FLAGS ].base.u32[ 0 ] & 0x10 ) << 1 ) |
1810                                          ( ( calg_col[ alg_SAM_FLAGS ].base.u32[ 0 ] & 0x40 ) ? 0x80 : 0x40 );
1811                         rc = DumpUnalignedSAM( ctx->seq.cols, flags, readStart, readLen,
1812                                           calg_col[ c ].base.str, calg_col[ c ].len,
1813                                           calg_col[ alg_REF_POS ].base.coord0[ 0 ] + 1, ctx->readGroup, row_id );
1814                     }
1815                 }
1816                 *rcount = *rcount + 1;
1817             }
1818         }
1819     }
1820     return rc;
1821 }
1822 
1823 
AlignRegionFilter(SCol const * cols)1824 static bool AlignRegionFilter( SCol const *cols )
1825 {
1826     if ( param->region_qty == 0 )
1827         return true;
1828     if ( cols[ alg_REF_NAME ].len != 0 || cols[ alg_REF_SEQ_ID ].len != 0 )
1829     {
1830         unsigned i;
1831 
1832         assert( cols[ alg_REF_POS ].len == cols[ alg_REF_LEN ].len );
1833 
1834         for ( i = 0; i < param->region_qty; i++ )
1835         {
1836             unsigned j;
1837 
1838             for ( j = 0; j < cols[ alg_REF_POS ].len; j++ )
1839             {
1840                 unsigned const refStart_zero = cols[ alg_REF_POS ].base.coord0[ j ];
1841                 unsigned const refEnd_exclusive = refStart_zero + cols[ alg_REF_LEN ].base.coord_len[ j ];
1842                 unsigned k;
1843 
1844                 for ( k = 0; k < param->region[ i ].rq; k++ )
1845                 {
1846                     unsigned const from_zero = param->region[ i ].r[ k ].from;
1847                     unsigned const to_inclusive = param->region[ i ].r[ k ].to;
1848 
1849                     if ( from_zero < refEnd_exclusive && refStart_zero <= to_inclusive )
1850                         return true;
1851                 }
1852             }
1853         }
1854     }
1855     return false;
1856 }
1857 
1858 
AlignDistanceFilter(SCol const * cols)1859 static bool AlignDistanceFilter( SCol const *cols )
1860 {
1861     if ( param->mp_dist_qty != 0 || param->mp_dist_unknown )
1862     {
1863         if ( cols[ alg_TEMPLATE_LEN ].len == 0 && param->mp_dist_unknown )
1864         {
1865             return true;
1866         }
1867         else
1868         {
1869             uint32_t i, j;
1870             for( i = 0; i < param->mp_dist_qty; i++ )
1871             {
1872                 for( j = 0; j < cols[ alg_TEMPLATE_LEN ].len; j++ )
1873                 {
1874                     if ( ( cols[ alg_TEMPLATE_LEN ].base.i32[ j ] == 0 && param->mp_dist_unknown ) ||
1875                          ( param->mp_dist[ i ].from <= cols[ alg_TEMPLATE_LEN ].base.i32[ j ] &&
1876                          cols[ alg_TEMPLATE_LEN ].base.i32[ j ] <= param->mp_dist[ i ].to ) )
1877                     {
1878                         return true;
1879                     }
1880                 }
1881             }
1882         }
1883         return false;
1884     }
1885     return true;
1886 }
1887 
1888 
1889 typedef struct cgOp_s
1890 {
1891     uint16_t length;
1892     uint8_t type; /* 0: match, 1: insert, 2: delete */
1893     char code;
1894 } cgOp;
1895 
1896 
print_CG_cigar(int line,cgOp const op[],unsigned const ops,unsigned const gap[3])1897 static void print_CG_cigar( int line, cgOp const op[], unsigned const ops, unsigned const gap[ 3 ] )
1898 {
1899 #if _DEBUGGING
1900     unsigned i;
1901 
1902     SAM_DUMP_DBG( 3, ( "%5i: ", line ) );
1903     for ( i = 0; i < ops; ++i )
1904     {
1905         if ( gap && ( i == gap[ 0 ] || i == gap[ 1 ] || i == gap[ 2 ] ) )
1906         {
1907             SAM_DUMP_DBG( 3, ( "%u%c", op[ i ].length, tolower( op[ i ].code ) ) );
1908         }
1909         else
1910         {
1911             SAM_DUMP_DBG( 3, ( "%u%c", op[ i ].length, toupper( op[ i ].code ) ) );
1912         }
1913     }
1914     SAM_DUMP_DBG( 3, ( "\n" ) );
1915 #endif
1916 }
1917 
1918 /* gap contains the indices of the wobbles in op
1919  * gap[0] is between read 1 and 2; it is the 'B'
1920  * gap[1] is between read 2 and 3; it is an 'N'
1921  * gap[2] is between read 3 and 4; it is an 'N'
1922  */
1923 
CIGAR_to_CG_Ops(cgOp op[],unsigned const maxOps,unsigned * const opCnt,unsigned gap[3],char const cigar[],unsigned const ciglen,unsigned * const S_adjust,unsigned * const CG_adjust,unsigned const read,bool const reversed,bool * has_ref_offset_type)1924 static rc_t CIGAR_to_CG_Ops( cgOp op[], unsigned const maxOps,
1925                              unsigned *const opCnt,
1926                              unsigned gap[ 3 ],
1927                              char const cigar[], unsigned const ciglen,
1928                              unsigned *const S_adjust,
1929                              unsigned *const CG_adjust,
1930                              unsigned const read,
1931                              bool const reversed,
1932 			     bool * has_ref_offset_type)
1933 {
1934     unsigned i;
1935     unsigned ops = 0;
1936     unsigned gapno;
1937 
1938     *opCnt = 0;
1939     *S_adjust = 0;
1940     for ( i = 0; i < ciglen; ++ops )
1941     {
1942         char opChar=0;
1943         int opLen=0;
1944         int n;
1945 
1946 	for(n=0;n+i<ciglen && isdigit(cigar[n+i]);n++){
1947 		opLen = opLen * 10 + cigar[n+i] - '0';
1948 	}
1949 	if(n+i<ciglen){
1950 		opChar = cigar[n+i];
1951 		n++;
1952 	}
1953         if ( ops + 1 >= maxOps )
1954             return RC( rcExe, rcData, rcReading, rcBuffer, rcInsufficient );
1955 
1956         i += n;
1957 
1958         op[ ops ].length = opLen;
1959         op[ ops ].code = opChar;
1960         switch ( opChar )
1961         {
1962         case 'M':
1963         case '=':
1964         case 'X':
1965             op[ ops ].type = 0;
1966             break;
1967         case 'S':
1968             *S_adjust += opLen;
1969         case 'I':
1970             op[ ops ].type = 1;
1971             op[ ops ].code = 'I';
1972             break;
1973 	case 'N':
1974 	    *has_ref_offset_type=true;
1975         case 'D':
1976             op[ ops ].type = 2;
1977             break;
1978         default:
1979             return RC( rcExe, rcData, rcReading, rcConstraint, rcViolated );
1980         }
1981     }
1982     *opCnt = ops;
1983     gap[ 0 ] = gap[ 1 ] = gap[ 2 ] = ops;
1984     print_CG_cigar( __LINE__, op, ops, NULL );
1985     if ( ops < 3 )
1986         return RC( rcExe, rcData, rcReading, rcFormat, rcNotFound ); /* CG pattern not found */
1987 
1988     {
1989         unsigned fwd = 0; /* 5 10 10 10 */
1990         unsigned rev = 0; /* 10 10 10 5 */
1991         unsigned acc; /* accumulated length */
1992 
1993         if ( ( read == 1 && !reversed ) || ( read == 2 && reversed ) )
1994         {
1995             for ( i = 0, acc = 0; i < ops  && acc <=5; ++i )
1996             {
1997                 if ( op[ i ].type != 2 )
1998                 {
1999                     acc += op[ i ].length;
2000                     if ( acc == 5 && op[ i + 1 ].type == 1 )
2001                     {
2002                         fwd = i + 1;
2003                         break;
2004                     }
2005                     else if ( acc > 5 )
2006                     {
2007                         unsigned const right = acc - 5;
2008                         unsigned const left = op[ i ].length - right;
2009 
2010                         memmove( &op[ i + 2 ], &op[ i ], ( ops - i ) * sizeof( op[ 0 ] ) );
2011                         ops += 2;
2012                         op[ i ].length = left;
2013                         op[ i + 2 ].length = right;
2014 
2015                         op[ i + 1 ].type = 1;
2016                         op[ i + 1 ].code = 'B';
2017                         op[ i + 1 ].length = 0;
2018 
2019                         fwd = i + 1;
2020                         break;
2021                     }
2022                 }
2023             }
2024         }
2025         else if ( ( read == 2 && !reversed ) || ( read == 1 && reversed ) )
2026         {
2027             for ( i = ops, acc = 0; i > 0 && acc <= 5; )
2028             {
2029                 --i;
2030                 if ( op[ i ].type != 2 )
2031                 {
2032                     acc += op[ i ].length;
2033                     if ( acc == 5 && op[ i ].type == 1 )
2034                     {
2035                         rev = i;
2036                         break;
2037                     }
2038                     else if ( acc > 5 )
2039                     {
2040                         unsigned const left = acc - 5;
2041                         unsigned const right = op[ i ].length - left;
2042 
2043                         memmove( &op[ i + 2 ], &op[ i ], ( ops - i ) * sizeof( op[ 0 ] ) );
2044                         ops += 2;
2045                         op[ i ].length = left;
2046                         op[ i + 2 ].length = right;
2047 
2048                         op[ i + 1 ].type = 1;
2049                         op[ i + 1 ].code = 'B';
2050                         op[ i + 1 ].length = 0;
2051 
2052                         rev = i + 1;
2053                         break;
2054                     }
2055                 }
2056             }
2057         }
2058         else
2059         {
2060             /* fprintf(stderr, "guessing layout\n"); */
2061             for ( i = 0, acc = 0; i < ops  && acc <= 5; ++i )
2062             {
2063                 if ( op[ i ].type != 2 )
2064                 {
2065                     acc += op[ i ].length;
2066                     if ( acc == 5 && op[ i + 1 ].type == 1 )
2067                     {
2068                         fwd = i + 1;
2069                     }
2070                 }
2071             }
2072             for ( i = ops, acc = 0; i > 0 && acc <= 5; )
2073             {
2074                 --i;
2075                 if ( op[ i ].type != 2 )
2076                 {
2077                     acc += op[ i ].length;
2078                     if ( acc == 5 && op[i].type == 1 )
2079                     {
2080                         rev = i;
2081                     }
2082                 }
2083             }
2084             if ( ( fwd == 0 && rev == 0 ) || ( fwd != 0 && rev != 0 ) )
2085             {
2086                 if(!(*has_ref_offset_type)) for ( i = 0; i < ops; ++i )
2087                 {
2088                     if ( op[ i ].type == 2 )
2089                     {
2090                         op[ i ].code = 'N';
2091                         *CG_adjust += op[ i ].length;
2092                     }
2093                 }
2094                 return RC( rcExe, rcData, rcReading, rcFormat, rcNotFound ); /* CG pattern not found */
2095             }
2096         }
2097         if ( fwd && op[ fwd ].type == 1 )
2098         {
2099             for ( i = ops, acc = 0; i > fwd + 1; )
2100             {
2101                 --i;
2102                 if ( op[ i ].type != 2 )
2103                 {
2104                     acc += op[ i ].length;
2105                     if ( acc >= 10 )
2106                     {
2107                         if ( acc > 10 )
2108                         {
2109                             unsigned const r = 10 + op[ i ].length - acc;
2110                             unsigned const l = op[ i ].length - r;
2111 
2112                             if ( ops + 2 >= maxOps )
2113                                 return RC( rcExe, rcData, rcReading, rcBuffer, rcInsufficient );
2114                             memmove( &op[ i + 2 ], &op[ i ], ( ops - i ) * sizeof( op[ 0 ] ) );
2115                             ops += 2;
2116                             op[ i + 2 ].length = r;
2117                             op[ i ].length = l;
2118 
2119                             op[ i + 1 ].length = 0;
2120                             op[ i + 1 ].type = 2;
2121                             op[ i + 1 ].code = 'N';
2122                             i += 2;
2123                         }
2124                         else if ( i - 1 > fwd )
2125                         {
2126                             if ( op[ i - 1 ].type == 2 )
2127                                  op[ i - 1 ].code = 'N';
2128                             else
2129                             {
2130                                 if ( ops + 1 >= maxOps )
2131                                     return RC( rcExe, rcData, rcReading, rcBuffer, rcInsufficient );
2132                                 memmove( &op[ i + 1 ], &op[ i ], ( ops - i ) * sizeof( op[ 0 ] ) );
2133                                 ops += 1;
2134                                 op[ i ].length = 0;
2135                                 op[ i ].type = 2;
2136                                 op[ i ].code = 'N';
2137                                 i += 1;
2138                             }
2139                         }
2140                         acc = 0;
2141                     }
2142                 }
2143             }
2144             /** change I to B+M **/
2145             op[ fwd ].code = 'B';
2146             memmove( &op[ fwd + 1 ], &op[ fwd ], ( ops - fwd ) * sizeof( op[ 0 ] ) );
2147             ops += 1;
2148             op[ fwd + 1 ].code = 'M';
2149             *opCnt = ops;
2150             /** set the gaps now **/
2151             for ( gapno = 3, i = ops; gapno > 1 && i > 0; )
2152             {
2153                 --i;
2154                 if ( op[ i ].code == 'N' )
2155                     gap[ --gapno ] = i;
2156             }
2157             gap[ 0 ] = fwd;
2158             print_CG_cigar( __LINE__, op, ops, gap );
2159             return 0;
2160         }
2161         if ( rev && op[ rev ].type == 1 )
2162         {
2163             for ( acc = i = 0; i < rev; ++i )
2164             {
2165                 if ( op[ i ].type != 2 )
2166                 {
2167                     acc += op[ i ].length;
2168                     if ( acc >= 10 )
2169                     {
2170                         if ( acc > 10 )
2171                         {
2172                             unsigned const l = 10 + op[ i ].length - acc;
2173                             unsigned const r = op[ i ].length - l;
2174 
2175                             if ( ops + 2 >= maxOps )
2176                                 return RC( rcExe, rcData, rcReading, rcBuffer, rcInsufficient );
2177                             memmove( &op[ i + 2 ], &op[ i ], ( ops - i ) * sizeof( op[ 0 ] ) );
2178                             ops += 2;
2179                             op[ i + 2 ].length = r;
2180                             op[ i ].length = l;
2181 
2182                             op[ i + 1 ].length = 0;
2183                             op[ i + 1 ].type = 2;
2184                             op[ i + 1 ].code = 'N';
2185                             rev += 2;
2186                             i += 2;
2187                         }
2188                         else if ( i + 1 < rev )
2189                         {
2190                             if ( op[ i + 1 ].type == 2)
2191                                  op[ i + 1 ].code = 'N';
2192                             else
2193                             {
2194                                 if ( ops + 1 >= maxOps )
2195                                     return RC( rcExe, rcData, rcReading, rcBuffer, rcInsufficient );
2196                                 memmove( &op[ i + 1 ], &op[ i ], ( ops - i ) * sizeof( op[ 0 ] ) );
2197                                 ops += 1;
2198                                 op[ i + 1 ].length = 0;
2199                                 op[ i + 1 ].type = 2;
2200                                 op[ i + 1 ].code = 'N';
2201                                 rev += 1;
2202                                 i += 1;
2203                             }
2204                         }
2205                         acc = 0;
2206                     }
2207                 }
2208             }
2209             for ( gapno = 3, i = 0; gapno > 1 && i < ops; ++i )
2210             {
2211                 if ( op[ i ].code == 'N' )
2212                     gap[ --gapno ] = i;
2213             }
2214             gap[ 0 ] = rev;
2215             op[ rev ].code = 'B';
2216             memmove( &op[ rev + 1 ], &op[ rev ], ( ops - rev ) * sizeof( op[ 0 ] ) );
2217             ops += 1;
2218             op[ rev + 1 ].code = 'M';
2219             *opCnt = ops;
2220             print_CG_cigar( __LINE__, op, ops, gap );
2221             return 0;
2222         }
2223     }
2224     return RC( rcExe, rcData, rcReading, rcFormat, rcNotFound ); /* CG pattern not found */
2225 }
2226 
2227 
GenerateCGData(SCol cols[],unsigned style)2228 static rc_t GenerateCGData( SCol cols[], unsigned style )
2229 {
2230     rc_t rc = 0;
2231 
2232     memset( &cols[ alg_CG_TAGS_STR], 0, sizeof( cols[ alg_CG_TAGS_STR ] ) );
2233 
2234     if ( cols[ alg_READ ].len == 35 && cols[ alg_SAM_QUALITY ].len == 35 )
2235     {
2236         static char newCIGAR[ 35 * 11 ];
2237         static int32_t newEditDistance;
2238         unsigned gap[ 3 ] = { 0, 0, 0 };
2239         cgOp cigOp[ 35 ];
2240         unsigned opCnt;
2241         unsigned i;
2242         unsigned j;
2243         size_t sz;
2244         unsigned S_adjust = 0;
2245         unsigned CG_adjust = 0;
2246         unsigned const read = cols[ alg_SEQ_READ_ID ].len && cols[ alg_REVERSED ].len ? cols[ alg_SEQ_READ_ID ].base.coord1[ 0 ] : 0;
2247         bool const reversed = cols[ alg_REVERSED ].len ? cols[ alg_REVERSED ].base.u8[ 0 ] : false;
2248 	bool has_ref_offset_type = false;
2249 
2250         rc = CIGAR_to_CG_Ops( cigOp, sizeof( cigOp ) / sizeof( cigOp[ 0 ] ), &opCnt, gap,
2251                               cols[ alg_CIGAR ].base.str, cols[ alg_CIGAR ].len,
2252                               &S_adjust, &CG_adjust, read, reversed, &has_ref_offset_type );
2253         if ( GetRCState( rc ) == rcNotFound && GetRCObject( rc ) == rcFormat )
2254         {
2255             rc = 0;
2256             if ( style == 1 )
2257                 goto CLEAN_CIGAR;
2258             else
2259                 goto PRINT_CIGAR;
2260         }
2261         if ( rc != 0 )
2262             return 0;
2263 
2264         if ( !has_ref_offset_type && CG_adjust == 0 )
2265             CG_adjust = ( gap[ 0 ] < opCnt ? cigOp[ gap[ 0 ] ].length : 0 )
2266                       + ( gap[ 1 ] < opCnt ? cigOp[ gap[ 1 ] ].length : 0 )
2267                       + ( gap[ 2 ] < opCnt ? cigOp[ gap[ 2 ] ].length : 0 );
2268 
2269         print_CG_cigar( __LINE__, cigOp, opCnt, NULL );
2270         if ( style == 1 )
2271         {
2272             static char newSeq[ 35 ];
2273             static char newQual[ 35 ];
2274             static char tags[ 35*22 + 1 ];
2275             unsigned const B_len = cigOp[ gap[ 0 ] ].length;
2276             unsigned const B_at = gap[ 0 ] < gap[ 2 ] ? 5 : 30;
2277 
2278             if ( 0 < B_len && B_len < 5 )
2279             {
2280                 memmove( newSeq, cols[ alg_READ ].base.v, 35 );
2281                 memmove( newQual, cols[ alg_SAM_QUALITY ].base.v, 35 );
2282 
2283                 cols[ alg_CG_TAGS_STR ].base.v = tags;
2284 
2285                 cols[ alg_READ ].base.v = newSeq;
2286                 cols[ alg_READ ].len = 35 - B_len;
2287 
2288                 cols[ alg_SAM_QUALITY ].base.v = newQual;
2289                 cols[ alg_SAM_QUALITY ].len = 35 - B_len;
2290 
2291                 /* nBnM -> nB0M */
2292                 cigOp[ gap[ 0 ] + 1 ].length -= B_len;
2293                 if ( gap[ 0 ] < gap[ 2 ] )
2294                 {
2295                     rc = string_printf( tags, sizeof( tags ), &sz, "\tGC:Z:%uS%uG%uS\tGS:Z:%.*s\tGQ:Z:%.*s",
2296                         5 - B_len, B_len, 30 - B_len, 2 * B_len, &newSeq[ 5 - B_len ], 2 * B_len, &newQual[ 5 - B_len ] );
2297                     if ( rc == 0 )
2298                     {
2299                         cols[ alg_CG_TAGS_STR ].len = sz;
2300                     }
2301                     memmove( &cigOp[ gap[ 0 ] ],
2302                              &cigOp[ gap[ 0 ] + 1 ],
2303                              ( opCnt - ( gap[ 0 ] + 1 ) ) * sizeof( cigOp[ 0 ] ) );
2304                     --opCnt;
2305                 }
2306                 else
2307                 {
2308                     rc = string_printf( tags, sizeof( tags ), &sz, "\tGC:Z:%uS%uG%uS\tGS:Z:%.*s\tGQ:Z:%.*s",
2309                         30 - B_len, B_len, 5 - B_len, 2 * B_len, &newSeq[ 30 - B_len ], 2 * B_len, &newQual[ 30 - B_len ] );
2310                     if ( rc == 0 )
2311                     {
2312                         cols[ alg_CG_TAGS_STR ].len = sz;
2313                     }
2314                     memmove( &cigOp[ gap[ 0 ] ],
2315                              &cigOp[ gap[ 0 ] + 1 ],
2316                              ( opCnt - ( gap[ 0 ] + 1 ) ) * sizeof( cigOp[ 0 ] ) );
2317                     --opCnt;
2318                 }
2319                 if ( rc == 0 )
2320                 {
2321                     for ( i = B_at; i < B_at + B_len; ++i )
2322                     {
2323                         int const Lq = newQual[ i - B_len ];
2324                         int const Rq = newQual[ i ];
2325 
2326                         if ( Lq <= Rq )
2327                         {
2328                             newSeq[ i - B_len ] = newSeq[ i ];
2329                             newQual[ i - B_len ] = Rq;
2330                         }
2331                         else
2332                         {
2333                             newSeq[ i ] = newSeq[ i - B_len ];
2334                             newQual[ i ] = Lq;
2335                         }
2336                     }
2337                     memmove( &newSeq [ B_at ], &newSeq [ B_at + B_len ], 35 - B_at - B_len );
2338                     memmove( &newQual[ B_at ], &newQual[ B_at + B_len ], 35 - B_at - B_len );
2339                 }
2340             }
2341             else
2342             {
2343                 int len = cigOp[ gap[ 0 ] ].length;
2344 
2345                 cigOp[ gap[ 0 ] ].code = 'I';
2346                 for (i = gap[0] + 1; i < opCnt && len > 0; ++i) {
2347                     if (cigOp[i].length <= len){
2348                         len -= cigOp[i].length;
2349                         cigOp[i].length = 0;
2350                     }
2351                     else {
2352                         cigOp[i].length -= len;
2353                         len = 0;
2354                     }
2355                 }
2356                 if(!has_ref_offset_type) CG_adjust -= cigOp[ gap[ 0 ] ].length;
2357             }
2358         }
2359         if ( rc == 0 )
2360         {
2361         PRINT_CIGAR:
2362         CLEAN_CIGAR:
2363             print_CG_cigar( __LINE__, cigOp, opCnt, NULL );
2364             /* remove zero length ops */
2365             for ( j = i = 0; i < opCnt; )
2366             {
2367                 if ( cigOp[ j ].length == 0)
2368                 {
2369                     ++j;
2370                     --opCnt;
2371                     continue;
2372                 }
2373                 cigOp[ i++ ] = cigOp[ j++ ];
2374             }
2375             print_CG_cigar( __LINE__, cigOp, opCnt, NULL );
2376             if ( cols[ alg_EDIT_DISTANCE ].len )
2377             {
2378                 int const edit_distance = cols[ alg_EDIT_DISTANCE ].base.i32[ 0 ];
2379                 int const adjusted = edit_distance + S_adjust - CG_adjust;
2380 
2381                 newEditDistance = adjusted > 0 ? adjusted : 0;
2382                 SAM_DUMP_DBG( 4, ( "NM: before: %u, after: %u(+%u-%u)\n", edit_distance, newEditDistance, S_adjust, CG_adjust ) );
2383                 cols[ alg_EDIT_DISTANCE ].base.v = &newEditDistance;
2384                 cols[ alg_EDIT_DISTANCE ].len = 1;
2385             }
2386             /* merge adjacent ops */
2387             for ( i = opCnt; i > 1; )
2388             {
2389                 --i;
2390                 if ( cigOp[ i - 1 ].code == cigOp[ i ].code )
2391                 {
2392                     cigOp[ i - 1 ].length += cigOp[ i ].length;
2393                     memmove( &cigOp[ i ], &cigOp[ i + 1 ], ( opCnt - 1 - i ) * sizeof( cigOp[ 0 ] ) );
2394                     --opCnt;
2395                 }
2396             }
2397             print_CG_cigar( __LINE__, cigOp, opCnt, NULL );
2398             for ( i = j = 0; i < opCnt && rc == 0; ++i )
2399             {
2400                 rc = string_printf( &newCIGAR[ j ], sizeof( newCIGAR ) - j, &sz, "%u%c", cigOp[ i ].length, cigOp[ i ].code);
2401                 j += sz;
2402             }
2403             cols[ alg_CIGAR ].base.v = newCIGAR;
2404             cols[ alg_CIGAR ].len = j;
2405         }
2406     }
2407     return rc;
2408 }
2409 
2410 
DumpAlignedRow(SAM_dump_ctx_t * const ctx,DataSource * const ds,int64_t row,bool const primary,int const cg_style,rc_t * prc)2411 static bool DumpAlignedRow( SAM_dump_ctx_t *const ctx, DataSource *const ds,
2412                     int64_t row,
2413                     bool const primary,
2414                     int const cg_style,
2415                     rc_t *prc )
2416 {
2417     rc_t rc = 0;
2418 
2419     /* if SEQ_SPOT_ID is open then skip rows with no SEQ_SPOT_ID */
2420     if ( ds->cols[ alg_SEQ_SPOT_ID ].idx )
2421     {
2422         rc = Cursor_Read( ds, row, alg_SEQ_SPOT_ID, 1 );
2423         if ( rc != 0 )
2424         {
2425             if ( !( GetRCObject( rc ) == rcRow && GetRCState( rc ) == rcNotFound ) )
2426                 *prc = rc;
2427             return false;
2428         }
2429         if ( ds->cols[ alg_SEQ_SPOT_ID ].len == 0 ||
2430              ds->cols[ alg_SEQ_SPOT_ID ].base.i64[ 0 ] == 0 )
2431             return false;
2432     }
2433 
2434     /* skip rows that fail mate distance filter */
2435     if ( param->mp_dist_qty != 0 || param->mp_dist_unknown )
2436     {
2437         rc = Cursor_Read( ds, row, alg_TEMPLATE_LEN, 1 );
2438         if ( rc != 0 )
2439         {
2440             if ( !( GetRCObject( rc ) == rcRow && GetRCState( rc ) == rcNotFound ) )
2441                 *prc = rc;
2442             return false;
2443         }
2444         if ( !AlignDistanceFilter( ds->cols ) )
2445             return false;
2446     }
2447 
2448     rc = Cursor_ReadAlign( &ds->curs, row, ds->cols, 0 );
2449     if ( rc != 0 )
2450     {
2451         if ( !( GetRCObject( rc ) == rcRow && GetRCState( rc ) == rcNotFound ) )
2452             *prc = rc;
2453         return false;
2454     }
2455 
2456     if ( param->fasta || param->fastq )
2457     {
2458         unsigned const read_id = ds->cols[ alg_SEQ_READ_ID ].base.v ? ds->cols[ alg_SEQ_READ_ID ].base.coord1[ 0 ] : 0;
2459 
2460         rc = DumpAlignedFastX( ctx->pri.cols, row, read_id, primary, false );
2461     }
2462     else
2463     {
2464         if ( cg_style != 0 )
2465         {
2466             rc = GenerateCGData( ds->cols, cg_style );
2467             if ( rc != 0 )
2468             {
2469                 *prc = rc;
2470                 return false;
2471             }
2472         }
2473         rc = DumpAlignedSAM(ctx, ds, row, ctx->readGroup, 0 );
2474     }
2475     *prc = rc;
2476     return true;
2477 }
2478 
2479 
DumpAlignedRowList(SAM_dump_ctx_t * const ctx,DataSource * const ds,SCol const * const ids,int64_t * rcount,bool const primary,int const cg_style,bool const noFilter)2480 static rc_t DumpAlignedRowList( SAM_dump_ctx_t *const ctx, DataSource *const ds, SCol const *const ids,
2481                         int64_t *rcount,
2482                         bool const primary,
2483                         int const cg_style,
2484                         bool const noFilter )
2485 {
2486     rc_t rc;
2487     unsigned i;
2488     unsigned const n = ids->len;
2489 
2490     for ( i = 0; ( rc = Quitting() ) == 0 && i < n; ++i )
2491     {
2492         int64_t const row = ids->base.i64[ i ];
2493 
2494         if ( ! noFilter )
2495             rc = Cursor_Read( ds, row, alg_REGION_FILTER, ~(unsigned)0 );
2496 
2497         if ( rc == 0 )
2498         {
2499             if ( noFilter || AlignRegionFilter(ds->cols) )
2500             {
2501                 if ( DumpAlignedRow( ctx, ds, row, primary, cg_style, &rc ) )
2502                     ++*rcount;
2503                 if ( rc != 0 )
2504                     break;
2505             }
2506         }
2507     }
2508     return rc;
2509 }
2510 
DumpAlignedRowListIndirect(SAM_dump_ctx_t * const ctx,DataSource * const indirectSource,DataSource * const directSource,SCol const * const directIDs,int64_t * rcount,bool const primary,int const cg_style)2511 static rc_t DumpAlignedRowListIndirect( SAM_dump_ctx_t *const ctx,
2512                                 DataSource *const indirectSource,
2513                                 DataSource *const directSource,
2514                                 SCol const *const directIDs,
2515                                 int64_t *rcount,
2516                                 bool const primary,
2517                                 int const cg_style )
2518 {
2519     unsigned i;
2520 
2521     for ( i = 0; i < directIDs->len; ++i )
2522     {
2523         int64_t const row = directIDs->base.i64[ i ];
2524         rc_t rc = Cursor_Read( directSource, row, alg_REGION_FILTER, ~(unsigned)0 );
2525 
2526         if ( rc != 0 )
2527             return rc;
2528 
2529         if ( !AlignRegionFilter( directSource->cols ) )
2530             continue;
2531 
2532         rc = Cursor_Read( directSource, row, alg_EVIDENCE_ALIGNMENT_IDS, 1 );
2533         if ( rc != 0 )
2534             return rc;
2535 
2536         rc = DumpAlignedRowList( ctx, indirectSource, &directSource->cols[ alg_EVIDENCE_ALIGNMENT_IDS ],
2537                                  rcount, primary, cg_style, true );
2538         if ( rc != 0 )
2539             return rc;
2540     }
2541     return 0;
2542 }
2543 
2544 
2545 enum e_tables
2546 {
2547     primary_alignment,
2548     secondary_alignment,
2549     evidence_interval,
2550     evidence_alignment
2551 };
2552 
2553 enum e_IDS_opts
2554 {
2555     primary_IDS             = ( 1 << primary_alignment ),
2556     secondary_IDS           = ( 1 << secondary_alignment ),
2557     evidence_interval_IDS   = ( 1 << evidence_interval ),
2558     evidence_alignment_IDS  = ( 1 << evidence_alignment )
2559 };
2560 
2561 
DumpAlignedRowList_cb(SAM_dump_ctx_t * const ctx,TAlignedRegion const * const rgn,int options,int which,int64_t * rcount,SCol const * const IDS)2562 static rc_t DumpAlignedRowList_cb( SAM_dump_ctx_t *const ctx, TAlignedRegion const *const rgn,
2563                                    int options, int which, int64_t *rcount, SCol const *const IDS )
2564 {
2565     /*SAM_DUMP_DBG(2, ("row %s index range is [%lu:%lu] pos %lu\n",
2566         param->region[r].name, start, start + count - 1, cur_pos));*/
2567     switch ( which )
2568     {
2569     case primary_IDS:
2570         return DumpAlignedRowList( ctx, &ctx->pri, IDS, rcount, true, param->cg_style, false );
2571 
2572     case secondary_IDS:
2573         return DumpAlignedRowList( ctx, &ctx->sec, IDS, rcount, false, param->cg_style, false );
2574 
2575     case evidence_interval_IDS:
2576         {
2577             rc_t rc = 0;
2578 
2579             if ( ( options & evidence_interval_IDS ) != 0 )
2580                 rc = DumpAlignedRowList( ctx, &ctx->evi, IDS, rcount, true, 0, false );
2581 
2582             if ( rc == 0 && ( options & evidence_alignment_IDS ) != 0 )
2583                 rc = DumpAlignedRowListIndirect( ctx, &ctx->eva, &ctx->evi, IDS, rcount, true, param->cg_style );
2584 
2585             return rc;
2586         }
2587     }
2588     return RC( rcExe, rcTable, rcReading, rcParam, rcUnexpected );
2589 }
2590 
2591 
2592 typedef struct CigOps{
2593 	char op;
2594 	int8_t	 ref_sign; /*** 0;+1;-1; ref_offset = ref_sign * offset ***/
2595 	int8_t	 seq_sign; /*** 0;+1;-1; seq_offset = seq_sign * offset ***/
2596 	uint32_t oplen;
2597 } CigOps;
2598 
SetCigOp(CigOps * dst,char op,uint32_t oplen)2599 static void SetCigOp(CigOps *dst,char op,uint32_t oplen)
2600 {
2601 	dst->op    = op;
2602 	dst->oplen = oplen;
2603 	switch(op) { /*MX= DN B IS PH*/
2604 	 case 'M': case 'X': case '=':
2605 		dst->ref_sign=+1;
2606 		dst->seq_sign=+1;
2607 		break;
2608 	 case 'D': case 'N':
2609 		dst->ref_sign=+1;
2610                 dst->seq_sign= 0;
2611                 break;
2612 	 case 'B':
2613 		dst->ref_sign=-1;
2614                 dst->seq_sign= 0;
2615                 break;
2616 	 case 'S': case 'I':
2617 		dst->ref_sign= 0;
2618                 dst->seq_sign=+1;
2619                 break;
2620 	 case 'P': case 'H':
2621 	  case 0: /** terminating op **/
2622 		dst->ref_sign= 0;
2623                 dst->seq_sign= 0;
2624                 break;
2625 	  default:
2626 		assert(0);
2627                 break;
2628 	}
2629 }
2630 
ExplodeCIGAR(CigOps dst[],unsigned len,char const cigar[],unsigned ciglen)2631 static unsigned ExplodeCIGAR( CigOps dst[], unsigned len, char const cigar[], unsigned ciglen )
2632 {
2633     unsigned i;
2634     unsigned j;
2635     int opLen;
2636 
2637     for ( i = j = opLen=0; i < ciglen; i++) {
2638 	if(isdigit(cigar[i])){
2639 		opLen = opLen * 10 + cigar[i] - '0';
2640 	} else {
2641 		assert(isalpha(cigar[i]));
2642 		SetCigOp(dst+j,cigar[i],opLen);
2643 		opLen=0;
2644 		j++;
2645 	}
2646     }
2647     SetCigOp(dst+j,0,0);
2648     j++;
2649     return j;
2650 }
2651 
2652 
2653 #define CG_CIGAR_STRING_MAX (35 * 11 + 1)
2654 
2655 
2656 #if 0
2657 static unsigned FormatCIGAR( char dst[], unsigned cp, unsigned oplen, int opcode )
2658 {
2659     size_t sz;
2660     string_printf( dst + cp, CG_CIGAR_STRING_MAX - cp, &sz, "%u%c", oplen, opcode );
2661     return sz;
2662 }
2663 
2664 
2665 static unsigned DeletedCIGAR( char dst[], unsigned const cp, int opcode, unsigned D,
2666                               uint32_t const Op[], int ri, unsigned len )
2667 {
2668     unsigned i;
2669     unsigned curPos = 0;
2670     int curOp = opcode;
2671     unsigned oplen = 0;
2672 
2673     for ( i = 0; i < D && ri < len; ++i, ++ri )
2674     {
2675         uint32_t const op = Op[ ri ];
2676         int const d = op >> 2;
2677         int const m = op & 1;
2678         int const nxtOp = m ? opcode : 'P';
2679 
2680         if ( d != 0 )
2681         {
2682             if ( oplen != 0 )
2683                 curPos += FormatCIGAR( dst, curPos + cp, oplen, curOp );
2684             curPos += FormatCIGAR( dst, cp, d, 'D' );
2685             oplen = 0;
2686         }
2687         if ( oplen == 0 || nxtOp == curOp )
2688         {
2689             ++oplen;
2690         }
2691         else
2692         {
2693             curPos += FormatCIGAR( dst, curPos + cp, oplen, curOp );
2694             oplen = 1;
2695         }
2696         curOp = nxtOp;
2697     }
2698 
2699     if ( oplen != 0 && curOp != opcode)
2700     {
2701         curPos += FormatCIGAR( dst, curPos + cp, oplen, curOp );
2702         oplen = 0;
2703     }
2704     if ( i < D )
2705         oplen += D - i;
2706     if ( oplen )
2707         curPos += FormatCIGAR( dst, curPos + cp, oplen, opcode );
2708     return curPos;
2709 }
2710 #endif
2711 
merge_M_type_ops(char a,char b)2712 static char merge_M_type_ops(char a,char b)
2713 { /*MX=*/
2714 	char c=0;
2715 	switch(b){
2716 	 case 'X':
2717 		switch(a){
2718 		 case '=': c='X'; break;
2719 		 case 'X': c='M'; break; /**we don't know - 2X may create '=' **/
2720 		 case 'M': c='M'; break;
2721 		}
2722 		break;
2723 	 case 'M':
2724 		c='M';
2725 		break;
2726 	 case '=':
2727 		c=a;
2728 		break;
2729 	}
2730 	assert(c!=0);
2731 	return c;
2732 }
2733 
2734 
CombineCIGAR(char dst[],CigOps const seqOp[],unsigned seq_len,int refPos,CigOps const refOp[],unsigned ref_len)2735 static unsigned CombineCIGAR( char dst[], CigOps const seqOp[], unsigned seq_len,
2736                               int refPos, CigOps const refOp[], unsigned ref_len )
2737 {
2738     bool     done=false;
2739     unsigned ciglen=0,last_ciglen=0;
2740     char     last_cig_op;
2741     uint32_t last_cig_oplen=0;
2742     int	     si=0,ri=0;
2743     CigOps   seq_cop={0,0,0,0},ref_cop={0,0,0,0};
2744     int	     seq_pos=0; /** seq_pos is tracked roughly - with every extraction from seqOp **/
2745     int      ref_pos=0; /** ref_pos is tracked precisely - with every delta and consumption in cigar **/
2746     int	     delta=refPos; /*** delta in relative positions of seq and ref **/
2747 			   /*** when delta < 0 - rewind or extend the reference ***/
2748 			   /*** wher delta > 0 - skip reference  ***/
2749 #define MACRO_BUILD_CIGAR(OP,OPLEN) \
2750 	if( last_cig_oplen > 0 && last_cig_op == OP){							\
2751                 last_cig_oplen += OPLEN;								\
2752                 ciglen = last_ciglen + sprintf(dst+last_ciglen,"%d%c",last_cig_oplen,last_cig_op);	\
2753         } else {											\
2754                 last_ciglen = ciglen;									\
2755                 last_cig_oplen = OPLEN;									\
2756                 last_cig_op    = OP;									\
2757                 ciglen = ciglen      + sprintf(dst+last_ciglen,"%d%c",last_cig_oplen,last_cig_op);	\
2758         }
2759     while(!done){
2760 	while(delta < 0){
2761 		ref_pos += delta; /** we will make it to back up this way **/
2762 		if(ri > 0){ /** backing up on ref if possible ***/
2763 			int avail_oplen = refOp[ri-1].oplen - ref_cop.oplen;
2764 			if(avail_oplen > 0 ){
2765 				if((-delta) <= avail_oplen * ref_cop.ref_sign){ /*** rewind within last operation **/
2766 					ref_cop.oplen -= delta;
2767 					delta -= delta * ref_cop.ref_sign;
2768 				} else { /*** rewind the whole ***/
2769 					ref_cop.oplen += avail_oplen;
2770 					delta += avail_oplen * ref_cop.ref_sign;
2771 				}
2772 			} else {
2773 				ri--;
2774 				/** pick the previous as used up **/
2775 				ref_cop = refOp[ri-1];
2776 				ref_cop.oplen =0;
2777 			}
2778 		} else { /** extending the reference **/
2779 			SetCigOp(&ref_cop,'=',ref_cop.oplen - delta);
2780 			delta = 0;
2781 		}
2782 		ref_pos -= delta;
2783 	}
2784 	if(ref_cop.oplen == 0){ /*** advance the reference ***/
2785 		ref_cop = refOp[ri++];
2786 		if(ref_cop.oplen == 0) { /** extending beyond the reference **/
2787 			SetCigOp(&ref_cop,'=',1000);
2788 		}
2789 		assert(ref_cop.oplen > 0 );
2790 	}
2791 	if(delta > 0){ /***  skip refOps ***/
2792 		ref_pos += delta; /** may need to back up **/
2793 		if(delta >=  ref_cop.oplen){ /** full **/
2794 			delta -= ref_cop.oplen * ref_cop.ref_sign;
2795 			ref_cop.oplen = 0;
2796 		} else { /** partial **/
2797 			ref_cop.oplen -= delta;
2798 			delta -= delta * ref_cop.ref_sign;
2799 		}
2800 		ref_pos -= delta; /** if something left - restore ***/
2801 		continue;
2802 	}
2803 	/*** seq and ref should be synchronized here **/
2804 	assert(delta == 0);
2805 	if(seq_cop.oplen == 0){ /*** advance sequence ***/
2806 		if(seq_pos < seq_len){
2807 			seq_cop = seqOp[si++];
2808 			assert(seq_cop.oplen > 0 );
2809 			seq_pos += seq_cop.oplen * seq_cop.seq_sign;
2810 		} else {
2811 			done=true;
2812 		}
2813 	}
2814         if(!done){
2815 		int seq_seq_step = seq_cop.oplen * seq_cop.seq_sign; /** sequence movement**/
2816 		int seq_ref_step = seq_cop.oplen * seq_cop.ref_sign; /** influence of sequence movement on intermediate reference **/
2817 		int ref_seq_step = ref_cop.oplen * ref_cop.seq_sign; /** movement of the intermediate reference ***/
2818 		int ref_ref_step = ref_cop.oplen * ref_cop.ref_sign; /** influence of the intermediate reference movement on final reference ***/
2819 		assert( ref_ref_step >= 0); /** no B in the reference **/
2820 		if( seq_ref_step <= 0){ /** BSIPH in the sequence against anything ***/
2821 			MACRO_BUILD_CIGAR( seq_cop.op, seq_cop.oplen );
2822 			seq_cop.oplen = 0;
2823 			delta = seq_ref_step; /** if negative - will force rewind next cycle **/
2824 		} else if(ref_ref_step <= 0){ /** MX=DN against SIPH in the reference***/
2825 			if(ref_seq_step == 0){ /** MX=DN against PH **/
2826 				MACRO_BUILD_CIGAR( ref_cop.op,ref_cop.oplen);
2827 				ref_cop.oplen = 0;
2828 			} else {
2829 				int min_len = (seq_cop.oplen<ref_cop.oplen)?seq_cop.oplen:ref_cop.oplen;
2830 				if(seq_seq_step == 0){ /** DN agains SI **/
2831 					MACRO_BUILD_CIGAR('P',min_len);
2832 				} else { /** MX= agains SI ***/
2833 					MACRO_BUILD_CIGAR( ref_cop.op,min_len);
2834 				}
2835 				seq_cop.oplen -= min_len;
2836 				ref_cop.oplen -= min_len;
2837 			}
2838 		} else { /*MX=DN  against MX=DN*/
2839 			int min_len = (seq_cop.oplen<ref_cop.oplen)?seq_cop.oplen:ref_cop.oplen;
2840 			if(seq_seq_step == 0){ /* DN against MX=DN */
2841 				if(ref_seq_step == 0){ /** padding DN against DN **/
2842 					MACRO_BUILD_CIGAR('P',min_len);
2843 					ref_cop.oplen -= min_len;
2844 					seq_cop.oplen -= min_len;
2845 				} else { /* DN against MX= **/
2846 					MACRO_BUILD_CIGAR(seq_cop.op,min_len);
2847 					seq_cop.oplen -= min_len;
2848 				}
2849 			} else if (ref_cop.seq_sign == 0){ /* MX= against DN - always wins */
2850 				MACRO_BUILD_CIGAR(ref_cop.op,min_len);
2851 				ref_cop.oplen -= min_len;
2852 			} else { /** MX= against MX= ***/
2853 				MACRO_BUILD_CIGAR(merge_M_type_ops(seq_cop.op,ref_cop.op),min_len);
2854 				ref_cop.oplen -= min_len;
2855 				seq_cop.oplen -= min_len;
2856 			}
2857 			ref_pos += min_len;
2858 		}
2859         }
2860     }
2861     return ciglen;
2862 }
2863 
2864 
DumpCGSAM(SAM_dump_ctx_t * const ctx,TAlignedRegion const * const rgn,int options,int which,int64_t * rows,SCol const * const ids)2865 static rc_t DumpCGSAM( SAM_dump_ctx_t *const ctx, TAlignedRegion const *const rgn, int options,
2866                        int which, int64_t *rows, SCol const *const ids )
2867 {
2868     if ( which == evidence_interval_IDS )
2869     {
2870         rc_t rc = 0;
2871         unsigned i;
2872 
2873         for ( i = 0; i < ids->len; ++i )
2874         {
2875             int64_t const rowInterval = ids->base.i64[ i ];
2876 
2877             rc = Cursor_Read( &ctx->evi, rowInterval, alg_REGION_FILTER, ~(0u) );
2878             if ( rc == 0 )
2879             {
2880                 if ( AlignRegionFilter( ctx->evi.cols ) )
2881                 {
2882                     rc = Cursor_ReadAlign( &ctx->evi.curs, rowInterval, ctx->evi.cols, 0 );
2883                     if ( rc == 0 )
2884                     {
2885                         unsigned const evidence = ctx->evi.cols[ alg_EVIDENCE_ALIGNMENT_IDS ].len;
2886                         if ( evidence != 0 )
2887                         {
2888                             INSDC_coord_len const *const refLen = ctx->evi.cols[ alg_READ_LEN ].base.coord_len;
2889                             INSDC_coord_len const *const cigLen = ctx->evi.cols[ alg_CIGAR_LEN ].base.coord_len;
2890                             unsigned const ploids = ctx->evi.cols[ alg_CIGAR_LEN ].len;
2891                             unsigned const totalIntervalReadLen = ctx->evi.cols[ alg_READ ].len;
2892 			    CigOps  refCigOps_stack[1024];
2893 			    CigOps *refCigOps_heap=NULL;
2894                             CigOps *refCigOps;
2895 
2896 			    if( totalIntervalReadLen > 1000){
2897 				refCigOps=refCigOps_heap = malloc( sizeof( refCigOps[ 0 ] ) * (totalIntervalReadLen+ploids) );
2898 			    } else {
2899 				refCigOps=refCigOps_stack;
2900 			    }
2901 
2902                             if ( refCigOps != NULL )
2903                             {
2904                                 unsigned j;
2905                                 unsigned start;
2906                                 unsigned cigop_starts[256];
2907 
2908                                 assert(ploids < 256);
2909 
2910                                 for ( start = j = 0, cigop_starts[0]=0; j < ploids; ++j )
2911                                 {
2912                                     unsigned len = cigLen[ j ];
2913                                     cigop_starts[j+1] = cigop_starts[j] + ExplodeCIGAR( refCigOps + cigop_starts[j], refLen[ j ],
2914                                                                                        ctx->evi.cols[ alg_CIGAR ].base.str + start, len );
2915                                     start += len;
2916                                 }
2917 
2918                                 for ( j = 0; j < evidence; ++j )
2919                                 {
2920                                     int64_t const rowAlign = ctx->evi.cols[ alg_EVIDENCE_ALIGNMENT_IDS ].base.i64[ j ];
2921 
2922                                     rc = Cursor_ReadAlign( &ctx->eva.curs, rowAlign, ctx->eva.cols, 0 );
2923                                     if ( rc == 0 )
2924                                     {
2925                                         if(param->cg_style != 0)
2926                                             rc = GenerateCGData( ctx->eva.cols, param->cg_style );
2927                                         if ( rc == 0 )
2928                                         {
2929                                             int const ploidy = ctx->eva.cols[ alg_REF_PLOIDY ].base.u32[ 0 ];
2930                                             int const readLen = ctx->eva.cols[ alg_READ ].len;
2931                                             INSDC_coord_zero refPos = ctx->eva.cols[ alg_REF_POS ].base.coord0[ 0 ];
2932                                             CigOps op[ 36 ];
2933                                             char cigbuf[ CG_CIGAR_STRING_MAX ];
2934 
2935                                             memset( cigbuf, 0, CG_CIGAR_STRING_MAX );
2936                                             ExplodeCIGAR( op, readLen, ctx->eva.cols[ alg_CIGAR ].base.str,
2937                                                          ctx->eva.cols[ alg_CIGAR ].len );
2938                                             ctx->eva.cols[ alg_CIGAR ].len = CombineCIGAR( cigbuf, op, readLen, refPos,
2939                                                                                           refCigOps + cigop_starts[ ploidy - 1 ], refLen[ ploidy - 1 ] );
2940                                             ctx->eva.cols[ alg_CIGAR ].base.str = cigbuf;
2941                                             ctx->eva.cols[ alg_REF_POS ].base.v = &refPos;
2942                                             refPos += ctx->evi.cols[ alg_REF_POS ].base.coord0[ 0 ] ;
2943 			  		    if(refPos < 0){
2944 						ReferenceObj const *r = NULL;
2945     						rc = ReferenceList_Find( gRefList, &r,
2946 									ctx->evi.cols[ alg_REF_NAME ].base.str,
2947 									ctx->evi.cols[ alg_REF_NAME ].len );
2948 						if(rc == 0){
2949 							bool circular=false;
2950 							rc=ReferenceObj_Circular(r, &circular);
2951 							if(rc == 0 && circular){
2952 								INSDC_coord_len len;
2953 								rc=ReferenceObj_SeqLength(r,&len);
2954 								if(rc == 0)
2955 									refPos += len;
2956 							}
2957 							ReferenceObj_Release(r);
2958 						}
2959 					    }
2960                                             rc = DumpAlignedSAM( ctx, &ctx->eva, rowAlign, ctx->readGroup, 1 );
2961                                             ++*rows;
2962                                         }
2963                                         else
2964                                             break;
2965                                     }
2966                                     else
2967                                         break;
2968                                 }
2969 				if(refCigOps_heap) free( refCigOps_heap );
2970                             }
2971                             else
2972                                 rc = RC( rcExe, rcTable, rcReading, rcMemory, rcExhausted );
2973                         }
2974                     }
2975                 }
2976             }
2977             else
2978                 break;
2979             rc = rc ? rc : Quitting();
2980             if ( rc != 0 )
2981                 break;
2982         }
2983         return rc;
2984     }
2985     return RC( rcExe, rcTable, rcReading, rcParam, rcUnexpected );
2986 }
2987 
2988 
ForEachAlignedRegion(SAM_dump_ctx_t * const ctx,enum e_IDS_opts const Options,rc_t (* user_func)(SAM_dump_ctx_t * ctx,TAlignedRegion const * rgn,int options,int which,int64_t * rows,SCol const * IDS))2989 static rc_t ForEachAlignedRegion( SAM_dump_ctx_t *const ctx, enum e_IDS_opts const Options,
2990     rc_t ( *user_func )( SAM_dump_ctx_t *ctx, TAlignedRegion const *rgn, int options, int which, int64_t *rows, SCol const *IDS ) )
2991 {
2992     SCol cols[] =
2993     {
2994         { "MAX_SEQ_LEN", 0, { NULL }, 0, false },
2995         { "OVERLAP_REF_POS", 0, { NULL }, 0, true },
2996         { "PRIMARY_ALIGNMENT_IDS", 0, { NULL }, 0, false },
2997         { "SECONDARY_ALIGNMENT_IDS", 0, { NULL }, 0, true },
2998         { "EVIDENCE_INTERVAL_IDS", 0, { NULL }, 0, true },
2999         { NULL, 0, { NULL }, 0, false }
3000     };
3001     enum eref_col
3002     {
3003         ref_MAX_SEQ_LEN = 0,
3004         ref_OVERLAP_REF_LEN,
3005         ref_PRIMARY_ALIGNMENT_IDS,
3006         ref_SECONDARY_ALIGNMENT_IDS,
3007         ref_EVIDENCE_INTERVAL_IDS
3008     };
3009 
3010     KIndex const *iname = NULL;
3011     int64_t rows = 0;
3012     rc_t rc = 0;
3013     int options = Options & 7;
3014 
3015     if ( Options & evidence_alignment_IDS )
3016         options |= evidence_interval_IDS;
3017 
3018     if ( ( options & primary_IDS ) == 0 )
3019         cols[ ref_PRIMARY_ALIGNMENT_IDS ].name = "";
3020     if ( ( options & secondary_IDS ) == 0 )
3021         cols[ ref_SECONDARY_ALIGNMENT_IDS ].name = "";
3022     if ( ( options & evidence_interval_IDS ) == 0 )
3023         cols[ ref_EVIDENCE_INTERVAL_IDS ].name = "";
3024 
3025     ctx->ref.cols = cols;
3026 
3027     rc = VTableOpenIndexRead( ctx->ref.tbl.vtbl, &iname, "i_name" );
3028     if ( rc == 0 )
3029     {
3030         rc = Cursor_Open( &ctx->ref.tbl, &ctx->ref.curs, ctx->ref.cols, NULL );
3031         if ( rc == 0 )
3032         {
3033             int64_t rowid = 1;
3034             uint64_t count = 0;
3035             char refname[ 1024 ];
3036             size_t sz;
3037 
3038             if ( ctx->ref.cols[ ref_PRIMARY_ALIGNMENT_IDS ].idx == 0 )
3039                 options &= ~primary_IDS;
3040             if ( ctx->ref.cols[ ref_SECONDARY_ALIGNMENT_IDS ].idx == 0 )
3041                 options &= ~secondary_IDS;
3042             if ( ctx->ref.cols[ ref_EVIDENCE_INTERVAL_IDS ].idx == 0 )
3043                 options &= ~evidence_interval_IDS;
3044 
3045             /* new: if we have a list of regions, reset the their print-flag */
3046             if ( param->region_qty > 0 )
3047             {
3048                 unsigned r;
3049                 for ( r = 0; r < param->region_qty; ++r )
3050                 {
3051                     param->region[ r ].printed = 0;
3052                 }
3053             }
3054 
3055             while ( ( rc = KIndexProjectText(iname, rowid + count, &rowid, &count, refname, sizeof( refname ), &sz ) ) == 0 )
3056             {
3057                 bool include;
3058                 unsigned r;
3059                 unsigned max_to = UINT_MAX;
3060                 unsigned min_from = 0;
3061 
3062                 for ( include = false, r = 0; r < param->region_qty; ++r )
3063                 {
3064                     if ( sz == string_size( param->region[ r ].name ) &&
3065                          memcmp( param->region[ r ].name, refname, sz ) == 0 )
3066                     {
3067                         include = true;
3068                         max_to = param->region[ r ].max_to;
3069                         min_from = param->region[ r ].min_from;
3070                         param->region[ r ].printed++; /* new: mark a region as printed */
3071                         break;
3072                     }
3073                 }
3074 
3075                 if ( param->region_qty == 0 || include )
3076                 {
3077                     int64_t const endrow_exclusive = rowid + count;
3078                     unsigned m;
3079                     unsigned k;
3080 
3081                     for ( k = 0, m = 1; rc == 0 && k < 3; ++k, m <<= 1 )
3082                     {
3083                         if ( m & options )
3084                         {
3085                             unsigned lookback = 0;
3086                             int64_t row;
3087                             int32_t row_start_offset;
3088                             unsigned pos;
3089 			    unsigned const maxseqlen = 5000; /***** TODO: this code is to be rewritten anyway ****/
3090 
3091                             if ( param->region_qty )
3092                             {
3093                                 if ( ctx->ref.cols[ ref_OVERLAP_REF_LEN ].idx )
3094                                 {
3095                                     rc = Cursor_Read( &ctx->ref, rowid, 0, ~(0u) );
3096                                     if ( rc != 0 )
3097                                         break;
3098                                     if ( ctx->ref.cols[ ref_OVERLAP_REF_LEN ].len > k )
3099                                     {
3100                                         unsigned const overlap = ctx->ref.cols[ ref_OVERLAP_REF_LEN ].base.coord0[ k ];
3101 
3102                                         if ( overlap != 0 )
3103                                         {
3104 
3105                                             assert( overlap < rowid * maxseqlen );
3106                                             lookback = ( rowid - ( overlap / maxseqlen ) );
3107                                         }
3108                                     }
3109                                 }
3110                                 else
3111                                     lookback = 1;
3112                             }
3113 			    row_start_offset = min_from/maxseqlen - lookback;
3114 			   if(row_start_offset < 0) row_start_offset=0;
3115 
3116 
3117                             for ( pos = row_start_offset*maxseqlen, row = rowid + row_start_offset; rc == 0 && row < endrow_exclusive; ++row )
3118                             {
3119                                 if ( row < 1 )
3120                                 {
3121                                     row = 0;
3122                                     continue;
3123                                 }
3124                                 rc = Cursor_Read( &ctx->ref, row, 0, ~(0u) );
3125                                 if ( rc != 0 )
3126                                     break;
3127                                 if ( ctx->ref.cols[ ref_PRIMARY_ALIGNMENT_IDS + k ].len > 0 )
3128                                     rc = user_func( ctx, &param->region[r], Options, m, &rows,
3129                                                     &ctx->ref.cols[ ref_PRIMARY_ALIGNMENT_IDS + k ] );
3130                                 pos += ctx->ref.cols[ ref_MAX_SEQ_LEN ].base.u32[ 0 ];
3131                                 if ( pos >= max_to )
3132                                     break;
3133                                 if ( param->test_rows != 0 && rows > param->test_rows )
3134                                     break;
3135                             }
3136                         }
3137                     }
3138                 }
3139                 if ( param->test_rows != 0 && rows > param->test_rows )
3140                     break;
3141                 if ( rc != 0 )
3142                     break;
3143             } /* while */
3144 
3145             /* new: walk the list of regions and report what was NOT printed ( found ) ... */
3146             if ( param->region_qty > 0 && GetRCState( rc ) != rcCanceled )
3147             {
3148                 unsigned r;
3149                 for ( r = 0; r < param->region_qty; ++r )
3150                 {
3151                     if ( param->region[ r ].printed == 0 )
3152                     {
3153                         (void)PLOGMSG( klogWarn, ( klogWarn, "reference >$(a)< not found", "a=%s", param->region[ r ].name ) );
3154                     }
3155                 }
3156             }
3157 
3158             if ( GetRCObject( rc ) == rcId && GetRCState( rc ) == rcNotFound )
3159             {
3160                 rc = 0;
3161             }
3162 
3163         }
3164     }
3165     Cursor_Close( &ctx->ref.curs );
3166     KIndexRelease( iname );
3167 
3168     return rc;
3169 }
3170 
3171 
DumpAlignedTable(SAM_dump_ctx_t * const ctx,DataSource * const ds,bool const primary,int const cg_style,unsigned * const rcount)3172 static rc_t DumpAlignedTable( SAM_dump_ctx_t *const ctx, DataSource *const ds,
3173                               bool const primary, int const cg_style, unsigned *const rcount )
3174 {
3175     unsigned i;
3176     int64_t start;
3177     uint64_t count;
3178     rc_t rc = VCursorIdRange( ds->curs.vcurs, 0, &start, &count );
3179 
3180     if ( rc != 0 )
3181         return rc;
3182     for ( i = 0; i != (unsigned)count; ++i )
3183     {
3184         if ( DumpAlignedRow(ctx, ds, start + i, primary, cg_style, &rc ) )
3185             ++*rcount;
3186         if ( rc != 0 || ( rc = Quitting() ) != 0 )
3187             return rc;
3188         if ( param->test_rows && *rcount > param->test_rows )
3189             break;
3190     }
3191     return 0;
3192 }
3193 
3194 
DumpUnsorted(SAM_dump_ctx_t * const ctx)3195 static rc_t DumpUnsorted( SAM_dump_ctx_t *const ctx )
3196 {
3197     rc_t rc = 0;
3198     unsigned rcount;
3199 
3200     if ( rc == 0 && param->cg_evidence )
3201     {
3202         SAM_DUMP_DBG( 2, ( "%s EVIDENCE_INTERVAL\n", ctx->accession ) );
3203         rcount = 0;
3204         rc = DumpAlignedTable( ctx, &ctx->evi, false, 0, &rcount );
3205         (void)PLOGMSG( klogInfo, ( klogInfo, "$(a): $(c) allele sequences", "a=%s,c=%lu", ctx->accession, rcount ) );
3206     }
3207     if ( rc == 0 && param->cg_ev_dnb )
3208     {
3209         SAM_DUMP_DBG( 2, ( "%s EVIDENCE_ALIGNMENT\n", ctx->accession ) );
3210         rcount = 0;
3211         rc = DumpAlignedTable( ctx, &ctx->eva, false, param->cg_style, &rcount );
3212         (void)PLOGMSG( klogInfo, ( klogInfo, "$(a): $(c) support sequences", "a=%s,c=%lu", ctx->accession, rcount ) );
3213     }
3214     if ( rc == 0 && ctx->pri.curs.vcurs )
3215     {
3216         SAM_DUMP_DBG( 2, ( "%s PRIMARY_ALIGNMENT\n", ctx->accession ) );
3217         rcount = 0;
3218         rc = DumpAlignedTable( ctx, &ctx->pri, true, param->cg_style, &rcount );
3219         (void)PLOGMSG( klogInfo, ( klogInfo, "$(a): $(c) primary sequences", "a=%s,c=%lu", ctx->accession, rcount ) );
3220     }
3221     if ( rc == 0 && ctx->sec.curs.vcurs )
3222     {
3223         SAM_DUMP_DBG( 2, ( "%s SECONDARY_ALIGNMENT\n", ctx->accession ) );
3224         rcount = 0;
3225         rc = DumpAlignedTable( ctx, &ctx->sec, false, param->cg_style, &rcount );
3226         (void)PLOGMSG( klogInfo, ( klogInfo, "$(a): $(c) secondary sequences", "a=%s,c=%lu", ctx->accession, rcount ) );
3227     }
3228     return rc;
3229 }
3230 
3231 
DumpUnaligned(SAM_dump_ctx_t * const ctx,bool const aligned)3232 static rc_t DumpUnaligned( SAM_dump_ctx_t *const ctx, bool const aligned )
3233 {
3234     rc_t rc = 0;
3235     int64_t start = 0;
3236     uint64_t count = 0;
3237 
3238     rc = VCursorIdRange( ctx->seq.curs.vcurs, 0, &start, &count );
3239     if ( rc == 0 )
3240     {
3241         uint64_t rcount = 0;
3242         if ( param->test_rows != 0 && count > param->test_rows )
3243         {
3244             count = param->test_rows;
3245         }
3246         SAM_DUMP_DBG( 2, ( "%s SEQUENCE table range from %ld qty %lu\n", ctx->accession, start, count ) );
3247         while ( count > 0 && rc == 0 )
3248         {
3249             uint32_t i;
3250 
3251             if ( ctx->seq.cols[ seq_PRIMARY_ALIGNMENT_ID ].idx == 0)
3252             {
3253                 rc = DumpUnalignedReads( ctx, NULL, start, &rcount );
3254             }
3255             else
3256             {
3257                 /* avoid reading whole sequence cursor data unnecessarily */
3258                 rc = Cursor_Read( &ctx->seq, start, seq_PRIMARY_ALIGNMENT_ID, ~(unsigned)0 );
3259                 if ( rc == 0 )
3260                 {
3261                     /* find if its completely unaligned */
3262                     int64_t min_prim_id = 0;
3263                     bool has_unaligned = false;
3264                     for ( i = 0; rc == 0 && i < ctx->seq.cols[ seq_PRIMARY_ALIGNMENT_ID ].len; i++ )
3265                     {
3266                         int64_t x = ctx->seq.cols[ seq_PRIMARY_ALIGNMENT_ID ].base.i64[ i ];
3267                         has_unaligned |= x == 0;
3268                         if ( ( min_prim_id == 0 && x != 0 ) || min_prim_id < x )
3269                         {
3270                             min_prim_id = x;
3271                         }
3272                     }
3273                     if ( min_prim_id == 0 )
3274                     {
3275                         rc = DumpUnalignedReads( ctx, NULL, start, &rcount );
3276                     }
3277                     else if ( has_unaligned && !param->unaligned_spots )
3278                     {
3279                         if ( rc == 0 )
3280                         {
3281 #if USE_MATE_CACHE
3282                             uint64_t val;
3283                             rc = Cache_Get( &ctx->pri.curs, min_prim_id, &val );
3284                             if ( rc == 0 )
3285                             {
3286                                 ctx->pri.cols[ alg_REF_POS ].len = 0;
3287                                 Cache_Unpack( val, 0, &ctx->pri.curs, ctx->pri.cols );
3288                                 ctx->pri.cols[ alg_SEQ_SPOT_ID ].base.i64 = &start;
3289                                 ctx->pri.cols[ alg_SEQ_SPOT_ID ].len = 1;
3290                                 memmove( &ctx->pri.cols[ alg_REF_NAME ], &ctx->pri.cols[ alg_MATE_REF_NAME ], sizeof( SCol ) );
3291                                 memmove( &ctx->pri.cols[ alg_REF_SEQ_ID ], &ctx->pri.cols[ alg_MATE_REF_NAME ], sizeof( SCol ) );
3292                                 memmove( &ctx->pri.cols[ alg_REF_POS ], &ctx->pri.cols[ alg_MATE_REF_POS ], sizeof( SCol ) );
3293                             }
3294                             else if ( !( GetRCState( rc ) == rcNotFound && GetRCObject( rc ) == rcItem ) )
3295                             {
3296                                 break;
3297                             }
3298                             else
3299                             {
3300 #endif /* USE_MATE_CACHE */
3301                                 rc = Cursor_ReadAlign( &ctx->pri.curs, min_prim_id, ctx->pri.cols, alg_MATE_ALIGN_ID );
3302 #if USE_MATE_CACHE
3303                             }
3304 #endif /* USE_MATE_CACHE */
3305                             rc = rc ? rc : DumpUnalignedReads( ctx, ctx->pri.cols, min_prim_id, &rcount );
3306                         }
3307                     }
3308                 } else if ( GetRCState( rc ) == rcNotFound && GetRCObject( rc ) == rcRow )
3309                 {
3310                     rc = 0;
3311                 }
3312             }
3313             start++;
3314             count--;
3315             rc = rc ? rc : Quitting();
3316         }
3317         (void)PLOGMSG( klogInfo, ( klogInfo, "$(a): $(c) unaligned sequences", "a=%s,c=%lu", ctx->accession, rcount ) );
3318     }
3319     return rc;
3320 }
3321 
3322 #if USE_MATE_CACHE
FlushUnalignedRead_cb(uint64_t key,bool value,void * user_data)3323 static rc_t FlushUnalignedRead_cb (uint64_t key, bool value,void *user_data)
3324 {
3325 	rc_t rc;
3326 	int64_t spot_id = (int64_t) key;
3327 	int64_t aligned_mate_id = (int64_t) value;
3328 	SAM_dump_ctx_t *const ctx = user_data;
3329 	rc = Cursor_Read( &ctx->seq, spot_id, seq_PRIMARY_ALIGNMENT_ID, ~(unsigned)0 );
3330 	if ( rc == 0 ) {
3331 		uint64_t rcount;
3332 		uint64_t val;
3333 		assert(ctx->seq.cols[ seq_PRIMARY_ALIGNMENT_ID ].len == 2);
3334 		if(ctx->seq.cols[ seq_PRIMARY_ALIGNMENT_ID ].base.i64[0]==0){
3335 			aligned_mate_id = ctx->seq.cols[ seq_PRIMARY_ALIGNMENT_ID ].base.i64[1];
3336 			assert(aligned_mate_id != 0);
3337 		} else if(ctx->seq.cols[ seq_PRIMARY_ALIGNMENT_ID ].base.i64[1] == 0){
3338 			aligned_mate_id = ctx->seq.cols[ seq_PRIMARY_ALIGNMENT_ID ].base.i64[0];
3339 			assert(aligned_mate_id != 0);
3340 		} else {
3341 			assert(0);
3342 		}
3343                 rc = Cache_Get( &ctx->pri.curs, aligned_mate_id, &val );
3344                 if ( rc == 0 ) {
3345 			ctx->pri.cols[ alg_REF_POS ].len = 0;
3346 			Cache_Unpack( val, 0, &ctx->pri.curs, ctx->pri.cols );
3347 			ctx->pri.cols[ alg_SEQ_SPOT_ID ].base.i64 = &spot_id;
3348 			ctx->pri.cols[ alg_SEQ_SPOT_ID ].len = 1;
3349 			memmove( &ctx->pri.cols[ alg_REF_NAME ], &ctx->pri.cols[ alg_MATE_REF_NAME ], sizeof( SCol ) );
3350 			memmove( &ctx->pri.cols[ alg_REF_SEQ_ID ], &ctx->pri.cols[ alg_MATE_REF_NAME ], sizeof( SCol ) );
3351 			memmove( &ctx->pri.cols[ alg_REF_POS ], &ctx->pri.cols[ alg_MATE_REF_POS ], sizeof( SCol ) );
3352 			rc =DumpUnalignedReads( ctx, ctx->pri.cols, aligned_mate_id, &rcount );
3353 		}
3354 	}
3355         return rc;
3356 }
FlushUnaligned(SAM_dump_ctx_t * const ctx,const SCursCache * cache)3357 static rc_t FlushUnaligned( SAM_dump_ctx_t *const ctx, const SCursCache * cache )
3358 {
3359     return KVectorVisitBool (cache->cache_unaligned_mate, false, FlushUnalignedRead_cb, ctx );
3360 }
3361 #endif
3362 
DumpHeader(SAM_dump_ctx_t const * ctx)3363 static rc_t DumpHeader( SAM_dump_ctx_t const *ctx )
3364 {
3365     bool reheader = param->reheader;
3366     rc_t rc = 0;
3367 
3368     if ( !reheader )
3369     {
3370         if ( ctx->db )
3371         {
3372             /* grab header from db meta node */
3373             KMetadata const *meta;
3374 
3375             rc = VDatabaseOpenMetadataRead( ctx->db, &meta );
3376             if ( rc == 0 )
3377             {
3378                 KMDataNode const *node;
3379 
3380                 rc = KMetadataOpenNodeRead( meta, &node, "BAM_HEADER" );
3381                 KMetadataRelease( meta );
3382 
3383                 if ( rc == 0 )
3384                 {
3385                     size_t offset = 0;
3386 
3387                     for ( offset = 0; ; )
3388                     {
3389                         size_t remain;
3390                         size_t nread;
3391                         char buffer[ 4096 ];
3392 
3393                         rc = KMDataNodeRead( node, offset, buffer, sizeof(buffer), &nread, &remain );
3394                         if ( rc != 0 )
3395                             break;
3396                         BufferedWriter( NULL, buffer, nread, NULL );
3397                         if ( remain == 0 )
3398                         {
3399                             if ( buffer[nread - 1] != '\n' )
3400                             {
3401                                  buffer[ 0 ] = '\n';
3402                                  rc = BufferedWriter( NULL, buffer, 1, NULL );
3403                             }
3404                             break;
3405                         }
3406                         offset += nread;
3407                     }
3408                     KMDataNodeRelease(node);
3409                 }
3410                 else if ( GetRCState( rc ) == rcNotFound )
3411                 {
3412                     reheader = true;
3413                     rc = 0;
3414                 }
3415             }
3416         }
3417         else
3418         {
3419             reheader = true;
3420             rc = 0;
3421         }
3422     }
3423 
3424     if ( rc == 0 && reheader )
3425     {
3426         rc = KOutMsg( "@HD\tVN:1.3\n" );
3427         if ( rc == 0 && gRefList )
3428             rc = RefSeqPrint();
3429         if ( rc == 0 && ctx->readGroup )
3430             rc = ReadGroup_print( ctx->readGroup );
3431         if ( rc == 0 && ctx->seq.tbl.vtbl )
3432             rc = DumpReadGroups( &ctx->seq.tbl );
3433     }
3434 
3435     if ( rc == 0 && param->comments )
3436     {
3437         unsigned i;
3438 
3439         for( i = 0; rc == 0 && param->comments[ i ]; ++i )
3440             rc = KOutMsg( "@CO\t%s\n", param->comments[ i ] );
3441     }
3442     return rc;
3443 }
3444 
3445 
DumpDB(SAM_dump_ctx_t * const ctx)3446 static rc_t DumpDB( SAM_dump_ctx_t *const ctx )
3447 {
3448     rc_t rc = 0;
3449 
3450     if ( ctx->ref.tbl.vtbl != NULL )
3451         rc = ReferenceList_MakeTable( &gRefList, ctx->ref.tbl.vtbl, 0, CURSOR_CACHE, NULL, 0 );
3452     if ( !param->noheader )
3453         rc = DumpHeader( ctx );
3454     if ( rc == 0 )
3455     {
3456         if ( param->region_qty ){
3457             rc = ForEachAlignedRegion(  ctx
3458                                       ,   ( param->primaries   ? primary_IDS : 0 )
3459                                         | ( param->secondaries ? secondary_IDS : 0 )
3460                                         | ( param->cg_evidence ? evidence_interval_IDS : 0 )
3461                                         | ( param->cg_ev_dnb   ? evidence_alignment_IDS : 0 )
3462                                       , DumpAlignedRowList_cb );
3463 #if USE_MATE_CACHE
3464 	    if ( rc == 0 && param->unaligned ){
3465                 rc = FlushUnaligned( ctx,ctx->pri.curs.cache);
3466 	    }
3467 #endif
3468 	}
3469         if ( param->cg_sam )
3470             rc = ForEachAlignedRegion( ctx, evidence_interval_IDS, DumpCGSAM );
3471 
3472         if ( param->region_qty == 0 )
3473         {
3474             rc = DumpUnsorted( ctx );
3475             if ( rc == 0 && param->unaligned )
3476                 rc = DumpUnaligned( ctx, ctx->pri.tbl.vtbl != NULL );
3477         }
3478     }
3479     ReferenceList_Release( gRefList );
3480     return rc;
3481 }
3482 
3483 
DumpTable(SAM_dump_ctx_t * ctx)3484 static rc_t DumpTable( SAM_dump_ctx_t *ctx )
3485 {
3486     rc_t rc = 0;
3487 
3488     if ( !param->noheader )
3489         rc = DumpHeader( ctx );
3490     if ( rc == 0 )
3491         rc = DumpUnaligned( ctx, false );
3492     return rc;
3493 }
3494 
3495 
ProcessTable(VDBManager const * mgr,char const fullPath[],char const accession[],char const readGroup[])3496 static rc_t ProcessTable( VDBManager const *mgr, char const fullPath[],
3497                           char const accession[], char const readGroup[] )
3498 {
3499     VTable const *tbl;
3500     rc_t rc = VDBManagerOpenTableRead( mgr, &tbl, 0, "%s", fullPath );
3501 
3502     if ( rc != 0 )
3503     {
3504         VSchema *schema;
3505 
3506         rc = VDBManagerMakeSRASchema( mgr, &schema );
3507         if ( rc == 0 )
3508         {
3509             rc = VDBManagerOpenTableRead( mgr, &tbl, schema, "%s", fullPath );
3510             VSchemaRelease( schema );
3511         }
3512     }
3513 
3514     if ( rc == 0 )
3515     {
3516         SAM_dump_ctx_t ctx;
3517         SCol seq_cols[ sizeof( gSeqCol ) / sizeof( gSeqCol[ 0 ] ) ];
3518 
3519         memset( &ctx, 0, sizeof( ctx ) );
3520 
3521         ctx.fullPath = fullPath;
3522         ctx.accession = accession;
3523         ctx.readGroup = readGroup;
3524 
3525         DATASOURCE_INIT( ctx.seq, accession );
3526         ctx.seq.tbl.vtbl = tbl;
3527         ctx.seq.type = edstt_Sequence;
3528 
3529         ReportResetTable( fullPath, tbl );
3530 
3531         ctx.seq.cols = seq_cols;
3532         memmove( ctx.seq.cols, gSeqCol, sizeof( gSeqCol ) );
3533         rc = Cursor_Open( &ctx.seq.tbl, &ctx.seq.curs, ctx.seq.cols, NULL );
3534         if ( rc == 0 ) {
3535             rc = DumpTable( &ctx );
3536             Cursor_Close(&ctx.seq.curs);
3537         }
3538         VTableRelease( tbl );
3539     }
3540     return rc;
3541 }
3542 
3543 
SetupColumns(DataSource * ds,enum eDSTableType type)3544 static void SetupColumns( DataSource *ds, enum eDSTableType type )
3545 {
3546     memmove( ds->cols, g_alg_col_tmpl, sizeof( g_alg_col_tmpl ) );
3547 
3548     ds->type = type;
3549     if ( param->use_seqid )
3550         ds->cols[ alg_MATE_REF_NAME ].name = "MATE_REF_SEQ_ID";
3551     else
3552         ds->cols[ alg_MATE_REF_NAME ].name = "MATE_REF_NAME";
3553 
3554     if ( param->fasta || param->fastq )
3555         ds->cols[ alg_READ ].name = "RAW_READ";
3556     else if ( param->hide_identical )
3557         ds->cols[ alg_READ ].name = "MISMATCH_READ";
3558     else
3559         ds->cols[ alg_READ ].name = "READ";
3560 
3561     if ( param->long_cigar )
3562     {
3563         ds->cols[ alg_CIGAR ].name = "CIGAR_LONG";
3564         ds->cols[ alg_CIGAR_LEN ].name = "CIGAR_LONG_LEN";
3565     }
3566     else
3567     {
3568         ds->cols[ alg_CIGAR ].name = "CIGAR_SHORT";
3569         ds->cols[ alg_CIGAR_LEN ].name = "CIGAR_SHORT_LEN";
3570     }
3571 
3572     if(param->qualQuant && param->qualQuantSingle){ /** we don't really need quality ***/
3573 	ds->cols[ alg_SAM_QUALITY   ].name = "";
3574     }
3575 
3576     switch ( type )
3577     {
3578     case edstt_EvidenceInterval:
3579 
3580         ds->cols[ alg_SEQ_SPOT_ID   ].name = "";
3581         ds->cols[ alg_SEQ_NAME      ].name = "";
3582         ds->cols[ alg_SAM_QUALITY   ].name = "";
3583         ds->cols[ alg_SPOT_GROUP    ].name = "";
3584         ds->cols[ alg_SEQ_SPOT_GROUP].name = "";
3585         ds->cols[ alg_SEQ_READ_ID   ].name = "";
3586         ds->cols[ alg_MATE_ALIGN_ID ].name = "";
3587         ds->cols[ alg_MATE_REF_ID   ].name = "";
3588         ds->cols[ alg_MATE_REF_NAME ].name = "";
3589         ds->cols[ alg_SAM_FLAGS     ].name = "";
3590         ds->cols[ alg_MATE_REF_POS  ].name = "";
3591         ds->cols[ alg_ALIGN_GROUP   ].name = "";
3592         ds->cols[ alg_TEMPLATE_LEN  ].name = "";
3593         if ( param->cg_sam || param->cg_ev_dnb )
3594             ds->cols[ alg_EVIDENCE_ALIGNMENT_IDS ].name = "EVIDENCE_ALIGNMENT_IDS";
3595         break;
3596 
3597     case edstt_EvidenceAlignment:
3598         ds->cols[ alg_REF_PLOIDY    ].name = "REF_PLOIDY";
3599         ds->cols[ alg_REF_ID        ].name = "REF_ID";
3600         ds->cols[ alg_MATE_ALIGN_ID ].name = "";
3601         ds->cols[ alg_MATE_REF_ID   ].name = "";
3602         ds->cols[ alg_MATE_REF_NAME ].name = "";
3603         ds->cols[ alg_SAM_FLAGS     ].name = "";
3604         ds->cols[ alg_MATE_REF_POS  ].name = "";
3605         ds->cols[ alg_ALIGN_GROUP   ].name = "";
3606         ds->cols[ alg_TEMPLATE_LEN  ].name = "";
3607         break;
3608     default:
3609         break;
3610     }
3611 }
3612 
3613 
ProcessDB(VDatabase const * db,char const fullPath[],char const accession[],char const readGroup[])3614 static rc_t ProcessDB( VDatabase const *db, char const fullPath[],
3615                        char const accession[], char const readGroup[] )
3616 {
3617     char const *const refTableName = (   param->region_qty > 0
3618                                       || param->cg_evidence
3619                                       || param->cg_ev_dnb
3620                                       || param->cg_sam
3621                                       || param->reheader
3622                                       || param->primaries
3623                                       || param->secondaries
3624                                      )
3625                                    ? "REFERENCE"
3626                                    : 0;
3627     char const *const priTableName = param->primaries   ? "PRIMARY_ALIGNMENT" : 0;
3628     char const *const secTableName = param->secondaries ? "SECONDARY_ALIGNMENT" : 0;
3629     char const *const seqTableName = ( param->unaligned || param->reheader ) ? "SEQUENCE" : 0;
3630     char const *const eviTableName = ( param->cg_evidence || ( param->cg_ev_dnb & ( param->region_qty > 0 ) ) || param->cg_sam ) ? "EVIDENCE_INTERVAL" : 0;
3631     char const *const evaTableName = ( param->cg_ev_dnb   || param->cg_sam ) ? "EVIDENCE_ALIGNMENT" : 0;
3632 
3633     SAM_dump_ctx_t ctx;
3634     SCol align_cols[ ( sizeof( g_alg_col_tmpl ) / sizeof( g_alg_col_tmpl[ 0 ] ) ) * 4 ];
3635     SCol seq_cols[ sizeof( gSeqCol ) / sizeof( gSeqCol[ 0 ] ) ];
3636     rc_t rc = 0;
3637 
3638     memset( &ctx, 0, sizeof( ctx ) );
3639     memset( align_cols, 0, sizeof( align_cols ) );
3640     memset( seq_cols, 0, sizeof( seq_cols ) );
3641 
3642     ctx.db = db;
3643     ctx.fullPath = fullPath;
3644     ctx.accession = accession;
3645     ctx.readGroup = readGroup;
3646 
3647     DATASOURCE_INIT( ctx.seq, seqTableName );
3648     DATASOURCE_INIT( ctx.ref, refTableName );
3649     DATASOURCE_INIT( ctx.pri, priTableName );
3650     DATASOURCE_INIT( ctx.sec, secTableName );
3651     DATASOURCE_INIT( ctx.evi, eviTableName );
3652     DATASOURCE_INIT( ctx.eva, evaTableName );
3653 
3654     if ( ctx.seq.tbl.name )
3655     {
3656         rc = VDatabaseOpenTableRead( db, &ctx.seq.tbl.vtbl, "%s", ctx.seq.tbl.name );
3657         if ( rc == 0 )
3658         {
3659             ctx.seq.cols = seq_cols;
3660             memmove(seq_cols, gSeqCol, sizeof(gSeqCol));
3661             rc = Cursor_Open( &ctx.seq.tbl, &ctx.seq.curs, ctx.seq.cols, NULL );
3662         }
3663     }
3664     ctx.pri.cols = &align_cols[ 0 * ( sizeof( g_alg_col_tmpl ) / sizeof( g_alg_col_tmpl[ 0 ] ) ) ];
3665     ctx.sec.cols = &align_cols[ 1 * ( sizeof( g_alg_col_tmpl ) / sizeof( g_alg_col_tmpl[ 0 ] ) ) ];
3666     ctx.evi.cols = &align_cols[ 2 * ( sizeof( g_alg_col_tmpl ) / sizeof( g_alg_col_tmpl[ 0 ] ) ) ];
3667     ctx.eva.cols = &align_cols[ 3 * ( sizeof( g_alg_col_tmpl ) / sizeof( g_alg_col_tmpl[ 0 ] ) ) ];
3668 
3669     SetupColumns( &ctx.pri, edstt_PrimaryAlignment );
3670     SetupColumns( &ctx.sec, edstt_SecondaryAlignment );
3671     SetupColumns( &ctx.evi, edstt_EvidenceInterval );
3672     SetupColumns( &ctx.eva, edstt_EvidenceAlignment );
3673 
3674     if ( ctx.pri.tbl.name )
3675         VDatabaseOpenTableRead( db, &ctx.pri.tbl.vtbl, "%s", ctx.pri.tbl.name );
3676     if ( ctx.sec.tbl.name )
3677         VDatabaseOpenTableRead( db, &ctx.sec.tbl.vtbl, "%s", ctx.sec.tbl.name );
3678     if ( ctx.evi.tbl.name )
3679         VDatabaseOpenTableRead( db, &ctx.evi.tbl.vtbl, "%s", ctx.evi.tbl.name );
3680     if ( ctx.eva.tbl.name )
3681         VDatabaseOpenTableRead( db, &ctx.eva.tbl.vtbl, "%s", ctx.eva.tbl.name );
3682 
3683     if (   ctx.pri.tbl.vtbl == NULL
3684         && ctx.sec.tbl.vtbl == NULL
3685         && ctx.evi.tbl.vtbl == NULL
3686         && ctx.eva.tbl.vtbl == NULL )
3687     {
3688         memset (&ctx.pri, 0, sizeof( ctx.pri ) * 4 );
3689         if ( ctx.seq.tbl.name == NULL )
3690         {
3691             ctx.seq.tbl.name = "SEQUENCE";
3692             rc = VDatabaseOpenTableRead( db, &ctx.seq.tbl.vtbl, "%s", ctx.seq.tbl.name );
3693         }
3694         if ( rc == 0 )
3695         {
3696             ctx.seq.cols = seq_cols;
3697             memmove(seq_cols, gSeqCol, sizeof(gSeqCol));
3698             rc = Cursor_Open(&ctx.seq.tbl, &ctx.seq.curs, ctx.seq.cols, NULL);
3699             if ( rc == 0 )
3700             {
3701                 rc = DumpTable( &ctx );
3702                 Cursor_Close(&ctx.seq.curs);
3703             }
3704             VTableRelease(ctx.seq.tbl.vtbl);
3705         }
3706         VDatabaseRelease( db );
3707         return rc;
3708     }
3709     ReportResetDatabase( fullPath, db );
3710 
3711     if ( ctx.ref.tbl.name )
3712     {
3713         rc = VDatabaseOpenTableRead( db, &ctx.ref.tbl.vtbl, "%s", ctx.ref.tbl.name );
3714         ctx.ref.type = edstt_Reference;
3715     }
3716     if ( rc == 0 )
3717         rc = Cursor_Open( &ctx.pri.tbl, &ctx.pri.curs, ctx.pri.cols, ctx.seq.curs.cache );
3718     if ( rc == 0 )
3719         rc = Cursor_Open( &ctx.sec.tbl, &ctx.sec.curs, ctx.sec.cols, NULL );
3720     if ( rc == 0 )
3721         rc = Cursor_Open( &ctx.evi.tbl, &ctx.evi.curs, ctx.evi.cols, NULL );
3722     if ( rc == 0 )
3723         rc = Cursor_Open( &ctx.eva.tbl, &ctx.eva.curs, ctx.eva.cols, NULL );
3724     if ( rc == 0 )
3725         rc = DumpDB( &ctx );
3726 
3727     Cursor_Close(&ctx.ref.curs);
3728     Cursor_Close(&ctx.pri.curs);
3729     Cursor_Close(&ctx.sec.curs);
3730     Cursor_Close(&ctx.evi.curs);
3731     Cursor_Close(&ctx.eva.curs);
3732     Cursor_Close(&ctx.seq.curs);
3733 
3734     VTableRelease( ctx.ref.tbl.vtbl );
3735     VTableRelease( ctx.pri.tbl.vtbl );
3736     VTableRelease( ctx.sec.tbl.vtbl );
3737     VTableRelease( ctx.evi.tbl.vtbl );
3738     VTableRelease( ctx.eva.tbl.vtbl );
3739     VTableRelease( ctx.seq.tbl.vtbl );
3740 
3741     return rc;
3742 }
3743 
3744 #if 0
3745 rc_t ResolvePath( char const *accession, char const ** path )
3746 {
3747     rc_t rc = 0;
3748     static char tblpath[ 4096 ];
3749 #if TOOLS_USE_SRAPATH != 0
3750     static SRAPath* pmgr = NULL;
3751 
3752     if ( accession == NULL && path == NULL )
3753     {
3754         SRAPathRelease( pmgr );
3755         return 0;
3756     }
3757     if (   pmgr != NULL
3758         || ( rc = SRAPathMake( &pmgr, NULL ) ) == 0
3759         || ( GetRCState( rc ) == rcNotFound && GetRCTarget( rc ) == rcDylib ) )
3760     {
3761         *path = tblpath;
3762         tblpath[ 0 ] = '\0';
3763         rc = 0;
3764         do {
3765             if ( pmgr != NULL && !SRAPathTest( pmgr, accession ) )
3766             {
3767                 /* try to resolve the path using mgr */
3768                 rc = SRAPathFind( pmgr, accession, tblpath, sizeof( tblpath ) );
3769                 if ( rc == 0 )
3770                     break;
3771             }
3772             if ( string_size( accession ) >= sizeof( tblpath ) )
3773             {
3774                 rc = RC( rcExe, rcPath, rcResolving, rcBuffer, rcInsufficient );
3775             }
3776             else
3777             {
3778                 string_copy( tblpath, ( sizeof tblpath ) - 1, accession, string_size( accession ) );
3779                 rc = 0;
3780             }
3781         } while( false );
3782     }
3783 #else
3784     if ( accession != NULL && path != NULL )
3785     {
3786         *path = tblpath;
3787         string_copy( tblpath, ( sizeof tblpath ) - 1, accession, string_size( accession ) );
3788     }
3789 #endif
3790 
3791     return rc;
3792 }
3793 #endif
3794 
3795 
ProcessPath(VDBManager const * mgr,char const Path[])3796 static rc_t ProcessPath( VDBManager const *mgr, char const Path[] )
3797 {
3798 #if 0
3799     unsigned pathlen = string_size( Path );
3800     char *readGroup = NULL;
3801     char *path = NULL;
3802     unsigned i;
3803     rc_t rc;
3804 
3805     /* strip trailing path seperators */
3806     for ( i = pathlen; i > 0; )
3807     {
3808         int const ch = Path[ --i ];
3809 
3810         if ( ch == '/' || ch == '\\' )
3811             --pathlen;
3812         else
3813             break;
3814     }
3815 
3816     /* find read group alias */
3817     for ( i = pathlen; i > 0; )
3818     {
3819         int const ch = Path[ --i ];
3820 
3821         if ( ch == '=' )
3822         {
3823             unsigned const pos = i + 1;
3824             unsigned const len = pathlen - pos;
3825 
3826             pathlen = i;
3827 
3828             readGroup = malloc( len + 1 );
3829             if ( readGroup == NULL )
3830                 return RC( rcExe, rcArgv, rcProcessing, rcMemory, rcExhausted );
3831             memmove( readGroup, &Path[ pos ], len );
3832             readGroup[ len ] = '\0';
3833             break;
3834         }
3835     }
3836 
3837     path = malloc( pathlen + 1 );
3838     if ( path == NULL )
3839         return RC( rcExe, rcArgv, rcProcessing, rcMemory, rcExhausted );
3840     memmove( path, Path, pathlen );
3841     path[ pathlen ] = '\0';
3842 
3843     rc = ReportResetObject( path );
3844     if ( rc == 0 )
3845     {
3846         char const *fullPath;
3847 
3848         rc = ResolvePath( path, &fullPath );
3849         if ( rc == 0 )
3850         {
3851             char const *accession = fullPath;
3852             VDatabase const *db;
3853 
3854             /* use last path element as accession */
3855             for ( i = pathlen; i > 0; )
3856             {
3857                 int const ch = path[ --i ];
3858 
3859                 if ( ch == '/' || ch == '\\' )
3860                 {
3861                     accession = &fullPath[ i + 1 ];
3862                     break;
3863                 }
3864             }
3865             rc = VDBManagerOpenDBRead( mgr, &db, NULL, "%s", fullPath );
3866             if ( rc == 0 )
3867             {
3868                 rc = ProcessDB( db, fullPath, accession, readGroup );
3869                 VDatabaseRelease( db );
3870             }
3871             else
3872                 rc = ProcessTable( mgr, fullPath, accession, readGroup );
3873         }
3874     }
3875     free( path );
3876     free( readGroup );
3877     return rc;
3878 
3879 #else
3880 
3881     char * pc;
3882     rc_t rc = 0;
3883     size_t pathlen = string_size ( Path );
3884 
3885     /* look for scheme */
3886     pc = string_chr ( Path, pathlen, ':' );
3887     if (pc == NULL)
3888     {
3889         char *readGroup;
3890         char *path;
3891         size_t i;
3892 
3893         /* no scheme found - must be simple fs path */
3894     /* old: ---unused label */
3895         readGroup = NULL;
3896         path = NULL;
3897 
3898         /* strip trailing path separators */
3899         for ( i = pathlen; i > 0; -- pathlen )
3900         {
3901             int const ch = Path[ --i ];
3902             if ( ch != '/' && ch != '\\' )
3903                 break;
3904         }
3905 
3906         /* find read group alias */
3907         for ( i = pathlen; i > 0; )
3908         {
3909             int const ch = Path[ --i ];
3910 
3911             if ( ch == '=' )
3912             {
3913                 size_t const pos = i + 1;
3914                 size_t const len = pathlen - pos;
3915 
3916                 pathlen = i;
3917 
3918                 readGroup = malloc( len + 1 );
3919                 if ( readGroup == NULL )
3920                     return RC( rcExe, rcArgv, rcProcessing, rcMemory, rcExhausted );
3921                 memmove( readGroup, &Path[ pos ], len );
3922                 readGroup[ len ] = '\0';
3923                 break;
3924             }
3925         }
3926 
3927         path = malloc( pathlen + 1 );
3928         if ( path == NULL )
3929             return RC( rcExe, rcArgv, rcProcessing, rcMemory, rcExhausted );
3930         memmove( path, Path, pathlen );
3931         path[ pathlen ] = '\0';
3932 
3933         rc = ReportResetObject( path );
3934         if ( rc == 0 )
3935         {
3936             char const *accession = path;
3937             VDatabase const *db;
3938 
3939             /* use last path element as accession */
3940             for ( i = pathlen; i > 0; )
3941             {
3942                 int const ch = path[ --i ];
3943 
3944                 if ( ch == '/' || ch == '\\' )
3945                 {
3946                     accession = &path[ i + 1 ];
3947                     break;
3948                 }
3949             }
3950             rc = VDBManagerOpenDBRead( mgr, &db, NULL, "%s", path );
3951             if ( rc == 0 )
3952             {
3953                 rc = ProcessDB( db, path, accession, readGroup );
3954                 if (UIError(rc, db, NULL)) {
3955                     UIDatabaseLOGError(rc, db, true);
3956 /*                  *error_reported = true; */
3957                 }
3958                 VDatabaseRelease( db );
3959             }
3960             else {
3961                 rc = ProcessTable( mgr, path, accession, readGroup );
3962                 if (UIError(rc, NULL, NULL)) {
3963                     UITableLOGError(rc, NULL, true);
3964 /*                  *error_reported = true; */
3965                 }
3966             }
3967         }
3968         free( path );
3969         free( readGroup );
3970         return rc;
3971     }
3972     else
3973     {
3974         VFSManager * vfs;
3975         rc = VFSManagerMake ( & vfs );
3976         if ( rc == 0 )
3977         {
3978             VPath * vpath;
3979             char * basename;
3980             size_t zz;
3981             char buffer [8193];
3982             char readgroup_ [257];
3983             char * readgroup;
3984 
3985             rc = VFSManagerMakePath (vfs, &vpath, "%s", Path);
3986             VFSManagerRelease ( vfs );
3987             if ( rc == 0 )
3988             {
3989                 char * ext;
3990 
3991                 rc = VPathReadPath (vpath, buffer, sizeof buffer - 1, &zz);
3992                 if ( rc == 0 )
3993                 {
3994                 loop:
3995                     basename = string_rchr (buffer, zz, '/');
3996                     if ( basename == NULL )
3997                         basename = buffer;
3998                     else
3999                     {
4000                         if (basename[1] == '\0')
4001                         {
4002                             basename[0] = '\0';
4003                             goto loop;
4004                         }
4005                         ++ basename;
4006                         zz -= basename - buffer;
4007                     }
4008 
4009                     /* cut off [.lite].[c]sra[.nenc||.ncbi_enc] if any */
4010                     ext = string_rchr( basename, zz, '.' );
4011                     if ( ext != NULL )
4012                     {
4013                         if ( strcasecmp( ext, ".nenc" ) == 0 || strcasecmp( ext, ".ncbi_enc" ) == 0 )
4014                         {
4015                             zz -= ext - basename;
4016                             *ext = '\0';
4017                             ext = string_rchr( basename, zz, '.' );
4018                         }
4019 
4020                         if ( ext != NULL )
4021                         {
4022                             if ( strcasecmp( ext, ".sra" ) == 0 || strcasecmp( ext, ".csra" ) == 0 )
4023                             {
4024                                 zz -= ext - basename;
4025                                 *ext = '\0';
4026                                 ext = string_rchr( basename, zz, '.' );
4027                                 if ( ext != NULL && strcasecmp( ext, ".lite" ) == 0 )
4028                                     *ext = '\0';
4029                             }
4030                         }
4031                     }
4032 
4033                     rc = VPathOption (vpath, vpopt_readgroup, readgroup_,
4034                                       sizeof readgroup_ - 1, &zz);
4035                     if ( rc == 0 )
4036                         readgroup = readgroup_;
4037                     else if ( GetRCState ( rc ) == rcNotFound )
4038                     {
4039                         rc = 0;
4040                         readgroup = NULL;
4041                     }
4042 
4043 
4044                     if ( rc == 0 )
4045                     {
4046                         VDatabase const *db;
4047 
4048                         rc = VDBManagerOpenDBRead( mgr, &db, NULL, "%s", Path );
4049                         if ( rc == 0 )
4050                         {
4051                             rc = ProcessDB( db, Path, basename, readgroup );
4052                             VDatabaseRelease( db );
4053                         }
4054                         else
4055                             rc = ProcessTable( mgr, Path, basename, readgroup );
4056                     }
4057                 }
4058 
4059                 VPathRelease (vpath);
4060             }
4061         }
4062     }
4063     return rc;
4064 #endif
4065 }
4066 
4067 /*
4068 ver_t CC KAppVersion( void )
4069 {
4070     return SAM_DUMP_VERS;
4071 }
4072 */
4073 
4074 char const *unaligned_usage[] = {"Output unaligned reads", NULL};
4075 char const *unaligned_only_usage[] = {"Only output unaligned spots", NULL};
4076 char const *primaryonly_usage[] = {"Output only primary alignments", NULL};
4077 char const *cigartype_usage[] = {"Output long version of CIGAR", NULL};
4078 char const *cigarCG_usage[] = {"Output CG version of CIGAR", NULL};
4079 char const *cigarCGMerge_usage[] = {"Apply CG fixups to CIGAR/SEQ/QUAL and outputs CG-specific columns", NULL};
4080 char const *CG_evidence[] = {"Output CG evidence aligned to reference", NULL};
4081 char const *CG_ev_dnb[] = {"Output CG evidence DNB's aligned to evidence", NULL};
4082 char const *CG_SAM[] = {"Output CG evidence DNB's aligned to reference ", NULL};
4083 char const *CG_mappings[] = {"Output CG sequences aligned to reference ", NULL};
4084 char const *header_usage[] = {"Replace original BAM header with a reconstructed one", NULL};
4085 char const *noheader_usage[] = {"Do not output headers", NULL};
4086 char const *region_usage[] = {"Filter by position on genome.",
4087                               "Name can either be file specific name (ex: \"chr1\" or \"1\").",
4088                               "\"from\" and \"to\" are 1-based coordinates", NULL};
4089 char const *distance_usage[] = {"Filter by distance between matepairs.",
4090                                 "Use \"unknown\" to find matepairs split between the references.",
4091                                 "Use from-to to limit matepair distance on the same reference", NULL};
4092 char const *seq_id_usage[] = {"Print reference SEQ_ID in RNAME instead of NAME", NULL};
4093 char const *identicalbases_usage[] = {"Output '=' if base is identical to reference", NULL};
4094 char const *gzip_usage[] = {"Compress output using gzip", NULL};
4095 char const *bzip2_usage[] = {"Compress output using bzip2", NULL};
4096 char const *qname_usage[] = {"Add .SPOT_GROUP to QNAME", NULL};
4097 char const *fasta_usage[] = {"Produce Fasta formatted output", NULL};
4098 char const *fastq_usage[] = {"Produce FastQ formatted output", NULL};
4099 char const *prefix_usage[] = {"Prefix QNAME: prefix.QNAME", NULL};
4100 char const *reverse_usage[] = {"Reverse unaligned reads according to read type", NULL};
4101 char const *comment_usage[] = {"Add comment to header. Use multiple times for several lines. Use quotes", NULL};
4102 char const *XI_usage[] = {"Output cSRA alignment id in XI column", NULL};
4103 char const *qual_quant_usage[] = {"Quality scores quantization level",
4104                                   "a string like '1:10,10:20,20:30,30:-'", NULL};
4105 char const *CG_names[] = { "Generate CG friendly read names", NULL};
4106 
4107 char const *usage_params[] =
4108 {
4109     NULL,                       /* unaligned */
4110     NULL,                       /* unaligned-only */
4111     NULL,                       /* primaryonly */
4112     NULL,                       /* cigartype */
4113     NULL,                       /* cigarCG */
4114     NULL,                       /* header */
4115     NULL,                       /* no-header */
4116     "text",                     /* comment */
4117     "name[:from-to]",           /* region */
4118     "from-to|'unknown'",        /* distance */
4119     NULL,                       /* seq-id */
4120     NULL,                       /* identical-bases */
4121     NULL,                       /* gzip */
4122     NULL,                       /* bzip2 */
4123     NULL,                       /* qname */
4124     NULL,                       /* fasta */
4125     NULL,                       /* fastq */
4126     "prefix",                   /* prefix */
4127     NULL,                       /* reverse */
4128     NULL,                       /* test-rows */
4129     NULL,                       /* mate-row-gap-cacheable */
4130     NULL,                       /* cigarCGMerge */
4131     NULL,                       /* XI */
4132     "quantization string",      /* qual-quant */
4133     NULL,                       /* cigarCGMerge */
4134     NULL,                       /* CG-evidence */
4135     NULL,                       /* CG-ev-dnb */
4136     NULL,                       /* CG-mappings */
4137     NULL,                       /* CG-SAM */
4138     NULL                        /* CG-names */
4139 };
4140 
4141 enum eArgs
4142 {
4143     earg_unaligned = 0,         /* unaligned */
4144     earg_unaligned_only,        /* unaligned */
4145     earg_prim_only,             /* primaryonly */
4146     earg_cigartype,             /* cigartype */
4147     earg_cigarCG,               /* cigarCG */
4148     earg_header,                /* header */
4149     earg_noheader,              /* no-header */
4150     earg_comment,               /* comment */
4151     earg_region,                /* region */
4152     earg_distance,              /* distance */
4153     earg_seq_id,                /* seq-id */
4154     earg_identicalbases,        /* identical-bases */
4155     earg_gzip,                  /* gzip */
4156     earg_bzip2,                 /* bzip2 */
4157     earg_qname,                 /* qname */
4158     earg_fastq,                 /* fasta */
4159     earg_fasta,                 /* fastq */
4160     earg_prefix,                /* prefix */
4161     earg_reverse,               /* reverse */
4162     earg_test_rows,             /* test-rows */
4163     earg_mate_row_gap_cachable, /* mate-row-gap-cacheable */
4164     earg_cigarCG_merge,         /* cigarCGMerge */
4165     earg_XI,                    /* XI */
4166     earg_QualQuant,             /* qual-quant */
4167     earg_CG_evidence,           /* CG-evidence */
4168     earg_CG_ev_dnb,             /* CG-ev-dnb */
4169     earg_CG_mappings,           /* CG-mappings */
4170     earg_CG_SAM,                /* CG-SAM */
4171     earg_CG_names               /* CG-names */
4172 };
4173 
4174 OptDef DumpArgs[] =
4175 {
4176     { "unaligned", "u", NULL, unaligned_usage, 0, false, false },           /* unaligned */
4177     { "unaligned-only", "U", NULL, unaligned_usage, 0, false, false },      /* unaligned-only */
4178     { "primary", "1", NULL, primaryonly_usage, 0, false, false },           /* primaryonly */
4179     { "cigar-long", "c", NULL, cigartype_usage, 0, false, false },          /* cigartype */
4180     { "cigar-CG", NULL, NULL, cigarCG_usage, 0, false, false },             /* cigarCG */
4181     { "header", "r", NULL, header_usage, 0, false, false },                 /* header */
4182     { "no-header", "n", NULL, noheader_usage, 0, false, false },            /* no-header */
4183     { "header-comment", NULL, NULL, comment_usage, 0, true, false },        /* comment */
4184     { "aligned-region", NULL, NULL, region_usage, 0, true, false },         /* region */
4185     { "matepair-distance", NULL, NULL, distance_usage, 0, true, false },    /* distance */
4186     { "seqid", "s", NULL, seq_id_usage, 0, false, false },                  /* seq-id */
4187     { "hide-identical", "=", NULL, identicalbases_usage, 0, false, false }, /* identical-bases */
4188     { "gzip", NULL, NULL, gzip_usage, 0, false, false },                    /* gzip */
4189     { "bzip2", NULL, NULL, bzip2_usage, 0, false, false },                  /* bzip2 */
4190     { "spot-group", "g", NULL, qname_usage, 0, false, false },              /* qname */
4191     { "fastq", NULL, NULL, fastq_usage, 0, false, false },                  /* fasta */
4192     { "fasta", NULL, NULL, fasta_usage, 0, false, false },                  /* fastq */
4193     { "prefix", "p", NULL, prefix_usage, 0, true, false },                  /* prefix */
4194     { "reverse", NULL, NULL, reverse_usage, 0, false, false },              /* reverse */
4195     { "test-rows", NULL, NULL, NULL, 0, true, false },                      /* test-rows */
4196     { "mate-cache-row-gap", NULL, NULL, NULL, 0, true, false },             /* mate-row-gap-cacheable */
4197     { "cigar-CG-merge", NULL, NULL, cigarCGMerge_usage, 0, false, false },  /* cigarCGMerge */
4198     { "XI", NULL, NULL, XI_usage, 0, false, false },                        /* XI */
4199     { "qual-quant", "Q", NULL, qual_quant_usage, 0, true, false },          /* qual-quant */
4200     { "CG-evidence", NULL, NULL, CG_evidence, 0, false, false },            /* CG-evidence */
4201     { "CG-ev-dnb"  , NULL, NULL, CG_ev_dnb  , 0, false, false },            /* CG-ev-dnb */
4202     { "CG-mappings", NULL, NULL, CG_mappings, 0, false, false },            /* CG-mappings */
4203     { "CG-SAM", NULL, NULL, CG_SAM, 0, false, false },                      /* CG-SAM */
4204     { "CG-names", NULL, NULL, CG_names, 0, false, false },                  /* CG-names */
4205     { "legacy", NULL, NULL, NULL, 0, false, false }
4206 };
4207 
4208 
4209 /*
4210 const char UsageDefaultName[] = "sam-dump";
4211 
4212 
4213 rc_t CC UsageSummary( char const *progname )
4214 {
4215     return KOutMsg( "Usage:\n"
4216         "\t%s [options] path-to-run[ path-to-run ...] > output.sam\n\n", progname );
4217 }
4218 
4219 
4220 rc_t CC Usage( Args const *args)
4221 {
4222     char const *progname = UsageDefaultName;
4223     char const *fullpath = UsageDefaultName;
4224     rc_t rc;
4225     unsigned i;
4226 
4227     rc = ArgsProgram( args, &fullpath, &progname );
4228 
4229     UsageSummary( progname );
4230 
4231     KOutMsg( "Options:\n" );
4232     for( i = 0; i < sizeof( DumpArgs ) / sizeof( DumpArgs[ 0 ] ); i++ )
4233     {
4234         if ( DumpArgs[ i ].help != NULL )
4235         {
4236             HelpOptionLine( DumpArgs[ i ].aliases, DumpArgs[ i ].name,
4237                             usage_params[ i ], DumpArgs[ i ].help);
4238         }
4239     }
4240     KOutMsg( "\n" );
4241     HelpOptionsStandard();
4242 
4243     HelpVersion( fullpath, KAppVersion() );
4244 
4245     return rc;
4246 }
4247 */
4248 
4249 #define ARGS_COUNT (sizeof(DumpArgs)/sizeof(DumpArgs[0]))
4250 
4251 
ParamCount(Args const * const args,rc_t * const prc)4252 static unsigned ParamCount( Args const *const args, rc_t *const prc )
4253 {
4254     uint32_t y = 0;
4255     rc_t rc = ArgsParamCount( args, &y );
4256 
4257     if ( prc != NULL )
4258         *prc = rc;
4259     return y;
4260 }
4261 
4262 
ArgCount(Args const * const args,rc_t * const prc)4263 static unsigned ArgCount( Args const *const args, rc_t *const prc )
4264 {
4265     uint32_t y = 0;
4266     rc_t rc = ArgsArgvCount( args, &y );
4267 
4268     if ( prc != NULL )
4269         *prc = rc;
4270     return y;
4271 }
4272 
4273 
CountArgs(Args const * const args,unsigned count[],char const ** const errmsg)4274 static rc_t CountArgs( Args const *const args, unsigned count[],
4275                        char const **const errmsg )
4276 {
4277     rc_t rc;
4278     unsigned const pcount = ParamCount( args, &rc );
4279 
4280     memset( count, 0, ARGS_COUNT );
4281     if ( rc != 0 || pcount == 0 )
4282     {
4283         *errmsg = "";
4284         MiniUsage( args );
4285         return ( ArgCount( args, NULL ) < 2 )
4286              ? 0
4287              : RC( rcExe, rcArgv, rcParsing, rcParam, rcInsufficient );
4288     }
4289 #define COUNT_ARG(E) do {\
4290     char const *const optname = DumpArgs[ (E) ].name; \
4291     if ( ( rc = ArgsOptionCount( args, optname, &count[ (E) ] ) ) != 0 ) { \
4292         *errmsg = optname; \
4293         return rc; \
4294     } } while( 0 )
4295 
4296 
4297     /* record source options */
4298     COUNT_ARG( earg_unaligned );
4299     COUNT_ARG( earg_unaligned_only );
4300     COUNT_ARG( earg_prim_only );
4301     COUNT_ARG( earg_CG_evidence );
4302     COUNT_ARG( earg_CG_ev_dnb );
4303     COUNT_ARG( earg_CG_mappings );
4304     COUNT_ARG( earg_CG_SAM );
4305 
4306     /* record filter options */
4307     COUNT_ARG( earg_region );
4308     COUNT_ARG( earg_distance );
4309 
4310     /* output format options */
4311     COUNT_ARG( earg_fastq );
4312     COUNT_ARG( earg_fasta );
4313 
4314     /* SAM header options */
4315     COUNT_ARG( earg_header );
4316     COUNT_ARG( earg_noheader );
4317     COUNT_ARG( earg_comment );
4318 
4319     /* SAM field options */
4320     COUNT_ARG( earg_prefix );
4321     COUNT_ARG( earg_qname );
4322     COUNT_ARG( earg_seq_id );
4323     COUNT_ARG( earg_CG_names );
4324 
4325     COUNT_ARG( earg_cigartype );
4326     COUNT_ARG( earg_cigarCG );
4327     COUNT_ARG( earg_cigarCG_merge );
4328 
4329     COUNT_ARG( earg_identicalbases );
4330     COUNT_ARG( earg_reverse );
4331     COUNT_ARG( earg_QualQuant );
4332 
4333     /* output encoding options */
4334     COUNT_ARG( earg_gzip );
4335     COUNT_ARG( earg_bzip2 );
4336 
4337     COUNT_ARG( earg_mate_row_gap_cachable );
4338 
4339     /* debug options */
4340     COUNT_ARG( earg_XI );
4341     COUNT_ARG( earg_test_rows );
4342 #undef COUNT_ARG
4343     return 0;
4344 }
4345 
4346 
GetOptValU(Args const * const args,char const * const argname,unsigned const defval,rc_t * const prc)4347 static unsigned GetOptValU( Args const *const args, char const *const argname,
4348                             unsigned const defval, rc_t *const prc )
4349 {
4350     unsigned y = defval;
4351     char const *valstr;
4352     rc_t rc = ArgsOptionValue( args, argname, 0, (const void **)&valstr );
4353 
4354     if ( rc == 0 )
4355     {
4356         char *endp;
4357 
4358         y = strtou32( valstr, &endp, 0 );
4359         if ( *endp != '\0' )
4360         {
4361             rc = RC( rcExe, rcArgv, rcProcessing, rcParam, rcInvalid );
4362             y = 0;
4363         }
4364         if ( y == 0 )
4365             y = defval;
4366     }
4367     if ( prc )
4368         *prc = rc;
4369     return y;
4370 }
4371 
4372 
GetComments(Args const * const args,struct params_s * const parms,unsigned const n)4373 static rc_t GetComments( Args const *const args, struct params_s *const parms, unsigned const n )
4374 {
4375     parms->comments = calloc( n + 1, sizeof( parms->comments[ 0 ] ) );
4376     if ( parms->comments != NULL )
4377     {
4378         unsigned i;
4379 
4380         for ( i = 0; i < n; ++i )
4381         {
4382             rc_t const rc = ArgsOptionValue( args, DumpArgs[ earg_comment ].name, i, (const void **)&parms->comments[ i ] );
4383             if ( rc != 0 )
4384             {
4385                 free( ( void * )parms->comments );
4386                 parms->comments = NULL;
4387                 return rc;
4388             }
4389         }
4390         return 0;
4391     }
4392     else
4393         return RC( rcExe, rcArgv, rcProcessing, rcMemory, rcExhausted );
4394 }
4395 
4396 
ParseFromTo(int * const pFrom,int * const pTo,char const str[])4397 static rc_t ParseFromTo( int *const pFrom, int *const pTo, char const str[] )
4398 {
4399     /* $str =~ /((\d+)-(\d+))|(-(\d+))|(\d+)/ */
4400     int n;
4401     char fr_str[ 16 ];
4402     char to_str[ 16 ];
4403     uint32_t fr = 0;
4404     uint32_t to = 0;
4405     int i = sscanf( str, "%15[0-9]-%15[0-9]%n", fr_str, to_str, &n );
4406 
4407     if ( i != 2 )
4408     {
4409         unsigned const offset = ( str[ 0 ] == '-' ) ? 1 : 0;
4410 
4411         fr = 0;
4412         i = sscanf( str + offset, "%15[0-9]%n", to_str, &n );
4413         if ( i != 1 )
4414             return RC( rcExe, rcArgv, rcProcessing, rcParam, rcOutofrange );
4415         to = strtou32( to_str, 0, 0 );
4416     }
4417     else
4418     {
4419         fr = strtou32( fr_str, 0, 0 );
4420         to = strtou32( to_str, 0, 0 );
4421 
4422         if ( to < fr )
4423         {
4424             uint32_t const tmp = to;
4425             to = fr;
4426             fr = tmp;
4427         }
4428     }
4429     /* was the entire string consumed */
4430     if ( str[ n ] != 0 )
4431         return RC( rcExe, rcArgv, rcProcessing, rcParam, rcInvalid );
4432     *pFrom = ( int )fr;
4433     *pTo = ( int )to;
4434     return 0;
4435 }
4436 
4437 
GetDistance(Args const * const args,struct params_s * const parms,unsigned const n)4438 static rc_t GetDistance( Args const *const args, struct params_s *const parms, unsigned const n )
4439 {
4440     rc_t rc;
4441     TMatepairDistance *const mpd = calloc( n, sizeof( mpd[ 0 ] ) );
4442 
4443     if ( mpd == NULL )
4444         rc = RC( rcExe, rcArgv, rcProcessing, rcMemory, rcExhausted );
4445     else
4446     {
4447         unsigned i;
4448         unsigned j;
4449 
4450         for ( j = i = 0; i < n; ++i )
4451         {
4452             char const *valstr;
4453             int from = 0;
4454             int to = 0;
4455 
4456             rc = ArgsOptionValue( args, DumpArgs[earg_distance].name, i, (const void **)&valstr );
4457             if ( rc != 0 )
4458                 break;
4459             if ( strcasecmp( valstr, "unknown" ) == 0 )
4460             {
4461                 parms->mp_dist_unknown = true;
4462                 continue;
4463             }
4464             rc = ParseFromTo( &from, &to, valstr );
4465             if ( rc != 0 )
4466                 break;
4467             if ( ( from != 0 ) || ( to != 0 ) )
4468             {
4469                 mpd[ j ].from = from;
4470                 mpd[ j ].to = to;
4471                 ++j;
4472             }
4473         }
4474         if ( rc == 0 )
4475         {
4476             parms->mp_dist = mpd;
4477             parms->mp_dist_qty = j;
4478 
4479             SAM_DUMP_DBG( 2, ( "filter by mate pair distance\n" ) );
4480             if ( parms->mp_dist_unknown )
4481                 SAM_DUMP_DBG( 2, ( "    distance: unknown\n" ) );
4482             for ( i = 0; i < j; ++i )
4483             {
4484                 SAM_DUMP_DBG( 2, ( "    distance [%lu-%lu]\n",
4485                               parms->mp_dist[ i ].from, parms->mp_dist[ i ].to ) );
4486             }
4487         }
4488         else
4489             free( mpd );
4490     }
4491     return rc;
4492 }
4493 
4494 
GetRegion(Args const * const args,struct params_s * const parms,unsigned const n)4495 static rc_t GetRegion( Args const *const args, struct params_s *const parms, unsigned const n )
4496 {
4497     rc_t rc;
4498     TAlignedRegion *const reg = calloc( n, sizeof( reg[ 0 ] ) );
4499 
4500     if ( reg == NULL )
4501         rc = RC( rcExe, rcArgv, rcProcessing, rcMemory, rcExhausted );
4502     else
4503     {
4504         unsigned i;
4505         unsigned j;
4506 
4507         for ( rc = 0, j = i = 0; i < n; ++i )
4508         {
4509             char const *valstr;
4510             char const *sep;
4511             unsigned namelen;
4512             unsigned f;
4513             unsigned e;
4514             TAlignedRegion r;
4515 
4516 	    memset(&r,0,sizeof(r));
4517 
4518             rc = ArgsOptionValue( args, DumpArgs[ earg_region ].name, i, (const void **)&valstr );
4519             if ( rc != 0 ) break;
4520 
4521             sep = strchr( valstr, ':' );
4522             if ( sep != NULL )
4523                 namelen = sep - valstr;
4524             else
4525                 namelen = string_size( valstr );
4526 
4527             if ( namelen >= sizeof( r.name ) )
4528                 return RC( rcExe, rcArgv, rcProcessing, rcParam, rcTooLong );
4529 
4530             memmove( r.name, valstr, namelen );
4531             r.name[ namelen ] = '\0';
4532 
4533             r.rq = UINT_MAX;
4534             if ( sep != NULL )
4535             {
4536                 int from = -1;
4537                 int to = -1;
4538 
4539                 rc = ParseFromTo( &from, &to, sep + 1 );
4540                 if ( rc != 0 )
4541                     break;
4542                 if ( from > 0 ) /* convert to 0-based */
4543                     --from;
4544                 else if ( to > 0 )
4545                     from = 0;
4546                 if ( to > 0 )
4547                     --to;
4548                 else if ( from >= 0 && to < 0 )
4549                     to = from;
4550                 if ( from >= 0 || to >= 0 )
4551                 {
4552                     if ( from > to )
4553                     {
4554                         int tmp = to;
4555                         to = from;
4556                         from = tmp;
4557                     }
4558                     r.r[ 0 ].from = from;
4559                     r.r[ 0 ].to = to;
4560                     r.rq = 1;
4561                 }
4562             }
4563 
4564             for ( f = 0, e = j; f < e; )
4565             {
4566                 unsigned const m = ( ( f + e ) / 2 );
4567                 int const diff = strcmp( r.name, reg[ m ].name );
4568 
4569                 if ( diff < 0 )
4570                     e = m;
4571                 else
4572                     f = m + 1;
4573             }
4574             if ( 0 < e && e < j && strcmp( r.name, reg[ e-1 ].name ) == 0 )
4575             {
4576                 TAlignedRegion *const fnd = &reg[ e - 1 ];
4577 
4578                 if ( fnd->rq != UINT_MAX )
4579                 {
4580                     for ( f = 0, e = fnd->rq; f < e; )
4581                     {
4582                         unsigned const m = ( ( f + e ) / 2 );
4583 
4584                         if ( r.r[ 0 ].from < fnd->r[ m ].from )
4585                             e = m;
4586                         else
4587                             f = m + 1;
4588                     }
4589                     if ( fnd->rq >= ( sizeof( r.r ) / sizeof( r.r[0] ) ) )
4590                     {
4591                         rc = RC( rcExe, rcArgv, rcProcessing, rcBuffer, rcInsufficient );
4592                         break;
4593                     }
4594                     memmove( &fnd->r[ e +1 ], &fnd->r[ e ], ( fnd->rq - e ) * sizeof( fnd->r[ 0 ] ) );
4595                     fnd->r[ e ] = r.r[ 0 ];
4596                     ++fnd->rq;
4597                 }
4598             }
4599             else
4600             {
4601                 memmove( &reg[ e + 1 ], &reg[ e ], ( j - e ) * sizeof( reg[ 0 ] ) );
4602                 reg[ e ] = r;
4603                 ++j;
4604             }
4605         }
4606         if ( rc == 0 )
4607         {
4608             parms->region = reg;
4609             parms->region_qty = j;
4610 
4611             for ( i = 0; i < parms->region_qty; ++i )
4612             {
4613                 TAlignedRegion *const r = &parms->region[ i ];
4614 
4615                 SAM_DUMP_DBG( 2, ( "filter by %s\n", r->name ) );
4616                 if ( r->rq == UINT_MAX )
4617                 {
4618                     r->rq = 1;
4619                     r->r[ 0 ].from = 0;
4620                     r->r[ 0 ].to = UINT_MAX;
4621                 }
4622                 for ( j = 0; j < r->rq; ++j )
4623                 {
4624                     unsigned const to = r->r[ j ].to;
4625                     unsigned const from = r->r[ j ].from;
4626 
4627                     SAM_DUMP_DBG( 2, ( "   range: [%u:%u]\n", r->r[ j ].from, to ) );
4628                     if ( r->max_to < to ) r->max_to = to;
4629                     if ( r->min_from < from ) r->min_from = from;
4630                 }
4631             }
4632         }
4633         else
4634             free( reg );
4635     }
4636     return rc;
4637 }
4638 
4639 
GetQualQuant(Args const * const args,struct params_s * const parms)4640 static rc_t GetQualQuant( Args const *const args, struct params_s *const parms )
4641 {
4642     char const *valstr;
4643     int i;
4644     rc_t rc = ArgsOptionValue( args, DumpArgs[ earg_QualQuant ].name, 0, (const void **)&valstr );
4645 
4646     if ( rc == 0 && strcmp( valstr, "0" ) != 0 )
4647         parms->quantizeQual = QualityQuantizerInitMatrix( parms->qualQuant, valstr );
4648     for(i=1; parms->qualQuant[i]==parms->qualQuant[0] && i<256;i++){}
4649     if(i<256) parms->qualQuantSingle=0;
4650     else      parms->qualQuantSingle = parms->qualQuant[0];
4651     return rc;
4652 }
4653 
4654 
GetArgs(Args const * const args,unsigned const count[],char const ** const errmsg)4655 static rc_t GetArgs( Args const *const args, unsigned const count[],
4656                      char const **const errmsg )
4657 {
4658     static struct params_s parms;
4659     bool const multipass = ParamCount( args, 0 ) > 1;
4660     rc_t rc;
4661 
4662     memset( &parms, 0, sizeof( parms ) );
4663 
4664     if ( count[ earg_comment ] )
4665     {
4666         rc = GetComments( args, &parms, count[ earg_comment ] );
4667         if ( rc != 0 )
4668         {
4669             *errmsg = DumpArgs[ earg_comment ].name;
4670             return rc;
4671         }
4672     }
4673 
4674     if ( count[ earg_region ] && count[ earg_unaligned_only ] == 0 )
4675     {
4676         rc = GetRegion( args, &parms, count[ earg_region ] );
4677         if ( rc != 0 )
4678         {
4679             *errmsg = DumpArgs[ earg_region ].name;
4680             return rc;
4681         }
4682     }
4683 
4684     if ( count[ earg_distance ] && count[ earg_unaligned_only ] == 0 )
4685     {
4686         rc = GetDistance( args, &parms, count[ earg_distance ] );
4687         if ( rc != 0 )
4688         {
4689             *errmsg = DumpArgs[ earg_distance ].name;
4690             return rc;
4691         }
4692     }
4693 
4694     if ( count[ earg_QualQuant ] )
4695     {
4696         rc = GetQualQuant( args, &parms );
4697         if ( rc != 0 )
4698         {
4699             *errmsg = DumpArgs[ earg_QualQuant ].name;
4700             return rc;
4701         }
4702     }
4703 
4704     parms.cg_sam = ( count[ earg_CG_SAM ] != 0 );
4705     parms.cg_evidence = ( count[ earg_CG_evidence ] != 0 );
4706     parms.cg_ev_dnb = ( count[ earg_CG_ev_dnb ] != 0 );
4707 
4708     if ( count[ earg_CG_mappings ] == 0 &&
4709          ( parms.cg_sam || parms.cg_evidence || parms.cg_ev_dnb ) )
4710     {
4711         parms.primaries = false;
4712         parms.secondaries = false;
4713         parms.unaligned = false;
4714     }
4715     else
4716     {
4717         parms.primaries = true;
4718         parms.secondaries = ( count[ earg_prim_only ] == 0 );
4719         parms.unaligned = ( ( count[ earg_unaligned ] != 0 ) /*&& ( parms.region_qty == 0 )*/ );
4720     }
4721 
4722     parms.long_cigar = ( count[ earg_cigartype ] != 0 );
4723     parms.use_seqid = ( ( count[ earg_seq_id ] != 0 ) || multipass );
4724     parms.hide_identical = ( count[ earg_identicalbases ] != 0 );
4725     parms.fasta = ( count[ earg_fasta ] != 0 );
4726     parms.fastq = ( count[ earg_fastq ] != 0 );
4727     parms.reverse_unaligned = ( count[ earg_reverse ] != 0 );
4728     parms.cg_friendly_names = count[ earg_CG_names ] != 0;
4729     parms.spot_group_in_name = ( count[ earg_qname ] != 0 || multipass );
4730     parms.noheader = ( ( count[ earg_noheader ] != 0 ) || parms.fasta || parms.fastq || multipass );
4731     parms.reheader = ( ( count[ earg_header ] != 0 ) && !parms.noheader );
4732     parms.xi = ( count[ earg_XI ] != 0 );
4733     if ( ( count[ earg_cigarCG ] == 0 ) && ( count[ earg_cigarCG_merge ] == 0 ) )
4734     {
4735         parms.cg_style = 0;
4736     }
4737     else if ( count[ earg_cigarCG ] == 0 )
4738     {
4739         parms.cg_style = 1;
4740         parms.long_cigar = false;
4741     }
4742     else if ( count[ earg_cigarCG_merge ] == 0 )
4743     {
4744         parms.cg_style = 2;
4745     }
4746     else
4747     {
4748         rc = RC( rcExe, rcArgv, rcProcessing, rcParam, rcInconsistent );
4749         *errmsg = "cigar-CG and CG are mutually exclusive";
4750         parms.cg_style = 0;
4751     }
4752 
4753     if (count[ earg_unaligned_only ] > 0) {
4754         parms.unaligned_spots = true;
4755 
4756         parms.primaries = false;
4757         parms.secondaries = false;
4758         parms.unaligned = true;
4759         parms.cg_ev_dnb = false;
4760         parms.cg_evidence = false;
4761         parms.cg_sam = false;
4762         parms.cg_style = 0;
4763     }
4764 
4765     parms.test_rows = GetOptValU( args, DumpArgs[ earg_test_rows ].name, 0, NULL );
4766     parms.mate_row_gap_cachable = GetOptValU( args, DumpArgs[ earg_mate_row_gap_cachable ].name, 1000000, NULL );
4767 
4768     param = &parms;
4769     return 0;
4770 }
4771 
4772 
GetParams(Args const * const args,char const ** const errmsg)4773 static rc_t GetParams( Args const *const args, char const **const errmsg )
4774 {
4775     unsigned count[ ARGS_COUNT ];
4776     rc_t rc = CountArgs( args, count, errmsg );
4777 
4778     if ( rc == 0 )
4779         rc = GetArgs( args, count, errmsg );
4780     return rc;
4781 }
4782 
4783 
SAM_Dump_Main(int argc,char * argv[])4784 rc_t CC SAM_Dump_Main( int argc, char* argv[] )
4785 {
4786     rc_t rc = 0;
4787     Args* args;
4788     char const *errmsg = "stop";
4789     bool error_reported = false;
4790 
4791     memset( &g_out_writer, 0, sizeof( g_out_writer ) );
4792     KOutHandlerSetStdOut();
4793     KStsHandlerSetStdErr();
4794     KLogHandlerSetStdErr();
4795     (void)KDbgHandlerSetStdErr();
4796 
4797     ReportBuildDate( __DATE__ );
4798 
4799     rc = ArgsMakeAndHandle( &args, argc, argv, 1, DumpArgs, ARGS_COUNT );
4800     if ( rc == 0 )
4801     {
4802         uint32_t pcount;
4803 
4804         rc = ArgsParamCount( args, &pcount );
4805         if ( rc != 0 || pcount < 1 )
4806         {
4807             errmsg = "";
4808             rc = argc < 2 ? 0 : RC( rcExe, rcArgv, rcParsing, rcParam, rcInsufficient );
4809             MiniUsage( args );
4810         }
4811         else if ( ( rc = GetParams( args, &errmsg ) ) != 0 )
4812         {
4813         }
4814         else
4815         {
4816             VDBManager const *mgr;
4817 
4818             rc = VDBManagerMakeRead( &mgr, NULL );
4819             if ( rc == 0 )
4820             {
4821                 rc = BufferedWriterMake( param->output_gzip, param->output_bz2 );
4822                 if ( rc == 0 )
4823                 {
4824                     unsigned i;
4825 
4826                     for ( i = 0; i < pcount; ++i )
4827                     {
4828                         char const *arg;
4829 
4830                         rc = ArgsParamValue( args, i, (const void **)&arg );
4831                         if ( rc != 0 ) break;
4832                         rc = ProcessPath( mgr, arg );
4833 #if _ARCH_BITS < 64
4834                         /* look for "param excessive while writing vector within container module" */
4835                         if (GetRCState(rc) == rcExcessive &&
4836                             GetRCObject(rc) == rcParam &&
4837                             GetRCContext(rc) == rcWriting &&
4838                             GetRCTarget(rc) == rcVector &&
4839                             GetRCModule(rc) == rcCont)
4840                         {
4841                             ( void )PLOGMSG( klogErr, (klogErr, "This run '$(run)' contains too many reads to be processed with a $(bits)-bit executable; please try again with a 64-bit executable.", "run=%s,bits=%u", arg, _ARCH_BITS));
4842                         }
4843 #endif
4844                         if ( rc != 0 ) break;
4845                     }
4846                     BufferedWriterRelease( rc == 0 );
4847                 }
4848                 VDBManagerRelease( mgr );
4849             }
4850         }
4851         ArgsWhack( args );
4852     }
4853 
4854     if ( rc != 0 && !error_reported )
4855     {
4856         if ( errmsg[ 0 ] )
4857         {
4858             ( void )LOGERR( klogErr, rc, errmsg );
4859         } else if ( KLogLastErrorCode() != rc )
4860         {
4861             ( void )LOGERR( klogErr, rc, "stop" );
4862         }
4863     }
4864 
4865     {
4866         /* Report execution environment if necessary */
4867         rc_t rc2 = ReportFinalize( rc );
4868         if ( rc == 0 )
4869             rc = rc2;
4870     }
4871     return rc;
4872 }
4873 
4874 
Legacy_KMain(int argc,char * argv[])4875 rc_t CC Legacy_KMain( int argc, char* argv[] )
4876 {
4877     char filename[ 4096 ];
4878     /*
4879        Try to find a option-file - parameter in the original parameters
4880        This is a hack to teach sam-dump to read it's parameters from a file/pipe instead
4881        of the commandline
4882        It is neccessary because the code above does not use libkapp to parse parameters!
4883     */
4884     rc_t rc = Args_find_option_in_argv( argc, argv, "--option-file", filename, sizeof filename );
4885     if ( rc == 0 )
4886     {
4887         int new_argc;
4888         char ** new_argv;
4889         /* we found it! Now comes the special treatment: we fake a new argc/argv-pair! */
4890         rc = Args_tokenize_file_and_progname_into_argv( filename, "sam-dump", &new_argc, &new_argv );
4891         if ( rc == 0 )
4892         {
4893             /* pass the faked input-parameters from */
4894             rc = SAM_Dump_Main( new_argc, new_argv );
4895 
4896             Args_free_token_argv( new_argc, new_argv );
4897         }
4898     }
4899     else
4900     {
4901         /* we did not found the special option, just use the commandline-parameters unchanged */
4902         rc = SAM_Dump_Main( argc, argv );
4903     }
4904     return rc;
4905 }
4906