1 /*===========================================================================
2 *
3 * PUBLIC DOMAIN NOTICE
4 * National Center for Biotechnology Information
5 *
6 * This software/database is a "United States Government Work" under the
7 * terms of the United States Copyright Act. It was written as part of
8 * the author's official duties as a United States Government employee and
9 * thus cannot be copyrighted. This software/database is freely available
10 * to the public for use. The National Library of Medicine and the U.S.
11 * Government have not placed any restriction on its use or reproduction.
12 *
13 * Although all reasonable efforts have been taken to ensure the accuracy
14 * and reliability of the software and data, the NLM and the U.S.
15 * Government do not and cannot warrant the performance or results that
16 * may be obtained by using this software or data. The NLM and the U.S.
17 * Government disclaim all warranties, express or implied, including
18 * warranties of performance, merchantability or fitness for any particular
19 * purpose.
20 *
21 * Please cite the author in any work or product based on this material.
22 *
23 * ===========================================================================
24 *
25 */
26 #include <klib/report.h>
27 #include <klib/container.h>
28 #include <klib/log.h>
29 #include <klib/out.h>
30 #include <klib/text.h>
31 #include <klib/status.h>
32 #include <klib/rc.h>
33 #include <klib/vector.h>
34 #include <klib/printf.h>
35 #include <klib/data-buffer.h>
36 #include <vfs/manager.h>
37 #include <vfs/path.h>
38 #include <vfs/path-priv.h>
39 #include <kfs/file.h>
40 #include <kfs/buffile.h>
41 #include <kfs/gzip.h>
42 #include <kfs/bzip.h>
43 #include <kdb/meta.h>
44 #include <kdb/namelist.h>
45 #include <kapp/main.h>
46 #include <kapp/args.h>
47 #include <insdc/insdc.h>
48 #include <insdc/sra.h>
49 #include <vdb/report.h>
50 #include <vdb/manager.h>
51 #include <vdb/database.h>
52 #include <vdb/table.h>
53 #include <vdb/cursor.h>
54 #include <vdb/vdb-priv.h>
55 #include <vdb/schema.h>
56 #include <vdb/dependencies.h>
57 #include <sra/sraschema.h>
58 #include <sra/srapath.h>
59 #include <align/dna-reverse-cmpl.h>
60 #include <align/iterator.h>
61 #include <align/reference.h>
62 #include <align/quality-quantizer.h>
63
64 #include <kfs/directory.h>
65 #include <os-native.h>
66 #include <sysalloc.h>
67
68 #include <stdio.h>
69 #include <stdlib.h>
70 #include <stdint.h>
71 #include <limits.h>
72 #include <string.h>
73 #include <strtol.h>
74 #include <ctype.h>
75 #include <assert.h>
76
77 #include "debug.h"
78
79 #if _ARCH_BITS == 64
80 #define USE_MATE_CACHE 1
81 #define CURSOR_CACHE (4 * 1024 * 1024 * 1024UL)
82 #else
83 #define USE_MATE_CACHE 0
84 #define CURSOR_CACHE (256 * 1024 * 1024)
85 #endif
86
87 rc_t cg_canonical_print_cigar( const char * cigar, size_t cigar_len);
88
89
90
91 typedef struct TAlignedRegion_struct
92 {
93 char name[1024];
94 struct {
95 INSDC_coord_zero from;
96 INSDC_coord_zero to;
97 } r[10240];
98 int rq;
99 INSDC_coord_zero max_to;
100 INSDC_coord_zero min_from;
101 int printed;
102 } TAlignedRegion;
103
104
105 typedef struct TMatepairDistance_struct
106 {
107 uint32_t from;
108 uint32_t to;
109 } TMatepairDistance;
110
111
112 struct params_s
113 {
114 /* which outputs are on */
115 bool primaries;
116 bool secondaries;
117 bool unaligned;
118 bool cg_evidence;
119 bool cg_ev_dnb;
120 bool cg_sam;
121
122 bool use_seqid;
123 bool long_cigar;
124 bool reheader;
125 bool noheader;
126 bool hide_identical;
127 bool fasta;
128 bool fastq;
129 bool spot_group_in_name;
130 bool cg_friendly_names;
131 bool reverse_unaligned;
132 bool unaligned_spots;
133
134 bool output_gzip;
135 bool output_bz2;
136
137 bool xi;
138 int cg_style; /* 0: raw; 1: with B's; 2: without B's, fixed up SEQ/QUAL; */
139 char const *name_prefix;
140 /* region filter data */
141 TAlignedRegion* region;
142 uint32_t region_qty;
143 /* distance filter data */
144 bool mp_dist_unknown;
145 TMatepairDistance* mp_dist;
146 uint32_t mp_dist_qty;
147 uint32_t test_rows;
148
149 /* mate info cache */
150 int64_t mate_row_gap_cachable;
151
152 char const **comments;
153
154 bool quantizeQual;
155 uint8_t qualQuant[256];
156 uint8_t qualQuantSingle; /*** the quality is quantized - single value, no need to retrieve **/
157 };
158
159
160 struct params_s const *param;
161 ReferenceList const *gRefList;
162
163
164 typedef union UData_union
165 {
166 void const *v;
167 uint32_t const *u32;
168 int32_t const *i32;
169 int64_t const *i64;
170 uint64_t const *u64;
171 uint8_t const *u8;
172 char const *str;
173 bool const *tf;
174 INSDC_coord_one* coord1;
175 INSDC_coord_zero* coord0;
176 INSDC_coord_len* coord_len;
177 INSDC_coord_val* coord_val;
178 INSDC_SRA_xread_type* read_type;
179 INSDC_SRA_read_filter* read_filter;
180 } UData;
181
182
183 typedef struct SCol_struct
184 {
185 char const *name;
186 uint32_t idx;
187 UData base;
188 uint32_t len;
189 bool optional;
190 } SCol;
191
192
193 typedef struct STable_struct
194 {
195 char const *name;
196 VTable const *vtbl;
197 } STable;
198
199
200 typedef struct SCursCache_struct
201 {
202 KVector* cache;
203 KVector* cache_unaligned_mate; /* keeps unaligned-mate for a half-aligned spots */
204 uint32_t sam_flags;
205 INSDC_coord_zero pnext;
206 int32_t tlen;
207 ReferenceObj const *ref;
208 /* cache stats */
209 uint64_t projected;
210 uint64_t added;
211 uint64_t hit;
212 uint64_t bad;
213 } SCursCache;
214
215
216 typedef struct SCurs_struct
217 {
218 STable const *tbl;
219 VCursor const *vcurs;
220 SCursCache* cache;
221 SCursCache cache_local;
222 uint64_t col_reads_qty;
223 } SCurs;
224
225 enum eDSTableType
226 {
227 edstt_NotSpecified,
228 edstt_Reference,
229 edstt_Sequence,
230 edstt_PrimaryAlignment,
231 edstt_SecondaryAlignment,
232 edstt_EvidenceAlignment,
233 edstt_EvidenceInterval
234 };
235
236
237 typedef struct DataSource_s {
238 STable tbl;
239 SCurs curs;
240 SCol *cols;
241 enum eDSTableType type;
242 } DataSource;
243
244
245 #define DATASOURCE_INIT(O, NAME) do { memset(&O, 0, sizeof(O)); O.tbl.name = NAME; O.curs.tbl = &O.tbl; } while(0)
246
247
248 typedef struct SAM_dump_ctx_s
249 {
250 VDatabase const *db;
251 char const *fullPath;
252 char const *accession;
253 char const *readGroup;
254
255 DataSource ref;
256 DataSource pri;
257 DataSource sec;
258 DataSource evi;
259 DataSource eva;
260 DataSource seq;
261 } SAM_dump_ctx_t;
262
263
264 enum ealg_col
265 {
266 alg_SEQ_SPOT_ID = 0,
267 alg_SEQ_NAME,
268 alg_MAPQ,
269 alg_CIGAR,
270 alg_READ,
271 alg_READ_START,
272 alg_READ_LEN,
273 alg_CIGAR_LEN,
274 alg_SAM_QUALITY,
275 alg_SPOT_GROUP,
276 alg_SEQ_SPOT_GROUP,
277 alg_SEQ_READ_ID,
278 alg_REVERSED,
279 alg_ALIGNMENT_COUNT,
280 alg_EDIT_DISTANCE,
281 alg_READ_FILTER,
282 alg_MATE_ALIGN_ID,
283 alg_MATE_REF_NAME,
284 alg_SAM_FLAGS,
285 alg_REF_START,
286 alg_MATE_REF_POS,
287 alg_ALIGN_GROUP,
288 alg_EVIDENCE_ALIGNMENT_IDS,
289 alg_REF_PLOIDY,
290 alg_REF_ID,
291 alg_MATE_REF_ID,
292 alg_HAS_MISMATCH,
293 alg_REGION_FILTER,
294 alg_REF_NAME = alg_REGION_FILTER,
295 alg_REF_SEQ_ID,
296 alg_REF_POS,
297 alg_REF_LEN,
298 alg_DISTANCE_FILTER,
299 alg_TEMPLATE_LEN = alg_DISTANCE_FILTER,
300 alg_CG_TAGS_STR
301 };
302
303
304 SCol const g_alg_col_tmpl[] =
305 {
306 { "SEQ_SPOT_ID", 0, {NULL}, 0, true },
307 { "SEQ_NAME", 0, {NULL}, 0, false },
308 { "MAPQ", 0, {NULL}, 0, false },
309 { "?CIGAR column name?", 0, {NULL}, 0, false },
310 { "?READ column name?", 0, {NULL}, 0, false },
311 { "READ_START", 0, {NULL}, 0, false }, /* READ_START */
312 { "READ_LEN", 0, {NULL}, 0, false }, /* READ_LEN */
313 { "?CIGAR_LEN column name?", 0, {NULL}, 0, true }, /* CIGAR_LEN */
314 { "SAM_QUALITY", 0, {NULL}, 0, false },
315 { "SPOT_GROUP", 0, {NULL}, 0, true },
316 { "SEQ_SPOT_GROUP", 0, {NULL}, 0, true },
317 { "SEQ_READ_ID", 0, {NULL}, 0, true },
318 { "REF_ORIENTATION", 0, {NULL}, 0, true },
319 { "ALIGNMENT_COUNT", 0, {NULL}, 0, true },
320 { "EDIT_DISTANCE", 0, {NULL}, 0, false },
321 { "", 0, {NULL}, 0, false },
322 /* start cols used as standalone in DumpUnaligned */
323 /* MATE_ALIGN_ID col must preceeed
324 MATE_REF_NAME, MATE_REF_POS, SAM_FLAGS, TEMPLATE_LEN for cache to work */
325 { "MATE_ALIGN_ID", 0, {NULL}, 0, false },
326 { "?MATE_REF_NAME column name?", 0, {NULL}, 0, false },
327 { "SAM_FLAGS", 0, {NULL}, 0, false },
328 { "REF_START", 0, {NULL}, 0, false }, /* priming cursor cache */
329 { "MATE_REF_POS", 0, {NULL}, 0, false },
330 { "ALIGN_GROUP", 0, {NULL}, 0, true },
331 { "", 0, {NULL}, 0, true }, /* EVIDENCE_ALIGNMENT_IDS */
332 { "", 0, {NULL}, 0, true }, /* REF_PLOIDY */
333 { "REF_ID", 0, {NULL}, 0, true }, /* REF_ID */
334 { "MATE_REF_ID", 0, {NULL}, 0, true }, /* priming cursor cache */
335 { "(bool)HAS_MISMATCH", 0, {NULL}, 0, true },
336 /* these are read before any other for filtering so they must be last */
337 { "REF_NAME", 0, {NULL}, 0, false },
338 { "REF_SEQ_ID", 0, {NULL}, 0, false },
339 { "REF_POS", 0, {NULL}, 0, false },
340 /* end cols used as standalone in DumpUnaligned */
341 { "REF_LEN", 0, {NULL}, 0, false },
342 { "TEMPLATE_LEN", 0, {NULL}, 0, false },
343 { NULL, 0, {NULL}, 0, false }, /* alg_CG_TAGS_STR */
344 { NULL, 0, {NULL}, 0, false }
345 };
346
347 enum eseq_col
348 {
349 seq_READ = 0,
350 seq_QUALITY,
351 seq_SPOT_GROUP,
352 seq_READ_START,
353 seq_READ_LEN,
354 seq_READ_TYPE,
355 seq_READ_FILTER,
356 seq_NAME,
357 seq_PRIMARY_ALIGNMENT_ID
358 };
359
360 static
361 SCol const gSeqCol[] =
362 {
363 { "READ", 0, {NULL}, 0, false },
364 { "(INSDC:quality:text:phred_33)QUALITY", 0, {NULL}, 0, false },
365 { "SPOT_GROUP", 0, {NULL}, 0, true },
366 { "READ_START", 0, {NULL}, 0, true },
367 { "READ_LEN", 0, {NULL}, 0, true },
368 { "READ_TYPE", 0, {NULL}, 0, true },
369 { "READ_FILTER", 0, {NULL}, 0, true },
370 { "NAME", 0, {NULL}, 0, true },
371 /* must be last in list to avoid reading all columns */
372 { "PRIMARY_ALIGNMENT_ID", 0, {NULL}, 0, true },
373 { NULL, 0, {NULL}, 0, false }
374 };
375
376
RefSeqPrint(void)377 static rc_t RefSeqPrint( void )
378 {
379 rc_t rc = 0;
380 uint32_t i, count = 0;
381
382 rc = ReferenceList_Count( gRefList, &count );
383 for( i = 0; rc == 0 && i < count; i++ )
384 {
385 ReferenceObj const *obj;
386 rc = ReferenceList_Get( gRefList, &obj, i );
387 if ( rc == 0 )
388 {
389 char const *seqid = NULL;
390 rc = ReferenceObj_SeqId( obj, &seqid );
391 if ( rc == 0 )
392 {
393 char const *name = NULL;
394 rc = ReferenceObj_Name( obj, &name );
395 if ( rc == 0 )
396 {
397 INSDC_coord_len len;
398 rc = ReferenceObj_SeqLength( obj, &len );
399 if ( rc == 0 )
400 {
401 char const *nm;
402 if ( param->use_seqid && seqid != NULL && seqid[ 0 ] != '\0' )
403 {
404 nm = seqid;
405 }
406 else
407 {
408 nm = name;
409 }
410 KOutMsg( "@SQ\tSN:%s", nm );
411 if ( nm != seqid && seqid != NULL && seqid[ 0 ] != '\0' && strcmp( nm, seqid ) != 0)
412 {
413 KOutMsg( "\tAS:%s", seqid );
414 }
415 KOutMsg( "\tLN:%u\n", len );
416 }
417 }
418 }
419 ReferenceObj_Release( obj );
420 }
421 }
422 return rc;
423 }
424
425
426 #if USE_MATE_CACHE
Cache_Init(SCursCache * c)427 static rc_t Cache_Init( SCursCache* c )
428 {
429 if ( c != NULL )
430 {
431 rc_t rc;
432 memset( c, 0, sizeof( *c ) );
433 rc=KVectorMake( &c->cache );
434 if(rc == 0){
435 rc=KVectorMake( &c->cache_unaligned_mate );
436 }
437 }
438 return 0;
439 }
440
441
Cache_Close(char const * name,SCursCache * c)442 static void Cache_Close( char const *name, SCursCache* c )
443 {
444 if ( c != NULL )
445 {
446 KVectorRelease( c->cache );
447 KVectorRelease( c->cache_unaligned_mate );
448 if ( c->added > 0 )
449 {
450 SAM_DUMP_DBG( 2, ( "%s cache stats: projected %lu added of those %lu; "
451 "hits %lu of those broken %lu;\n",
452 name, c->projected, c->added, c->hit, c->bad ) );
453 }
454 }
455 memset( c, 0, sizeof( *c ) );
456 }
457
458
Cache_Add(uint64_t key,SCurs const * curs,SCol const * cols)459 static rc_t Cache_Add( uint64_t key, SCurs const *curs, SCol const *cols )
460 {
461 /* compact values for mate record to cache as:
462 pos_delta - 32bit, ref_proj - 11bit, flags - 11bit, rnext_idx - 10bit = 64bit
463 */
464 rc_t rc = 0;
465 ReferenceObj const *r = NULL;
466 uint32_t rid = 0;
467 uint64_t val = 0;
468 int64_t mate_id = cols[ alg_MATE_ALIGN_ID ].len > 0 ? cols[ alg_MATE_ALIGN_ID ].base.i64[ 0 ] : 0;
469
470 rc = ReferenceList_Find( gRefList, &r, cols[ alg_REF_NAME ].base.str, cols[ alg_REF_NAME ].len );
471 if ( rc == 0 )
472 {
473 rc = ReferenceObj_Idx( r, &rid );
474 }
475 #if _DEBUGGING
476 {
477 char const *rname = NULL;
478 curs->cache->projected++;
479 ReferenceObj_Name( r, &rname );
480 SAM_DUMP_DBG( 10, ( "to cache row %li for mate %li: %u,%s[%hu],%u,%u,%i",
481 key, mate_id, cols[ alg_SAM_FLAGS ].base.u32[ 0 ], rname, rid,
482 cols[ alg_REF_POS ].len ? cols[ alg_REF_POS ].base.coord0[ 0 ] : 0,
483 cols[ alg_MATE_REF_POS ].len ? cols[ alg_MATE_REF_POS ].base.coord0[ 0 ] : 0,
484 cols[ alg_TEMPLATE_LEN ].len > 0 ? cols[ alg_TEMPLATE_LEN ].base.i32[ 0 ] : 0));
485 }
486 #endif
487 if ( rc == 0 && !( rid & 0xFC00 ) )
488 {
489 int64_t pos_delta64;
490 int32_t pos_delta32;
491
492 if ( mate_id != 0 )
493 {
494 ReferenceObj const *rm = NULL;
495 uint32_t rm_id;
496
497 rc = ReferenceList_Find( gRefList, &rm, cols[ alg_MATE_REF_NAME ].base.str, cols[ alg_MATE_REF_NAME ].len );
498 if ( rc == 0 )
499 {
500 rc = ReferenceObj_Idx( rm, &rm_id );
501 }
502 assert( rm != NULL );
503 if ( rc == 0 && rid != rm_id )
504 {
505 char const *rm_name = NULL;
506 ReferenceObj_Name( rm, &rm_name );
507 mate_id = 0;
508 SAM_DUMP_DBG( 10, ( " mate ref differ: %s[%hu]!", rm_name, rm_id ) );
509 }
510 ReferenceObj_Release( rm );
511 }
512
513 if ( mate_id != 0 )
514 {
515 pos_delta64 = cols[ alg_MATE_REF_POS ].base.coord0[ 0 ] - cols[ alg_REF_POS ].base.coord0[ 0 ];
516 }
517 else
518 {
519 pos_delta64 = cols[ alg_REF_POS ].base.coord0[ 0 ];
520 }
521
522 pos_delta32 = pos_delta64;
523 if ( pos_delta64 == pos_delta32 )
524 {
525 int64_t ref_proj;
526 if ( mate_id == 0 )
527 {
528 ref_proj = 0; /* indicates full value in delta */
529 }
530 else if ( cols[ alg_TEMPLATE_LEN ].base.i32[ 0 ] < 0 )
531 {
532 assert( pos_delta32 <= 0 );
533 ref_proj = pos_delta32 - cols[ alg_TEMPLATE_LEN ].base.i32[ 0 ];
534 }
535 else
536 {
537 assert( pos_delta32 >= 0 );
538 ref_proj = cols[ alg_TEMPLATE_LEN ].base.i32[ 0 ] - pos_delta32;
539 }
540
541 if ( !( ref_proj & 0xFFFFF800 ) )
542 {
543 val = ( pos_delta64 << 32 ) | ( ref_proj << 21 ) | ( cols[ alg_SAM_FLAGS ].base.u32[ 0 ] << 10 ) | rid;
544 rc = KVectorSetU64( curs->cache->cache, key, val );
545 }
546 }
547 }
548 ReferenceObj_Release( r );
549
550 #if _DEBUGGING
551 if ( val == 0 )
552 {
553 SAM_DUMP_DBG( 10, ( " --> out of range\n" ) );
554 }
555 else
556 {
557 SAM_DUMP_DBG( 10, ( " --> %016lX\n", val ) );
558 curs->cache->added++;
559 }
560 #endif
561 return rc;
562 }
563
564
Cache_Get(SCurs const * curs,uint64_t key,uint64_t * val)565 static rc_t Cache_Get( SCurs const *curs, uint64_t key, uint64_t* val )
566 {
567 rc_t rc = KVectorGetU64( curs->cache->cache, key, val );
568 if ( rc == 0 )
569 {
570 uint32_t id = ( *val & 0x3FF );
571 #if _DEBUGGING
572 curs->cache->hit++;
573 #endif
574 KVectorUnset( curs->cache->cache, key );
575 rc = ReferenceList_Get( gRefList, &curs->cache->ref, id );
576 if ( rc != 0 )
577 {
578 *val = 0;
579 curs->cache->ref = NULL;
580 rc = RC( rcExe, rcNoTarg, rcSearching, rcItem, rcNotFound );
581 #if _DEBUGGING
582 curs->cache->bad++;
583 #endif
584 }
585 else
586 {
587 SAM_DUMP_DBG( 10, ( "from cache row %li %016lX", key, *val ) );
588 }
589 }
590 return rc;
591 }
592
593
Cache_Unpack(uint64_t val,int64_t mate_id,SCurs const * curs,SCol * cols)594 static void Cache_Unpack( uint64_t val, int64_t mate_id, SCurs const *curs, SCol* cols )
595 {
596 int32_t pos_delta = ( val & 0xFFFFFFFF00000000 ) >> 32;
597 uint32_t ref_proj = ( val & 0x00000000FFE00000 ) >> 21;
598 uint32_t flags = ( val & 0x00000000001FFC00 ) >> 10;
599
600 if ( mate_id != 0 )
601 {
602 /* adjust flags for mate record */
603 curs->cache->sam_flags = ( flags & 0x1 ) |
604 ( flags & 0x2 ) |
605 ( ( flags & 0x8 ) >> 1 ) |
606 ( ( flags & 0x4 ) << 1 ) |
607 ( ( flags & 0x20 ) >> 1 ) |
608 ( ( flags & 0x10 ) << 1 ) |
609 ( ( flags & 0x40 ) ? 0x80 : 0x40 ) |
610 ( flags & 0x700 );
611 }
612 else
613 {
614 /* preserve flags as if original records is restored */
615 curs->cache->sam_flags = flags;
616 }
617 cols[ alg_SAM_FLAGS ].base.u32 = &curs->cache->sam_flags;
618 cols[ alg_SAM_FLAGS ].len = sizeof( curs->cache->sam_flags );
619
620 if ( param->use_seqid )
621 {
622 ReferenceObj_SeqId( curs->cache->ref, &cols[ alg_MATE_REF_NAME ].base.str );
623 }
624 else
625 {
626 ReferenceObj_Name( curs->cache->ref, &cols[ alg_MATE_REF_NAME ].base.str );
627 }
628
629 cols[ alg_MATE_REF_NAME ].len = string_size( cols[ alg_MATE_REF_NAME ].base.str );
630
631 if ( ref_proj == 0 )
632 {
633 curs->cache->pnext = pos_delta;
634 curs->cache->tlen = 0;
635 }
636 else if ( pos_delta > 0 )
637 {
638 curs->cache->pnext = ( cols[ alg_REF_POS ].len > 0 ? cols[ alg_REF_POS ].base.coord0[ 0 ] : 0 ) - pos_delta;
639 curs->cache->tlen = - ( ref_proj + pos_delta );
640 }
641 else
642 {
643 curs->cache->pnext = ( cols[ alg_REF_POS ].len > 0 ? cols[ alg_REF_POS ].base.coord0[ 0 ] : 0 ) - pos_delta;
644 curs->cache->tlen = ref_proj - pos_delta;
645 }
646
647 cols[ alg_MATE_REF_POS ].base.coord0 = &curs->cache->pnext;
648 cols[ alg_MATE_REF_POS ].len = sizeof( curs->cache->pnext );
649 cols[ alg_TEMPLATE_LEN ].base.i32 = &curs->cache->tlen;
650 cols[ alg_TEMPLATE_LEN ].len = sizeof( curs->cache->tlen );
651 {
652 uint32_t id;
653 ReferenceObj_Idx( curs->cache->ref, &id );
654 SAM_DUMP_DBG( 10, ( " --> mate %li: %u,%s[%hu],%u,%i\n",
655 mate_id, curs->cache->sam_flags, cols[ alg_MATE_REF_NAME ].base.str,
656 id, curs->cache->pnext, curs->cache->tlen ) );
657 }
658 }
659 #endif /* USE_MATE_CACHE */
660
661 #if 0
662 static rc_t OpenVTable( VDatabase const *db, STable* tbl, char const *name, bool optional )
663 {
664 rc_t rc = VDatabaseOpenTableRead( db, &tbl->vtbl, "%s", name );
665 if ( GetRCState( rc ) == rcNotFound && optional )
666 {
667 rc = 0;
668 tbl->vtbl = NULL;
669 }
670 tbl->name = name;
671 return rc;
672 }
673 #endif
674
Cursor_Open(STable const * const tbl,SCurs * const curs,SCol cols[],SCursCache * cache)675 static rc_t Cursor_Open( STable const *const tbl, SCurs *const curs, SCol cols[], SCursCache* cache )
676 {
677 rc_t rc = 0;
678 unsigned i;
679
680 curs->vcurs = NULL;
681 if ( tbl == NULL || tbl->vtbl == NULL )
682 return 0;
683
684 rc = VTableCreateCachedCursorRead( tbl->vtbl, &curs->vcurs, CURSOR_CACHE );
685 if ( rc != 0 )
686 return rc;
687
688 rc = VCursorPermitPostOpenAdd( curs->vcurs );
689 if ( rc != 0 )
690 return rc;
691
692 if ( rc == 0 )
693 {
694 rc = VCursorOpen( curs->vcurs );
695 if ( rc == 0 )
696 {
697 #if USE_MATE_CACHE
698 if ( cache != NULL )
699 {
700 curs->cache = cache;
701 }
702 else
703 {
704 curs->cache = &curs->cache_local;
705 rc = Cache_Init( &curs->cache_local );
706 }
707 #endif /* USE_MATE_CACHE */
708 curs->tbl = tbl;
709 }
710 }
711
712 for ( i = 0; cols[ i ].name != NULL; ++i )
713 {
714 if ( cols[ i ].name[ 0 ] == 0 )
715 continue;
716 rc = VCursorAddColumn( curs->vcurs, &cols[ i ].idx, "%s", cols[ i ].name );
717 if ( GetRCObject( rc ) == ( enum RCObject ) rcColumn )
718 {
719 switch ( GetRCState( rc ) )
720 {
721 case rcNotFound:
722 case rcUndefined:
723 if ( !cols[ i ].optional )
724 break;
725 case rcExists:
726 rc = 0;
727 default:
728 break;
729 }
730 }
731 if ( rc != 0 )
732 {
733 (void)PLOGERR( klogErr, ( klogErr, rc, "table $(t) column $(c)", "t=%s,c=%s", tbl->name, cols[ i ].name ) );
734 break;
735 }
736 }
737 if ( rc != 0 )
738 {
739 VCursorRelease( curs->vcurs );
740 curs->vcurs = NULL;
741 if ( rc != KLogLastErrorCode() )
742 {
743 (void)PLOGERR( klogErr, ( klogErr, rc, "table $(t)", "t=%s", tbl->name ) );
744 }
745 }
746 else
747 {
748 SAM_DUMP_DBG( 2, ( "%s: table %s\n", __func__, curs->tbl->name ) );
749 }
750 return rc;
751 }
752
753
Cursor_Close(SCurs * curs)754 static void Cursor_Close( SCurs* curs )
755 {
756 if ( curs != NULL && curs->vcurs != NULL )
757 {
758 SAM_DUMP_DBG( 2, ( "%s: table %s, columns rows read %lu\n", __func__, curs->tbl->name, curs->col_reads_qty ) );
759 VCursorRelease( curs->vcurs );
760 #if USE_MATE_CACHE
761 if ( curs->cache == &curs->cache_local )
762 {
763 Cache_Close( curs->tbl->name, curs->cache );
764 }
765 #endif /* USE_MATE_CACHE */
766 memset( curs, 0, sizeof( *curs ) );
767 }
768 }
769
770
Cursor_Read(DataSource * ds,int64_t row_id,int firstCol,unsigned nCols)771 static rc_t Cursor_Read( DataSource *ds, int64_t row_id, int firstCol, unsigned nCols )
772 {
773 rc_t rc = 0;
774
775 if ( ds->curs.vcurs == NULL )
776 {
777 rc = Cursor_Open( &ds->tbl, &ds->curs, ds->cols, ds->curs.cache );
778 if ( rc != 0 )
779 return rc;
780 }
781 if (1)
782 {
783 SCol *const col = &ds->cols[ firstCol ];
784 unsigned i;
785
786 for ( i = 0; i < nCols && col[ i ].name; ++i )
787 {
788 uint32_t const idx = col[ i ].idx;
789
790 if ( idx != 0 )
791 {
792 uint32_t elem_bits;
793 uint32_t bit_offset;
794 uint32_t elem_count;
795 void const *base;
796
797 rc = VCursorCellDataDirect( ds->curs.vcurs, row_id, idx, &elem_bits, &base, &bit_offset, &elem_count );
798 if ( rc != 0 )
799 {
800 (void)PLOGERR( klogWarn, ( klogWarn, rc, "reading $(t) row $(r), column $(c)", "t=%s,r=%li,c=%s",
801 ds->tbl.name, row_id, col[ i ].name ) );
802 return rc;
803 }
804
805 assert( bit_offset == 0 );
806 assert( elem_bits % 8 == 0 );
807
808 col[ i ].base.v = base;
809 col[ i ].len = elem_count;
810 }
811 }
812 }
813 return rc;
814 }
815
816 struct
817 {
818 KWrtWriter writer;
819 void* data;
820 KFile* kfile;
821 uint64_t pos;
822 } g_out_writer = {NULL};
823
824
BufferedWriter(void * const self,char const buffer[],size_t const bufsize,size_t * const pnum_writ)825 static rc_t CC BufferedWriter( void *const self, char const buffer[], size_t const bufsize, size_t *const pnum_writ )
826 {
827 rc_t rc = 0;
828 size_t written = 0;
829
830 assert( buffer != NULL );
831
832 while ( written < bufsize )
833 {
834 size_t n;
835
836 rc = KFileWrite( g_out_writer.kfile, g_out_writer.pos + written, &buffer[ written ], bufsize - written, &n );
837 if ( rc != 0 )
838 break;
839 written += n;
840 }
841 g_out_writer.pos += written;
842 if ( pnum_writ != NULL )
843 *pnum_writ = written;
844 return rc;
845 }
846
847
BufferedWriterMake(bool gzip,bool bzip2)848 static rc_t BufferedWriterMake( bool gzip, bool bzip2 )
849 {
850 rc_t rc = 0;
851
852 if ( gzip && bzip2 )
853 {
854 rc = RC( rcApp, rcFile, rcConstructing, rcParam, rcAmbiguous );
855 }
856 else if ( g_out_writer.writer != NULL )
857 {
858 rc = RC( rcApp, rcFile, rcConstructing, rcParam, rcAmbiguous );
859 }
860 else
861 {
862 rc = KFileMakeStdOut( &g_out_writer.kfile );
863 if ( rc == 0 )
864 {
865 g_out_writer.pos = 0;
866 if ( gzip )
867 {
868 KFile* gz;
869 rc = KFileMakeGzipForWrite( &gz, g_out_writer.kfile );
870 if ( rc == 0 )
871 {
872 KFileRelease( g_out_writer.kfile );
873 g_out_writer.kfile = gz;
874 }
875 }
876 else if ( bzip2 )
877 {
878 KFile* bz;
879 rc = KFileMakeBzip2ForWrite( &bz, g_out_writer.kfile );
880 if ( rc == 0 )
881 {
882 KFileRelease( g_out_writer.kfile );
883 g_out_writer.kfile = bz;
884 }
885 }
886 if ( rc == 0 )
887 {
888 KFile* buf;
889 rc = KBufFileMakeWrite( &buf, g_out_writer.kfile, false, 128 * 1024 );
890 if ( rc == 0 )
891 {
892 KFileRelease( g_out_writer.kfile );
893 g_out_writer.kfile = buf;
894 g_out_writer.writer = KOutWriterGet();
895 g_out_writer.data = KOutDataGet();
896 rc = KOutHandlerSet( BufferedWriter, &g_out_writer );
897 }
898 }
899 }
900 }
901 return rc;
902 }
903
904
BufferedWriterRelease(bool flush)905 static void BufferedWriterRelease( bool flush )
906 {
907 if ( flush )
908 {
909 /* avoid flushing buffered data after failure */
910 KFileRelease( g_out_writer.kfile );
911 }
912 if ( g_out_writer.writer != NULL )
913 {
914 KOutHandlerSet( g_out_writer.writer, g_out_writer.data );
915 }
916 g_out_writer.writer = NULL;
917 }
918
919
920 typedef struct ReadGroup
921 {
922 BSTNode node;
923 char name[ 1024 ];
924 } ReadGroup;
925
926
ReadGroup_sort(BSTNode const * item,BSTNode const * node)927 static int64_t CC ReadGroup_sort( BSTNode const *item, BSTNode const *node )
928 {
929 return strcmp( ( ( ReadGroup const * )item )->name, ( ( ReadGroup const * ) node )->name );
930 }
931
932
ReadGroup_print(char const * nm)933 static rc_t ReadGroup_print( char const *nm )
934 {
935 rc_t rc = 0;
936 if ( nm[ 0 ] != '\0' && strcasecmp( nm, "default" ) )
937 {
938 rc = KOutMsg( "@RG\tID:%s\n", nm );
939 }
940 return rc;
941 }
942
943
ReadGroup_dump(BSTNode * n,void * data)944 static void CC ReadGroup_dump( BSTNode *n, void *data )
945 {
946 ReadGroup const *g = ( ReadGroup* )n;
947 ReadGroup_print( g->name );
948 }
949
950
DumpReadGroupsScan(STable const * tbl)951 static rc_t CC DumpReadGroupsScan( STable const *tbl )
952 {
953 SCol cols[] =
954 {
955 { "SPOT_GROUP", 0, {NULL}, 0, false },
956 { NULL, 0, {NULL}, 0, false }
957 };
958 rc_t rc = 0;
959 BSTree tree;
960 DataSource ds;
961
962 memset( &ds, 0, sizeof( ds ) );
963 ds.cols = cols;
964
965 BSTreeInit( &tree );
966 rc = Cursor_Open( tbl, &ds.curs, ds.cols, NULL );
967 if ( rc == 0 )
968 {
969 int64_t start;
970 uint64_t count;
971
972 rc = VCursorIdRange( ds.curs.vcurs, 0, &start, &count );
973 if ( rc == 0 )
974 {
975 ReadGroup* node = NULL;
976 uint32_t last_len = 0;
977
978 while ( count > 0 && rc == 0 )
979 {
980 rc = Cursor_Read( &ds, start, 0, ~(unsigned)0 );
981 if ( rc == 0 && cols[ 0 ].len != 0 )
982 {
983 if ( node == NULL ||
984 last_len != cols[ 0 ].len ||
985 strncmp( cols[ 0 ].base.str, node->name, cols[ 0 ].len ) != 0 )
986 {
987 node = malloc( sizeof( *node ) );
988 if ( node == NULL )
989 {
990 rc = RC( rcExe, rcNode, rcConstructing, rcMemory, rcExhausted );
991 }
992 else if ( cols[ 0 ].len > sizeof( node->name ) )
993 {
994 rc = RC( rcExe, rcString, rcCopying, rcBuffer, rcInsufficient );
995 }
996 else
997 {
998 last_len = cols[ 0 ].len;
999 string_copy( node->name, ( sizeof node->name ) - 1, cols[ 0 ].base.str, last_len );
1000 node->name[ last_len ] = '\0';
1001 rc = BSTreeInsertUnique( &tree, &node->node, NULL, ReadGroup_sort );
1002 if ( GetRCState( rc ) == rcExists )
1003 {
1004 free( node );
1005 rc = 0;
1006 }
1007 }
1008 }
1009 }
1010 else if ( GetRCState( rc ) == rcNotFound && GetRCObject( rc ) == rcRow )
1011 {
1012 rc = 0;
1013 }
1014 start++;
1015 count--;
1016 }
1017 }
1018 Cursor_Close( &ds.curs );
1019 }
1020
1021 if ( rc == 0 )
1022 {
1023 BSTreeForEach( &tree, false, ReadGroup_dump, NULL );
1024 }
1025 else if ( rc != KLogLastErrorCode() )
1026 {
1027 (void)PLOGERR( klogErr, ( klogErr, rc, "$(f)", "f=%s", __func__ ) );
1028 }
1029 BSTreeWhack( &tree, NULL, NULL );
1030 return rc;
1031 }
1032
1033
DumpReadGroups(STable const * tbl)1034 rc_t CC DumpReadGroups( STable const *tbl )
1035 {
1036 rc_t rc = 0;
1037 KMetadata const *m;
1038
1039 /* try getting list from stats meta */
1040 rc = VTableOpenMetadataRead( tbl->vtbl, &m );
1041 if ( rc == 0 )
1042 {
1043 KMDataNode const *n;
1044 rc = KMetadataOpenNodeRead( m, &n, "/STATS/SPOT_GROUP" );
1045 if ( rc == 0 )
1046 {
1047 KNamelist* names;
1048 rc = KMDataNodeListChild( n, &names );
1049 if ( rc == 0 )
1050 {
1051 uint32_t i, q;
1052 rc = KNamelistCount( names, &q );
1053 if ( rc == 0 )
1054 {
1055 for ( i = 0; rc == 0 && i < q; i++ )
1056 {
1057 char const *nm;
1058 rc = KNamelistGet( names, i, &nm );
1059 if ( rc == 0 )
1060 {
1061 rc = ReadGroup_print( nm );
1062 }
1063 }
1064 }
1065 KNamelistRelease( names );
1066 }
1067 KMDataNodeRelease( n );
1068 }
1069 KMetadataRelease( m );
1070 }
1071
1072 if ( GetRCState( rc ) == rcNotFound )
1073 {
1074 rc = DumpReadGroupsScan( tbl );
1075 }
1076 else if ( rc != 0 && rc != KLogLastErrorCode() )
1077 {
1078 (void)PLOGERR( klogErr, ( klogErr, rc, "$(f)", "f=%s", __func__ ) );
1079 }
1080 return rc;
1081 }
1082
Cursor_ReadAlign(SCurs const * curs,int64_t row_id,SCol * cols,uint32_t idx)1083 static rc_t Cursor_ReadAlign( SCurs const *curs, int64_t row_id, SCol* cols, uint32_t idx )
1084 {
1085 rc_t rc = 0;
1086 SCol* c = NULL;
1087 SCol* mate_id = NULL;
1088 #if USE_MATE_CACHE
1089 uint64_t cache_val = 0;
1090 bool cache_miss = false;
1091 #endif /* USE_MATE_CACHE */
1092
1093
1094 for( ; rc == 0 && cols[ idx ].name != NULL; idx++ )
1095 {
1096 c = &cols[ idx ];
1097 if ( c->idx != 0 )
1098 {
1099 #if USE_MATE_CACHE
1100 if ( mate_id != NULL && curs->cache != NULL && !cache_miss &&
1101 ( idx == alg_SAM_FLAGS || idx == alg_MATE_REF_NAME || idx == alg_MATE_REF_POS || idx == alg_TEMPLATE_LEN ) &&
1102 mate_id->idx && mate_id->len > 0 && mate_id->base.i64[ 0 ] > 0 )
1103 {
1104 if ( cache_val != 0 )
1105 {
1106 continue;
1107 }
1108 rc = Cache_Get( curs, mate_id->base.u64[ 0 ], &cache_val );
1109 if ( rc == 0 )
1110 {
1111 continue;
1112 }
1113 else if ( !( GetRCObject( rc ) == rcItem && GetRCState( rc ) == rcNotFound ) )
1114 {
1115 break;
1116 }
1117 else
1118 {
1119 /* avoid multiple lookups in cache */
1120 cache_miss = true;
1121 }
1122 }
1123 #endif /* USE_MATE_CACHE */
1124 rc = VCursorCellDataDirect( curs->vcurs, row_id, c->idx, NULL, &c->base.v, NULL, &c->len );
1125 if ( rc == 0 )
1126 {
1127 if ( idx == alg_SEQ_SPOT_ID && ( c->len ==0 || c->base.i64[ 0 ] == 0 ) )
1128 {
1129 return RC( rcExe, rcColumn, rcReading, rcRow, rcNotFound );
1130 }
1131 if ( idx == alg_MATE_ALIGN_ID )
1132 {
1133 mate_id = &cols[ alg_MATE_ALIGN_ID ];
1134 }
1135 #if _DEBUGGING
1136 ( ( SCurs* )curs )->col_reads_qty++;
1137 #endif
1138 }
1139 }
1140 else
1141 {
1142 static INSDC_coord_zero readStart;
1143 static INSDC_coord_len readLen;
1144 static INSDC_coord_len cigarLen;
1145
1146 switch ( (int)idx )
1147 {
1148 case alg_READ_START:
1149 readStart = 0;
1150 c->base.coord0 = &readStart;
1151 c->len = 1;
1152 break;
1153 case alg_READ_LEN:
1154 readLen = cols[ alg_READ ].len;
1155 c->base.coord_len = &readLen;
1156 c->len = 1;
1157 break;
1158 case alg_CIGAR_LEN:
1159 cigarLen = cols[ alg_CIGAR ].len;
1160 c->base.coord_len = &cigarLen;
1161 c->len = 1;
1162 break;
1163 }
1164 }
1165 }
1166
1167 if ( rc != 0 )
1168 {
1169 (void)PLOGERR( klogWarn, ( klogWarn, rc, "reading $(t) row $(r), column $(c)", "t=%s,r=%li,c=%s",
1170 curs->tbl->name, row_id, c ? c->name : "<none>" ) );
1171 #if USE_MATE_CACHE
1172 }
1173 else if ( curs->cache == NULL )
1174 {
1175 }
1176 else if ( cache_val == 0 )
1177 {
1178 /* this row is not from cache */
1179 int64_t mate_align_id = ( mate_id != NULL && mate_id->len > 0 ) ? mate_id->base.i64[ 0 ] : 0;
1180 if ( cols[ 0 ].idx != 0 && /* we have full cursor which means we are reading alignment table */
1181 /* no mate -> unaligned (not secondary!) */
1182 ( ( mate_align_id == 0 && param->unaligned && curs->cache != &curs->cache_local ) ||
1183 /* 2nd mate with higher rowid and more than set gap rows away */
1184 ( mate_align_id != 0 && mate_align_id > row_id && ( mate_align_id - row_id) > param->mate_row_gap_cachable ) ) )
1185 {
1186 rc = Cache_Add( row_id, curs, cols );
1187 }
1188 if(param->unaligned == true && mate_align_id == 0 ){
1189 rc = KVectorSetBool( curs->cache->cache_unaligned_mate, cols[alg_SEQ_SPOT_ID].base.i64[0], true );
1190 }
1191 }
1192 else
1193 {
1194 Cache_Unpack( cache_val, row_id, curs, cols );
1195 #endif /* USE_MATE_CACHE */
1196 }
1197 return rc;
1198 }
1199
1200
DumpName(char const * name,size_t name_len,const char spot_group_sep,char const * spot_group,size_t spot_group_len,int64_t spot_id)1201 static rc_t DumpName( char const *name, size_t name_len,
1202 const char spot_group_sep, char const *spot_group,
1203 size_t spot_group_len, int64_t spot_id )
1204 {
1205 rc_t rc = 0;
1206 if ( param->cg_friendly_names )
1207 {
1208 rc = KOutMsg( "%.*s-1:%lu", spot_group_len, spot_group, spot_id );
1209 }
1210 else
1211 {
1212 if ( param->name_prefix != NULL )
1213 {
1214 rc = KOutMsg( "%s.", param->name_prefix );
1215 }
1216 rc = BufferedWriter( NULL, name, name_len, NULL );
1217 if ( rc == 0 && param->spot_group_in_name && spot_group_len > 0 )
1218 {
1219 rc = BufferedWriter( NULL, &spot_group_sep, 1, NULL );
1220 if ( rc == 0 )
1221 rc = BufferedWriter( NULL, spot_group, spot_group_len, NULL );
1222 }
1223 }
1224 return rc;
1225 }
1226
1227
DumpQuality(char const quality[],unsigned const count,bool const reverse,bool const quantize)1228 static rc_t DumpQuality( char const quality[], unsigned const count, bool const reverse, bool const quantize )
1229 {
1230 rc_t rc = 0;
1231 if ( quality == NULL )
1232 {
1233 unsigned i;
1234
1235 for ( i = 0; rc == 0 && i < count; ++i )
1236 {
1237 char const newValue = ((param->qualQuant && param->qualQuantSingle)?param->qualQuantSingle:30) + 33;
1238 rc = BufferedWriter( NULL, &newValue, 1, NULL );
1239 }
1240 }
1241 else if ( reverse || quantize )
1242 {
1243 unsigned i;
1244
1245 for ( i = 0; rc == 0 && i < count; ++i )
1246 {
1247 char const qual = quality[ reverse ? ( count - i - 1 ) : i ];
1248 char const newValue = quantize ? param->qualQuant[ qual - 33 ] + 33 : qual;
1249
1250 rc = BufferedWriter( NULL, &newValue, 1, NULL );
1251 }
1252 }
1253 else
1254 {
1255 rc = BufferedWriter( NULL, quality, count, NULL );
1256 }
1257 return rc;
1258 }
1259
1260
DumpUnalignedFastX(const SCol cols[],uint32_t read_id,INSDC_coord_zero readStart,INSDC_coord_len readLen,int64_t row_id)1261 static rc_t DumpUnalignedFastX( const SCol cols[], uint32_t read_id, INSDC_coord_zero readStart, INSDC_coord_len readLen, int64_t row_id )
1262 {
1263 /* fast[AQ] represnted in SAM fields:
1264 [@|>]QNAME unaligned
1265 SEQ
1266 +
1267 QUAL
1268 */
1269 rc_t rc = BufferedWriter( NULL, param->fastq ? "@" : ">", 1, NULL );
1270
1271 /* QNAME: [PFX.]SEQUENCE:NAME[#SPOT_GROUP] */
1272 if ( rc == 0 )
1273 rc = DumpName( cols[ seq_NAME ].base.str, cols[ seq_NAME ].len, '#',
1274 cols[ seq_SPOT_GROUP ].base.str, cols[ seq_SPOT_GROUP ].len, row_id );
1275 if ( rc == 0 && read_id > 0 )
1276 {
1277 rc = KOutMsg( "/%u", read_id );
1278 }
1279 if ( rc == 0 )
1280 rc = BufferedWriter( NULL, " unaligned\n", 11, NULL );
1281
1282 /* SEQ: SEQUENCE.READ */
1283 if ( rc == 0 )
1284 rc = BufferedWriter( NULL, &cols[ seq_READ ].base.str[readStart], readLen, NULL );
1285 if ( rc == 0 && param->fastq )
1286 {
1287 /* QUAL: SEQUENCE.QUALITY */
1288 rc = BufferedWriter( NULL, "\n+\n", 3, NULL );
1289 if ( rc == 0 )
1290 rc = DumpQuality( &cols[ seq_QUALITY ].base.str[ readStart ], readLen, false, param->quantizeQual );
1291 }
1292 if ( rc == 0 )
1293 rc = BufferedWriter( NULL, "\n", 1, NULL );
1294 return rc;
1295 }
1296
1297
DumpAlignedFastX(const SCol cols[],int64_t const alignId,uint32_t read_id,bool primary,bool secondary)1298 static rc_t DumpAlignedFastX( const SCol cols[], int64_t const alignId, uint32_t read_id, bool primary, bool secondary )
1299 {
1300 rc_t rc = 0;
1301 size_t nm;
1302 unsigned readId;
1303 unsigned const nreads = cols[ alg_READ_LEN ].len;
1304
1305 for ( readId = 0; readId < nreads && rc == 0 ; ++readId )
1306 {
1307 char const *qname = cols[ alg_SEQ_NAME ].base.str;
1308 size_t qname_len = cols[ alg_SEQ_NAME ].len;
1309 char synth_qname[ 40 ];
1310 int64_t const spot_id = cols[alg_SEQ_SPOT_ID].len > 0 ? cols[alg_SEQ_SPOT_ID].base.i64[0] : 0;
1311 char const *const read = cols[ alg_READ ].base.str + cols[ alg_READ_START ].base.coord0[ readId ];
1312 char const *const qual = cols[ alg_SAM_QUALITY ].base.v
1313 ? cols[ alg_SAM_QUALITY ].base.str + cols[ alg_READ_START ].base.coord0[ readId ]
1314 : NULL;
1315 unsigned const readlen = cols[ alg_READ_LEN ].base.coord_len[ readId ];
1316
1317 /* fast[AQ] represnted in SAM fields:
1318 [@|>]QNAME primary|secondary ref=RNAME pos=POS mapq=MAPQ
1319 SEQ
1320 +
1321 QUAL
1322 */
1323 rc = BufferedWriter( NULL, param->fastq ? "@" : ">", 1, NULL );
1324 /* QNAME: [PFX.]SEQ_NAME[#SPOT_GROUP] */
1325 if ( qname_len == 0 || qname == NULL )
1326 {
1327 string_printf( synth_qname, sizeof( synth_qname ), &qname_len, "%li.%u", alignId, readId + 1 );
1328 qname = synth_qname;
1329 }
1330 nm = cols[ alg_SPOT_GROUP ].len ? alg_SPOT_GROUP : alg_SEQ_SPOT_GROUP;
1331 if ( rc == 0 )
1332 rc = DumpName( qname, qname_len, '.', cols[ nm ].base.str, cols[ nm ].len, spot_id);
1333
1334 if ( rc == 0 && read_id > 0 )
1335 rc = KOutMsg( "/%u", read_id );
1336
1337 if ( rc == 0 )
1338 {
1339 if ( primary )
1340 {
1341 rc = BufferedWriter( NULL, " primary", 8, NULL );
1342 }
1343 else if ( secondary )
1344 {
1345 rc = BufferedWriter( NULL, " secondary", 10, NULL );
1346 }
1347 }
1348
1349 /* RNAME: REF_NAME or REF_SEQ_ID */
1350 if ( rc == 0 )
1351 rc = BufferedWriter( NULL, " ref=", 5, NULL );
1352 if ( rc == 0 )
1353 {
1354 if ( param->use_seqid )
1355 {
1356 rc = BufferedWriter( NULL, cols[ alg_REF_SEQ_ID ].base.str, cols[ alg_REF_SEQ_ID ].len, NULL );
1357 }
1358 else
1359 {
1360 rc = BufferedWriter( NULL, cols[ alg_REF_NAME ].base.str, cols[ alg_REF_NAME ].len, NULL );
1361 }
1362 }
1363
1364 /* POS: REF_POS, MAPQ: MAPQ */
1365 if ( rc == 0 )
1366 rc = KOutMsg( " pos=%u mapq=%i\n", cols[ alg_REF_POS ].base.coord0[ 0 ] + 1, cols[ alg_MAPQ ].base.i32[ 0 ] );
1367
1368 /* SEQ: READ */
1369 if ( rc == 0 )
1370 rc = BufferedWriter( NULL, read, readlen, NULL );
1371 if ( rc == 0 && param->fastq )
1372 {
1373 /* QUAL: SAM_QUALITY */
1374 rc = BufferedWriter( NULL, "\n+\n", 3, NULL );
1375 if ( rc == 0 )
1376 rc = DumpQuality( qual, readlen, false, param->quantizeQual );
1377 }
1378 if ( rc == 0 )
1379 rc = BufferedWriter( NULL, "\n", 1, NULL );
1380 }
1381 return rc;
1382 }
1383
1384
1385 static
DumpUnalignedSAM(const SCol cols[],uint32_t flags,INSDC_coord_zero readStart,INSDC_coord_len readLen,char const * rnext,uint32_t rnext_len,INSDC_coord_zero pnext,char const readGroup[],int64_t row_id)1386 rc_t DumpUnalignedSAM( const SCol cols[], uint32_t flags, INSDC_coord_zero readStart, INSDC_coord_len readLen,
1387 char const *rnext, uint32_t rnext_len, INSDC_coord_zero pnext, char const readGroup[], int64_t row_id )
1388 {
1389 unsigned i;
1390
1391 /* QNAME: [PFX.]NAME[.SPOT_GROUP] */
1392 rc_t rc = DumpName( cols[ seq_NAME ].base.str, cols[ seq_NAME ].len, '.',
1393 cols[ seq_SPOT_GROUP ].base.str, cols[ seq_SPOT_GROUP ].len, row_id );
1394
1395 /* all these fields are const text for now */
1396 if ( rc == 0 )
1397 rc = KOutMsg( "\t%u\t*\t0\t0\t*\t%.*s\t%u\t0\t",
1398 flags, rnext_len ? rnext_len : 1, rnext_len ? rnext : "*", pnext );
1399 /* SEQ: SEQUENCE.READ */
1400 if ( flags & 0x10 )
1401 {
1402 for( i = 0; i < readLen && rc == 0; i++ )
1403 {
1404 char base;
1405
1406 DNAReverseCompliment( &cols[ seq_READ ].base.str[ readStart + readLen - 1 - i ], &base, 1 );
1407 rc = BufferedWriter( NULL, &base, 1, NULL );
1408 }
1409 }
1410 else
1411 {
1412 rc = BufferedWriter( NULL, &cols[ seq_READ ].base.str[ readStart ], readLen, NULL );
1413 }
1414
1415 if ( rc == 0 )
1416 rc = BufferedWriter( NULL, "\t", 1, NULL );
1417 /* QUAL: SEQUENCE.QUALITY */
1418 if ( rc == 0 )
1419 rc = DumpQuality( &cols[ seq_QUALITY ].base.str[ readStart ], readLen, flags & 0x10, param->quantizeQual );
1420
1421 /* optional fields: */
1422 if ( rc == 0 )
1423 {
1424 if ( readGroup )
1425 {
1426 rc = BufferedWriter( NULL, "\tRG:Z:", 6, NULL );
1427 if ( rc == 0 )
1428 rc = BufferedWriter( NULL, readGroup, string_size( readGroup ), NULL );
1429 }
1430 else if ( cols[ seq_SPOT_GROUP ].len > 0 )
1431 {
1432 /* read group */
1433 rc = BufferedWriter( NULL, "\tRG:Z:", 6, NULL );
1434 if ( rc == 0 )
1435 rc = BufferedWriter( NULL, cols[ seq_SPOT_GROUP ].base.str, cols[ seq_SPOT_GROUP ].len, NULL );
1436 }
1437 }
1438 if ( rc == 0 )
1439 rc = BufferedWriter( NULL, "\n", 1, NULL );
1440 return rc;
1441 }
1442
1443
1444 static
DumpAlignedSAM(SAM_dump_ctx_t * const ctx,DataSource const * ds,int64_t alignId,char const readGroup[],int type)1445 rc_t DumpAlignedSAM( SAM_dump_ctx_t *const ctx,
1446 DataSource const *ds,
1447 int64_t alignId,
1448 char const readGroup[],
1449 int type)
1450 {
1451 rc_t rc = 0;
1452 unsigned const nreads = ds->cols[ alg_READ_LEN ].len;
1453 SCol const *const cols = ds->cols;
1454 int64_t const spot_id = cols[alg_SEQ_SPOT_ID].len > 0 ? cols[alg_SEQ_SPOT_ID].base.i64[0] : 0;
1455 INSDC_coord_one const read_id = cols[alg_SEQ_READ_ID].len > 0 ? cols[alg_SEQ_READ_ID].base.coord1[0] : 0;
1456 INSDC_SRA_read_filter const *align_filter = cols[alg_READ_FILTER].len == nreads ? cols[alg_READ_FILTER].base.read_filter : NULL;
1457 INSDC_SRA_read_filter seq_filter = 0;
1458 unsigned readId;
1459 unsigned cigOffset = 0;
1460
1461 if ( align_filter == NULL && spot_id && read_id && ctx->seq.cols )
1462 {
1463 rc_t rc;
1464
1465 rc = Cursor_Read(&ctx->seq, spot_id, seq_READ_FILTER, 1);
1466 if (rc == 0 && ctx->seq.cols[seq_READ_FILTER].len >= read_id)
1467 seq_filter = ctx->seq.cols[seq_READ_FILTER].base.read_filter[read_id - 1];
1468 }
1469
1470 for ( readId = 0; readId < nreads && rc == 0 ; ++readId )
1471 {
1472 char const *qname = cols[ alg_SEQ_NAME ].base.str;
1473 size_t qname_len = cols[ alg_SEQ_NAME ].len;
1474 char const *const read = cols[ alg_READ ].base.str + cols[ alg_READ_START ].base.coord0[ readId ];
1475 char const *const qual = cols[ alg_SAM_QUALITY ].base.v
1476 ? cols[ alg_SAM_QUALITY ].base.str + cols[ alg_READ_START ].base.coord0[ readId ]
1477 : NULL;
1478 unsigned const readlen = nreads > 1 ? cols[ alg_READ_LEN ].base.coord_len[ readId ] : cols[ alg_READ ].len;
1479 unsigned const sflags = cols[ alg_SAM_FLAGS ].base.v ? cols[ alg_SAM_FLAGS ].base.u32[ readId ] : 0;
1480 INSDC_SRA_read_filter const filt = align_filter ? align_filter[readId] : seq_filter;
1481 unsigned const flags = (sflags & ~((unsigned)0x200)) | ((filt == SRA_READ_FILTER_REJECT) ? 0x200 : 0);
1482 char const *const cigar = cols[ alg_CIGAR ].base.str + cigOffset;
1483 unsigned const cigLen = nreads > 1 ? cols[ alg_CIGAR_LEN ].base.coord_len[ readId ] : cols[ alg_CIGAR ].len;
1484 size_t nm;
1485 char synth_qname[1024];
1486
1487 cigOffset += cigLen;
1488 if ( qname_len == 0 || qname == NULL )
1489 {
1490 string_printf( synth_qname, sizeof( synth_qname ), &qname_len, "ALLELE_%li.%u", alignId, readId + 1 );
1491 qname = synth_qname;
1492 }
1493 else if (ds->type == edstt_EvidenceAlignment) {
1494 string_printf( synth_qname, sizeof( synth_qname ), &qname_len, "%u/ALLELE_%li.%u", spot_id, cols[ alg_REF_ID ].base.i64[ readId ], cols[ alg_REF_PLOIDY ].base.u32[ readId ] );
1495 qname = synth_qname;
1496 }
1497 nm = cols[ alg_SPOT_GROUP ].len ? alg_SPOT_GROUP : alg_SEQ_SPOT_GROUP;
1498 rc = DumpName( qname, qname_len, '.', cols[ nm ].base.str, cols[ nm ].len, spot_id );
1499
1500 /* FLAG: SAM_FLAGS */
1501 if ( rc == 0 )
1502 {
1503 if ( ds->type == edstt_EvidenceAlignment )
1504 {
1505 bool const cmpl = cols[alg_REVERSED].base.v && readId < cols[alg_REVERSED].len ? cols[alg_REVERSED].base.tf[readId] : false;
1506 rc = KOutMsg( "\t%u\t", 1 | (cmpl ? 0x10 : 0) | (read_id == 1 ? 0x40 : 0x80) );
1507 }
1508 else if ( !param->unaligned /** not going to dump unaligned **/
1509 && ( flags & 0x1 ) /** but we have sequenced multiple fragments **/
1510 && ( flags & 0x8 ) ) /** and not all of them align **/
1511 {
1512 /*** remove flags talking about multiple reads **/
1513 /* turn off 0x001 0x008 0x040 0x080 */
1514 rc = KOutMsg( "\t%u\t", flags & ~0xC9 );
1515 }
1516 else
1517 {
1518 rc = KOutMsg( "\t%u\t", flags );
1519 }
1520 }
1521
1522 if ( rc == 0 )
1523 {
1524 if ( ds->type == edstt_EvidenceAlignment && type == 0 )
1525 {
1526 rc = KOutMsg( "ALLELE_%li.%u", cols[ alg_REF_ID ].base.i64[ readId ], cols[ alg_REF_PLOIDY ].base.u32[ readId ] );
1527 }
1528 else
1529 {
1530 /* RNAME: REF_NAME or REF_SEQ_ID */
1531 if ( param->use_seqid )
1532 rc = BufferedWriter( NULL, cols[ alg_REF_SEQ_ID ].base.str, cols[ alg_REF_SEQ_ID ].len, NULL );
1533 else
1534 rc = BufferedWriter( NULL, cols[ alg_REF_NAME ].base.str, cols[ alg_REF_NAME ].len, NULL );
1535 }
1536 }
1537
1538 if ( rc == 0 )
1539 rc = BufferedWriter( NULL, "\t", 1, NULL );
1540
1541 /* POS: REF_POS */
1542 if ( rc == 0 )
1543 rc = KOutMsg( "%i\t", cols[ alg_REF_POS ].base.coord0[ 0 ] + 1 );
1544
1545 /* MAPQ: MAPQ */
1546 if ( rc == 0 )
1547 rc = KOutMsg( "%i\t", cols[ alg_MAPQ ].base.i32[ 0 ] );
1548
1549 /* CIGAR: CIGAR_* */
1550 if ( ds->type == edstt_EvidenceInterval )
1551 {
1552 unsigned i;
1553
1554 for ( i = 0; i != cigLen && rc == 0; ++i )
1555 {
1556 char ch = cigar[i];
1557 if ( ch == 'S' ) ch = 'I';
1558 rc = BufferedWriter(NULL, &ch, 1, NULL);
1559 }
1560 }
1561 else if(ds->type == edstt_EvidenceAlignment)
1562 {
1563 rc = cg_canonical_print_cigar(cigar,cigLen);
1564 }
1565 else
1566 {
1567 if ( rc == 0 )
1568 rc = BufferedWriter( NULL, cigar, cigLen, NULL );
1569 }
1570
1571 if ( rc == 0 )
1572 rc = BufferedWriter( NULL, "\t", 1, NULL );
1573
1574 /* RNEXT: MATE_REF_NAME or '*' */
1575 /* PNEXT: MATE_REF_POS or 0 */
1576 if ( rc == 0 )
1577 {
1578 if ( cols[ alg_MATE_REF_NAME ].len )
1579 {
1580 if ( cols[ alg_MATE_REF_NAME ].len == cols[ alg_REF_NAME ].len &&
1581 memcmp( cols[ alg_MATE_REF_NAME ].base.str, cols[ alg_REF_NAME ].base.str, cols[ alg_MATE_REF_NAME ].len ) == 0 )
1582 {
1583 rc = BufferedWriter( NULL, "=\t", 2, NULL );
1584 }
1585 else
1586 {
1587 rc = BufferedWriter( NULL, cols[ alg_MATE_REF_NAME ].base.str, cols[ alg_MATE_REF_NAME ].len, NULL );
1588 if ( rc == 0 )
1589 rc = BufferedWriter( NULL, "\t", 1, NULL );
1590 }
1591 if ( rc == 0 )
1592 rc = KOutMsg( "%u\t", cols[ alg_MATE_REF_POS ].base.coord0[ 0 ] + 1 );
1593 }
1594 else
1595 {
1596 rc = BufferedWriter( NULL, "*\t0\t", 4, NULL );
1597 }
1598 }
1599
1600 /* TLEN: TEMPLATE_LEN */
1601 if ( rc == 0 )
1602 rc = KOutMsg( "%i\t", cols[ alg_TEMPLATE_LEN ].base.v ? cols[ alg_TEMPLATE_LEN ].base.i32[ 0 ] : 0 );
1603
1604 /* SEQ: READ */
1605 if ( rc == 0 )
1606 rc = BufferedWriter( NULL, read, readlen, NULL );
1607 if ( rc == 0 )
1608 rc = BufferedWriter( NULL, "\t", 1, NULL );
1609
1610 /* QUAL: SAM_QUALITY */
1611 if ( rc == 0 )
1612 rc = DumpQuality( qual, readlen, false, param->quantizeQual );
1613
1614 /* optional fields: */
1615 if ( rc == 0 && ds->type == edstt_EvidenceInterval )
1616 rc = KOutMsg( "\tRG:Z:ALLELE_%u", readId + 1 );
1617
1618 if ( rc == 0 )
1619 {
1620 if ( readGroup )
1621 {
1622 rc = BufferedWriter( NULL, "\tRG:Z:", 6, NULL );
1623 if ( rc == 0 )
1624 rc = BufferedWriter( NULL, readGroup, string_size( readGroup ), NULL );
1625 }
1626 else if ( cols[ alg_SPOT_GROUP ].len > 0 )
1627 {
1628 /* read group */
1629 rc = BufferedWriter( NULL, "\tRG:Z:", 6, NULL );
1630 if ( rc == 0 )
1631 rc = BufferedWriter( NULL, cols[ alg_SPOT_GROUP ].base.str, cols[ alg_SPOT_GROUP ].len, NULL );
1632 }
1633 else if ( cols[ alg_SEQ_SPOT_GROUP ].len > 0 )
1634 {
1635 /* backward compatibility */
1636 rc = BufferedWriter( NULL, "\tRG:Z:", 6, NULL );
1637 if ( rc == 0 )
1638 rc = BufferedWriter( NULL, cols[ alg_SEQ_SPOT_GROUP ].base.str, cols[ alg_SEQ_SPOT_GROUP ].len, NULL );
1639 }
1640 }
1641
1642 if ( rc == 0 && param->cg_style > 0 && cols[ alg_CG_TAGS_STR ].len > 0 )
1643 rc = BufferedWriter( NULL, cols[ alg_CG_TAGS_STR ].base.str, cols[ alg_CG_TAGS_STR ].len, NULL );
1644
1645 if ( rc == 0 )
1646 {
1647 if ( param->cg_style > 0 && cols[ alg_ALIGN_GROUP ].len > 0 )
1648 {
1649 char const *ZI = cols[ alg_ALIGN_GROUP ].base.str;
1650 unsigned i;
1651
1652 for ( i = 0; rc == 0 && i < cols[ alg_ALIGN_GROUP ].len - 1; ++i )
1653 {
1654 if ( ZI[ i ] == '_' )
1655 {
1656 rc = KOutMsg( "\tZI:i:%.*s\tZA:i:%.1s", i, ZI, ZI + i + 1 );
1657 break;
1658 }
1659 }
1660 }
1661 else if ( ds->type == edstt_EvidenceAlignment && type == 1 )
1662 {
1663 rc = KOutMsg( "\tZI:i:%li\tZA:i:%u", cols[ alg_REF_ID ].base.i64[ readId ], cols[ alg_REF_PLOIDY ].base.u32[ readId ] );
1664 }
1665 }
1666
1667 /* align id */
1668 if ( rc == 0 && param->xi )
1669 rc = KOutMsg( "\tXI:i:%li", alignId );
1670
1671 /* hit count */
1672 if ( rc == 0 && cols[alg_ALIGNMENT_COUNT].len )
1673 rc = KOutMsg( "\tNH:i:%i", (int)cols[ alg_ALIGNMENT_COUNT ].base.u8[ readId ] );
1674
1675 /* edit distance */
1676 if ( rc == 0 && cols[ alg_EDIT_DISTANCE ].len )
1677 rc = KOutMsg( "\tNM:i:%i", cols[ alg_EDIT_DISTANCE ].base.i32[ readId ] );
1678
1679 if ( rc == 0 )
1680 rc = KOutMsg( "\n" );
1681 }
1682 return rc;
1683 }
1684
1685
DumpUnalignedReads(SAM_dump_ctx_t * const ctx,SCol const calg_col[],int64_t row_id,uint64_t * rcount)1686 static rc_t DumpUnalignedReads( SAM_dump_ctx_t *const ctx, SCol const calg_col[], int64_t row_id, uint64_t* rcount )
1687 {
1688 rc_t rc = 0;
1689 uint32_t i, nreads = 0;
1690
1691 if ( calg_col != NULL )
1692 {
1693 if ( calg_col[ alg_SAM_FLAGS ].base.u32[ 0 ] & 0x02 )
1694 {
1695 /* skip all aligned rows by flag */
1696 return rc;
1697 }
1698 /* get primary alignments only */
1699 rc = Cursor_Read( &ctx->seq, calg_col[ alg_SEQ_SPOT_ID ].base.i64[ 0 ], seq_PRIMARY_ALIGNMENT_ID, ~(unsigned)0 );
1700 if ( rc == 0 )
1701 {
1702 for ( i = 0; i < ctx->seq.cols[ seq_PRIMARY_ALIGNMENT_ID ].len; i++ )
1703 {
1704 if ( ctx->seq.cols[ seq_PRIMARY_ALIGNMENT_ID ].base.i64[ i ] != 0 )
1705 {
1706 if ( ctx->seq.cols[ seq_PRIMARY_ALIGNMENT_ID ].base.i64[ i ] < row_id )
1707 {
1708 /* unaligned were printed with 1st aligment */
1709 return rc;
1710 }
1711 }
1712 else
1713 {
1714 nreads++;
1715 }
1716 }
1717 if ( nreads == ctx->seq.cols[ seq_PRIMARY_ALIGNMENT_ID ].len )
1718 {
1719 /* skip all aligned rows by actual data, if flag above is not set properly */
1720 return rc;
1721 }
1722 row_id = calg_col[ alg_SEQ_SPOT_ID ].base.i64[ 0 ];
1723 }
1724 }
1725 if ( rc == 0 )
1726 {
1727 rc = Cursor_Read( &ctx->seq, row_id, 0, ~(unsigned)0 );
1728 if ( rc == 0 )
1729 {
1730 unsigned non_empty_reads = 0;
1731
1732 nreads = ctx->seq.cols[ seq_READ_LEN ].idx != 0 ? ctx->seq.cols[ seq_READ_LEN ].len : 1;
1733
1734 for( i = 0; i < nreads; i++ ) {
1735 unsigned const readLen = ctx->seq.cols[ seq_READ_LEN ].idx
1736 ? ctx->seq.cols[ seq_READ_LEN ].base.coord_len[ i ]
1737 : 0;
1738
1739 if (readLen)
1740 ++non_empty_reads;
1741 }
1742 for( i = 0; i < nreads; i++ )
1743 {
1744 INSDC_coord_zero readStart;
1745 INSDC_coord_len readLen;
1746
1747 if ( ctx->seq.cols[ seq_PRIMARY_ALIGNMENT_ID ].idx != 0 &&
1748 ctx->seq.cols[ seq_PRIMARY_ALIGNMENT_ID ].base.i64[ i ] != 0 )
1749 {
1750 continue;
1751 }
1752
1753 if ( ctx->seq.cols[ seq_READ_TYPE ].idx != 0 &&
1754 !( ctx->seq.cols[ seq_READ_TYPE ].base.read_type[ i ] & SRA_READ_TYPE_BIOLOGICAL ) )
1755 {
1756 continue;
1757 }
1758
1759 readLen = ctx->seq.cols[ seq_READ_LEN ].idx ?
1760 ctx->seq.cols[ seq_READ_LEN ].base.coord_len[ i ] :
1761 ctx->seq.cols[ seq_READ ].len;
1762 if ( readLen == 0 )
1763 {
1764 continue;
1765 }
1766
1767 readStart = ctx->seq.cols[ seq_READ_START ].idx ?
1768 ctx->seq.cols[ seq_READ_START ].base.coord0[ i ] :
1769 0;
1770 if ( param->fasta || param->fastq )
1771 {
1772 rc = DumpUnalignedFastX( ctx->seq.cols, nreads > 1 ? i + 1 : 0, readStart, readLen, row_id );
1773 }
1774 else
1775 {
1776 uint32_t cflags = 0x4;
1777 if ( param->reverse_unaligned )
1778 {
1779 if ( ctx->seq.cols[ seq_READ_TYPE ].base.read_type[ i ] & SRA_READ_TYPE_REVERSE )
1780 {
1781 cflags |= 0x10;
1782 }
1783 if ( ctx->seq.cols[ seq_READ_TYPE ].base.read_type[ i == nreads - 1 ? 0 : ( i + 1 ) ] & SRA_READ_TYPE_REVERSE )
1784 {
1785 cflags |= 0x20;
1786 }
1787 }
1788 if ( ctx->seq.cols[ seq_READ_FILTER ].idx != 0 )
1789 {
1790 if ( ctx->seq.cols[ seq_READ_FILTER ].base.read_filter[ i ] == SRA_READ_FILTER_REJECT )
1791 {
1792 cflags |= 0x200;
1793 }
1794 else if ( ctx->seq.cols[ seq_READ_FILTER ].base.read_filter[ i ] == SRA_READ_FILTER_CRITERIA )
1795 {
1796 cflags |= 0x400;
1797 }
1798 }
1799 if ( calg_col == NULL )
1800 {
1801 rc = DumpUnalignedSAM( ctx->seq.cols, cflags |
1802 ( non_empty_reads > 1 ? ( 0x1 | 0x8 | ( i == 0 ? 0x40 : 0x00 ) | ( i == nreads - 1 ? 0x80 : 0x00 ) ) : 0x00 ),
1803 readStart, readLen, NULL, 0, 0, ctx->readGroup, row_id );
1804 }
1805 else
1806 {
1807 int c = param->use_seqid ? alg_REF_SEQ_ID : alg_REF_NAME;
1808 uint16_t flags = cflags | 0x1 |
1809 ( ( calg_col[ alg_SAM_FLAGS ].base.u32[ 0 ] & 0x10 ) << 1 ) |
1810 ( ( calg_col[ alg_SAM_FLAGS ].base.u32[ 0 ] & 0x40 ) ? 0x80 : 0x40 );
1811 rc = DumpUnalignedSAM( ctx->seq.cols, flags, readStart, readLen,
1812 calg_col[ c ].base.str, calg_col[ c ].len,
1813 calg_col[ alg_REF_POS ].base.coord0[ 0 ] + 1, ctx->readGroup, row_id );
1814 }
1815 }
1816 *rcount = *rcount + 1;
1817 }
1818 }
1819 }
1820 return rc;
1821 }
1822
1823
AlignRegionFilter(SCol const * cols)1824 static bool AlignRegionFilter( SCol const *cols )
1825 {
1826 if ( param->region_qty == 0 )
1827 return true;
1828 if ( cols[ alg_REF_NAME ].len != 0 || cols[ alg_REF_SEQ_ID ].len != 0 )
1829 {
1830 unsigned i;
1831
1832 assert( cols[ alg_REF_POS ].len == cols[ alg_REF_LEN ].len );
1833
1834 for ( i = 0; i < param->region_qty; i++ )
1835 {
1836 unsigned j;
1837
1838 for ( j = 0; j < cols[ alg_REF_POS ].len; j++ )
1839 {
1840 unsigned const refStart_zero = cols[ alg_REF_POS ].base.coord0[ j ];
1841 unsigned const refEnd_exclusive = refStart_zero + cols[ alg_REF_LEN ].base.coord_len[ j ];
1842 unsigned k;
1843
1844 for ( k = 0; k < param->region[ i ].rq; k++ )
1845 {
1846 unsigned const from_zero = param->region[ i ].r[ k ].from;
1847 unsigned const to_inclusive = param->region[ i ].r[ k ].to;
1848
1849 if ( from_zero < refEnd_exclusive && refStart_zero <= to_inclusive )
1850 return true;
1851 }
1852 }
1853 }
1854 }
1855 return false;
1856 }
1857
1858
AlignDistanceFilter(SCol const * cols)1859 static bool AlignDistanceFilter( SCol const *cols )
1860 {
1861 if ( param->mp_dist_qty != 0 || param->mp_dist_unknown )
1862 {
1863 if ( cols[ alg_TEMPLATE_LEN ].len == 0 && param->mp_dist_unknown )
1864 {
1865 return true;
1866 }
1867 else
1868 {
1869 uint32_t i, j;
1870 for( i = 0; i < param->mp_dist_qty; i++ )
1871 {
1872 for( j = 0; j < cols[ alg_TEMPLATE_LEN ].len; j++ )
1873 {
1874 if ( ( cols[ alg_TEMPLATE_LEN ].base.i32[ j ] == 0 && param->mp_dist_unknown ) ||
1875 ( param->mp_dist[ i ].from <= cols[ alg_TEMPLATE_LEN ].base.i32[ j ] &&
1876 cols[ alg_TEMPLATE_LEN ].base.i32[ j ] <= param->mp_dist[ i ].to ) )
1877 {
1878 return true;
1879 }
1880 }
1881 }
1882 }
1883 return false;
1884 }
1885 return true;
1886 }
1887
1888
1889 typedef struct cgOp_s
1890 {
1891 uint16_t length;
1892 uint8_t type; /* 0: match, 1: insert, 2: delete */
1893 char code;
1894 } cgOp;
1895
1896
print_CG_cigar(int line,cgOp const op[],unsigned const ops,unsigned const gap[3])1897 static void print_CG_cigar( int line, cgOp const op[], unsigned const ops, unsigned const gap[ 3 ] )
1898 {
1899 #if _DEBUGGING
1900 unsigned i;
1901
1902 SAM_DUMP_DBG( 3, ( "%5i: ", line ) );
1903 for ( i = 0; i < ops; ++i )
1904 {
1905 if ( gap && ( i == gap[ 0 ] || i == gap[ 1 ] || i == gap[ 2 ] ) )
1906 {
1907 SAM_DUMP_DBG( 3, ( "%u%c", op[ i ].length, tolower( op[ i ].code ) ) );
1908 }
1909 else
1910 {
1911 SAM_DUMP_DBG( 3, ( "%u%c", op[ i ].length, toupper( op[ i ].code ) ) );
1912 }
1913 }
1914 SAM_DUMP_DBG( 3, ( "\n" ) );
1915 #endif
1916 }
1917
1918 /* gap contains the indices of the wobbles in op
1919 * gap[0] is between read 1 and 2; it is the 'B'
1920 * gap[1] is between read 2 and 3; it is an 'N'
1921 * gap[2] is between read 3 and 4; it is an 'N'
1922 */
1923
CIGAR_to_CG_Ops(cgOp op[],unsigned const maxOps,unsigned * const opCnt,unsigned gap[3],char const cigar[],unsigned const ciglen,unsigned * const S_adjust,unsigned * const CG_adjust,unsigned const read,bool const reversed,bool * has_ref_offset_type)1924 static rc_t CIGAR_to_CG_Ops( cgOp op[], unsigned const maxOps,
1925 unsigned *const opCnt,
1926 unsigned gap[ 3 ],
1927 char const cigar[], unsigned const ciglen,
1928 unsigned *const S_adjust,
1929 unsigned *const CG_adjust,
1930 unsigned const read,
1931 bool const reversed,
1932 bool * has_ref_offset_type)
1933 {
1934 unsigned i;
1935 unsigned ops = 0;
1936 unsigned gapno;
1937
1938 *opCnt = 0;
1939 *S_adjust = 0;
1940 for ( i = 0; i < ciglen; ++ops )
1941 {
1942 char opChar=0;
1943 int opLen=0;
1944 int n;
1945
1946 for(n=0;n+i<ciglen && isdigit(cigar[n+i]);n++){
1947 opLen = opLen * 10 + cigar[n+i] - '0';
1948 }
1949 if(n+i<ciglen){
1950 opChar = cigar[n+i];
1951 n++;
1952 }
1953 if ( ops + 1 >= maxOps )
1954 return RC( rcExe, rcData, rcReading, rcBuffer, rcInsufficient );
1955
1956 i += n;
1957
1958 op[ ops ].length = opLen;
1959 op[ ops ].code = opChar;
1960 switch ( opChar )
1961 {
1962 case 'M':
1963 case '=':
1964 case 'X':
1965 op[ ops ].type = 0;
1966 break;
1967 case 'S':
1968 *S_adjust += opLen;
1969 case 'I':
1970 op[ ops ].type = 1;
1971 op[ ops ].code = 'I';
1972 break;
1973 case 'N':
1974 *has_ref_offset_type=true;
1975 case 'D':
1976 op[ ops ].type = 2;
1977 break;
1978 default:
1979 return RC( rcExe, rcData, rcReading, rcConstraint, rcViolated );
1980 }
1981 }
1982 *opCnt = ops;
1983 gap[ 0 ] = gap[ 1 ] = gap[ 2 ] = ops;
1984 print_CG_cigar( __LINE__, op, ops, NULL );
1985 if ( ops < 3 )
1986 return RC( rcExe, rcData, rcReading, rcFormat, rcNotFound ); /* CG pattern not found */
1987
1988 {
1989 unsigned fwd = 0; /* 5 10 10 10 */
1990 unsigned rev = 0; /* 10 10 10 5 */
1991 unsigned acc; /* accumulated length */
1992
1993 if ( ( read == 1 && !reversed ) || ( read == 2 && reversed ) )
1994 {
1995 for ( i = 0, acc = 0; i < ops && acc <=5; ++i )
1996 {
1997 if ( op[ i ].type != 2 )
1998 {
1999 acc += op[ i ].length;
2000 if ( acc == 5 && op[ i + 1 ].type == 1 )
2001 {
2002 fwd = i + 1;
2003 break;
2004 }
2005 else if ( acc > 5 )
2006 {
2007 unsigned const right = acc - 5;
2008 unsigned const left = op[ i ].length - right;
2009
2010 memmove( &op[ i + 2 ], &op[ i ], ( ops - i ) * sizeof( op[ 0 ] ) );
2011 ops += 2;
2012 op[ i ].length = left;
2013 op[ i + 2 ].length = right;
2014
2015 op[ i + 1 ].type = 1;
2016 op[ i + 1 ].code = 'B';
2017 op[ i + 1 ].length = 0;
2018
2019 fwd = i + 1;
2020 break;
2021 }
2022 }
2023 }
2024 }
2025 else if ( ( read == 2 && !reversed ) || ( read == 1 && reversed ) )
2026 {
2027 for ( i = ops, acc = 0; i > 0 && acc <= 5; )
2028 {
2029 --i;
2030 if ( op[ i ].type != 2 )
2031 {
2032 acc += op[ i ].length;
2033 if ( acc == 5 && op[ i ].type == 1 )
2034 {
2035 rev = i;
2036 break;
2037 }
2038 else if ( acc > 5 )
2039 {
2040 unsigned const left = acc - 5;
2041 unsigned const right = op[ i ].length - left;
2042
2043 memmove( &op[ i + 2 ], &op[ i ], ( ops - i ) * sizeof( op[ 0 ] ) );
2044 ops += 2;
2045 op[ i ].length = left;
2046 op[ i + 2 ].length = right;
2047
2048 op[ i + 1 ].type = 1;
2049 op[ i + 1 ].code = 'B';
2050 op[ i + 1 ].length = 0;
2051
2052 rev = i + 1;
2053 break;
2054 }
2055 }
2056 }
2057 }
2058 else
2059 {
2060 /* fprintf(stderr, "guessing layout\n"); */
2061 for ( i = 0, acc = 0; i < ops && acc <= 5; ++i )
2062 {
2063 if ( op[ i ].type != 2 )
2064 {
2065 acc += op[ i ].length;
2066 if ( acc == 5 && op[ i + 1 ].type == 1 )
2067 {
2068 fwd = i + 1;
2069 }
2070 }
2071 }
2072 for ( i = ops, acc = 0; i > 0 && acc <= 5; )
2073 {
2074 --i;
2075 if ( op[ i ].type != 2 )
2076 {
2077 acc += op[ i ].length;
2078 if ( acc == 5 && op[i].type == 1 )
2079 {
2080 rev = i;
2081 }
2082 }
2083 }
2084 if ( ( fwd == 0 && rev == 0 ) || ( fwd != 0 && rev != 0 ) )
2085 {
2086 if(!(*has_ref_offset_type)) for ( i = 0; i < ops; ++i )
2087 {
2088 if ( op[ i ].type == 2 )
2089 {
2090 op[ i ].code = 'N';
2091 *CG_adjust += op[ i ].length;
2092 }
2093 }
2094 return RC( rcExe, rcData, rcReading, rcFormat, rcNotFound ); /* CG pattern not found */
2095 }
2096 }
2097 if ( fwd && op[ fwd ].type == 1 )
2098 {
2099 for ( i = ops, acc = 0; i > fwd + 1; )
2100 {
2101 --i;
2102 if ( op[ i ].type != 2 )
2103 {
2104 acc += op[ i ].length;
2105 if ( acc >= 10 )
2106 {
2107 if ( acc > 10 )
2108 {
2109 unsigned const r = 10 + op[ i ].length - acc;
2110 unsigned const l = op[ i ].length - r;
2111
2112 if ( ops + 2 >= maxOps )
2113 return RC( rcExe, rcData, rcReading, rcBuffer, rcInsufficient );
2114 memmove( &op[ i + 2 ], &op[ i ], ( ops - i ) * sizeof( op[ 0 ] ) );
2115 ops += 2;
2116 op[ i + 2 ].length = r;
2117 op[ i ].length = l;
2118
2119 op[ i + 1 ].length = 0;
2120 op[ i + 1 ].type = 2;
2121 op[ i + 1 ].code = 'N';
2122 i += 2;
2123 }
2124 else if ( i - 1 > fwd )
2125 {
2126 if ( op[ i - 1 ].type == 2 )
2127 op[ i - 1 ].code = 'N';
2128 else
2129 {
2130 if ( ops + 1 >= maxOps )
2131 return RC( rcExe, rcData, rcReading, rcBuffer, rcInsufficient );
2132 memmove( &op[ i + 1 ], &op[ i ], ( ops - i ) * sizeof( op[ 0 ] ) );
2133 ops += 1;
2134 op[ i ].length = 0;
2135 op[ i ].type = 2;
2136 op[ i ].code = 'N';
2137 i += 1;
2138 }
2139 }
2140 acc = 0;
2141 }
2142 }
2143 }
2144 /** change I to B+M **/
2145 op[ fwd ].code = 'B';
2146 memmove( &op[ fwd + 1 ], &op[ fwd ], ( ops - fwd ) * sizeof( op[ 0 ] ) );
2147 ops += 1;
2148 op[ fwd + 1 ].code = 'M';
2149 *opCnt = ops;
2150 /** set the gaps now **/
2151 for ( gapno = 3, i = ops; gapno > 1 && i > 0; )
2152 {
2153 --i;
2154 if ( op[ i ].code == 'N' )
2155 gap[ --gapno ] = i;
2156 }
2157 gap[ 0 ] = fwd;
2158 print_CG_cigar( __LINE__, op, ops, gap );
2159 return 0;
2160 }
2161 if ( rev && op[ rev ].type == 1 )
2162 {
2163 for ( acc = i = 0; i < rev; ++i )
2164 {
2165 if ( op[ i ].type != 2 )
2166 {
2167 acc += op[ i ].length;
2168 if ( acc >= 10 )
2169 {
2170 if ( acc > 10 )
2171 {
2172 unsigned const l = 10 + op[ i ].length - acc;
2173 unsigned const r = op[ i ].length - l;
2174
2175 if ( ops + 2 >= maxOps )
2176 return RC( rcExe, rcData, rcReading, rcBuffer, rcInsufficient );
2177 memmove( &op[ i + 2 ], &op[ i ], ( ops - i ) * sizeof( op[ 0 ] ) );
2178 ops += 2;
2179 op[ i + 2 ].length = r;
2180 op[ i ].length = l;
2181
2182 op[ i + 1 ].length = 0;
2183 op[ i + 1 ].type = 2;
2184 op[ i + 1 ].code = 'N';
2185 rev += 2;
2186 i += 2;
2187 }
2188 else if ( i + 1 < rev )
2189 {
2190 if ( op[ i + 1 ].type == 2)
2191 op[ i + 1 ].code = 'N';
2192 else
2193 {
2194 if ( ops + 1 >= maxOps )
2195 return RC( rcExe, rcData, rcReading, rcBuffer, rcInsufficient );
2196 memmove( &op[ i + 1 ], &op[ i ], ( ops - i ) * sizeof( op[ 0 ] ) );
2197 ops += 1;
2198 op[ i + 1 ].length = 0;
2199 op[ i + 1 ].type = 2;
2200 op[ i + 1 ].code = 'N';
2201 rev += 1;
2202 i += 1;
2203 }
2204 }
2205 acc = 0;
2206 }
2207 }
2208 }
2209 for ( gapno = 3, i = 0; gapno > 1 && i < ops; ++i )
2210 {
2211 if ( op[ i ].code == 'N' )
2212 gap[ --gapno ] = i;
2213 }
2214 gap[ 0 ] = rev;
2215 op[ rev ].code = 'B';
2216 memmove( &op[ rev + 1 ], &op[ rev ], ( ops - rev ) * sizeof( op[ 0 ] ) );
2217 ops += 1;
2218 op[ rev + 1 ].code = 'M';
2219 *opCnt = ops;
2220 print_CG_cigar( __LINE__, op, ops, gap );
2221 return 0;
2222 }
2223 }
2224 return RC( rcExe, rcData, rcReading, rcFormat, rcNotFound ); /* CG pattern not found */
2225 }
2226
2227
GenerateCGData(SCol cols[],unsigned style)2228 static rc_t GenerateCGData( SCol cols[], unsigned style )
2229 {
2230 rc_t rc = 0;
2231
2232 memset( &cols[ alg_CG_TAGS_STR], 0, sizeof( cols[ alg_CG_TAGS_STR ] ) );
2233
2234 if ( cols[ alg_READ ].len == 35 && cols[ alg_SAM_QUALITY ].len == 35 )
2235 {
2236 static char newCIGAR[ 35 * 11 ];
2237 static int32_t newEditDistance;
2238 unsigned gap[ 3 ] = { 0, 0, 0 };
2239 cgOp cigOp[ 35 ];
2240 unsigned opCnt;
2241 unsigned i;
2242 unsigned j;
2243 size_t sz;
2244 unsigned S_adjust = 0;
2245 unsigned CG_adjust = 0;
2246 unsigned const read = cols[ alg_SEQ_READ_ID ].len && cols[ alg_REVERSED ].len ? cols[ alg_SEQ_READ_ID ].base.coord1[ 0 ] : 0;
2247 bool const reversed = cols[ alg_REVERSED ].len ? cols[ alg_REVERSED ].base.u8[ 0 ] : false;
2248 bool has_ref_offset_type = false;
2249
2250 rc = CIGAR_to_CG_Ops( cigOp, sizeof( cigOp ) / sizeof( cigOp[ 0 ] ), &opCnt, gap,
2251 cols[ alg_CIGAR ].base.str, cols[ alg_CIGAR ].len,
2252 &S_adjust, &CG_adjust, read, reversed, &has_ref_offset_type );
2253 if ( GetRCState( rc ) == rcNotFound && GetRCObject( rc ) == rcFormat )
2254 {
2255 rc = 0;
2256 if ( style == 1 )
2257 goto CLEAN_CIGAR;
2258 else
2259 goto PRINT_CIGAR;
2260 }
2261 if ( rc != 0 )
2262 return 0;
2263
2264 if ( !has_ref_offset_type && CG_adjust == 0 )
2265 CG_adjust = ( gap[ 0 ] < opCnt ? cigOp[ gap[ 0 ] ].length : 0 )
2266 + ( gap[ 1 ] < opCnt ? cigOp[ gap[ 1 ] ].length : 0 )
2267 + ( gap[ 2 ] < opCnt ? cigOp[ gap[ 2 ] ].length : 0 );
2268
2269 print_CG_cigar( __LINE__, cigOp, opCnt, NULL );
2270 if ( style == 1 )
2271 {
2272 static char newSeq[ 35 ];
2273 static char newQual[ 35 ];
2274 static char tags[ 35*22 + 1 ];
2275 unsigned const B_len = cigOp[ gap[ 0 ] ].length;
2276 unsigned const B_at = gap[ 0 ] < gap[ 2 ] ? 5 : 30;
2277
2278 if ( 0 < B_len && B_len < 5 )
2279 {
2280 memmove( newSeq, cols[ alg_READ ].base.v, 35 );
2281 memmove( newQual, cols[ alg_SAM_QUALITY ].base.v, 35 );
2282
2283 cols[ alg_CG_TAGS_STR ].base.v = tags;
2284
2285 cols[ alg_READ ].base.v = newSeq;
2286 cols[ alg_READ ].len = 35 - B_len;
2287
2288 cols[ alg_SAM_QUALITY ].base.v = newQual;
2289 cols[ alg_SAM_QUALITY ].len = 35 - B_len;
2290
2291 /* nBnM -> nB0M */
2292 cigOp[ gap[ 0 ] + 1 ].length -= B_len;
2293 if ( gap[ 0 ] < gap[ 2 ] )
2294 {
2295 rc = string_printf( tags, sizeof( tags ), &sz, "\tGC:Z:%uS%uG%uS\tGS:Z:%.*s\tGQ:Z:%.*s",
2296 5 - B_len, B_len, 30 - B_len, 2 * B_len, &newSeq[ 5 - B_len ], 2 * B_len, &newQual[ 5 - B_len ] );
2297 if ( rc == 0 )
2298 {
2299 cols[ alg_CG_TAGS_STR ].len = sz;
2300 }
2301 memmove( &cigOp[ gap[ 0 ] ],
2302 &cigOp[ gap[ 0 ] + 1 ],
2303 ( opCnt - ( gap[ 0 ] + 1 ) ) * sizeof( cigOp[ 0 ] ) );
2304 --opCnt;
2305 }
2306 else
2307 {
2308 rc = string_printf( tags, sizeof( tags ), &sz, "\tGC:Z:%uS%uG%uS\tGS:Z:%.*s\tGQ:Z:%.*s",
2309 30 - B_len, B_len, 5 - B_len, 2 * B_len, &newSeq[ 30 - B_len ], 2 * B_len, &newQual[ 30 - B_len ] );
2310 if ( rc == 0 )
2311 {
2312 cols[ alg_CG_TAGS_STR ].len = sz;
2313 }
2314 memmove( &cigOp[ gap[ 0 ] ],
2315 &cigOp[ gap[ 0 ] + 1 ],
2316 ( opCnt - ( gap[ 0 ] + 1 ) ) * sizeof( cigOp[ 0 ] ) );
2317 --opCnt;
2318 }
2319 if ( rc == 0 )
2320 {
2321 for ( i = B_at; i < B_at + B_len; ++i )
2322 {
2323 int const Lq = newQual[ i - B_len ];
2324 int const Rq = newQual[ i ];
2325
2326 if ( Lq <= Rq )
2327 {
2328 newSeq[ i - B_len ] = newSeq[ i ];
2329 newQual[ i - B_len ] = Rq;
2330 }
2331 else
2332 {
2333 newSeq[ i ] = newSeq[ i - B_len ];
2334 newQual[ i ] = Lq;
2335 }
2336 }
2337 memmove( &newSeq [ B_at ], &newSeq [ B_at + B_len ], 35 - B_at - B_len );
2338 memmove( &newQual[ B_at ], &newQual[ B_at + B_len ], 35 - B_at - B_len );
2339 }
2340 }
2341 else
2342 {
2343 int len = cigOp[ gap[ 0 ] ].length;
2344
2345 cigOp[ gap[ 0 ] ].code = 'I';
2346 for (i = gap[0] + 1; i < opCnt && len > 0; ++i) {
2347 if (cigOp[i].length <= len){
2348 len -= cigOp[i].length;
2349 cigOp[i].length = 0;
2350 }
2351 else {
2352 cigOp[i].length -= len;
2353 len = 0;
2354 }
2355 }
2356 if(!has_ref_offset_type) CG_adjust -= cigOp[ gap[ 0 ] ].length;
2357 }
2358 }
2359 if ( rc == 0 )
2360 {
2361 PRINT_CIGAR:
2362 CLEAN_CIGAR:
2363 print_CG_cigar( __LINE__, cigOp, opCnt, NULL );
2364 /* remove zero length ops */
2365 for ( j = i = 0; i < opCnt; )
2366 {
2367 if ( cigOp[ j ].length == 0)
2368 {
2369 ++j;
2370 --opCnt;
2371 continue;
2372 }
2373 cigOp[ i++ ] = cigOp[ j++ ];
2374 }
2375 print_CG_cigar( __LINE__, cigOp, opCnt, NULL );
2376 if ( cols[ alg_EDIT_DISTANCE ].len )
2377 {
2378 int const edit_distance = cols[ alg_EDIT_DISTANCE ].base.i32[ 0 ];
2379 int const adjusted = edit_distance + S_adjust - CG_adjust;
2380
2381 newEditDistance = adjusted > 0 ? adjusted : 0;
2382 SAM_DUMP_DBG( 4, ( "NM: before: %u, after: %u(+%u-%u)\n", edit_distance, newEditDistance, S_adjust, CG_adjust ) );
2383 cols[ alg_EDIT_DISTANCE ].base.v = &newEditDistance;
2384 cols[ alg_EDIT_DISTANCE ].len = 1;
2385 }
2386 /* merge adjacent ops */
2387 for ( i = opCnt; i > 1; )
2388 {
2389 --i;
2390 if ( cigOp[ i - 1 ].code == cigOp[ i ].code )
2391 {
2392 cigOp[ i - 1 ].length += cigOp[ i ].length;
2393 memmove( &cigOp[ i ], &cigOp[ i + 1 ], ( opCnt - 1 - i ) * sizeof( cigOp[ 0 ] ) );
2394 --opCnt;
2395 }
2396 }
2397 print_CG_cigar( __LINE__, cigOp, opCnt, NULL );
2398 for ( i = j = 0; i < opCnt && rc == 0; ++i )
2399 {
2400 rc = string_printf( &newCIGAR[ j ], sizeof( newCIGAR ) - j, &sz, "%u%c", cigOp[ i ].length, cigOp[ i ].code);
2401 j += sz;
2402 }
2403 cols[ alg_CIGAR ].base.v = newCIGAR;
2404 cols[ alg_CIGAR ].len = j;
2405 }
2406 }
2407 return rc;
2408 }
2409
2410
DumpAlignedRow(SAM_dump_ctx_t * const ctx,DataSource * const ds,int64_t row,bool const primary,int const cg_style,rc_t * prc)2411 static bool DumpAlignedRow( SAM_dump_ctx_t *const ctx, DataSource *const ds,
2412 int64_t row,
2413 bool const primary,
2414 int const cg_style,
2415 rc_t *prc )
2416 {
2417 rc_t rc = 0;
2418
2419 /* if SEQ_SPOT_ID is open then skip rows with no SEQ_SPOT_ID */
2420 if ( ds->cols[ alg_SEQ_SPOT_ID ].idx )
2421 {
2422 rc = Cursor_Read( ds, row, alg_SEQ_SPOT_ID, 1 );
2423 if ( rc != 0 )
2424 {
2425 if ( !( GetRCObject( rc ) == rcRow && GetRCState( rc ) == rcNotFound ) )
2426 *prc = rc;
2427 return false;
2428 }
2429 if ( ds->cols[ alg_SEQ_SPOT_ID ].len == 0 ||
2430 ds->cols[ alg_SEQ_SPOT_ID ].base.i64[ 0 ] == 0 )
2431 return false;
2432 }
2433
2434 /* skip rows that fail mate distance filter */
2435 if ( param->mp_dist_qty != 0 || param->mp_dist_unknown )
2436 {
2437 rc = Cursor_Read( ds, row, alg_TEMPLATE_LEN, 1 );
2438 if ( rc != 0 )
2439 {
2440 if ( !( GetRCObject( rc ) == rcRow && GetRCState( rc ) == rcNotFound ) )
2441 *prc = rc;
2442 return false;
2443 }
2444 if ( !AlignDistanceFilter( ds->cols ) )
2445 return false;
2446 }
2447
2448 rc = Cursor_ReadAlign( &ds->curs, row, ds->cols, 0 );
2449 if ( rc != 0 )
2450 {
2451 if ( !( GetRCObject( rc ) == rcRow && GetRCState( rc ) == rcNotFound ) )
2452 *prc = rc;
2453 return false;
2454 }
2455
2456 if ( param->fasta || param->fastq )
2457 {
2458 unsigned const read_id = ds->cols[ alg_SEQ_READ_ID ].base.v ? ds->cols[ alg_SEQ_READ_ID ].base.coord1[ 0 ] : 0;
2459
2460 rc = DumpAlignedFastX( ctx->pri.cols, row, read_id, primary, false );
2461 }
2462 else
2463 {
2464 if ( cg_style != 0 )
2465 {
2466 rc = GenerateCGData( ds->cols, cg_style );
2467 if ( rc != 0 )
2468 {
2469 *prc = rc;
2470 return false;
2471 }
2472 }
2473 rc = DumpAlignedSAM(ctx, ds, row, ctx->readGroup, 0 );
2474 }
2475 *prc = rc;
2476 return true;
2477 }
2478
2479
DumpAlignedRowList(SAM_dump_ctx_t * const ctx,DataSource * const ds,SCol const * const ids,int64_t * rcount,bool const primary,int const cg_style,bool const noFilter)2480 static rc_t DumpAlignedRowList( SAM_dump_ctx_t *const ctx, DataSource *const ds, SCol const *const ids,
2481 int64_t *rcount,
2482 bool const primary,
2483 int const cg_style,
2484 bool const noFilter )
2485 {
2486 rc_t rc;
2487 unsigned i;
2488 unsigned const n = ids->len;
2489
2490 for ( i = 0; ( rc = Quitting() ) == 0 && i < n; ++i )
2491 {
2492 int64_t const row = ids->base.i64[ i ];
2493
2494 if ( ! noFilter )
2495 rc = Cursor_Read( ds, row, alg_REGION_FILTER, ~(unsigned)0 );
2496
2497 if ( rc == 0 )
2498 {
2499 if ( noFilter || AlignRegionFilter(ds->cols) )
2500 {
2501 if ( DumpAlignedRow( ctx, ds, row, primary, cg_style, &rc ) )
2502 ++*rcount;
2503 if ( rc != 0 )
2504 break;
2505 }
2506 }
2507 }
2508 return rc;
2509 }
2510
DumpAlignedRowListIndirect(SAM_dump_ctx_t * const ctx,DataSource * const indirectSource,DataSource * const directSource,SCol const * const directIDs,int64_t * rcount,bool const primary,int const cg_style)2511 static rc_t DumpAlignedRowListIndirect( SAM_dump_ctx_t *const ctx,
2512 DataSource *const indirectSource,
2513 DataSource *const directSource,
2514 SCol const *const directIDs,
2515 int64_t *rcount,
2516 bool const primary,
2517 int const cg_style )
2518 {
2519 unsigned i;
2520
2521 for ( i = 0; i < directIDs->len; ++i )
2522 {
2523 int64_t const row = directIDs->base.i64[ i ];
2524 rc_t rc = Cursor_Read( directSource, row, alg_REGION_FILTER, ~(unsigned)0 );
2525
2526 if ( rc != 0 )
2527 return rc;
2528
2529 if ( !AlignRegionFilter( directSource->cols ) )
2530 continue;
2531
2532 rc = Cursor_Read( directSource, row, alg_EVIDENCE_ALIGNMENT_IDS, 1 );
2533 if ( rc != 0 )
2534 return rc;
2535
2536 rc = DumpAlignedRowList( ctx, indirectSource, &directSource->cols[ alg_EVIDENCE_ALIGNMENT_IDS ],
2537 rcount, primary, cg_style, true );
2538 if ( rc != 0 )
2539 return rc;
2540 }
2541 return 0;
2542 }
2543
2544
2545 enum e_tables
2546 {
2547 primary_alignment,
2548 secondary_alignment,
2549 evidence_interval,
2550 evidence_alignment
2551 };
2552
2553 enum e_IDS_opts
2554 {
2555 primary_IDS = ( 1 << primary_alignment ),
2556 secondary_IDS = ( 1 << secondary_alignment ),
2557 evidence_interval_IDS = ( 1 << evidence_interval ),
2558 evidence_alignment_IDS = ( 1 << evidence_alignment )
2559 };
2560
2561
DumpAlignedRowList_cb(SAM_dump_ctx_t * const ctx,TAlignedRegion const * const rgn,int options,int which,int64_t * rcount,SCol const * const IDS)2562 static rc_t DumpAlignedRowList_cb( SAM_dump_ctx_t *const ctx, TAlignedRegion const *const rgn,
2563 int options, int which, int64_t *rcount, SCol const *const IDS )
2564 {
2565 /*SAM_DUMP_DBG(2, ("row %s index range is [%lu:%lu] pos %lu\n",
2566 param->region[r].name, start, start + count - 1, cur_pos));*/
2567 switch ( which )
2568 {
2569 case primary_IDS:
2570 return DumpAlignedRowList( ctx, &ctx->pri, IDS, rcount, true, param->cg_style, false );
2571
2572 case secondary_IDS:
2573 return DumpAlignedRowList( ctx, &ctx->sec, IDS, rcount, false, param->cg_style, false );
2574
2575 case evidence_interval_IDS:
2576 {
2577 rc_t rc = 0;
2578
2579 if ( ( options & evidence_interval_IDS ) != 0 )
2580 rc = DumpAlignedRowList( ctx, &ctx->evi, IDS, rcount, true, 0, false );
2581
2582 if ( rc == 0 && ( options & evidence_alignment_IDS ) != 0 )
2583 rc = DumpAlignedRowListIndirect( ctx, &ctx->eva, &ctx->evi, IDS, rcount, true, param->cg_style );
2584
2585 return rc;
2586 }
2587 }
2588 return RC( rcExe, rcTable, rcReading, rcParam, rcUnexpected );
2589 }
2590
2591
2592 typedef struct CigOps{
2593 char op;
2594 int8_t ref_sign; /*** 0;+1;-1; ref_offset = ref_sign * offset ***/
2595 int8_t seq_sign; /*** 0;+1;-1; seq_offset = seq_sign * offset ***/
2596 uint32_t oplen;
2597 } CigOps;
2598
SetCigOp(CigOps * dst,char op,uint32_t oplen)2599 static void SetCigOp(CigOps *dst,char op,uint32_t oplen)
2600 {
2601 dst->op = op;
2602 dst->oplen = oplen;
2603 switch(op) { /*MX= DN B IS PH*/
2604 case 'M': case 'X': case '=':
2605 dst->ref_sign=+1;
2606 dst->seq_sign=+1;
2607 break;
2608 case 'D': case 'N':
2609 dst->ref_sign=+1;
2610 dst->seq_sign= 0;
2611 break;
2612 case 'B':
2613 dst->ref_sign=-1;
2614 dst->seq_sign= 0;
2615 break;
2616 case 'S': case 'I':
2617 dst->ref_sign= 0;
2618 dst->seq_sign=+1;
2619 break;
2620 case 'P': case 'H':
2621 case 0: /** terminating op **/
2622 dst->ref_sign= 0;
2623 dst->seq_sign= 0;
2624 break;
2625 default:
2626 assert(0);
2627 break;
2628 }
2629 }
2630
ExplodeCIGAR(CigOps dst[],unsigned len,char const cigar[],unsigned ciglen)2631 static unsigned ExplodeCIGAR( CigOps dst[], unsigned len, char const cigar[], unsigned ciglen )
2632 {
2633 unsigned i;
2634 unsigned j;
2635 int opLen;
2636
2637 for ( i = j = opLen=0; i < ciglen; i++) {
2638 if(isdigit(cigar[i])){
2639 opLen = opLen * 10 + cigar[i] - '0';
2640 } else {
2641 assert(isalpha(cigar[i]));
2642 SetCigOp(dst+j,cigar[i],opLen);
2643 opLen=0;
2644 j++;
2645 }
2646 }
2647 SetCigOp(dst+j,0,0);
2648 j++;
2649 return j;
2650 }
2651
2652
2653 #define CG_CIGAR_STRING_MAX (35 * 11 + 1)
2654
2655
2656 #if 0
2657 static unsigned FormatCIGAR( char dst[], unsigned cp, unsigned oplen, int opcode )
2658 {
2659 size_t sz;
2660 string_printf( dst + cp, CG_CIGAR_STRING_MAX - cp, &sz, "%u%c", oplen, opcode );
2661 return sz;
2662 }
2663
2664
2665 static unsigned DeletedCIGAR( char dst[], unsigned const cp, int opcode, unsigned D,
2666 uint32_t const Op[], int ri, unsigned len )
2667 {
2668 unsigned i;
2669 unsigned curPos = 0;
2670 int curOp = opcode;
2671 unsigned oplen = 0;
2672
2673 for ( i = 0; i < D && ri < len; ++i, ++ri )
2674 {
2675 uint32_t const op = Op[ ri ];
2676 int const d = op >> 2;
2677 int const m = op & 1;
2678 int const nxtOp = m ? opcode : 'P';
2679
2680 if ( d != 0 )
2681 {
2682 if ( oplen != 0 )
2683 curPos += FormatCIGAR( dst, curPos + cp, oplen, curOp );
2684 curPos += FormatCIGAR( dst, cp, d, 'D' );
2685 oplen = 0;
2686 }
2687 if ( oplen == 0 || nxtOp == curOp )
2688 {
2689 ++oplen;
2690 }
2691 else
2692 {
2693 curPos += FormatCIGAR( dst, curPos + cp, oplen, curOp );
2694 oplen = 1;
2695 }
2696 curOp = nxtOp;
2697 }
2698
2699 if ( oplen != 0 && curOp != opcode)
2700 {
2701 curPos += FormatCIGAR( dst, curPos + cp, oplen, curOp );
2702 oplen = 0;
2703 }
2704 if ( i < D )
2705 oplen += D - i;
2706 if ( oplen )
2707 curPos += FormatCIGAR( dst, curPos + cp, oplen, opcode );
2708 return curPos;
2709 }
2710 #endif
2711
merge_M_type_ops(char a,char b)2712 static char merge_M_type_ops(char a,char b)
2713 { /*MX=*/
2714 char c=0;
2715 switch(b){
2716 case 'X':
2717 switch(a){
2718 case '=': c='X'; break;
2719 case 'X': c='M'; break; /**we don't know - 2X may create '=' **/
2720 case 'M': c='M'; break;
2721 }
2722 break;
2723 case 'M':
2724 c='M';
2725 break;
2726 case '=':
2727 c=a;
2728 break;
2729 }
2730 assert(c!=0);
2731 return c;
2732 }
2733
2734
CombineCIGAR(char dst[],CigOps const seqOp[],unsigned seq_len,int refPos,CigOps const refOp[],unsigned ref_len)2735 static unsigned CombineCIGAR( char dst[], CigOps const seqOp[], unsigned seq_len,
2736 int refPos, CigOps const refOp[], unsigned ref_len )
2737 {
2738 bool done=false;
2739 unsigned ciglen=0,last_ciglen=0;
2740 char last_cig_op;
2741 uint32_t last_cig_oplen=0;
2742 int si=0,ri=0;
2743 CigOps seq_cop={0,0,0,0},ref_cop={0,0,0,0};
2744 int seq_pos=0; /** seq_pos is tracked roughly - with every extraction from seqOp **/
2745 int ref_pos=0; /** ref_pos is tracked precisely - with every delta and consumption in cigar **/
2746 int delta=refPos; /*** delta in relative positions of seq and ref **/
2747 /*** when delta < 0 - rewind or extend the reference ***/
2748 /*** wher delta > 0 - skip reference ***/
2749 #define MACRO_BUILD_CIGAR(OP,OPLEN) \
2750 if( last_cig_oplen > 0 && last_cig_op == OP){ \
2751 last_cig_oplen += OPLEN; \
2752 ciglen = last_ciglen + sprintf(dst+last_ciglen,"%d%c",last_cig_oplen,last_cig_op); \
2753 } else { \
2754 last_ciglen = ciglen; \
2755 last_cig_oplen = OPLEN; \
2756 last_cig_op = OP; \
2757 ciglen = ciglen + sprintf(dst+last_ciglen,"%d%c",last_cig_oplen,last_cig_op); \
2758 }
2759 while(!done){
2760 while(delta < 0){
2761 ref_pos += delta; /** we will make it to back up this way **/
2762 if(ri > 0){ /** backing up on ref if possible ***/
2763 int avail_oplen = refOp[ri-1].oplen - ref_cop.oplen;
2764 if(avail_oplen > 0 ){
2765 if((-delta) <= avail_oplen * ref_cop.ref_sign){ /*** rewind within last operation **/
2766 ref_cop.oplen -= delta;
2767 delta -= delta * ref_cop.ref_sign;
2768 } else { /*** rewind the whole ***/
2769 ref_cop.oplen += avail_oplen;
2770 delta += avail_oplen * ref_cop.ref_sign;
2771 }
2772 } else {
2773 ri--;
2774 /** pick the previous as used up **/
2775 ref_cop = refOp[ri-1];
2776 ref_cop.oplen =0;
2777 }
2778 } else { /** extending the reference **/
2779 SetCigOp(&ref_cop,'=',ref_cop.oplen - delta);
2780 delta = 0;
2781 }
2782 ref_pos -= delta;
2783 }
2784 if(ref_cop.oplen == 0){ /*** advance the reference ***/
2785 ref_cop = refOp[ri++];
2786 if(ref_cop.oplen == 0) { /** extending beyond the reference **/
2787 SetCigOp(&ref_cop,'=',1000);
2788 }
2789 assert(ref_cop.oplen > 0 );
2790 }
2791 if(delta > 0){ /*** skip refOps ***/
2792 ref_pos += delta; /** may need to back up **/
2793 if(delta >= ref_cop.oplen){ /** full **/
2794 delta -= ref_cop.oplen * ref_cop.ref_sign;
2795 ref_cop.oplen = 0;
2796 } else { /** partial **/
2797 ref_cop.oplen -= delta;
2798 delta -= delta * ref_cop.ref_sign;
2799 }
2800 ref_pos -= delta; /** if something left - restore ***/
2801 continue;
2802 }
2803 /*** seq and ref should be synchronized here **/
2804 assert(delta == 0);
2805 if(seq_cop.oplen == 0){ /*** advance sequence ***/
2806 if(seq_pos < seq_len){
2807 seq_cop = seqOp[si++];
2808 assert(seq_cop.oplen > 0 );
2809 seq_pos += seq_cop.oplen * seq_cop.seq_sign;
2810 } else {
2811 done=true;
2812 }
2813 }
2814 if(!done){
2815 int seq_seq_step = seq_cop.oplen * seq_cop.seq_sign; /** sequence movement**/
2816 int seq_ref_step = seq_cop.oplen * seq_cop.ref_sign; /** influence of sequence movement on intermediate reference **/
2817 int ref_seq_step = ref_cop.oplen * ref_cop.seq_sign; /** movement of the intermediate reference ***/
2818 int ref_ref_step = ref_cop.oplen * ref_cop.ref_sign; /** influence of the intermediate reference movement on final reference ***/
2819 assert( ref_ref_step >= 0); /** no B in the reference **/
2820 if( seq_ref_step <= 0){ /** BSIPH in the sequence against anything ***/
2821 MACRO_BUILD_CIGAR( seq_cop.op, seq_cop.oplen );
2822 seq_cop.oplen = 0;
2823 delta = seq_ref_step; /** if negative - will force rewind next cycle **/
2824 } else if(ref_ref_step <= 0){ /** MX=DN against SIPH in the reference***/
2825 if(ref_seq_step == 0){ /** MX=DN against PH **/
2826 MACRO_BUILD_CIGAR( ref_cop.op,ref_cop.oplen);
2827 ref_cop.oplen = 0;
2828 } else {
2829 int min_len = (seq_cop.oplen<ref_cop.oplen)?seq_cop.oplen:ref_cop.oplen;
2830 if(seq_seq_step == 0){ /** DN agains SI **/
2831 MACRO_BUILD_CIGAR('P',min_len);
2832 } else { /** MX= agains SI ***/
2833 MACRO_BUILD_CIGAR( ref_cop.op,min_len);
2834 }
2835 seq_cop.oplen -= min_len;
2836 ref_cop.oplen -= min_len;
2837 }
2838 } else { /*MX=DN against MX=DN*/
2839 int min_len = (seq_cop.oplen<ref_cop.oplen)?seq_cop.oplen:ref_cop.oplen;
2840 if(seq_seq_step == 0){ /* DN against MX=DN */
2841 if(ref_seq_step == 0){ /** padding DN against DN **/
2842 MACRO_BUILD_CIGAR('P',min_len);
2843 ref_cop.oplen -= min_len;
2844 seq_cop.oplen -= min_len;
2845 } else { /* DN against MX= **/
2846 MACRO_BUILD_CIGAR(seq_cop.op,min_len);
2847 seq_cop.oplen -= min_len;
2848 }
2849 } else if (ref_cop.seq_sign == 0){ /* MX= against DN - always wins */
2850 MACRO_BUILD_CIGAR(ref_cop.op,min_len);
2851 ref_cop.oplen -= min_len;
2852 } else { /** MX= against MX= ***/
2853 MACRO_BUILD_CIGAR(merge_M_type_ops(seq_cop.op,ref_cop.op),min_len);
2854 ref_cop.oplen -= min_len;
2855 seq_cop.oplen -= min_len;
2856 }
2857 ref_pos += min_len;
2858 }
2859 }
2860 }
2861 return ciglen;
2862 }
2863
2864
DumpCGSAM(SAM_dump_ctx_t * const ctx,TAlignedRegion const * const rgn,int options,int which,int64_t * rows,SCol const * const ids)2865 static rc_t DumpCGSAM( SAM_dump_ctx_t *const ctx, TAlignedRegion const *const rgn, int options,
2866 int which, int64_t *rows, SCol const *const ids )
2867 {
2868 if ( which == evidence_interval_IDS )
2869 {
2870 rc_t rc = 0;
2871 unsigned i;
2872
2873 for ( i = 0; i < ids->len; ++i )
2874 {
2875 int64_t const rowInterval = ids->base.i64[ i ];
2876
2877 rc = Cursor_Read( &ctx->evi, rowInterval, alg_REGION_FILTER, ~(0u) );
2878 if ( rc == 0 )
2879 {
2880 if ( AlignRegionFilter( ctx->evi.cols ) )
2881 {
2882 rc = Cursor_ReadAlign( &ctx->evi.curs, rowInterval, ctx->evi.cols, 0 );
2883 if ( rc == 0 )
2884 {
2885 unsigned const evidence = ctx->evi.cols[ alg_EVIDENCE_ALIGNMENT_IDS ].len;
2886 if ( evidence != 0 )
2887 {
2888 INSDC_coord_len const *const refLen = ctx->evi.cols[ alg_READ_LEN ].base.coord_len;
2889 INSDC_coord_len const *const cigLen = ctx->evi.cols[ alg_CIGAR_LEN ].base.coord_len;
2890 unsigned const ploids = ctx->evi.cols[ alg_CIGAR_LEN ].len;
2891 unsigned const totalIntervalReadLen = ctx->evi.cols[ alg_READ ].len;
2892 CigOps refCigOps_stack[1024];
2893 CigOps *refCigOps_heap=NULL;
2894 CigOps *refCigOps;
2895
2896 if( totalIntervalReadLen > 1000){
2897 refCigOps=refCigOps_heap = malloc( sizeof( refCigOps[ 0 ] ) * (totalIntervalReadLen+ploids) );
2898 } else {
2899 refCigOps=refCigOps_stack;
2900 }
2901
2902 if ( refCigOps != NULL )
2903 {
2904 unsigned j;
2905 unsigned start;
2906 unsigned cigop_starts[256];
2907
2908 assert(ploids < 256);
2909
2910 for ( start = j = 0, cigop_starts[0]=0; j < ploids; ++j )
2911 {
2912 unsigned len = cigLen[ j ];
2913 cigop_starts[j+1] = cigop_starts[j] + ExplodeCIGAR( refCigOps + cigop_starts[j], refLen[ j ],
2914 ctx->evi.cols[ alg_CIGAR ].base.str + start, len );
2915 start += len;
2916 }
2917
2918 for ( j = 0; j < evidence; ++j )
2919 {
2920 int64_t const rowAlign = ctx->evi.cols[ alg_EVIDENCE_ALIGNMENT_IDS ].base.i64[ j ];
2921
2922 rc = Cursor_ReadAlign( &ctx->eva.curs, rowAlign, ctx->eva.cols, 0 );
2923 if ( rc == 0 )
2924 {
2925 if(param->cg_style != 0)
2926 rc = GenerateCGData( ctx->eva.cols, param->cg_style );
2927 if ( rc == 0 )
2928 {
2929 int const ploidy = ctx->eva.cols[ alg_REF_PLOIDY ].base.u32[ 0 ];
2930 int const readLen = ctx->eva.cols[ alg_READ ].len;
2931 INSDC_coord_zero refPos = ctx->eva.cols[ alg_REF_POS ].base.coord0[ 0 ];
2932 CigOps op[ 36 ];
2933 char cigbuf[ CG_CIGAR_STRING_MAX ];
2934
2935 memset( cigbuf, 0, CG_CIGAR_STRING_MAX );
2936 ExplodeCIGAR( op, readLen, ctx->eva.cols[ alg_CIGAR ].base.str,
2937 ctx->eva.cols[ alg_CIGAR ].len );
2938 ctx->eva.cols[ alg_CIGAR ].len = CombineCIGAR( cigbuf, op, readLen, refPos,
2939 refCigOps + cigop_starts[ ploidy - 1 ], refLen[ ploidy - 1 ] );
2940 ctx->eva.cols[ alg_CIGAR ].base.str = cigbuf;
2941 ctx->eva.cols[ alg_REF_POS ].base.v = &refPos;
2942 refPos += ctx->evi.cols[ alg_REF_POS ].base.coord0[ 0 ] ;
2943 if(refPos < 0){
2944 ReferenceObj const *r = NULL;
2945 rc = ReferenceList_Find( gRefList, &r,
2946 ctx->evi.cols[ alg_REF_NAME ].base.str,
2947 ctx->evi.cols[ alg_REF_NAME ].len );
2948 if(rc == 0){
2949 bool circular=false;
2950 rc=ReferenceObj_Circular(r, &circular);
2951 if(rc == 0 && circular){
2952 INSDC_coord_len len;
2953 rc=ReferenceObj_SeqLength(r,&len);
2954 if(rc == 0)
2955 refPos += len;
2956 }
2957 ReferenceObj_Release(r);
2958 }
2959 }
2960 rc = DumpAlignedSAM( ctx, &ctx->eva, rowAlign, ctx->readGroup, 1 );
2961 ++*rows;
2962 }
2963 else
2964 break;
2965 }
2966 else
2967 break;
2968 }
2969 if(refCigOps_heap) free( refCigOps_heap );
2970 }
2971 else
2972 rc = RC( rcExe, rcTable, rcReading, rcMemory, rcExhausted );
2973 }
2974 }
2975 }
2976 }
2977 else
2978 break;
2979 rc = rc ? rc : Quitting();
2980 if ( rc != 0 )
2981 break;
2982 }
2983 return rc;
2984 }
2985 return RC( rcExe, rcTable, rcReading, rcParam, rcUnexpected );
2986 }
2987
2988
ForEachAlignedRegion(SAM_dump_ctx_t * const ctx,enum e_IDS_opts const Options,rc_t (* user_func)(SAM_dump_ctx_t * ctx,TAlignedRegion const * rgn,int options,int which,int64_t * rows,SCol const * IDS))2989 static rc_t ForEachAlignedRegion( SAM_dump_ctx_t *const ctx, enum e_IDS_opts const Options,
2990 rc_t ( *user_func )( SAM_dump_ctx_t *ctx, TAlignedRegion const *rgn, int options, int which, int64_t *rows, SCol const *IDS ) )
2991 {
2992 SCol cols[] =
2993 {
2994 { "MAX_SEQ_LEN", 0, { NULL }, 0, false },
2995 { "OVERLAP_REF_POS", 0, { NULL }, 0, true },
2996 { "PRIMARY_ALIGNMENT_IDS", 0, { NULL }, 0, false },
2997 { "SECONDARY_ALIGNMENT_IDS", 0, { NULL }, 0, true },
2998 { "EVIDENCE_INTERVAL_IDS", 0, { NULL }, 0, true },
2999 { NULL, 0, { NULL }, 0, false }
3000 };
3001 enum eref_col
3002 {
3003 ref_MAX_SEQ_LEN = 0,
3004 ref_OVERLAP_REF_LEN,
3005 ref_PRIMARY_ALIGNMENT_IDS,
3006 ref_SECONDARY_ALIGNMENT_IDS,
3007 ref_EVIDENCE_INTERVAL_IDS
3008 };
3009
3010 KIndex const *iname = NULL;
3011 int64_t rows = 0;
3012 rc_t rc = 0;
3013 int options = Options & 7;
3014
3015 if ( Options & evidence_alignment_IDS )
3016 options |= evidence_interval_IDS;
3017
3018 if ( ( options & primary_IDS ) == 0 )
3019 cols[ ref_PRIMARY_ALIGNMENT_IDS ].name = "";
3020 if ( ( options & secondary_IDS ) == 0 )
3021 cols[ ref_SECONDARY_ALIGNMENT_IDS ].name = "";
3022 if ( ( options & evidence_interval_IDS ) == 0 )
3023 cols[ ref_EVIDENCE_INTERVAL_IDS ].name = "";
3024
3025 ctx->ref.cols = cols;
3026
3027 rc = VTableOpenIndexRead( ctx->ref.tbl.vtbl, &iname, "i_name" );
3028 if ( rc == 0 )
3029 {
3030 rc = Cursor_Open( &ctx->ref.tbl, &ctx->ref.curs, ctx->ref.cols, NULL );
3031 if ( rc == 0 )
3032 {
3033 int64_t rowid = 1;
3034 uint64_t count = 0;
3035 char refname[ 1024 ];
3036 size_t sz;
3037
3038 if ( ctx->ref.cols[ ref_PRIMARY_ALIGNMENT_IDS ].idx == 0 )
3039 options &= ~primary_IDS;
3040 if ( ctx->ref.cols[ ref_SECONDARY_ALIGNMENT_IDS ].idx == 0 )
3041 options &= ~secondary_IDS;
3042 if ( ctx->ref.cols[ ref_EVIDENCE_INTERVAL_IDS ].idx == 0 )
3043 options &= ~evidence_interval_IDS;
3044
3045 /* new: if we have a list of regions, reset the their print-flag */
3046 if ( param->region_qty > 0 )
3047 {
3048 unsigned r;
3049 for ( r = 0; r < param->region_qty; ++r )
3050 {
3051 param->region[ r ].printed = 0;
3052 }
3053 }
3054
3055 while ( ( rc = KIndexProjectText(iname, rowid + count, &rowid, &count, refname, sizeof( refname ), &sz ) ) == 0 )
3056 {
3057 bool include;
3058 unsigned r;
3059 unsigned max_to = UINT_MAX;
3060 unsigned min_from = 0;
3061
3062 for ( include = false, r = 0; r < param->region_qty; ++r )
3063 {
3064 if ( sz == string_size( param->region[ r ].name ) &&
3065 memcmp( param->region[ r ].name, refname, sz ) == 0 )
3066 {
3067 include = true;
3068 max_to = param->region[ r ].max_to;
3069 min_from = param->region[ r ].min_from;
3070 param->region[ r ].printed++; /* new: mark a region as printed */
3071 break;
3072 }
3073 }
3074
3075 if ( param->region_qty == 0 || include )
3076 {
3077 int64_t const endrow_exclusive = rowid + count;
3078 unsigned m;
3079 unsigned k;
3080
3081 for ( k = 0, m = 1; rc == 0 && k < 3; ++k, m <<= 1 )
3082 {
3083 if ( m & options )
3084 {
3085 unsigned lookback = 0;
3086 int64_t row;
3087 int32_t row_start_offset;
3088 unsigned pos;
3089 unsigned const maxseqlen = 5000; /***** TODO: this code is to be rewritten anyway ****/
3090
3091 if ( param->region_qty )
3092 {
3093 if ( ctx->ref.cols[ ref_OVERLAP_REF_LEN ].idx )
3094 {
3095 rc = Cursor_Read( &ctx->ref, rowid, 0, ~(0u) );
3096 if ( rc != 0 )
3097 break;
3098 if ( ctx->ref.cols[ ref_OVERLAP_REF_LEN ].len > k )
3099 {
3100 unsigned const overlap = ctx->ref.cols[ ref_OVERLAP_REF_LEN ].base.coord0[ k ];
3101
3102 if ( overlap != 0 )
3103 {
3104
3105 assert( overlap < rowid * maxseqlen );
3106 lookback = ( rowid - ( overlap / maxseqlen ) );
3107 }
3108 }
3109 }
3110 else
3111 lookback = 1;
3112 }
3113 row_start_offset = min_from/maxseqlen - lookback;
3114 if(row_start_offset < 0) row_start_offset=0;
3115
3116
3117 for ( pos = row_start_offset*maxseqlen, row = rowid + row_start_offset; rc == 0 && row < endrow_exclusive; ++row )
3118 {
3119 if ( row < 1 )
3120 {
3121 row = 0;
3122 continue;
3123 }
3124 rc = Cursor_Read( &ctx->ref, row, 0, ~(0u) );
3125 if ( rc != 0 )
3126 break;
3127 if ( ctx->ref.cols[ ref_PRIMARY_ALIGNMENT_IDS + k ].len > 0 )
3128 rc = user_func( ctx, ¶m->region[r], Options, m, &rows,
3129 &ctx->ref.cols[ ref_PRIMARY_ALIGNMENT_IDS + k ] );
3130 pos += ctx->ref.cols[ ref_MAX_SEQ_LEN ].base.u32[ 0 ];
3131 if ( pos >= max_to )
3132 break;
3133 if ( param->test_rows != 0 && rows > param->test_rows )
3134 break;
3135 }
3136 }
3137 }
3138 }
3139 if ( param->test_rows != 0 && rows > param->test_rows )
3140 break;
3141 if ( rc != 0 )
3142 break;
3143 } /* while */
3144
3145 /* new: walk the list of regions and report what was NOT printed ( found ) ... */
3146 if ( param->region_qty > 0 && GetRCState( rc ) != rcCanceled )
3147 {
3148 unsigned r;
3149 for ( r = 0; r < param->region_qty; ++r )
3150 {
3151 if ( param->region[ r ].printed == 0 )
3152 {
3153 (void)PLOGMSG( klogWarn, ( klogWarn, "reference >$(a)< not found", "a=%s", param->region[ r ].name ) );
3154 }
3155 }
3156 }
3157
3158 if ( GetRCObject( rc ) == rcId && GetRCState( rc ) == rcNotFound )
3159 {
3160 rc = 0;
3161 }
3162
3163 }
3164 }
3165 Cursor_Close( &ctx->ref.curs );
3166 KIndexRelease( iname );
3167
3168 return rc;
3169 }
3170
3171
DumpAlignedTable(SAM_dump_ctx_t * const ctx,DataSource * const ds,bool const primary,int const cg_style,unsigned * const rcount)3172 static rc_t DumpAlignedTable( SAM_dump_ctx_t *const ctx, DataSource *const ds,
3173 bool const primary, int const cg_style, unsigned *const rcount )
3174 {
3175 unsigned i;
3176 int64_t start;
3177 uint64_t count;
3178 rc_t rc = VCursorIdRange( ds->curs.vcurs, 0, &start, &count );
3179
3180 if ( rc != 0 )
3181 return rc;
3182 for ( i = 0; i != (unsigned)count; ++i )
3183 {
3184 if ( DumpAlignedRow(ctx, ds, start + i, primary, cg_style, &rc ) )
3185 ++*rcount;
3186 if ( rc != 0 || ( rc = Quitting() ) != 0 )
3187 return rc;
3188 if ( param->test_rows && *rcount > param->test_rows )
3189 break;
3190 }
3191 return 0;
3192 }
3193
3194
DumpUnsorted(SAM_dump_ctx_t * const ctx)3195 static rc_t DumpUnsorted( SAM_dump_ctx_t *const ctx )
3196 {
3197 rc_t rc = 0;
3198 unsigned rcount;
3199
3200 if ( rc == 0 && param->cg_evidence )
3201 {
3202 SAM_DUMP_DBG( 2, ( "%s EVIDENCE_INTERVAL\n", ctx->accession ) );
3203 rcount = 0;
3204 rc = DumpAlignedTable( ctx, &ctx->evi, false, 0, &rcount );
3205 (void)PLOGMSG( klogInfo, ( klogInfo, "$(a): $(c) allele sequences", "a=%s,c=%lu", ctx->accession, rcount ) );
3206 }
3207 if ( rc == 0 && param->cg_ev_dnb )
3208 {
3209 SAM_DUMP_DBG( 2, ( "%s EVIDENCE_ALIGNMENT\n", ctx->accession ) );
3210 rcount = 0;
3211 rc = DumpAlignedTable( ctx, &ctx->eva, false, param->cg_style, &rcount );
3212 (void)PLOGMSG( klogInfo, ( klogInfo, "$(a): $(c) support sequences", "a=%s,c=%lu", ctx->accession, rcount ) );
3213 }
3214 if ( rc == 0 && ctx->pri.curs.vcurs )
3215 {
3216 SAM_DUMP_DBG( 2, ( "%s PRIMARY_ALIGNMENT\n", ctx->accession ) );
3217 rcount = 0;
3218 rc = DumpAlignedTable( ctx, &ctx->pri, true, param->cg_style, &rcount );
3219 (void)PLOGMSG( klogInfo, ( klogInfo, "$(a): $(c) primary sequences", "a=%s,c=%lu", ctx->accession, rcount ) );
3220 }
3221 if ( rc == 0 && ctx->sec.curs.vcurs )
3222 {
3223 SAM_DUMP_DBG( 2, ( "%s SECONDARY_ALIGNMENT\n", ctx->accession ) );
3224 rcount = 0;
3225 rc = DumpAlignedTable( ctx, &ctx->sec, false, param->cg_style, &rcount );
3226 (void)PLOGMSG( klogInfo, ( klogInfo, "$(a): $(c) secondary sequences", "a=%s,c=%lu", ctx->accession, rcount ) );
3227 }
3228 return rc;
3229 }
3230
3231
DumpUnaligned(SAM_dump_ctx_t * const ctx,bool const aligned)3232 static rc_t DumpUnaligned( SAM_dump_ctx_t *const ctx, bool const aligned )
3233 {
3234 rc_t rc = 0;
3235 int64_t start = 0;
3236 uint64_t count = 0;
3237
3238 rc = VCursorIdRange( ctx->seq.curs.vcurs, 0, &start, &count );
3239 if ( rc == 0 )
3240 {
3241 uint64_t rcount = 0;
3242 if ( param->test_rows != 0 && count > param->test_rows )
3243 {
3244 count = param->test_rows;
3245 }
3246 SAM_DUMP_DBG( 2, ( "%s SEQUENCE table range from %ld qty %lu\n", ctx->accession, start, count ) );
3247 while ( count > 0 && rc == 0 )
3248 {
3249 uint32_t i;
3250
3251 if ( ctx->seq.cols[ seq_PRIMARY_ALIGNMENT_ID ].idx == 0)
3252 {
3253 rc = DumpUnalignedReads( ctx, NULL, start, &rcount );
3254 }
3255 else
3256 {
3257 /* avoid reading whole sequence cursor data unnecessarily */
3258 rc = Cursor_Read( &ctx->seq, start, seq_PRIMARY_ALIGNMENT_ID, ~(unsigned)0 );
3259 if ( rc == 0 )
3260 {
3261 /* find if its completely unaligned */
3262 int64_t min_prim_id = 0;
3263 bool has_unaligned = false;
3264 for ( i = 0; rc == 0 && i < ctx->seq.cols[ seq_PRIMARY_ALIGNMENT_ID ].len; i++ )
3265 {
3266 int64_t x = ctx->seq.cols[ seq_PRIMARY_ALIGNMENT_ID ].base.i64[ i ];
3267 has_unaligned |= x == 0;
3268 if ( ( min_prim_id == 0 && x != 0 ) || min_prim_id < x )
3269 {
3270 min_prim_id = x;
3271 }
3272 }
3273 if ( min_prim_id == 0 )
3274 {
3275 rc = DumpUnalignedReads( ctx, NULL, start, &rcount );
3276 }
3277 else if ( has_unaligned && !param->unaligned_spots )
3278 {
3279 if ( rc == 0 )
3280 {
3281 #if USE_MATE_CACHE
3282 uint64_t val;
3283 rc = Cache_Get( &ctx->pri.curs, min_prim_id, &val );
3284 if ( rc == 0 )
3285 {
3286 ctx->pri.cols[ alg_REF_POS ].len = 0;
3287 Cache_Unpack( val, 0, &ctx->pri.curs, ctx->pri.cols );
3288 ctx->pri.cols[ alg_SEQ_SPOT_ID ].base.i64 = &start;
3289 ctx->pri.cols[ alg_SEQ_SPOT_ID ].len = 1;
3290 memmove( &ctx->pri.cols[ alg_REF_NAME ], &ctx->pri.cols[ alg_MATE_REF_NAME ], sizeof( SCol ) );
3291 memmove( &ctx->pri.cols[ alg_REF_SEQ_ID ], &ctx->pri.cols[ alg_MATE_REF_NAME ], sizeof( SCol ) );
3292 memmove( &ctx->pri.cols[ alg_REF_POS ], &ctx->pri.cols[ alg_MATE_REF_POS ], sizeof( SCol ) );
3293 }
3294 else if ( !( GetRCState( rc ) == rcNotFound && GetRCObject( rc ) == rcItem ) )
3295 {
3296 break;
3297 }
3298 else
3299 {
3300 #endif /* USE_MATE_CACHE */
3301 rc = Cursor_ReadAlign( &ctx->pri.curs, min_prim_id, ctx->pri.cols, alg_MATE_ALIGN_ID );
3302 #if USE_MATE_CACHE
3303 }
3304 #endif /* USE_MATE_CACHE */
3305 rc = rc ? rc : DumpUnalignedReads( ctx, ctx->pri.cols, min_prim_id, &rcount );
3306 }
3307 }
3308 } else if ( GetRCState( rc ) == rcNotFound && GetRCObject( rc ) == rcRow )
3309 {
3310 rc = 0;
3311 }
3312 }
3313 start++;
3314 count--;
3315 rc = rc ? rc : Quitting();
3316 }
3317 (void)PLOGMSG( klogInfo, ( klogInfo, "$(a): $(c) unaligned sequences", "a=%s,c=%lu", ctx->accession, rcount ) );
3318 }
3319 return rc;
3320 }
3321
3322 #if USE_MATE_CACHE
FlushUnalignedRead_cb(uint64_t key,bool value,void * user_data)3323 static rc_t FlushUnalignedRead_cb (uint64_t key, bool value,void *user_data)
3324 {
3325 rc_t rc;
3326 int64_t spot_id = (int64_t) key;
3327 int64_t aligned_mate_id = (int64_t) value;
3328 SAM_dump_ctx_t *const ctx = user_data;
3329 rc = Cursor_Read( &ctx->seq, spot_id, seq_PRIMARY_ALIGNMENT_ID, ~(unsigned)0 );
3330 if ( rc == 0 ) {
3331 uint64_t rcount;
3332 uint64_t val;
3333 assert(ctx->seq.cols[ seq_PRIMARY_ALIGNMENT_ID ].len == 2);
3334 if(ctx->seq.cols[ seq_PRIMARY_ALIGNMENT_ID ].base.i64[0]==0){
3335 aligned_mate_id = ctx->seq.cols[ seq_PRIMARY_ALIGNMENT_ID ].base.i64[1];
3336 assert(aligned_mate_id != 0);
3337 } else if(ctx->seq.cols[ seq_PRIMARY_ALIGNMENT_ID ].base.i64[1] == 0){
3338 aligned_mate_id = ctx->seq.cols[ seq_PRIMARY_ALIGNMENT_ID ].base.i64[0];
3339 assert(aligned_mate_id != 0);
3340 } else {
3341 assert(0);
3342 }
3343 rc = Cache_Get( &ctx->pri.curs, aligned_mate_id, &val );
3344 if ( rc == 0 ) {
3345 ctx->pri.cols[ alg_REF_POS ].len = 0;
3346 Cache_Unpack( val, 0, &ctx->pri.curs, ctx->pri.cols );
3347 ctx->pri.cols[ alg_SEQ_SPOT_ID ].base.i64 = &spot_id;
3348 ctx->pri.cols[ alg_SEQ_SPOT_ID ].len = 1;
3349 memmove( &ctx->pri.cols[ alg_REF_NAME ], &ctx->pri.cols[ alg_MATE_REF_NAME ], sizeof( SCol ) );
3350 memmove( &ctx->pri.cols[ alg_REF_SEQ_ID ], &ctx->pri.cols[ alg_MATE_REF_NAME ], sizeof( SCol ) );
3351 memmove( &ctx->pri.cols[ alg_REF_POS ], &ctx->pri.cols[ alg_MATE_REF_POS ], sizeof( SCol ) );
3352 rc =DumpUnalignedReads( ctx, ctx->pri.cols, aligned_mate_id, &rcount );
3353 }
3354 }
3355 return rc;
3356 }
FlushUnaligned(SAM_dump_ctx_t * const ctx,const SCursCache * cache)3357 static rc_t FlushUnaligned( SAM_dump_ctx_t *const ctx, const SCursCache * cache )
3358 {
3359 return KVectorVisitBool (cache->cache_unaligned_mate, false, FlushUnalignedRead_cb, ctx );
3360 }
3361 #endif
3362
DumpHeader(SAM_dump_ctx_t const * ctx)3363 static rc_t DumpHeader( SAM_dump_ctx_t const *ctx )
3364 {
3365 bool reheader = param->reheader;
3366 rc_t rc = 0;
3367
3368 if ( !reheader )
3369 {
3370 if ( ctx->db )
3371 {
3372 /* grab header from db meta node */
3373 KMetadata const *meta;
3374
3375 rc = VDatabaseOpenMetadataRead( ctx->db, &meta );
3376 if ( rc == 0 )
3377 {
3378 KMDataNode const *node;
3379
3380 rc = KMetadataOpenNodeRead( meta, &node, "BAM_HEADER" );
3381 KMetadataRelease( meta );
3382
3383 if ( rc == 0 )
3384 {
3385 size_t offset = 0;
3386
3387 for ( offset = 0; ; )
3388 {
3389 size_t remain;
3390 size_t nread;
3391 char buffer[ 4096 ];
3392
3393 rc = KMDataNodeRead( node, offset, buffer, sizeof(buffer), &nread, &remain );
3394 if ( rc != 0 )
3395 break;
3396 BufferedWriter( NULL, buffer, nread, NULL );
3397 if ( remain == 0 )
3398 {
3399 if ( buffer[nread - 1] != '\n' )
3400 {
3401 buffer[ 0 ] = '\n';
3402 rc = BufferedWriter( NULL, buffer, 1, NULL );
3403 }
3404 break;
3405 }
3406 offset += nread;
3407 }
3408 KMDataNodeRelease(node);
3409 }
3410 else if ( GetRCState( rc ) == rcNotFound )
3411 {
3412 reheader = true;
3413 rc = 0;
3414 }
3415 }
3416 }
3417 else
3418 {
3419 reheader = true;
3420 rc = 0;
3421 }
3422 }
3423
3424 if ( rc == 0 && reheader )
3425 {
3426 rc = KOutMsg( "@HD\tVN:1.3\n" );
3427 if ( rc == 0 && gRefList )
3428 rc = RefSeqPrint();
3429 if ( rc == 0 && ctx->readGroup )
3430 rc = ReadGroup_print( ctx->readGroup );
3431 if ( rc == 0 && ctx->seq.tbl.vtbl )
3432 rc = DumpReadGroups( &ctx->seq.tbl );
3433 }
3434
3435 if ( rc == 0 && param->comments )
3436 {
3437 unsigned i;
3438
3439 for( i = 0; rc == 0 && param->comments[ i ]; ++i )
3440 rc = KOutMsg( "@CO\t%s\n", param->comments[ i ] );
3441 }
3442 return rc;
3443 }
3444
3445
DumpDB(SAM_dump_ctx_t * const ctx)3446 static rc_t DumpDB( SAM_dump_ctx_t *const ctx )
3447 {
3448 rc_t rc = 0;
3449
3450 if ( ctx->ref.tbl.vtbl != NULL )
3451 rc = ReferenceList_MakeTable( &gRefList, ctx->ref.tbl.vtbl, 0, CURSOR_CACHE, NULL, 0 );
3452 if ( !param->noheader )
3453 rc = DumpHeader( ctx );
3454 if ( rc == 0 )
3455 {
3456 if ( param->region_qty ){
3457 rc = ForEachAlignedRegion( ctx
3458 , ( param->primaries ? primary_IDS : 0 )
3459 | ( param->secondaries ? secondary_IDS : 0 )
3460 | ( param->cg_evidence ? evidence_interval_IDS : 0 )
3461 | ( param->cg_ev_dnb ? evidence_alignment_IDS : 0 )
3462 , DumpAlignedRowList_cb );
3463 #if USE_MATE_CACHE
3464 if ( rc == 0 && param->unaligned ){
3465 rc = FlushUnaligned( ctx,ctx->pri.curs.cache);
3466 }
3467 #endif
3468 }
3469 if ( param->cg_sam )
3470 rc = ForEachAlignedRegion( ctx, evidence_interval_IDS, DumpCGSAM );
3471
3472 if ( param->region_qty == 0 )
3473 {
3474 rc = DumpUnsorted( ctx );
3475 if ( rc == 0 && param->unaligned )
3476 rc = DumpUnaligned( ctx, ctx->pri.tbl.vtbl != NULL );
3477 }
3478 }
3479 ReferenceList_Release( gRefList );
3480 return rc;
3481 }
3482
3483
DumpTable(SAM_dump_ctx_t * ctx)3484 static rc_t DumpTable( SAM_dump_ctx_t *ctx )
3485 {
3486 rc_t rc = 0;
3487
3488 if ( !param->noheader )
3489 rc = DumpHeader( ctx );
3490 if ( rc == 0 )
3491 rc = DumpUnaligned( ctx, false );
3492 return rc;
3493 }
3494
3495
ProcessTable(VDBManager const * mgr,char const fullPath[],char const accession[],char const readGroup[])3496 static rc_t ProcessTable( VDBManager const *mgr, char const fullPath[],
3497 char const accession[], char const readGroup[] )
3498 {
3499 VTable const *tbl;
3500 rc_t rc = VDBManagerOpenTableRead( mgr, &tbl, 0, "%s", fullPath );
3501
3502 if ( rc != 0 )
3503 {
3504 VSchema *schema;
3505
3506 rc = VDBManagerMakeSRASchema( mgr, &schema );
3507 if ( rc == 0 )
3508 {
3509 rc = VDBManagerOpenTableRead( mgr, &tbl, schema, "%s", fullPath );
3510 VSchemaRelease( schema );
3511 }
3512 }
3513
3514 if ( rc == 0 )
3515 {
3516 SAM_dump_ctx_t ctx;
3517 SCol seq_cols[ sizeof( gSeqCol ) / sizeof( gSeqCol[ 0 ] ) ];
3518
3519 memset( &ctx, 0, sizeof( ctx ) );
3520
3521 ctx.fullPath = fullPath;
3522 ctx.accession = accession;
3523 ctx.readGroup = readGroup;
3524
3525 DATASOURCE_INIT( ctx.seq, accession );
3526 ctx.seq.tbl.vtbl = tbl;
3527 ctx.seq.type = edstt_Sequence;
3528
3529 ReportResetTable( fullPath, tbl );
3530
3531 ctx.seq.cols = seq_cols;
3532 memmove( ctx.seq.cols, gSeqCol, sizeof( gSeqCol ) );
3533 rc = Cursor_Open( &ctx.seq.tbl, &ctx.seq.curs, ctx.seq.cols, NULL );
3534 if ( rc == 0 ) {
3535 rc = DumpTable( &ctx );
3536 Cursor_Close(&ctx.seq.curs);
3537 }
3538 VTableRelease( tbl );
3539 }
3540 return rc;
3541 }
3542
3543
SetupColumns(DataSource * ds,enum eDSTableType type)3544 static void SetupColumns( DataSource *ds, enum eDSTableType type )
3545 {
3546 memmove( ds->cols, g_alg_col_tmpl, sizeof( g_alg_col_tmpl ) );
3547
3548 ds->type = type;
3549 if ( param->use_seqid )
3550 ds->cols[ alg_MATE_REF_NAME ].name = "MATE_REF_SEQ_ID";
3551 else
3552 ds->cols[ alg_MATE_REF_NAME ].name = "MATE_REF_NAME";
3553
3554 if ( param->fasta || param->fastq )
3555 ds->cols[ alg_READ ].name = "RAW_READ";
3556 else if ( param->hide_identical )
3557 ds->cols[ alg_READ ].name = "MISMATCH_READ";
3558 else
3559 ds->cols[ alg_READ ].name = "READ";
3560
3561 if ( param->long_cigar )
3562 {
3563 ds->cols[ alg_CIGAR ].name = "CIGAR_LONG";
3564 ds->cols[ alg_CIGAR_LEN ].name = "CIGAR_LONG_LEN";
3565 }
3566 else
3567 {
3568 ds->cols[ alg_CIGAR ].name = "CIGAR_SHORT";
3569 ds->cols[ alg_CIGAR_LEN ].name = "CIGAR_SHORT_LEN";
3570 }
3571
3572 if(param->qualQuant && param->qualQuantSingle){ /** we don't really need quality ***/
3573 ds->cols[ alg_SAM_QUALITY ].name = "";
3574 }
3575
3576 switch ( type )
3577 {
3578 case edstt_EvidenceInterval:
3579
3580 ds->cols[ alg_SEQ_SPOT_ID ].name = "";
3581 ds->cols[ alg_SEQ_NAME ].name = "";
3582 ds->cols[ alg_SAM_QUALITY ].name = "";
3583 ds->cols[ alg_SPOT_GROUP ].name = "";
3584 ds->cols[ alg_SEQ_SPOT_GROUP].name = "";
3585 ds->cols[ alg_SEQ_READ_ID ].name = "";
3586 ds->cols[ alg_MATE_ALIGN_ID ].name = "";
3587 ds->cols[ alg_MATE_REF_ID ].name = "";
3588 ds->cols[ alg_MATE_REF_NAME ].name = "";
3589 ds->cols[ alg_SAM_FLAGS ].name = "";
3590 ds->cols[ alg_MATE_REF_POS ].name = "";
3591 ds->cols[ alg_ALIGN_GROUP ].name = "";
3592 ds->cols[ alg_TEMPLATE_LEN ].name = "";
3593 if ( param->cg_sam || param->cg_ev_dnb )
3594 ds->cols[ alg_EVIDENCE_ALIGNMENT_IDS ].name = "EVIDENCE_ALIGNMENT_IDS";
3595 break;
3596
3597 case edstt_EvidenceAlignment:
3598 ds->cols[ alg_REF_PLOIDY ].name = "REF_PLOIDY";
3599 ds->cols[ alg_REF_ID ].name = "REF_ID";
3600 ds->cols[ alg_MATE_ALIGN_ID ].name = "";
3601 ds->cols[ alg_MATE_REF_ID ].name = "";
3602 ds->cols[ alg_MATE_REF_NAME ].name = "";
3603 ds->cols[ alg_SAM_FLAGS ].name = "";
3604 ds->cols[ alg_MATE_REF_POS ].name = "";
3605 ds->cols[ alg_ALIGN_GROUP ].name = "";
3606 ds->cols[ alg_TEMPLATE_LEN ].name = "";
3607 break;
3608 default:
3609 break;
3610 }
3611 }
3612
3613
ProcessDB(VDatabase const * db,char const fullPath[],char const accession[],char const readGroup[])3614 static rc_t ProcessDB( VDatabase const *db, char const fullPath[],
3615 char const accession[], char const readGroup[] )
3616 {
3617 char const *const refTableName = ( param->region_qty > 0
3618 || param->cg_evidence
3619 || param->cg_ev_dnb
3620 || param->cg_sam
3621 || param->reheader
3622 || param->primaries
3623 || param->secondaries
3624 )
3625 ? "REFERENCE"
3626 : 0;
3627 char const *const priTableName = param->primaries ? "PRIMARY_ALIGNMENT" : 0;
3628 char const *const secTableName = param->secondaries ? "SECONDARY_ALIGNMENT" : 0;
3629 char const *const seqTableName = ( param->unaligned || param->reheader ) ? "SEQUENCE" : 0;
3630 char const *const eviTableName = ( param->cg_evidence || ( param->cg_ev_dnb & ( param->region_qty > 0 ) ) || param->cg_sam ) ? "EVIDENCE_INTERVAL" : 0;
3631 char const *const evaTableName = ( param->cg_ev_dnb || param->cg_sam ) ? "EVIDENCE_ALIGNMENT" : 0;
3632
3633 SAM_dump_ctx_t ctx;
3634 SCol align_cols[ ( sizeof( g_alg_col_tmpl ) / sizeof( g_alg_col_tmpl[ 0 ] ) ) * 4 ];
3635 SCol seq_cols[ sizeof( gSeqCol ) / sizeof( gSeqCol[ 0 ] ) ];
3636 rc_t rc = 0;
3637
3638 memset( &ctx, 0, sizeof( ctx ) );
3639 memset( align_cols, 0, sizeof( align_cols ) );
3640 memset( seq_cols, 0, sizeof( seq_cols ) );
3641
3642 ctx.db = db;
3643 ctx.fullPath = fullPath;
3644 ctx.accession = accession;
3645 ctx.readGroup = readGroup;
3646
3647 DATASOURCE_INIT( ctx.seq, seqTableName );
3648 DATASOURCE_INIT( ctx.ref, refTableName );
3649 DATASOURCE_INIT( ctx.pri, priTableName );
3650 DATASOURCE_INIT( ctx.sec, secTableName );
3651 DATASOURCE_INIT( ctx.evi, eviTableName );
3652 DATASOURCE_INIT( ctx.eva, evaTableName );
3653
3654 if ( ctx.seq.tbl.name )
3655 {
3656 rc = VDatabaseOpenTableRead( db, &ctx.seq.tbl.vtbl, "%s", ctx.seq.tbl.name );
3657 if ( rc == 0 )
3658 {
3659 ctx.seq.cols = seq_cols;
3660 memmove(seq_cols, gSeqCol, sizeof(gSeqCol));
3661 rc = Cursor_Open( &ctx.seq.tbl, &ctx.seq.curs, ctx.seq.cols, NULL );
3662 }
3663 }
3664 ctx.pri.cols = &align_cols[ 0 * ( sizeof( g_alg_col_tmpl ) / sizeof( g_alg_col_tmpl[ 0 ] ) ) ];
3665 ctx.sec.cols = &align_cols[ 1 * ( sizeof( g_alg_col_tmpl ) / sizeof( g_alg_col_tmpl[ 0 ] ) ) ];
3666 ctx.evi.cols = &align_cols[ 2 * ( sizeof( g_alg_col_tmpl ) / sizeof( g_alg_col_tmpl[ 0 ] ) ) ];
3667 ctx.eva.cols = &align_cols[ 3 * ( sizeof( g_alg_col_tmpl ) / sizeof( g_alg_col_tmpl[ 0 ] ) ) ];
3668
3669 SetupColumns( &ctx.pri, edstt_PrimaryAlignment );
3670 SetupColumns( &ctx.sec, edstt_SecondaryAlignment );
3671 SetupColumns( &ctx.evi, edstt_EvidenceInterval );
3672 SetupColumns( &ctx.eva, edstt_EvidenceAlignment );
3673
3674 if ( ctx.pri.tbl.name )
3675 VDatabaseOpenTableRead( db, &ctx.pri.tbl.vtbl, "%s", ctx.pri.tbl.name );
3676 if ( ctx.sec.tbl.name )
3677 VDatabaseOpenTableRead( db, &ctx.sec.tbl.vtbl, "%s", ctx.sec.tbl.name );
3678 if ( ctx.evi.tbl.name )
3679 VDatabaseOpenTableRead( db, &ctx.evi.tbl.vtbl, "%s", ctx.evi.tbl.name );
3680 if ( ctx.eva.tbl.name )
3681 VDatabaseOpenTableRead( db, &ctx.eva.tbl.vtbl, "%s", ctx.eva.tbl.name );
3682
3683 if ( ctx.pri.tbl.vtbl == NULL
3684 && ctx.sec.tbl.vtbl == NULL
3685 && ctx.evi.tbl.vtbl == NULL
3686 && ctx.eva.tbl.vtbl == NULL )
3687 {
3688 memset (&ctx.pri, 0, sizeof( ctx.pri ) * 4 );
3689 if ( ctx.seq.tbl.name == NULL )
3690 {
3691 ctx.seq.tbl.name = "SEQUENCE";
3692 rc = VDatabaseOpenTableRead( db, &ctx.seq.tbl.vtbl, "%s", ctx.seq.tbl.name );
3693 }
3694 if ( rc == 0 )
3695 {
3696 ctx.seq.cols = seq_cols;
3697 memmove(seq_cols, gSeqCol, sizeof(gSeqCol));
3698 rc = Cursor_Open(&ctx.seq.tbl, &ctx.seq.curs, ctx.seq.cols, NULL);
3699 if ( rc == 0 )
3700 {
3701 rc = DumpTable( &ctx );
3702 Cursor_Close(&ctx.seq.curs);
3703 }
3704 VTableRelease(ctx.seq.tbl.vtbl);
3705 }
3706 VDatabaseRelease( db );
3707 return rc;
3708 }
3709 ReportResetDatabase( fullPath, db );
3710
3711 if ( ctx.ref.tbl.name )
3712 {
3713 rc = VDatabaseOpenTableRead( db, &ctx.ref.tbl.vtbl, "%s", ctx.ref.tbl.name );
3714 ctx.ref.type = edstt_Reference;
3715 }
3716 if ( rc == 0 )
3717 rc = Cursor_Open( &ctx.pri.tbl, &ctx.pri.curs, ctx.pri.cols, ctx.seq.curs.cache );
3718 if ( rc == 0 )
3719 rc = Cursor_Open( &ctx.sec.tbl, &ctx.sec.curs, ctx.sec.cols, NULL );
3720 if ( rc == 0 )
3721 rc = Cursor_Open( &ctx.evi.tbl, &ctx.evi.curs, ctx.evi.cols, NULL );
3722 if ( rc == 0 )
3723 rc = Cursor_Open( &ctx.eva.tbl, &ctx.eva.curs, ctx.eva.cols, NULL );
3724 if ( rc == 0 )
3725 rc = DumpDB( &ctx );
3726
3727 Cursor_Close(&ctx.ref.curs);
3728 Cursor_Close(&ctx.pri.curs);
3729 Cursor_Close(&ctx.sec.curs);
3730 Cursor_Close(&ctx.evi.curs);
3731 Cursor_Close(&ctx.eva.curs);
3732 Cursor_Close(&ctx.seq.curs);
3733
3734 VTableRelease( ctx.ref.tbl.vtbl );
3735 VTableRelease( ctx.pri.tbl.vtbl );
3736 VTableRelease( ctx.sec.tbl.vtbl );
3737 VTableRelease( ctx.evi.tbl.vtbl );
3738 VTableRelease( ctx.eva.tbl.vtbl );
3739 VTableRelease( ctx.seq.tbl.vtbl );
3740
3741 return rc;
3742 }
3743
3744 #if 0
3745 rc_t ResolvePath( char const *accession, char const ** path )
3746 {
3747 rc_t rc = 0;
3748 static char tblpath[ 4096 ];
3749 #if TOOLS_USE_SRAPATH != 0
3750 static SRAPath* pmgr = NULL;
3751
3752 if ( accession == NULL && path == NULL )
3753 {
3754 SRAPathRelease( pmgr );
3755 return 0;
3756 }
3757 if ( pmgr != NULL
3758 || ( rc = SRAPathMake( &pmgr, NULL ) ) == 0
3759 || ( GetRCState( rc ) == rcNotFound && GetRCTarget( rc ) == rcDylib ) )
3760 {
3761 *path = tblpath;
3762 tblpath[ 0 ] = '\0';
3763 rc = 0;
3764 do {
3765 if ( pmgr != NULL && !SRAPathTest( pmgr, accession ) )
3766 {
3767 /* try to resolve the path using mgr */
3768 rc = SRAPathFind( pmgr, accession, tblpath, sizeof( tblpath ) );
3769 if ( rc == 0 )
3770 break;
3771 }
3772 if ( string_size( accession ) >= sizeof( tblpath ) )
3773 {
3774 rc = RC( rcExe, rcPath, rcResolving, rcBuffer, rcInsufficient );
3775 }
3776 else
3777 {
3778 string_copy( tblpath, ( sizeof tblpath ) - 1, accession, string_size( accession ) );
3779 rc = 0;
3780 }
3781 } while( false );
3782 }
3783 #else
3784 if ( accession != NULL && path != NULL )
3785 {
3786 *path = tblpath;
3787 string_copy( tblpath, ( sizeof tblpath ) - 1, accession, string_size( accession ) );
3788 }
3789 #endif
3790
3791 return rc;
3792 }
3793 #endif
3794
3795
ProcessPath(VDBManager const * mgr,char const Path[])3796 static rc_t ProcessPath( VDBManager const *mgr, char const Path[] )
3797 {
3798 #if 0
3799 unsigned pathlen = string_size( Path );
3800 char *readGroup = NULL;
3801 char *path = NULL;
3802 unsigned i;
3803 rc_t rc;
3804
3805 /* strip trailing path seperators */
3806 for ( i = pathlen; i > 0; )
3807 {
3808 int const ch = Path[ --i ];
3809
3810 if ( ch == '/' || ch == '\\' )
3811 --pathlen;
3812 else
3813 break;
3814 }
3815
3816 /* find read group alias */
3817 for ( i = pathlen; i > 0; )
3818 {
3819 int const ch = Path[ --i ];
3820
3821 if ( ch == '=' )
3822 {
3823 unsigned const pos = i + 1;
3824 unsigned const len = pathlen - pos;
3825
3826 pathlen = i;
3827
3828 readGroup = malloc( len + 1 );
3829 if ( readGroup == NULL )
3830 return RC( rcExe, rcArgv, rcProcessing, rcMemory, rcExhausted );
3831 memmove( readGroup, &Path[ pos ], len );
3832 readGroup[ len ] = '\0';
3833 break;
3834 }
3835 }
3836
3837 path = malloc( pathlen + 1 );
3838 if ( path == NULL )
3839 return RC( rcExe, rcArgv, rcProcessing, rcMemory, rcExhausted );
3840 memmove( path, Path, pathlen );
3841 path[ pathlen ] = '\0';
3842
3843 rc = ReportResetObject( path );
3844 if ( rc == 0 )
3845 {
3846 char const *fullPath;
3847
3848 rc = ResolvePath( path, &fullPath );
3849 if ( rc == 0 )
3850 {
3851 char const *accession = fullPath;
3852 VDatabase const *db;
3853
3854 /* use last path element as accession */
3855 for ( i = pathlen; i > 0; )
3856 {
3857 int const ch = path[ --i ];
3858
3859 if ( ch == '/' || ch == '\\' )
3860 {
3861 accession = &fullPath[ i + 1 ];
3862 break;
3863 }
3864 }
3865 rc = VDBManagerOpenDBRead( mgr, &db, NULL, "%s", fullPath );
3866 if ( rc == 0 )
3867 {
3868 rc = ProcessDB( db, fullPath, accession, readGroup );
3869 VDatabaseRelease( db );
3870 }
3871 else
3872 rc = ProcessTable( mgr, fullPath, accession, readGroup );
3873 }
3874 }
3875 free( path );
3876 free( readGroup );
3877 return rc;
3878
3879 #else
3880
3881 char * pc;
3882 rc_t rc = 0;
3883 size_t pathlen = string_size ( Path );
3884
3885 /* look for scheme */
3886 pc = string_chr ( Path, pathlen, ':' );
3887 if (pc == NULL)
3888 {
3889 char *readGroup;
3890 char *path;
3891 size_t i;
3892
3893 /* no scheme found - must be simple fs path */
3894 /* old: ---unused label */
3895 readGroup = NULL;
3896 path = NULL;
3897
3898 /* strip trailing path separators */
3899 for ( i = pathlen; i > 0; -- pathlen )
3900 {
3901 int const ch = Path[ --i ];
3902 if ( ch != '/' && ch != '\\' )
3903 break;
3904 }
3905
3906 /* find read group alias */
3907 for ( i = pathlen; i > 0; )
3908 {
3909 int const ch = Path[ --i ];
3910
3911 if ( ch == '=' )
3912 {
3913 size_t const pos = i + 1;
3914 size_t const len = pathlen - pos;
3915
3916 pathlen = i;
3917
3918 readGroup = malloc( len + 1 );
3919 if ( readGroup == NULL )
3920 return RC( rcExe, rcArgv, rcProcessing, rcMemory, rcExhausted );
3921 memmove( readGroup, &Path[ pos ], len );
3922 readGroup[ len ] = '\0';
3923 break;
3924 }
3925 }
3926
3927 path = malloc( pathlen + 1 );
3928 if ( path == NULL )
3929 return RC( rcExe, rcArgv, rcProcessing, rcMemory, rcExhausted );
3930 memmove( path, Path, pathlen );
3931 path[ pathlen ] = '\0';
3932
3933 rc = ReportResetObject( path );
3934 if ( rc == 0 )
3935 {
3936 char const *accession = path;
3937 VDatabase const *db;
3938
3939 /* use last path element as accession */
3940 for ( i = pathlen; i > 0; )
3941 {
3942 int const ch = path[ --i ];
3943
3944 if ( ch == '/' || ch == '\\' )
3945 {
3946 accession = &path[ i + 1 ];
3947 break;
3948 }
3949 }
3950 rc = VDBManagerOpenDBRead( mgr, &db, NULL, "%s", path );
3951 if ( rc == 0 )
3952 {
3953 rc = ProcessDB( db, path, accession, readGroup );
3954 if (UIError(rc, db, NULL)) {
3955 UIDatabaseLOGError(rc, db, true);
3956 /* *error_reported = true; */
3957 }
3958 VDatabaseRelease( db );
3959 }
3960 else {
3961 rc = ProcessTable( mgr, path, accession, readGroup );
3962 if (UIError(rc, NULL, NULL)) {
3963 UITableLOGError(rc, NULL, true);
3964 /* *error_reported = true; */
3965 }
3966 }
3967 }
3968 free( path );
3969 free( readGroup );
3970 return rc;
3971 }
3972 else
3973 {
3974 VFSManager * vfs;
3975 rc = VFSManagerMake ( & vfs );
3976 if ( rc == 0 )
3977 {
3978 VPath * vpath;
3979 char * basename;
3980 size_t zz;
3981 char buffer [8193];
3982 char readgroup_ [257];
3983 char * readgroup;
3984
3985 rc = VFSManagerMakePath (vfs, &vpath, "%s", Path);
3986 VFSManagerRelease ( vfs );
3987 if ( rc == 0 )
3988 {
3989 char * ext;
3990
3991 rc = VPathReadPath (vpath, buffer, sizeof buffer - 1, &zz);
3992 if ( rc == 0 )
3993 {
3994 loop:
3995 basename = string_rchr (buffer, zz, '/');
3996 if ( basename == NULL )
3997 basename = buffer;
3998 else
3999 {
4000 if (basename[1] == '\0')
4001 {
4002 basename[0] = '\0';
4003 goto loop;
4004 }
4005 ++ basename;
4006 zz -= basename - buffer;
4007 }
4008
4009 /* cut off [.lite].[c]sra[.nenc||.ncbi_enc] if any */
4010 ext = string_rchr( basename, zz, '.' );
4011 if ( ext != NULL )
4012 {
4013 if ( strcasecmp( ext, ".nenc" ) == 0 || strcasecmp( ext, ".ncbi_enc" ) == 0 )
4014 {
4015 zz -= ext - basename;
4016 *ext = '\0';
4017 ext = string_rchr( basename, zz, '.' );
4018 }
4019
4020 if ( ext != NULL )
4021 {
4022 if ( strcasecmp( ext, ".sra" ) == 0 || strcasecmp( ext, ".csra" ) == 0 )
4023 {
4024 zz -= ext - basename;
4025 *ext = '\0';
4026 ext = string_rchr( basename, zz, '.' );
4027 if ( ext != NULL && strcasecmp( ext, ".lite" ) == 0 )
4028 *ext = '\0';
4029 }
4030 }
4031 }
4032
4033 rc = VPathOption (vpath, vpopt_readgroup, readgroup_,
4034 sizeof readgroup_ - 1, &zz);
4035 if ( rc == 0 )
4036 readgroup = readgroup_;
4037 else if ( GetRCState ( rc ) == rcNotFound )
4038 {
4039 rc = 0;
4040 readgroup = NULL;
4041 }
4042
4043
4044 if ( rc == 0 )
4045 {
4046 VDatabase const *db;
4047
4048 rc = VDBManagerOpenDBRead( mgr, &db, NULL, "%s", Path );
4049 if ( rc == 0 )
4050 {
4051 rc = ProcessDB( db, Path, basename, readgroup );
4052 VDatabaseRelease( db );
4053 }
4054 else
4055 rc = ProcessTable( mgr, Path, basename, readgroup );
4056 }
4057 }
4058
4059 VPathRelease (vpath);
4060 }
4061 }
4062 }
4063 return rc;
4064 #endif
4065 }
4066
4067 /*
4068 ver_t CC KAppVersion( void )
4069 {
4070 return SAM_DUMP_VERS;
4071 }
4072 */
4073
4074 char const *unaligned_usage[] = {"Output unaligned reads", NULL};
4075 char const *unaligned_only_usage[] = {"Only output unaligned spots", NULL};
4076 char const *primaryonly_usage[] = {"Output only primary alignments", NULL};
4077 char const *cigartype_usage[] = {"Output long version of CIGAR", NULL};
4078 char const *cigarCG_usage[] = {"Output CG version of CIGAR", NULL};
4079 char const *cigarCGMerge_usage[] = {"Apply CG fixups to CIGAR/SEQ/QUAL and outputs CG-specific columns", NULL};
4080 char const *CG_evidence[] = {"Output CG evidence aligned to reference", NULL};
4081 char const *CG_ev_dnb[] = {"Output CG evidence DNB's aligned to evidence", NULL};
4082 char const *CG_SAM[] = {"Output CG evidence DNB's aligned to reference ", NULL};
4083 char const *CG_mappings[] = {"Output CG sequences aligned to reference ", NULL};
4084 char const *header_usage[] = {"Replace original BAM header with a reconstructed one", NULL};
4085 char const *noheader_usage[] = {"Do not output headers", NULL};
4086 char const *region_usage[] = {"Filter by position on genome.",
4087 "Name can either be file specific name (ex: \"chr1\" or \"1\").",
4088 "\"from\" and \"to\" are 1-based coordinates", NULL};
4089 char const *distance_usage[] = {"Filter by distance between matepairs.",
4090 "Use \"unknown\" to find matepairs split between the references.",
4091 "Use from-to to limit matepair distance on the same reference", NULL};
4092 char const *seq_id_usage[] = {"Print reference SEQ_ID in RNAME instead of NAME", NULL};
4093 char const *identicalbases_usage[] = {"Output '=' if base is identical to reference", NULL};
4094 char const *gzip_usage[] = {"Compress output using gzip", NULL};
4095 char const *bzip2_usage[] = {"Compress output using bzip2", NULL};
4096 char const *qname_usage[] = {"Add .SPOT_GROUP to QNAME", NULL};
4097 char const *fasta_usage[] = {"Produce Fasta formatted output", NULL};
4098 char const *fastq_usage[] = {"Produce FastQ formatted output", NULL};
4099 char const *prefix_usage[] = {"Prefix QNAME: prefix.QNAME", NULL};
4100 char const *reverse_usage[] = {"Reverse unaligned reads according to read type", NULL};
4101 char const *comment_usage[] = {"Add comment to header. Use multiple times for several lines. Use quotes", NULL};
4102 char const *XI_usage[] = {"Output cSRA alignment id in XI column", NULL};
4103 char const *qual_quant_usage[] = {"Quality scores quantization level",
4104 "a string like '1:10,10:20,20:30,30:-'", NULL};
4105 char const *CG_names[] = { "Generate CG friendly read names", NULL};
4106
4107 char const *usage_params[] =
4108 {
4109 NULL, /* unaligned */
4110 NULL, /* unaligned-only */
4111 NULL, /* primaryonly */
4112 NULL, /* cigartype */
4113 NULL, /* cigarCG */
4114 NULL, /* header */
4115 NULL, /* no-header */
4116 "text", /* comment */
4117 "name[:from-to]", /* region */
4118 "from-to|'unknown'", /* distance */
4119 NULL, /* seq-id */
4120 NULL, /* identical-bases */
4121 NULL, /* gzip */
4122 NULL, /* bzip2 */
4123 NULL, /* qname */
4124 NULL, /* fasta */
4125 NULL, /* fastq */
4126 "prefix", /* prefix */
4127 NULL, /* reverse */
4128 NULL, /* test-rows */
4129 NULL, /* mate-row-gap-cacheable */
4130 NULL, /* cigarCGMerge */
4131 NULL, /* XI */
4132 "quantization string", /* qual-quant */
4133 NULL, /* cigarCGMerge */
4134 NULL, /* CG-evidence */
4135 NULL, /* CG-ev-dnb */
4136 NULL, /* CG-mappings */
4137 NULL, /* CG-SAM */
4138 NULL /* CG-names */
4139 };
4140
4141 enum eArgs
4142 {
4143 earg_unaligned = 0, /* unaligned */
4144 earg_unaligned_only, /* unaligned */
4145 earg_prim_only, /* primaryonly */
4146 earg_cigartype, /* cigartype */
4147 earg_cigarCG, /* cigarCG */
4148 earg_header, /* header */
4149 earg_noheader, /* no-header */
4150 earg_comment, /* comment */
4151 earg_region, /* region */
4152 earg_distance, /* distance */
4153 earg_seq_id, /* seq-id */
4154 earg_identicalbases, /* identical-bases */
4155 earg_gzip, /* gzip */
4156 earg_bzip2, /* bzip2 */
4157 earg_qname, /* qname */
4158 earg_fastq, /* fasta */
4159 earg_fasta, /* fastq */
4160 earg_prefix, /* prefix */
4161 earg_reverse, /* reverse */
4162 earg_test_rows, /* test-rows */
4163 earg_mate_row_gap_cachable, /* mate-row-gap-cacheable */
4164 earg_cigarCG_merge, /* cigarCGMerge */
4165 earg_XI, /* XI */
4166 earg_QualQuant, /* qual-quant */
4167 earg_CG_evidence, /* CG-evidence */
4168 earg_CG_ev_dnb, /* CG-ev-dnb */
4169 earg_CG_mappings, /* CG-mappings */
4170 earg_CG_SAM, /* CG-SAM */
4171 earg_CG_names /* CG-names */
4172 };
4173
4174 OptDef DumpArgs[] =
4175 {
4176 { "unaligned", "u", NULL, unaligned_usage, 0, false, false }, /* unaligned */
4177 { "unaligned-only", "U", NULL, unaligned_usage, 0, false, false }, /* unaligned-only */
4178 { "primary", "1", NULL, primaryonly_usage, 0, false, false }, /* primaryonly */
4179 { "cigar-long", "c", NULL, cigartype_usage, 0, false, false }, /* cigartype */
4180 { "cigar-CG", NULL, NULL, cigarCG_usage, 0, false, false }, /* cigarCG */
4181 { "header", "r", NULL, header_usage, 0, false, false }, /* header */
4182 { "no-header", "n", NULL, noheader_usage, 0, false, false }, /* no-header */
4183 { "header-comment", NULL, NULL, comment_usage, 0, true, false }, /* comment */
4184 { "aligned-region", NULL, NULL, region_usage, 0, true, false }, /* region */
4185 { "matepair-distance", NULL, NULL, distance_usage, 0, true, false }, /* distance */
4186 { "seqid", "s", NULL, seq_id_usage, 0, false, false }, /* seq-id */
4187 { "hide-identical", "=", NULL, identicalbases_usage, 0, false, false }, /* identical-bases */
4188 { "gzip", NULL, NULL, gzip_usage, 0, false, false }, /* gzip */
4189 { "bzip2", NULL, NULL, bzip2_usage, 0, false, false }, /* bzip2 */
4190 { "spot-group", "g", NULL, qname_usage, 0, false, false }, /* qname */
4191 { "fastq", NULL, NULL, fastq_usage, 0, false, false }, /* fasta */
4192 { "fasta", NULL, NULL, fasta_usage, 0, false, false }, /* fastq */
4193 { "prefix", "p", NULL, prefix_usage, 0, true, false }, /* prefix */
4194 { "reverse", NULL, NULL, reverse_usage, 0, false, false }, /* reverse */
4195 { "test-rows", NULL, NULL, NULL, 0, true, false }, /* test-rows */
4196 { "mate-cache-row-gap", NULL, NULL, NULL, 0, true, false }, /* mate-row-gap-cacheable */
4197 { "cigar-CG-merge", NULL, NULL, cigarCGMerge_usage, 0, false, false }, /* cigarCGMerge */
4198 { "XI", NULL, NULL, XI_usage, 0, false, false }, /* XI */
4199 { "qual-quant", "Q", NULL, qual_quant_usage, 0, true, false }, /* qual-quant */
4200 { "CG-evidence", NULL, NULL, CG_evidence, 0, false, false }, /* CG-evidence */
4201 { "CG-ev-dnb" , NULL, NULL, CG_ev_dnb , 0, false, false }, /* CG-ev-dnb */
4202 { "CG-mappings", NULL, NULL, CG_mappings, 0, false, false }, /* CG-mappings */
4203 { "CG-SAM", NULL, NULL, CG_SAM, 0, false, false }, /* CG-SAM */
4204 { "CG-names", NULL, NULL, CG_names, 0, false, false }, /* CG-names */
4205 { "legacy", NULL, NULL, NULL, 0, false, false }
4206 };
4207
4208
4209 /*
4210 const char UsageDefaultName[] = "sam-dump";
4211
4212
4213 rc_t CC UsageSummary( char const *progname )
4214 {
4215 return KOutMsg( "Usage:\n"
4216 "\t%s [options] path-to-run[ path-to-run ...] > output.sam\n\n", progname );
4217 }
4218
4219
4220 rc_t CC Usage( Args const *args)
4221 {
4222 char const *progname = UsageDefaultName;
4223 char const *fullpath = UsageDefaultName;
4224 rc_t rc;
4225 unsigned i;
4226
4227 rc = ArgsProgram( args, &fullpath, &progname );
4228
4229 UsageSummary( progname );
4230
4231 KOutMsg( "Options:\n" );
4232 for( i = 0; i < sizeof( DumpArgs ) / sizeof( DumpArgs[ 0 ] ); i++ )
4233 {
4234 if ( DumpArgs[ i ].help != NULL )
4235 {
4236 HelpOptionLine( DumpArgs[ i ].aliases, DumpArgs[ i ].name,
4237 usage_params[ i ], DumpArgs[ i ].help);
4238 }
4239 }
4240 KOutMsg( "\n" );
4241 HelpOptionsStandard();
4242
4243 HelpVersion( fullpath, KAppVersion() );
4244
4245 return rc;
4246 }
4247 */
4248
4249 #define ARGS_COUNT (sizeof(DumpArgs)/sizeof(DumpArgs[0]))
4250
4251
ParamCount(Args const * const args,rc_t * const prc)4252 static unsigned ParamCount( Args const *const args, rc_t *const prc )
4253 {
4254 uint32_t y = 0;
4255 rc_t rc = ArgsParamCount( args, &y );
4256
4257 if ( prc != NULL )
4258 *prc = rc;
4259 return y;
4260 }
4261
4262
ArgCount(Args const * const args,rc_t * const prc)4263 static unsigned ArgCount( Args const *const args, rc_t *const prc )
4264 {
4265 uint32_t y = 0;
4266 rc_t rc = ArgsArgvCount( args, &y );
4267
4268 if ( prc != NULL )
4269 *prc = rc;
4270 return y;
4271 }
4272
4273
CountArgs(Args const * const args,unsigned count[],char const ** const errmsg)4274 static rc_t CountArgs( Args const *const args, unsigned count[],
4275 char const **const errmsg )
4276 {
4277 rc_t rc;
4278 unsigned const pcount = ParamCount( args, &rc );
4279
4280 memset( count, 0, ARGS_COUNT );
4281 if ( rc != 0 || pcount == 0 )
4282 {
4283 *errmsg = "";
4284 MiniUsage( args );
4285 return ( ArgCount( args, NULL ) < 2 )
4286 ? 0
4287 : RC( rcExe, rcArgv, rcParsing, rcParam, rcInsufficient );
4288 }
4289 #define COUNT_ARG(E) do {\
4290 char const *const optname = DumpArgs[ (E) ].name; \
4291 if ( ( rc = ArgsOptionCount( args, optname, &count[ (E) ] ) ) != 0 ) { \
4292 *errmsg = optname; \
4293 return rc; \
4294 } } while( 0 )
4295
4296
4297 /* record source options */
4298 COUNT_ARG( earg_unaligned );
4299 COUNT_ARG( earg_unaligned_only );
4300 COUNT_ARG( earg_prim_only );
4301 COUNT_ARG( earg_CG_evidence );
4302 COUNT_ARG( earg_CG_ev_dnb );
4303 COUNT_ARG( earg_CG_mappings );
4304 COUNT_ARG( earg_CG_SAM );
4305
4306 /* record filter options */
4307 COUNT_ARG( earg_region );
4308 COUNT_ARG( earg_distance );
4309
4310 /* output format options */
4311 COUNT_ARG( earg_fastq );
4312 COUNT_ARG( earg_fasta );
4313
4314 /* SAM header options */
4315 COUNT_ARG( earg_header );
4316 COUNT_ARG( earg_noheader );
4317 COUNT_ARG( earg_comment );
4318
4319 /* SAM field options */
4320 COUNT_ARG( earg_prefix );
4321 COUNT_ARG( earg_qname );
4322 COUNT_ARG( earg_seq_id );
4323 COUNT_ARG( earg_CG_names );
4324
4325 COUNT_ARG( earg_cigartype );
4326 COUNT_ARG( earg_cigarCG );
4327 COUNT_ARG( earg_cigarCG_merge );
4328
4329 COUNT_ARG( earg_identicalbases );
4330 COUNT_ARG( earg_reverse );
4331 COUNT_ARG( earg_QualQuant );
4332
4333 /* output encoding options */
4334 COUNT_ARG( earg_gzip );
4335 COUNT_ARG( earg_bzip2 );
4336
4337 COUNT_ARG( earg_mate_row_gap_cachable );
4338
4339 /* debug options */
4340 COUNT_ARG( earg_XI );
4341 COUNT_ARG( earg_test_rows );
4342 #undef COUNT_ARG
4343 return 0;
4344 }
4345
4346
GetOptValU(Args const * const args,char const * const argname,unsigned const defval,rc_t * const prc)4347 static unsigned GetOptValU( Args const *const args, char const *const argname,
4348 unsigned const defval, rc_t *const prc )
4349 {
4350 unsigned y = defval;
4351 char const *valstr;
4352 rc_t rc = ArgsOptionValue( args, argname, 0, (const void **)&valstr );
4353
4354 if ( rc == 0 )
4355 {
4356 char *endp;
4357
4358 y = strtou32( valstr, &endp, 0 );
4359 if ( *endp != '\0' )
4360 {
4361 rc = RC( rcExe, rcArgv, rcProcessing, rcParam, rcInvalid );
4362 y = 0;
4363 }
4364 if ( y == 0 )
4365 y = defval;
4366 }
4367 if ( prc )
4368 *prc = rc;
4369 return y;
4370 }
4371
4372
GetComments(Args const * const args,struct params_s * const parms,unsigned const n)4373 static rc_t GetComments( Args const *const args, struct params_s *const parms, unsigned const n )
4374 {
4375 parms->comments = calloc( n + 1, sizeof( parms->comments[ 0 ] ) );
4376 if ( parms->comments != NULL )
4377 {
4378 unsigned i;
4379
4380 for ( i = 0; i < n; ++i )
4381 {
4382 rc_t const rc = ArgsOptionValue( args, DumpArgs[ earg_comment ].name, i, (const void **)&parms->comments[ i ] );
4383 if ( rc != 0 )
4384 {
4385 free( ( void * )parms->comments );
4386 parms->comments = NULL;
4387 return rc;
4388 }
4389 }
4390 return 0;
4391 }
4392 else
4393 return RC( rcExe, rcArgv, rcProcessing, rcMemory, rcExhausted );
4394 }
4395
4396
ParseFromTo(int * const pFrom,int * const pTo,char const str[])4397 static rc_t ParseFromTo( int *const pFrom, int *const pTo, char const str[] )
4398 {
4399 /* $str =~ /((\d+)-(\d+))|(-(\d+))|(\d+)/ */
4400 int n;
4401 char fr_str[ 16 ];
4402 char to_str[ 16 ];
4403 uint32_t fr = 0;
4404 uint32_t to = 0;
4405 int i = sscanf( str, "%15[0-9]-%15[0-9]%n", fr_str, to_str, &n );
4406
4407 if ( i != 2 )
4408 {
4409 unsigned const offset = ( str[ 0 ] == '-' ) ? 1 : 0;
4410
4411 fr = 0;
4412 i = sscanf( str + offset, "%15[0-9]%n", to_str, &n );
4413 if ( i != 1 )
4414 return RC( rcExe, rcArgv, rcProcessing, rcParam, rcOutofrange );
4415 to = strtou32( to_str, 0, 0 );
4416 }
4417 else
4418 {
4419 fr = strtou32( fr_str, 0, 0 );
4420 to = strtou32( to_str, 0, 0 );
4421
4422 if ( to < fr )
4423 {
4424 uint32_t const tmp = to;
4425 to = fr;
4426 fr = tmp;
4427 }
4428 }
4429 /* was the entire string consumed */
4430 if ( str[ n ] != 0 )
4431 return RC( rcExe, rcArgv, rcProcessing, rcParam, rcInvalid );
4432 *pFrom = ( int )fr;
4433 *pTo = ( int )to;
4434 return 0;
4435 }
4436
4437
GetDistance(Args const * const args,struct params_s * const parms,unsigned const n)4438 static rc_t GetDistance( Args const *const args, struct params_s *const parms, unsigned const n )
4439 {
4440 rc_t rc;
4441 TMatepairDistance *const mpd = calloc( n, sizeof( mpd[ 0 ] ) );
4442
4443 if ( mpd == NULL )
4444 rc = RC( rcExe, rcArgv, rcProcessing, rcMemory, rcExhausted );
4445 else
4446 {
4447 unsigned i;
4448 unsigned j;
4449
4450 for ( j = i = 0; i < n; ++i )
4451 {
4452 char const *valstr;
4453 int from = 0;
4454 int to = 0;
4455
4456 rc = ArgsOptionValue( args, DumpArgs[earg_distance].name, i, (const void **)&valstr );
4457 if ( rc != 0 )
4458 break;
4459 if ( strcasecmp( valstr, "unknown" ) == 0 )
4460 {
4461 parms->mp_dist_unknown = true;
4462 continue;
4463 }
4464 rc = ParseFromTo( &from, &to, valstr );
4465 if ( rc != 0 )
4466 break;
4467 if ( ( from != 0 ) || ( to != 0 ) )
4468 {
4469 mpd[ j ].from = from;
4470 mpd[ j ].to = to;
4471 ++j;
4472 }
4473 }
4474 if ( rc == 0 )
4475 {
4476 parms->mp_dist = mpd;
4477 parms->mp_dist_qty = j;
4478
4479 SAM_DUMP_DBG( 2, ( "filter by mate pair distance\n" ) );
4480 if ( parms->mp_dist_unknown )
4481 SAM_DUMP_DBG( 2, ( " distance: unknown\n" ) );
4482 for ( i = 0; i < j; ++i )
4483 {
4484 SAM_DUMP_DBG( 2, ( " distance [%lu-%lu]\n",
4485 parms->mp_dist[ i ].from, parms->mp_dist[ i ].to ) );
4486 }
4487 }
4488 else
4489 free( mpd );
4490 }
4491 return rc;
4492 }
4493
4494
GetRegion(Args const * const args,struct params_s * const parms,unsigned const n)4495 static rc_t GetRegion( Args const *const args, struct params_s *const parms, unsigned const n )
4496 {
4497 rc_t rc;
4498 TAlignedRegion *const reg = calloc( n, sizeof( reg[ 0 ] ) );
4499
4500 if ( reg == NULL )
4501 rc = RC( rcExe, rcArgv, rcProcessing, rcMemory, rcExhausted );
4502 else
4503 {
4504 unsigned i;
4505 unsigned j;
4506
4507 for ( rc = 0, j = i = 0; i < n; ++i )
4508 {
4509 char const *valstr;
4510 char const *sep;
4511 unsigned namelen;
4512 unsigned f;
4513 unsigned e;
4514 TAlignedRegion r;
4515
4516 memset(&r,0,sizeof(r));
4517
4518 rc = ArgsOptionValue( args, DumpArgs[ earg_region ].name, i, (const void **)&valstr );
4519 if ( rc != 0 ) break;
4520
4521 sep = strchr( valstr, ':' );
4522 if ( sep != NULL )
4523 namelen = sep - valstr;
4524 else
4525 namelen = string_size( valstr );
4526
4527 if ( namelen >= sizeof( r.name ) )
4528 return RC( rcExe, rcArgv, rcProcessing, rcParam, rcTooLong );
4529
4530 memmove( r.name, valstr, namelen );
4531 r.name[ namelen ] = '\0';
4532
4533 r.rq = UINT_MAX;
4534 if ( sep != NULL )
4535 {
4536 int from = -1;
4537 int to = -1;
4538
4539 rc = ParseFromTo( &from, &to, sep + 1 );
4540 if ( rc != 0 )
4541 break;
4542 if ( from > 0 ) /* convert to 0-based */
4543 --from;
4544 else if ( to > 0 )
4545 from = 0;
4546 if ( to > 0 )
4547 --to;
4548 else if ( from >= 0 && to < 0 )
4549 to = from;
4550 if ( from >= 0 || to >= 0 )
4551 {
4552 if ( from > to )
4553 {
4554 int tmp = to;
4555 to = from;
4556 from = tmp;
4557 }
4558 r.r[ 0 ].from = from;
4559 r.r[ 0 ].to = to;
4560 r.rq = 1;
4561 }
4562 }
4563
4564 for ( f = 0, e = j; f < e; )
4565 {
4566 unsigned const m = ( ( f + e ) / 2 );
4567 int const diff = strcmp( r.name, reg[ m ].name );
4568
4569 if ( diff < 0 )
4570 e = m;
4571 else
4572 f = m + 1;
4573 }
4574 if ( 0 < e && e < j && strcmp( r.name, reg[ e-1 ].name ) == 0 )
4575 {
4576 TAlignedRegion *const fnd = ®[ e - 1 ];
4577
4578 if ( fnd->rq != UINT_MAX )
4579 {
4580 for ( f = 0, e = fnd->rq; f < e; )
4581 {
4582 unsigned const m = ( ( f + e ) / 2 );
4583
4584 if ( r.r[ 0 ].from < fnd->r[ m ].from )
4585 e = m;
4586 else
4587 f = m + 1;
4588 }
4589 if ( fnd->rq >= ( sizeof( r.r ) / sizeof( r.r[0] ) ) )
4590 {
4591 rc = RC( rcExe, rcArgv, rcProcessing, rcBuffer, rcInsufficient );
4592 break;
4593 }
4594 memmove( &fnd->r[ e +1 ], &fnd->r[ e ], ( fnd->rq - e ) * sizeof( fnd->r[ 0 ] ) );
4595 fnd->r[ e ] = r.r[ 0 ];
4596 ++fnd->rq;
4597 }
4598 }
4599 else
4600 {
4601 memmove( ®[ e + 1 ], ®[ e ], ( j - e ) * sizeof( reg[ 0 ] ) );
4602 reg[ e ] = r;
4603 ++j;
4604 }
4605 }
4606 if ( rc == 0 )
4607 {
4608 parms->region = reg;
4609 parms->region_qty = j;
4610
4611 for ( i = 0; i < parms->region_qty; ++i )
4612 {
4613 TAlignedRegion *const r = &parms->region[ i ];
4614
4615 SAM_DUMP_DBG( 2, ( "filter by %s\n", r->name ) );
4616 if ( r->rq == UINT_MAX )
4617 {
4618 r->rq = 1;
4619 r->r[ 0 ].from = 0;
4620 r->r[ 0 ].to = UINT_MAX;
4621 }
4622 for ( j = 0; j < r->rq; ++j )
4623 {
4624 unsigned const to = r->r[ j ].to;
4625 unsigned const from = r->r[ j ].from;
4626
4627 SAM_DUMP_DBG( 2, ( " range: [%u:%u]\n", r->r[ j ].from, to ) );
4628 if ( r->max_to < to ) r->max_to = to;
4629 if ( r->min_from < from ) r->min_from = from;
4630 }
4631 }
4632 }
4633 else
4634 free( reg );
4635 }
4636 return rc;
4637 }
4638
4639
GetQualQuant(Args const * const args,struct params_s * const parms)4640 static rc_t GetQualQuant( Args const *const args, struct params_s *const parms )
4641 {
4642 char const *valstr;
4643 int i;
4644 rc_t rc = ArgsOptionValue( args, DumpArgs[ earg_QualQuant ].name, 0, (const void **)&valstr );
4645
4646 if ( rc == 0 && strcmp( valstr, "0" ) != 0 )
4647 parms->quantizeQual = QualityQuantizerInitMatrix( parms->qualQuant, valstr );
4648 for(i=1; parms->qualQuant[i]==parms->qualQuant[0] && i<256;i++){}
4649 if(i<256) parms->qualQuantSingle=0;
4650 else parms->qualQuantSingle = parms->qualQuant[0];
4651 return rc;
4652 }
4653
4654
GetArgs(Args const * const args,unsigned const count[],char const ** const errmsg)4655 static rc_t GetArgs( Args const *const args, unsigned const count[],
4656 char const **const errmsg )
4657 {
4658 static struct params_s parms;
4659 bool const multipass = ParamCount( args, 0 ) > 1;
4660 rc_t rc;
4661
4662 memset( &parms, 0, sizeof( parms ) );
4663
4664 if ( count[ earg_comment ] )
4665 {
4666 rc = GetComments( args, &parms, count[ earg_comment ] );
4667 if ( rc != 0 )
4668 {
4669 *errmsg = DumpArgs[ earg_comment ].name;
4670 return rc;
4671 }
4672 }
4673
4674 if ( count[ earg_region ] && count[ earg_unaligned_only ] == 0 )
4675 {
4676 rc = GetRegion( args, &parms, count[ earg_region ] );
4677 if ( rc != 0 )
4678 {
4679 *errmsg = DumpArgs[ earg_region ].name;
4680 return rc;
4681 }
4682 }
4683
4684 if ( count[ earg_distance ] && count[ earg_unaligned_only ] == 0 )
4685 {
4686 rc = GetDistance( args, &parms, count[ earg_distance ] );
4687 if ( rc != 0 )
4688 {
4689 *errmsg = DumpArgs[ earg_distance ].name;
4690 return rc;
4691 }
4692 }
4693
4694 if ( count[ earg_QualQuant ] )
4695 {
4696 rc = GetQualQuant( args, &parms );
4697 if ( rc != 0 )
4698 {
4699 *errmsg = DumpArgs[ earg_QualQuant ].name;
4700 return rc;
4701 }
4702 }
4703
4704 parms.cg_sam = ( count[ earg_CG_SAM ] != 0 );
4705 parms.cg_evidence = ( count[ earg_CG_evidence ] != 0 );
4706 parms.cg_ev_dnb = ( count[ earg_CG_ev_dnb ] != 0 );
4707
4708 if ( count[ earg_CG_mappings ] == 0 &&
4709 ( parms.cg_sam || parms.cg_evidence || parms.cg_ev_dnb ) )
4710 {
4711 parms.primaries = false;
4712 parms.secondaries = false;
4713 parms.unaligned = false;
4714 }
4715 else
4716 {
4717 parms.primaries = true;
4718 parms.secondaries = ( count[ earg_prim_only ] == 0 );
4719 parms.unaligned = ( ( count[ earg_unaligned ] != 0 ) /*&& ( parms.region_qty == 0 )*/ );
4720 }
4721
4722 parms.long_cigar = ( count[ earg_cigartype ] != 0 );
4723 parms.use_seqid = ( ( count[ earg_seq_id ] != 0 ) || multipass );
4724 parms.hide_identical = ( count[ earg_identicalbases ] != 0 );
4725 parms.fasta = ( count[ earg_fasta ] != 0 );
4726 parms.fastq = ( count[ earg_fastq ] != 0 );
4727 parms.reverse_unaligned = ( count[ earg_reverse ] != 0 );
4728 parms.cg_friendly_names = count[ earg_CG_names ] != 0;
4729 parms.spot_group_in_name = ( count[ earg_qname ] != 0 || multipass );
4730 parms.noheader = ( ( count[ earg_noheader ] != 0 ) || parms.fasta || parms.fastq || multipass );
4731 parms.reheader = ( ( count[ earg_header ] != 0 ) && !parms.noheader );
4732 parms.xi = ( count[ earg_XI ] != 0 );
4733 if ( ( count[ earg_cigarCG ] == 0 ) && ( count[ earg_cigarCG_merge ] == 0 ) )
4734 {
4735 parms.cg_style = 0;
4736 }
4737 else if ( count[ earg_cigarCG ] == 0 )
4738 {
4739 parms.cg_style = 1;
4740 parms.long_cigar = false;
4741 }
4742 else if ( count[ earg_cigarCG_merge ] == 0 )
4743 {
4744 parms.cg_style = 2;
4745 }
4746 else
4747 {
4748 rc = RC( rcExe, rcArgv, rcProcessing, rcParam, rcInconsistent );
4749 *errmsg = "cigar-CG and CG are mutually exclusive";
4750 parms.cg_style = 0;
4751 }
4752
4753 if (count[ earg_unaligned_only ] > 0) {
4754 parms.unaligned_spots = true;
4755
4756 parms.primaries = false;
4757 parms.secondaries = false;
4758 parms.unaligned = true;
4759 parms.cg_ev_dnb = false;
4760 parms.cg_evidence = false;
4761 parms.cg_sam = false;
4762 parms.cg_style = 0;
4763 }
4764
4765 parms.test_rows = GetOptValU( args, DumpArgs[ earg_test_rows ].name, 0, NULL );
4766 parms.mate_row_gap_cachable = GetOptValU( args, DumpArgs[ earg_mate_row_gap_cachable ].name, 1000000, NULL );
4767
4768 param = &parms;
4769 return 0;
4770 }
4771
4772
GetParams(Args const * const args,char const ** const errmsg)4773 static rc_t GetParams( Args const *const args, char const **const errmsg )
4774 {
4775 unsigned count[ ARGS_COUNT ];
4776 rc_t rc = CountArgs( args, count, errmsg );
4777
4778 if ( rc == 0 )
4779 rc = GetArgs( args, count, errmsg );
4780 return rc;
4781 }
4782
4783
SAM_Dump_Main(int argc,char * argv[])4784 rc_t CC SAM_Dump_Main( int argc, char* argv[] )
4785 {
4786 rc_t rc = 0;
4787 Args* args;
4788 char const *errmsg = "stop";
4789 bool error_reported = false;
4790
4791 memset( &g_out_writer, 0, sizeof( g_out_writer ) );
4792 KOutHandlerSetStdOut();
4793 KStsHandlerSetStdErr();
4794 KLogHandlerSetStdErr();
4795 (void)KDbgHandlerSetStdErr();
4796
4797 ReportBuildDate( __DATE__ );
4798
4799 rc = ArgsMakeAndHandle( &args, argc, argv, 1, DumpArgs, ARGS_COUNT );
4800 if ( rc == 0 )
4801 {
4802 uint32_t pcount;
4803
4804 rc = ArgsParamCount( args, &pcount );
4805 if ( rc != 0 || pcount < 1 )
4806 {
4807 errmsg = "";
4808 rc = argc < 2 ? 0 : RC( rcExe, rcArgv, rcParsing, rcParam, rcInsufficient );
4809 MiniUsage( args );
4810 }
4811 else if ( ( rc = GetParams( args, &errmsg ) ) != 0 )
4812 {
4813 }
4814 else
4815 {
4816 VDBManager const *mgr;
4817
4818 rc = VDBManagerMakeRead( &mgr, NULL );
4819 if ( rc == 0 )
4820 {
4821 rc = BufferedWriterMake( param->output_gzip, param->output_bz2 );
4822 if ( rc == 0 )
4823 {
4824 unsigned i;
4825
4826 for ( i = 0; i < pcount; ++i )
4827 {
4828 char const *arg;
4829
4830 rc = ArgsParamValue( args, i, (const void **)&arg );
4831 if ( rc != 0 ) break;
4832 rc = ProcessPath( mgr, arg );
4833 #if _ARCH_BITS < 64
4834 /* look for "param excessive while writing vector within container module" */
4835 if (GetRCState(rc) == rcExcessive &&
4836 GetRCObject(rc) == rcParam &&
4837 GetRCContext(rc) == rcWriting &&
4838 GetRCTarget(rc) == rcVector &&
4839 GetRCModule(rc) == rcCont)
4840 {
4841 ( void )PLOGMSG( klogErr, (klogErr, "This run '$(run)' contains too many reads to be processed with a $(bits)-bit executable; please try again with a 64-bit executable.", "run=%s,bits=%u", arg, _ARCH_BITS));
4842 }
4843 #endif
4844 if ( rc != 0 ) break;
4845 }
4846 BufferedWriterRelease( rc == 0 );
4847 }
4848 VDBManagerRelease( mgr );
4849 }
4850 }
4851 ArgsWhack( args );
4852 }
4853
4854 if ( rc != 0 && !error_reported )
4855 {
4856 if ( errmsg[ 0 ] )
4857 {
4858 ( void )LOGERR( klogErr, rc, errmsg );
4859 } else if ( KLogLastErrorCode() != rc )
4860 {
4861 ( void )LOGERR( klogErr, rc, "stop" );
4862 }
4863 }
4864
4865 {
4866 /* Report execution environment if necessary */
4867 rc_t rc2 = ReportFinalize( rc );
4868 if ( rc == 0 )
4869 rc = rc2;
4870 }
4871 return rc;
4872 }
4873
4874
Legacy_KMain(int argc,char * argv[])4875 rc_t CC Legacy_KMain( int argc, char* argv[] )
4876 {
4877 char filename[ 4096 ];
4878 /*
4879 Try to find a option-file - parameter in the original parameters
4880 This is a hack to teach sam-dump to read it's parameters from a file/pipe instead
4881 of the commandline
4882 It is neccessary because the code above does not use libkapp to parse parameters!
4883 */
4884 rc_t rc = Args_find_option_in_argv( argc, argv, "--option-file", filename, sizeof filename );
4885 if ( rc == 0 )
4886 {
4887 int new_argc;
4888 char ** new_argv;
4889 /* we found it! Now comes the special treatment: we fake a new argc/argv-pair! */
4890 rc = Args_tokenize_file_and_progname_into_argv( filename, "sam-dump", &new_argc, &new_argv );
4891 if ( rc == 0 )
4892 {
4893 /* pass the faked input-parameters from */
4894 rc = SAM_Dump_Main( new_argc, new_argv );
4895
4896 Args_free_token_argv( new_argc, new_argv );
4897 }
4898 }
4899 else
4900 {
4901 /* we did not found the special option, just use the commandline-parameters unchanged */
4902 rc = SAM_Dump_Main( argc, argv );
4903 }
4904 return rc;
4905 }
4906