1 /*===========================================================================
2 *
3 * PUBLIC DOMAIN NOTICE
4 * National Center for Biotechnology Information
5 *
6 * This software/database is a "United States Government Work" under the
7 * terms of the United States Copyright Act. It was written as part of
8 * the author's official duties as a United States Government employee and
9 * thus cannot be copyrighted. This software/database is freely available
10 * to the public for use. The National Library of Medicine and the U.S.
11 * Government have not placed any restriction on its use or reproduction.
12 *
13 * Although all reasonable efforts have been taken to ensure the accuracy
14 * and reliability of the software and data, the NLM and the U.S.
15 * Government do not and cannot warrant the performance or results that
16 * may be obtained by using this software or data. The NLM and the U.S.
17 * Government disclaim all warranties, express or implied, including
18 * warranties of performance, merchantability or fitness for any particular
19 * purpose.
20 *
21 * Please cite the author in any work or product based on this material.
22 *
23 * ===========================================================================
24 *
25 */
26
27 #include "read_fkt.h"
28 #include "sam-unaligned.h"
29 #include <kapp/main.h>
30 #include <sysalloc.h>
31 #include <ctype.h>
32
33 #define COL_READ "(INSDC:dna:text)READ"
34 #define COL_REF_NAME "(ascii)REF_NAME"
35 #define COL_REF_SEQ_ID "(ascii)REF_SEQ_ID"
36 #define COL_REF_POS "(INSDC:coord:zero)REF_POS"
37
38 typedef struct prim_table_ctx
39 {
40 const VCursor * cursor;
41
42 uint32_t ref_name_idx;
43 uint32_t ref_seq_id_idx;
44 uint32_t ref_pos_idx;
45 } prim_table_ctx;
46
47
prepare_prim_table_ctx(const samdump_opts * const opts,const input_table * const itab,prim_table_ctx * const ptx)48 static rc_t prepare_prim_table_ctx( const samdump_opts * const opts,
49 const input_table * const itab,
50 prim_table_ctx * const ptx )
51 {
52 rc_t rc;
53
54 if ( opts->cursor_cache_size == 0 )
55 rc = VTableCreateCursorRead( itab->tab, &ptx->cursor );
56 else
57 rc = VTableCreateCachedCursorRead( itab->tab, &ptx->cursor, opts->cursor_cache_size );
58 if ( rc != 0 )
59 {
60 (void)PLOGERR( klogInt, ( klogInt, rc, "VTableCreateCursorRead( PRIMARY_ALIGNMENT ) for $(tn) failed", "tn=%s", itab->path ) );
61 }
62 else
63 {
64 rc = add_column( ptx->cursor, COL_REF_NAME, &ptx->ref_name_idx );
65 if ( rc == 0 )
66 rc = add_column( ptx->cursor, COL_REF_SEQ_ID, &ptx->ref_seq_id_idx );
67 if ( rc == 0 )
68 rc = add_column( ptx->cursor, COL_REF_POS, &ptx->ref_pos_idx );
69 if ( rc == 0 )
70 {
71 rc = VCursorOpen( ptx->cursor );
72 if ( rc != 0 )
73 {
74 (void)PLOGERR( klogInt, ( klogInt, rc, "VCursorOpen( PRIMARY_ALIGNMENT ) for $(tn) failed", "tn=%s", itab->path ) );
75 }
76 }
77 }
78 return rc;
79 }
80
81
82 #define COL_ALIGN_COUNT "(U8)ALIGNMENT_COUNT"
83 #define COL_SPOT_ID "(INSDC:SRA:spotid_t)SPOT_ID"
84 #define COL_PRIM_AL_ID "(I64)PRIMARY_ALIGNMENT_ID"
85 #define COL_READ_TYPE "(INSDC:SRA:xread_type)READ_TYPE"
86 #define COL_READ_FILTER "(INSDC:SRA:read_filter)READ_FILTER"
87 #define COL_READ_LEN "(INSDC:coord:len)READ_LEN"
88 #define COL_READ_START "(INSDC:coord:zero)READ_START"
89 /* #define COL_QUALITY "(INSDC:quality:text:phred_33)QUALITY" */
90 #define COL_QUALITY "(INSDC:quality:phred)QUALITY"
91 #define COL_SPOT_GROUP "(ascii)SPOT_GROUP"
92 #define COL_NAME "(ascii)NAME"
93 #define COL_LNK_GROUP "(ascii)LINKAGE_GROUP"
94
95 typedef struct seq_table_ctx
96 {
97 const VCursor * cursor;
98
99 uint32_t align_count_idx;
100 uint32_t prim_al_id_idx;
101 uint32_t read_type_idx;
102 uint32_t read_filter_idx;
103 uint32_t read_len_idx;
104 uint32_t read_start_idx;
105 uint32_t read_idx;
106 uint32_t quality_idx;
107 uint32_t spot_group_idx;
108 uint32_t name_idx;
109 uint32_t lnk_group_idx;
110 } seq_table_ctx;
111
112
prepare_seq_table_ctx(const samdump_opts * const opts,const input_table * const itab,seq_table_ctx * const stx)113 static rc_t prepare_seq_table_ctx( const samdump_opts * const opts,
114 const input_table * const itab,
115 seq_table_ctx * const stx )
116 {
117 struct KNamelist * available_columns;
118 rc_t rc = VTableListReadableColumns ( itab->tab, &available_columns );
119 if ( rc != 0 )
120 {
121 (void)PLOGERR( klogInt, ( klogInt, rc,
122 "VTableListReadableColumns( SEQUENCE ) for $(tn) failed", "tn=%s", itab->path ) );
123 }
124 else
125 {
126 if ( opts->cursor_cache_size == 0 )
127 rc = VTableCreateCursorRead( itab->tab, &stx->cursor );
128 else
129 rc = VTableCreateCachedCursorRead( itab->tab, &stx->cursor, opts->cursor_cache_size );
130 if ( rc != 0 )
131 {
132 (void)PLOGERR( klogInt, ( klogInt, rc, "VTableCreateCursorRead( SEQUENCE ) for $(tn) failed", "tn=%s", itab->path ) );
133 }
134 else
135 {
136 add_opt_column( stx->cursor, available_columns, COL_ALIGN_COUNT, &stx->align_count_idx ); /* read_fkt.c */
137 add_opt_column( stx->cursor, available_columns, COL_PRIM_AL_ID, &stx->prim_al_id_idx );
138 add_opt_column( stx->cursor, available_columns, COL_NAME, &stx->name_idx );
139 add_opt_column( stx->cursor, available_columns, COL_LNK_GROUP, &stx->lnk_group_idx );
140
141 if ( rc == 0 )
142 rc = add_column( stx->cursor, COL_READ_TYPE, &stx->read_type_idx ); /* read_fkt.c */
143 if ( rc == 0 )
144 rc = add_column( stx->cursor, COL_READ_FILTER, &stx->read_filter_idx );
145 if ( rc == 0 )
146 rc = add_column( stx->cursor, COL_READ_LEN, &stx->read_len_idx );
147 if ( rc == 0 )
148 rc = add_column( stx->cursor, COL_READ_START, &stx->read_start_idx );
149 if ( rc == 0 )
150 rc = add_column( stx->cursor, COL_READ, &stx->read_idx );
151 if ( rc == 0 )
152 rc = add_column( stx->cursor, COL_QUALITY, &stx->quality_idx );
153 if ( rc == 0 )
154 rc = add_column( stx->cursor, COL_SPOT_GROUP, &stx->spot_group_idx );
155 }
156 KNamelistRelease( available_columns );
157 }
158 return rc;
159 }
160
161
162 typedef struct seq_row
163 {
164 uint32_t nreads;
165
166 bool fully_unaligned;
167 bool partly_unaligned;
168 bool filtered_out;
169 } seq_row;
170
171
complain_size_diff(int64_t row_id,const char * txt)172 static rc_t complain_size_diff( int64_t row_id, const char * txt )
173 {
174 rc_t rc = RC( rcExe, rcNoTarg, rcConstructing, rcSize, rcInvalid );
175 (void)PLOGERR( klogInt, ( klogInt, rc, "in row $(rn) of SEQUENCE.$(tx)", "rn=%ld,tx=%s", row_id, txt ) );
176 return rc;
177 }
178
179
read_seq_row(const samdump_opts * const opts,const seq_table_ctx * const stx,const int64_t row_id,seq_row * const row)180 static rc_t read_seq_row( const samdump_opts * const opts,
181 const seq_table_ctx * const stx,
182 const int64_t row_id,
183 seq_row * const row )
184 {
185 rc_t rc = 0;
186
187 if ( stx->align_count_idx == INVALID_COLUMN )
188 {
189 const INSDC_read_type *ptr;
190
191 row->fully_unaligned = true;
192 row->partly_unaligned = false;
193 row->filtered_out = false;
194 rc = read_INSDC_read_type_ptr( row_id, stx->cursor, stx->read_type_idx, &ptr, &row->nreads, "READ_TYPE" );
195 }
196 else
197 {
198 const uint8_t * u8ptr;
199 uint32_t len;
200
201 /* ALIGNMENT_COUNT ... only to detect if reads are unaligned, could be done with PRIMARY_ALINGMENT too,
202 but this one is only 8 bit instead of 64 bit, that means faster */
203 rc = read_uint8_ptr( row_id, stx->cursor, stx->align_count_idx, &u8ptr, &len, "ALIGN_COUNT" );
204 if ( rc == 0 )
205 {
206 uint32_t i, n;
207
208 row->nreads = len;
209 for ( i = 0, n = 0; i < len; ++i )
210 if ( u8ptr[ i ] != 0 )
211 n++;
212 row->fully_unaligned = ( n == 0 );
213 row->partly_unaligned = ( n < len && n > 0 );
214
215 if ( row->partly_unaligned )
216 row->filtered_out = !opts->print_half_unaligned_reads;
217 else if ( row->fully_unaligned )
218 row->filtered_out = !opts->print_fully_unaligned_reads;
219 else
220 row->filtered_out = true;
221 }
222 }
223 return rc;
224 }
225
226
227 /**********************************************************************************
228
229 0x001 template having multiple fragments in sequencing
230 (row->nreads > 1 )
231
232 0x002 each fragment properly aligned according to the aligner
233 ( never the case in an unaligned read )
234
235 0x004 fragment unmapped
236 ( always true in an unaligned read )
237
238 0x008 next fragment in the template unmapped
239 ( is the mate aligned? next: read_idx+1, if read_idx==row->nreads-1 then read_idx-1 )
240
241 0x010 SEQ being reverse complemented
242 ( column 'READ_TYPE' has bit READ_TYPE_REVERSE set )
243
244 0x020 SEQ of the next fragment in the template being reversed
245 ( is the mate reversed? )
246
247 0x040 the first fragment in the template
248 ( read_idx == 0 )
249
250 0x080 the last fragment in the template
251 ( read_idx == ( row->nreads - 1 ) )
252
253 0x100 secondary alignment
254 ( never the case in an unaligned read )
255
256 0x200 not passing quality controls
257 ( column 'READ_FILTER' has bit READ_FILTER_REJECT set )
258
259 0x400 PCR or optical duplicate
260 ( column 'READ_FILTER' has bit READ_FILTER_CRITERIA set )
261
262 **********************************************************************************/
calculate_unaligned_sam_flags_db(uint32_t nreads,uint32_t read_idx,uint32_t mate_idx,int64_t mate_id,const INSDC_read_type * read_type,bool reverse_flag,const INSDC_read_filter * read_filter)263 static uint32_t calculate_unaligned_sam_flags_db( uint32_t nreads,
264 uint32_t read_idx,
265 uint32_t mate_idx,
266 int64_t mate_id,
267 const INSDC_read_type * read_type,
268 bool reverse_flag,
269 const INSDC_read_filter * read_filter )
270 {
271 uint32_t res = 0x04;
272
273 if ( nreads > 1 )
274 {
275 res |= 0x01;
276 /* the following test only make sense if there is a mate */
277
278 if ( mate_id == 0 )
279 res |= 0x08;
280 if ( ( read_type[ mate_idx ] & READ_TYPE_REVERSE ) == READ_TYPE_REVERSE )
281 res |= 0x020;
282 if ( read_idx == 0 )
283 res |= 0x040;
284 if ( read_idx == ( nreads - 1 ) )
285 res |= 0x080;
286 }
287
288 if ( reverse_flag )
289 res |= 0x010;
290 if ( ( read_filter[ read_idx ] & READ_FILTER_REJECT ) == READ_FILTER_REJECT )
291 res |= 0x200;
292 if ( ( read_filter[ read_idx ] & READ_FILTER_CRITERIA ) == READ_FILTER_CRITERIA )
293 res |= 0x400;
294 return res;
295 }
296
297
298 const char * ref_name_star = "*";
299
get_mate_info(const prim_table_ctx * const ptx,const matecache * const mc,const input_database * const ids,const int64_t row_id,const int64_t mate_id,const uint32_t nreads,const char ** mate_ref_name,uint32_t * const mate_ref_name_len,INSDC_coord_zero * const mate_ref_pos)300 static rc_t get_mate_info( const prim_table_ctx * const ptx,
301 const matecache * const mc,
302 const input_database * const ids,
303 const int64_t row_id,
304 const int64_t mate_id,
305 const uint32_t nreads,
306 const char ** mate_ref_name,
307 uint32_t * const mate_ref_name_len,
308 INSDC_coord_zero * const mate_ref_pos )
309 {
310 rc_t rc = 0;
311
312 *mate_ref_name = ref_name_star;
313 *mate_ref_name_len = 1;
314 *mate_ref_pos = 0;
315 if ( nreads < 2 )
316 return rc;
317
318 if ( mate_id != 0 )
319 {
320 uint32_t ref_idx;
321 int64_t seq_spot_id;
322 rc = matecache_lookup_unaligned( mc, ids->db_idx, mate_id, mate_ref_pos, &ref_idx, &seq_spot_id );
323 if ( rc == 0 )
324 {
325 const ReferenceObj* refobj;
326 *mate_ref_pos += 1;
327 /* now we have to lookup the reference-name based on it's index into the ids->reflist */
328 rc = ReferenceList_Get( ids->reflist, &refobj, ref_idx );
329 if ( rc != 0 )
330 {
331 (void)PLOGERR( klogInt, ( klogInt, rc, "in row $(rn) of SEQUENCE.$(rx)", "rn=%ld,rx=%ld", mate_id, row_id ) );
332 }
333 else
334 {
335 rc = ReferenceObj_Name( refobj, mate_ref_name );
336 if ( rc != 0 )
337 {
338 (void)PLOGERR( klogInt, ( klogInt, rc, "in row $(rn) of SEQUENCE.$(rx)", "rn=%ld,rx=%ld", mate_id, row_id ) );
339 }
340 else
341 {
342 *mate_ref_name_len = string_size( *mate_ref_name );
343 }
344 }
345 }
346 else if ( GetRCState( rc ) == rcNotFound )
347 {
348 /* we have a mate and it is aligned! ---> look it up in the PRIMARY_ALIGNMENT - table*/
349 const char * ptr;
350 uint32_t len;
351
352 rc = read_char_ptr( mate_id, ptx->cursor, ptx->ref_name_idx, &ptr, &len, "REF_NAME" );
353 if ( rc == 0 && len > 0 )
354 {
355 *mate_ref_name = ptr;
356 *mate_ref_name_len = len;
357 }
358
359 if ( rc == 0 )
360 {
361 INSDC_coord_zero pos;
362 rc = read_INSDC_coord_zero( mate_id, ptx->cursor, ptx->ref_pos_idx, &pos, 0, "REF_POS" );
363 if ( rc == 0 )
364 {
365 *mate_ref_pos = ( pos + 1 );
366 }
367 }
368 }
369 }
370 return rc;
371 }
372
373
print_sliced_read(const INSDC_dna_text * read,uint32_t read_idx,bool reverse,const INSDC_coord_zero * read_start,const INSDC_coord_len * read_len)374 static rc_t print_sliced_read( const INSDC_dna_text * read,
375 uint32_t read_idx,
376 bool reverse,
377 const INSDC_coord_zero * read_start,
378 const INSDC_coord_len * read_len )
379 {
380 rc_t rc = 0;
381 const INSDC_dna_text * ptr = read + read_start[ read_idx ];
382 if ( !reverse )
383 {
384 rc = KOutMsg( "%.*s", read_len[ read_idx ], ptr );
385 }
386 else
387 {
388 const char cmp_tbl [] =
389 {
390 'T', 'V', 'G', 'H', 'E', 'F', 'C', 'D',
391 'I', 'J', 'M', 'L', 'K', 'N', 'O', 'P',
392 'Q', 'Y', 'S', 'A', 'U', 'B', 'W', 'X',
393 'R', 'Z'
394 };
395
396 int32_t i = ( read_len[ read_idx ] - 1 );
397 while( i >= 0 && rc == 0 )
398 {
399 int c = ptr [ i ];
400 if ( isalpha ( c ) )
401 {
402 if ( islower ( c ) )
403 c = tolower ( cmp_tbl [ toupper ( c ) - 'A' ] );
404 else
405 c = cmp_tbl [ c - 'A' ];
406 }
407
408 rc = KOutMsg( "%c", ( char ) c );
409 i--;
410 }
411 }
412 return rc;
413 }
414
415
print_sliced_quality(const samdump_opts * const opts,const char * quality,uint32_t read_idx,bool reverse,const INSDC_coord_zero * read_start,const INSDC_coord_len * read_len)416 static rc_t print_sliced_quality( const samdump_opts * const opts,
417 const char * quality,
418 uint32_t read_idx,
419 bool reverse,
420 const INSDC_coord_zero * read_start,
421 const INSDC_coord_len * read_len )
422 {
423 const char * ptr = quality + read_start[ read_idx ];
424 return dump_quality( opts, ptr, read_len[ read_idx ], reverse ); /* sam-dump-opts.c */
425 }
426
427
dump_the_other_read(const seq_table_ctx * const stx,const prim_table_ctx * const ptx,const int64_t row_id,const uint32_t mate_idx)428 static rc_t dump_the_other_read( const seq_table_ctx * const stx,
429 const prim_table_ctx * const ptx,
430 const int64_t row_id,
431 const uint32_t mate_idx )
432 {
433 uint32_t row_len;
434 const int64_t *prim_al_id_ptr;
435
436 /* read from the SEQUENCE-table the value of the colum "PRIMARY_ALIGNMENT_ID"[ mate_idx ] */
437 rc_t rc = read_int64_ptr( row_id, stx->cursor, stx->prim_al_id_idx, &prim_al_id_ptr, &row_len, "PRIM_AL_ID" );
438 if ( rc == 0 )
439 {
440 if ( row_len == 0 )
441 {
442 (void)PLOGERR( klogInt, ( klogInt, rc, "rowlen zero in row $(rn) of SEQUENCE.PRIMARY_ALIGNMENT_ID", "rn=%ld", row_id ) );
443 }
444 else if ( mate_idx >= row_len )
445 {
446 (void)PLOGERR( klogInt, ( klogInt, rc, "mate_idx invalid in row $(rn) of SEQUENCE.PRIMARY_ALIGNMENT_ID", "rn=%ld", row_id ) );
447 }
448 else
449 {
450 /* read from the PRIMARY_ALIGNMENT_TABLE the value of the columns "REF_NAME" and "REF_POS" */
451 int64_t a_row_id = prim_al_id_ptr[ mate_idx ];
452 if ( a_row_id == 0 )
453 {
454 rc = KOutMsg( "*\t0\t" );
455 }
456 else
457 {
458 const char * ref_name;
459 uint32_t ref_name_len;
460 rc = read_char_ptr( a_row_id, ptx->cursor, ptx->ref_name_idx, &ref_name, &ref_name_len, "REF_NAME" );
461 if ( rc == 0 )
462 {
463 if ( ref_name_len == 0 )
464 {
465 (void)PLOGERR( klogInt, ( klogInt, rc, "rowlen zero in row $(rn) of PRIM.REF_NAME", "rn=%ld", a_row_id ) );
466 }
467 else
468 {
469 const INSDC_coord_zero *ref_pos;
470 rc = read_INSDC_coord_zero_ptr( a_row_id, ptx->cursor, ptx->ref_pos_idx, &ref_pos, &row_len, "REF_POS" );
471 if ( rc == 0 )
472 {
473 rc = KOutMsg( "%.*s\t%i\t", ref_name_len, ref_name, ref_pos[ 0 ] + 1 );
474 }
475 }
476 }
477 }
478 }
479 }
480 return rc;
481 }
482
483
calc_mate_idx(const uint32_t n_reads,const uint32_t read_idx)484 static uint32_t calc_mate_idx( const uint32_t n_reads, const uint32_t read_idx )
485 {
486 if ( read_idx == ( n_reads - 1 ) )
487 return ( read_idx - 1 );
488 else
489 return ( read_idx + 1 );
490 }
491
492
read_quality(const seq_table_ctx * const stx,const int64_t row_id,const char ** quality,const uint32_t read_len)493 static rc_t read_quality( const seq_table_ctx * const stx,
494 const int64_t row_id,
495 const char **quality,
496 const uint32_t read_len )
497 {
498 uint32_t quality_len;
499 rc_t rc = read_char_ptr( row_id, stx->cursor, stx->quality_idx, quality, &quality_len, "QUALITY" );
500 if ( rc == 0 && read_len != quality_len )
501 rc = complain_size_diff( row_id, "QUALITY" );
502 return rc;
503 }
504
505
read_read_type(const seq_table_ctx * const stx,const int64_t row_id,const INSDC_read_type ** read_type,const uint32_t read_len)506 static rc_t read_read_type( const seq_table_ctx * const stx,
507 const int64_t row_id,
508 const INSDC_read_type **read_type,
509 const uint32_t read_len )
510 {
511 uint32_t read_type_len;
512 rc_t rc = read_INSDC_read_type_ptr( row_id, stx->cursor, stx->read_type_idx, read_type, &read_type_len, "READ_TYPE" );
513 if ( rc == 0 && read_type_len != read_len )
514 rc = complain_size_diff( row_id, "READ_TYPE" );
515 return rc;
516 }
517
518
read_read_filter(const seq_table_ctx * const stx,const int64_t row_id,const INSDC_read_filter ** read_filter,const uint32_t read_len)519 static rc_t read_read_filter( const seq_table_ctx * const stx,
520 const int64_t row_id,
521 const INSDC_read_filter **read_filter,
522 const uint32_t read_len )
523 {
524 uint32_t read_filter_len;
525 rc_t rc = read_INSDC_read_filter_ptr( row_id, stx->cursor, stx->read_filter_idx, read_filter, &read_filter_len, "READ_FILTER" );
526 if ( rc == 0 && read_filter_len != read_len )
527 rc = complain_size_diff( row_id, "READ_FILTER" );
528 return rc;
529 }
530
531
read_read_start(const seq_table_ctx * const stx,const int64_t row_id,const INSDC_coord_zero ** read_start,const uint32_t nreads)532 static rc_t read_read_start( const seq_table_ctx * const stx,
533 const int64_t row_id,
534 const INSDC_coord_zero **read_start,
535 const uint32_t nreads )
536 {
537 uint32_t read_start_len;
538 rc_t rc = read_INSDC_coord_zero_ptr( row_id, stx->cursor, stx->read_start_idx, read_start, &read_start_len, "READ_START" );
539 if ( rc == 0 && nreads != read_start_len )
540 rc = complain_size_diff( row_id, "READ_START" );
541 return rc;
542 }
543
544
read_read_len(const seq_table_ctx * const stx,const int64_t row_id,const INSDC_coord_len ** read_len,const uint32_t nreads)545 static rc_t read_read_len( const seq_table_ctx * const stx,
546 const int64_t row_id,
547 const INSDC_coord_len **read_len,
548 const uint32_t nreads )
549 {
550 uint32_t read_len_len;
551 rc_t rc = read_INSDC_coord_len_ptr( row_id, stx->cursor, stx->read_len_idx, read_len, &read_len_len, "READ_LEN" );
552 if ( rc == 0 && nreads != read_len_len )
553 rc = complain_size_diff( row_id, "READ_LEN" );
554 return rc;
555 }
556
557
calc_reverse_flag(const samdump_opts * const opts,const uint32_t read_idx,const INSDC_read_type * read_type)558 static bool calc_reverse_flag( const samdump_opts * const opts,
559 const uint32_t read_idx,
560 const INSDC_read_type * read_type )
561 {
562 bool res = ( ( read_type[ read_idx ] & READ_TYPE_REVERSE ) == READ_TYPE_REVERSE );
563 if ( res )
564 res = opts -> reverse_unaligned_reads;
565 return res;
566 }
567
568
opt_field_spot_group(const seq_table_ctx * const stx,int64_t row_id)569 static rc_t opt_field_spot_group( const seq_table_ctx * const stx, int64_t row_id )
570 {
571 const char * spot_group = NULL;
572 uint32_t spot_group_len;
573 rc_t rc = read_char_ptr( row_id, stx->cursor, stx->spot_group_idx, &spot_group, &spot_group_len, "SPOT_GROUP" );
574 if ( rc == 0 && spot_group_len > 0 )
575 rc = KOutMsg( "\tRG:Z:%.*s", spot_group_len, spot_group );
576 return rc;
577 }
578
opt_field_lnk_group(const seq_table_ctx * const stx,int64_t row_id)579 static rc_t opt_field_lnk_group( const seq_table_ctx * const stx, int64_t row_id )
580 {
581 const char * lnk_grp;
582 uint32_t lnk_grp_len;
583 rc_t rc = read_char_ptr( row_id, stx->cursor, stx->lnk_group_idx, &lnk_grp, &lnk_grp_len, "LINKAGE_GROUP" );
584 if ( rc == 0 && lnk_grp_len > 0 )
585 rc = KOutMsg( "\tBX:Z:%.*s", lnk_grp_len, lnk_grp );
586 return rc;
587 }
588
dump_seq_row_sam_filtered(const samdump_opts * const opts,const seq_table_ctx * const stx,const prim_table_ctx * const ptx,const matecache * const mc,const input_database * const ids,const int64_t row_id,const uint32_t nreads)589 static rc_t dump_seq_row_sam_filtered( const samdump_opts * const opts,
590 const seq_table_ctx * const stx,
591 const prim_table_ctx * const ptx,
592 const matecache * const mc,
593 const input_database * const ids,
594 const int64_t row_id,
595 const uint32_t nreads )
596 {
597 uint32_t read_idx, rd_len, prim_align_ids_len;
598 const int64_t * prim_align_ids;
599 const char * quality = NULL;
600 const INSDC_dna_text * read = NULL;
601 const INSDC_read_type * read_type = NULL;
602 const INSDC_read_filter * read_filter = NULL;
603 const INSDC_coord_zero * read_start = NULL;
604 const INSDC_coord_len * read_len;
605
606 rc_t rc = read_int64_ptr( row_id, stx->cursor, stx->prim_al_id_idx, &prim_align_ids, &prim_align_ids_len, "PRIM_AL_ID" );
607 if ( rc == 0 && nreads != prim_align_ids_len )
608 rc = complain_size_diff( row_id, "PRIMARY_ALIGNMENT_ID" );
609 if ( rc == 0 )
610 rc = read_read_len( stx, row_id, &read_len, nreads );
611
612 for ( read_idx = 0; ( read_idx < nreads ) && ( rc == 0 ); ++read_idx )
613 {
614 int64_t align_id = prim_align_ids[ read_idx ];
615 if ( align_id == 0 && /* read is NOT aligned! */
616 read_len[ read_idx ] > 0 ) /* and has a length! */
617 {
618 uint32_t mate_idx = calc_mate_idx( nreads, read_idx );
619
620 /* here we have to find out if the read is actually requested:
621 -----------------------------------------------------------
622 read from the SEQ-table the ID of the mate in PRIM-table
623 look if we have the requested ID in the mate-cache, if not here it ends...
624 */
625 align_id = prim_align_ids[ mate_idx ];
626 if ( align_id != 0 )
627 {
628 uint32_t ref_idx;
629 INSDC_coord_zero mate_ref_pos;
630 int64_t seq_spot_id;
631 /* this is the filter: we look up in the mate-cache! */
632 rc = matecache_lookup_unaligned( mc, ids->db_idx, align_id, &mate_ref_pos, &ref_idx, &seq_spot_id );
633 if ( rc == 0 )
634 {
635 const ReferenceObj* refobj;
636 /* now we have to lookup the reference-name based on it's index into the ids->reflist */
637 rc = ReferenceList_Get( ids->reflist, &refobj, ref_idx );
638 if ( rc != 0 )
639 {
640 (void)PLOGERR( klogInt, ( klogInt, rc, "in row $(rn) of SEQUENCE.$(rx)", "rn=%ld,rx=%ld", mate_idx, row_id ) );
641 }
642 else
643 {
644 const char * mate_ref_name;
645 rc = ReferenceObj_Name( refobj, &mate_ref_name );
646 if ( rc != 0 )
647 {
648 (void)PLOGERR( klogInt, ( klogInt, rc, "in row $(rn) of SEQUENCE.$(rx)", "rn=%ld,rx=%ld", mate_idx, row_id ) );
649 }
650 else
651 {
652 bool reverse = false;
653
654 /* SAM-FIELD: QNAME SRA-column: SPOT_ID ( int64 ) */
655 if ( rc == 0 )
656 {
657 bool print_just_seq_spot_id = true;
658
659 if ( opts->print_spot_group_in_name )
660 {
661 const char * spot_group;
662 uint32_t spot_group_len;
663 rc = read_char_ptr( row_id, stx->cursor, stx->spot_group_idx, &spot_group, &spot_group_len, "SPOT_GROUP" );
664 if ( rc == 0 && spot_group_len > 0 )
665 {
666 rc = KOutMsg( "%ld.%.*s\t", seq_spot_id, spot_group_len, spot_group );
667 print_just_seq_spot_id = false;
668 }
669 }
670 if ( print_just_seq_spot_id )
671 {
672 rc = KOutMsg( "%ld\t", seq_spot_id );
673 }
674 }
675
676 if ( rc == 0 && read_type == NULL )
677 rc = read_read_type( stx, row_id, &read_type, nreads );
678
679 if ( rc == 0 )
680 reverse = calc_reverse_flag( opts, read_idx, read_type );
681
682 if ( rc == 0 && read_filter == NULL )
683 rc = read_read_filter( stx, row_id, &read_filter, nreads );
684
685 /* SAM-FIELD: FLAG SRA-column: calculated from READ_TYPE, READ_FILTER etc. */
686 if ( rc == 0 )
687 {
688 uint32_t sam_flags = calculate_unaligned_sam_flags_db( nreads, read_idx, mate_idx,
689 align_id, read_type, reverse, read_filter );
690 rc = KOutMsg( "%u\t", sam_flags );
691 }
692
693 /* SAM-FIELD: RNAME SRA-column: none, fix '*' */
694 /* SAM-FIELD: POS SRA-column: none, fix '0' */
695 /* SAM-FIELD: MAPQ SRA-column: none, fix '0' */
696 /* SAM-FIELD: CIGAR SRA-column: none, fix '*' */
697 if ( rc == 0 )
698 rc = KOutMsg( "*\t0\t0\t*\t" );
699
700 /* SAM-FIELD: RNEXT SRA-column: found in cache */
701 /* SAM-FIELD: POS SRA-column: found in cache */
702 if ( rc == 0 )
703 rc = KOutMsg( "%s\t%li\t", mate_ref_name, mate_ref_pos + 1 );
704
705 /* SAM-FIELD: TLEN SRA-column: none, fix '0' */
706 if ( rc == 0 )
707 rc = KOutMsg( "0\t" );
708
709 if ( rc == 0 && read == NULL )
710 rc = read_INSDC_dna_text_ptr( row_id, stx->cursor, stx->read_idx, &read, &rd_len, "READ" );
711 if ( rc == 0 && quality == NULL )
712 rc = read_quality( stx, row_id, &quality, rd_len );
713 if ( rc == 0 && read_start == NULL )
714 rc = read_read_start( stx, row_id, &read_start, nreads );
715
716 /* SAM-FIELD: SEQ SRA-column: READ, sliced by READ_START/READ_LEN */
717 if ( rc == 0 )
718 rc = print_sliced_read( read, read_idx, reverse, read_start, read_len );
719 if ( rc == 0 )
720 rc = KOutMsg( "\t" );
721
722 /* SAM-FIELD: QUAL SRA-column: QUALITY, sliced by READ_START/READ_LEN */
723 if ( rc == 0 )
724 rc = print_sliced_quality( opts, quality, read_idx, reverse, read_start, read_len );
725
726 /* OPT SAM-FIELD: SRA-column: ALIGN_ID */
727 if ( rc == 0 && opts->print_alignment_id_in_column_xi )
728 rc = KOutMsg( "\tXI:i:%u", row_id );
729
730 /* OPT SAM-FIELD: SRA-column: SPOT_GROUP */
731 if ( rc == 0 && stx->spot_group_idx != COL_NOT_AVAILABLE )
732 rc = opt_field_spot_group( stx, row_id );
733
734 /* OPT SAM-FIELD: SRA-column: LINKAGE_GROUP */
735 if ( rc == 0 && stx->lnk_group_idx != COL_NOT_AVAILABLE )
736 rc = opt_field_lnk_group( stx, row_id );
737
738 if ( rc == 0 )
739 rc = KOutMsg( "\n" );
740 }
741 }
742 }
743 else
744 {
745 rc = 0; /* it is OK if an alignment is not found in the cache, do not terminate! */
746 }
747 }
748 }
749 }
750 return rc;
751 }
752
753
dump_seq_prim_row_sam(const samdump_opts * const opts,const seq_table_ctx * const stx,const prim_table_ctx * const ptx,const matecache * const mc,const input_database * const ids,const int64_t row_id,const uint32_t nreads)754 static rc_t dump_seq_prim_row_sam( const samdump_opts * const opts,
755 const seq_table_ctx * const stx,
756 const prim_table_ctx * const ptx,
757 const matecache * const mc,
758 const input_database * const ids,
759 const int64_t row_id,
760 const uint32_t nreads )
761 {
762 uint32_t read_idx, rd_len, prim_align_ids_len;
763 const int64_t * prim_align_ids;
764 const char * quality = NULL;
765 const INSDC_dna_text * read = NULL;
766 const INSDC_read_type * read_type = NULL;
767 const INSDC_read_filter * read_filter = NULL;
768 const INSDC_coord_zero * read_start = NULL;
769 const INSDC_coord_len * read_len;
770
771 rc_t rc = read_int64_ptr( row_id, stx->cursor, stx->prim_al_id_idx, &prim_align_ids, &prim_align_ids_len, "PRIM_AL_ID" );
772 if ( rc == 0 && nreads != prim_align_ids_len )
773 rc = complain_size_diff( row_id, "PRIMARY_ALIGNMENT_ID" );
774 if ( rc == 0 )
775 rc = read_read_len( stx, row_id, &read_len, nreads );
776
777 for ( read_idx = 0; ( read_idx < nreads ) && ( rc == 0 ); ++read_idx )
778 {
779 if ( prim_align_ids[ read_idx ] == 0 && /* read is NOT aligned! */
780 read_len[ read_idx ] > 0 ) /* and has a length! */
781 {
782 bool reverse = false, mate_available = false;
783 uint32_t mate_idx = 0;
784 int64_t mate_id = 0;
785
786 if ( nreads > 1 )
787 {
788 if ( read_idx == ( nreads - 1 ) )
789 mate_idx = read_idx - 1;
790 else
791 mate_idx = read_idx + 1;
792 mate_available = true;
793 }
794
795 if ( mate_available && mate_idx < prim_align_ids_len )
796 mate_id = prim_align_ids[ mate_idx ];
797
798 if ( rc == 0 && read_type == NULL )
799 rc = read_read_type( stx, row_id, &read_type, nreads );
800 if ( rc == 0 )
801 reverse = calc_reverse_flag( opts, read_idx, read_type );
802
803 if ( rc == 0 && read_filter == NULL )
804 rc = read_read_filter( stx, row_id, &read_filter, nreads );
805
806 /* SAM-FIELD: QNAME SRA-column: SPOT_ID ( int64 ) */
807 if ( rc == 0 )
808 {
809 bool print_just_seq_spot_id = true;
810
811 if ( opts->print_spot_group_in_name )
812 {
813 const char * spot_group;
814 uint32_t spot_group_len;
815 rc = read_char_ptr( row_id, stx->cursor, stx->spot_group_idx, &spot_group, &spot_group_len, "SPOT_GROUP" );
816 if ( rc == 0 && spot_group_len > 0 )
817 {
818 rc = KOutMsg( "%ld.%.*s\t", row_id, spot_group_len, spot_group );
819 print_just_seq_spot_id = false;
820 }
821 }
822 if ( print_just_seq_spot_id )
823 {
824 rc = KOutMsg( "%ld\t", row_id );
825 }
826 }
827
828 /* SAM-FIELD: FLAG SRA-column: calculated from READ_TYPE, READ_FILTER etc. */
829 if ( rc == 0 )
830 {
831 uint32_t sam_flags;
832 if ( stx->prim_al_id_idx != INVALID_COLUMN )
833 {
834 uint32_t temp_nreads = nreads;
835 if ( mate_id == 0 && read_len[ mate_idx ] == 0 ) temp_nreads--;
836 sam_flags = calculate_unaligned_sam_flags_db( temp_nreads, read_idx, mate_idx,
837 mate_id, read_type, reverse, read_filter );
838 }
839 else
840 {
841 if ( reverse )
842 sam_flags = ( 0x04 | 0x10 );
843 else
844 sam_flags = 0x04;
845 }
846 rc = KOutMsg( "%u\t", sam_flags );
847 }
848
849 /* SAM-FIELD: RNAME SRA-column: none, fix '*' */
850 /* SAM-FIELD: POS SRA-column: none, fix '0' */
851 /* SAM-FIELD: MAPQ SRA-column: none, fix '0' */
852 /* SAM-FIELD: CIGAR SRA-column: none, fix '*' */
853 if ( rc == 0 )
854 rc = KOutMsg( "*\t0\t0\t*\t" );
855
856 /* SAM-FIELD: RNEXT SRA-column: look up in cache, or none */
857 /* SAM-FIELD: POS SRA-column: look up in cache, or none */
858 if ( rc == 0 )
859 {
860 if ( ptx == NULL || !mate_available )
861 {
862 rc = KOutMsg( "0\t0\t" ); /* no way to get that without PRIM_ALIGN-table */
863 }
864 else
865 {
866 if ( opts->use_mate_cache && mc != NULL && ids != NULL )
867 {
868 const char * mate_ref_name;
869 uint32_t mate_ref_name_len;
870 INSDC_coord_zero mate_ref_pos;
871
872 rc = get_mate_info( ptx, mc, ids, row_id, mate_id, nreads, &mate_ref_name, &mate_ref_name_len, &mate_ref_pos );
873 if ( rc == 0 )
874 rc = KOutMsg( "%.*s\t%li\t", mate_ref_name_len, mate_ref_name, mate_ref_pos );
875 }
876 else
877 {
878 /* print the mate info */
879 rc = dump_the_other_read( stx, ptx, row_id, mate_idx );
880 }
881 }
882 }
883
884
885 /* SAM-FIELD: TLEN SRA-column: none, fix '0' */
886 if ( rc == 0 )
887 rc = KOutMsg( "0\t" );
888
889 if ( rc == 0 && read == NULL )
890 rc = read_INSDC_dna_text_ptr( row_id, stx->cursor, stx->read_idx, &read, &rd_len, "READ" );
891 if ( rc == 0 && read_start == NULL )
892 rc = read_read_start( stx, row_id, &read_start, nreads );
893
894 /* SAM-FIELD: SEQ SRA-column: READ, sliced by READ_START/READ_LEN */
895 if ( rc == 0 )
896 rc = print_sliced_read( read, read_idx, reverse, read_start, read_len );
897 if ( rc == 0 )
898 rc = KOutMsg( "\t" );
899
900 if ( rc == 0 && quality == NULL )
901 rc = read_quality( stx, row_id, &quality, rd_len );
902
903 /* SAM-FIELD: QUAL SRA-column: QUALITY, sliced by READ_START/READ_LEN */
904 if ( rc == 0 )
905 rc = print_sliced_quality( opts, quality, read_idx, reverse, read_start, read_len );
906
907 /* OPT SAM-FIIELD: SRA-column: ALIGN_ID */
908 if ( rc == 0 && opts->print_alignment_id_in_column_xi )
909 rc = KOutMsg( "\tXI:i:%u", row_id );
910
911 /* OPT SAM-FIIELD: SRA-column: SPOT_GROUP */
912 if ( rc == 0 && stx->spot_group_idx != COL_NOT_AVAILABLE )
913 rc = opt_field_spot_group( stx, row_id );
914
915 /* OPT SAM-FIELD: SRA-column: LINKAGE_GROUP */
916 if ( rc == 0 && stx->lnk_group_idx != COL_NOT_AVAILABLE )
917 rc = opt_field_lnk_group( stx, row_id );
918
919 if ( rc == 0 )
920 rc = KOutMsg( "\n" );
921 }
922 }
923 return rc;
924 }
925
926
927 /* called if we are dumping from a legacy table instead from a database */
dump_seq_row_sam(const samdump_opts * const opts,const seq_table_ctx * const stx,const int64_t row_id,const uint32_t nreads)928 static rc_t dump_seq_row_sam( const samdump_opts * const opts,
929 const seq_table_ctx * const stx,
930 const int64_t row_id,
931 const uint32_t nreads )
932 {
933 uint32_t read_idx, rd_len, name_len;
934 const char * quality = NULL;
935 const char * name = NULL;
936 const INSDC_dna_text * read = NULL;
937 const INSDC_read_type * read_type;
938 const INSDC_read_filter * read_filter = NULL;
939 const INSDC_coord_zero * read_start = NULL;
940 const INSDC_coord_len * read_len;
941
942 rc_t rc = read_read_len( stx, row_id, &read_len, nreads );
943 if ( rc == 0 )
944 {
945 if ( stx->name_idx != COL_NOT_AVAILABLE )
946 rc = read_char_ptr( row_id, stx->cursor, stx->name_idx, &name, &name_len, "NAME" );
947 else
948 name_len = 0;
949 }
950 if ( rc == 0 )
951 rc = read_read_type( stx, row_id, &read_type, nreads );
952
953 for ( read_idx = 0; ( read_idx < nreads ) && ( rc == 0 ); ++read_idx )
954 {
955 if ( ( read_len[ read_idx ] > 0 ) && /* has a length! */
956 ( ( read_type[ read_idx ] & READ_TYPE_BIOLOGICAL ) == READ_TYPE_BIOLOGICAL ) )
957 {
958 bool reverse = false;
959 uint32_t mate_idx = 0;
960
961 if ( nreads > 1 )
962 {
963 if ( read_idx == ( nreads - 1 ) )
964 mate_idx = read_idx - 1;
965 else
966 mate_idx = read_idx + 1;
967 }
968
969 if ( rc == 0 ) /* types in interfaces/insdc/insdc.h */
970 reverse = calc_reverse_flag( opts, read_idx, read_type );
971
972 if ( rc == 0 && read_filter == NULL )
973 rc = read_read_filter( stx, row_id, &read_filter, nreads );
974
975 /* SAM-FIELD: QNAME SRA-column: SPOT_ID ( int64 ) */
976 if ( rc == 0 )
977 {
978 bool print_just_seq_spot_id = true;
979
980 if ( opts->print_spot_group_in_name )
981 {
982 const char * spot_group;
983 uint32_t spot_group_len;
984 rc = read_char_ptr( row_id, stx->cursor, stx->spot_group_idx, &spot_group, &spot_group_len, "SPOT_GROUP" );
985 if ( rc == 0 && spot_group_len > 0 )
986 {
987 if ( name != NULL && name_len > 0 )
988 rc = KOutMsg( "%.*s.%.*s\t", name_len, name, spot_group_len, spot_group );
989 else
990 rc = KOutMsg( "%ld.%.*s\t", row_id, spot_group_len, spot_group );
991 print_just_seq_spot_id = false;
992 }
993 }
994
995 if ( print_just_seq_spot_id )
996 {
997 if ( name != NULL && name_len > 0 )
998 rc = KOutMsg( "%.*s\t", name_len, name );
999 else
1000 rc = KOutMsg( "%lu\t", row_id );
1001 }
1002 }
1003
1004 /* SAM-FIELD: FLAG SRA-column: calculated from READ_TYPE, READ_FILTER etc. */
1005 if ( rc == 0 )
1006 {
1007 uint32_t sam_flags = calculate_unaligned_sam_flags_db( nreads, read_idx, mate_idx,
1008 0, read_type, reverse, read_filter );
1009 rc = KOutMsg( "%u\t", sam_flags );
1010 }
1011
1012 /* SAM-FIELD: RNAME SRA-column: none, fix '*' */
1013 /* SAM-FIELD: POS SRA-column: none, fix '0' */
1014 /* SAM-FIELD: MAPQ SRA-column: none, fix '0' */
1015 /* SAM-FIELD: CIGAR SRA-column: none, fix '*' */
1016 /* SAM-FIELD: RNEXT SRA-column: none, fix '*' */
1017 /* SAM-FIELD: POS SRA-column: none, fix '0' */
1018 /* SAM-FIELD: TLEN SRA-column: none, fix '0' */
1019
1020 if ( rc == 0 )
1021 rc = KOutMsg( "*\t0\t0\t*\t*\t0\t0\t" );
1022
1023 if ( rc == 0 && read == NULL )
1024 rc = read_INSDC_dna_text_ptr( row_id, stx->cursor, stx->read_idx, &read, &rd_len, "READ" );
1025 if ( rc == 0 && read_start == NULL )
1026 rc = read_read_start( stx, row_id, &read_start, nreads );
1027
1028 /* SAM-FIELD: SEQ SRA-column: READ, sliced by READ_START/READ_LEN */
1029 if ( rc == 0 )
1030 rc = print_sliced_read( read, read_idx, reverse, read_start, read_len );
1031 if ( rc == 0 )
1032 rc = KOutMsg( "\t" );
1033
1034 if ( rc == 0 && quality == NULL )
1035 rc = read_quality( stx, row_id, &quality, rd_len );
1036
1037 /* SAM-FIELD: QUAL SRA-column: QUALITY, sliced by READ_START/READ_LEN */
1038 if ( rc == 0 )
1039 rc = print_sliced_quality( opts, quality, read_idx, reverse, read_start, read_len );
1040
1041 /* OPT SAM-FIIELD: SRA-column: ALIGN_ID */
1042 if ( rc == 0 && opts->print_alignment_id_in_column_xi )
1043 rc = KOutMsg( "\tXI:i:%u", row_id );
1044
1045 /* OPT SAM-FIIELD: SRA-column: SPOT_GROUP */
1046 if ( rc == 0 && stx->spot_group_idx != COL_NOT_AVAILABLE )
1047 rc = opt_field_spot_group( stx, row_id );
1048
1049 /* OPT SAM-FIELD: SRA-column: LINKAGE_GROUP */
1050 if ( rc == 0 && stx->lnk_group_idx != COL_NOT_AVAILABLE )
1051 rc = opt_field_lnk_group( stx, row_id );
1052
1053 if ( rc == 0 )
1054 rc = KOutMsg( "\n" );
1055 }
1056 }
1057 return rc;
1058 }
1059
1060
dump_seq_row_fastx_filtered(const samdump_opts * const opts,const seq_table_ctx * const stx,const prim_table_ctx * const ptx,const matecache * const mc,const input_database * const ids,const int64_t row_id,const uint32_t nreads)1061 static rc_t dump_seq_row_fastx_filtered( const samdump_opts * const opts,
1062 const seq_table_ctx * const stx,
1063 const prim_table_ctx * const ptx,
1064 const matecache * const mc,
1065 const input_database * const ids,
1066 const int64_t row_id,
1067 const uint32_t nreads )
1068 {
1069 uint32_t read_idx, rd_len, prim_align_ids_len, spot_group_len = 0;
1070 const int64_t * prim_align_ids;
1071 const char * quality = NULL;
1072 const INSDC_dna_text * read = NULL;
1073 const char * spot_group = NULL;
1074 const INSDC_read_type * read_type = NULL;
1075 const INSDC_coord_zero * read_start = NULL;
1076 const INSDC_coord_len * read_len;
1077
1078 rc_t rc = read_int64_ptr( row_id, stx->cursor, stx->prim_al_id_idx, &prim_align_ids, &prim_align_ids_len, "PRIM_AL_ID" );
1079 if ( rc == 0 && nreads != prim_align_ids_len )
1080 rc = complain_size_diff( row_id, "PRIMARY_ALIGNMENT_ID" );
1081 if ( rc == 0 )
1082 rc = read_read_len( stx, row_id, &read_len, nreads );
1083
1084 for ( read_idx = 0; ( read_idx < nreads ) && ( rc == 0 ); ++read_idx )
1085 {
1086 if ( prim_align_ids[ read_idx ] == 0 && /* read is NOT aligned! */
1087 read_len[ read_idx ] > 0 ) /* and has a length! */
1088 {
1089 uint32_t mate_idx = calc_mate_idx( nreads, read_idx );
1090 int64_t align_id = prim_align_ids[ mate_idx ];
1091 if ( align_id != 0 )
1092 {
1093 uint32_t ref_idx;
1094 INSDC_coord_zero mate_ref_pos;
1095 int64_t seq_spot_id;
1096 rc = matecache_lookup_unaligned( mc, ids->db_idx, align_id, &mate_ref_pos, &ref_idx, &seq_spot_id );
1097 if ( rc == 0 )
1098 {
1099 /* bool reverse; */
1100
1101 /* the NAME */
1102 if ( opts->output_format == of_fastq )
1103 rc = KOutMsg( "@" );
1104 else
1105 rc = KOutMsg( ">" );
1106
1107 if ( rc == 0 )
1108 {
1109 if ( opts->print_spot_group_in_name && spot_group == NULL )
1110 rc = read_char_ptr( row_id, stx->cursor, stx->spot_group_idx, &spot_group, &spot_group_len, "SPOT_GROUP" );
1111 if ( rc == 0 )
1112 rc = dump_name( opts, seq_spot_id, spot_group, spot_group_len ); /* sam-dump-opts.c */
1113 if ( rc == 0 )
1114 rc = KOutMsg( "/%u unaligned\n", read_idx + 1 );
1115 }
1116
1117 if ( rc == 0 && read == NULL )
1118 rc = read_INSDC_dna_text_ptr( row_id, stx->cursor, stx->read_idx, &read, &rd_len, "READ" );
1119 if ( rc == 0 && quality == NULL )
1120 rc = read_quality( stx, row_id, &quality, rd_len );
1121 if ( rc == 0 && read_type == NULL )
1122 rc = read_read_type( stx, row_id, &read_type, nreads );
1123 /*
1124 if ( rc == 0 )
1125 reverse = ( ( read_type[ read_idx ] & READ_TYPE_REVERSE ) == READ_TYPE_REVERSE );
1126 */
1127 if ( rc == 0 && read_start == NULL )
1128 rc = read_read_start( stx, row_id, &read_start, nreads );
1129
1130 /* the READ */
1131 if ( rc == 0 )
1132 rc = print_sliced_read( read, read_idx, /* reverse */ false, read_start, read_len );
1133 if ( rc == 0 )
1134 rc = KOutMsg( "\n" );
1135
1136 /* in case of fastq : the QUALITY-line */
1137 if ( rc == 0 && opts->output_format == of_fastq )
1138 {
1139 rc = KOutMsg( "+\n" );
1140 if ( rc == 0 )
1141 rc = print_sliced_quality( opts, quality, read_idx, /* reverse */ false, read_start, read_len );
1142 if ( rc == 0 )
1143 rc = KOutMsg( "\n" );
1144 }
1145 }
1146 else
1147 {
1148 rc = 0;
1149 }
1150 }
1151 }
1152 }
1153 return rc;
1154 }
1155
1156
dump_seq_row_fastx(const samdump_opts * const opts,const seq_table_ctx * const stx,const int64_t row_id,const uint32_t nreads)1157 static rc_t dump_seq_row_fastx( const samdump_opts * const opts,
1158 const seq_table_ctx * const stx,
1159 const int64_t row_id,
1160 const uint32_t nreads )
1161 {
1162 uint32_t read_idx, rd_len, prim_align_ids_len, spot_group_len = 0;
1163 const int64_t * prim_align_ids;
1164 const char * quality = NULL;
1165 const INSDC_dna_text * read = NULL ;
1166 const char * spot_group = NULL;
1167 const INSDC_read_type * read_type = NULL;
1168 const INSDC_coord_zero * read_start = NULL;
1169 const INSDC_coord_len * read_len;
1170
1171 rc_t rc = read_int64_ptr( row_id, stx->cursor, stx->prim_al_id_idx, &prim_align_ids, &prim_align_ids_len, "PRIM_AL_IDS" );
1172 if ( rc == 0 && nreads != prim_align_ids_len )
1173 rc = complain_size_diff( row_id, "PRIMARY_ALIGNMENT_ID" );
1174 if ( rc == 0 )
1175 rc = read_read_len( stx, row_id, &read_len, nreads );
1176
1177 for ( read_idx = 0; ( read_idx < nreads ) && ( rc == 0 ); ++read_idx )
1178 {
1179 if ( prim_align_ids[ read_idx ] == 0 && /* read is NOT aligned! */
1180 read_len[ read_idx ] > 0 ) /* and has a length! */
1181 {
1182 /* bool reverse; */
1183
1184 /* the NAME */
1185 if ( opts->output_format == of_fastq )
1186 rc = KOutMsg( "@" );
1187 else
1188 rc = KOutMsg( ">" );
1189 if ( rc == 0 )
1190 {
1191 if ( opts->print_spot_group_in_name && spot_group == NULL )
1192 rc = read_char_ptr( row_id, stx->cursor, stx->spot_group_idx, &spot_group, &spot_group_len, "SPOT_GROUP" );
1193 if ( rc == 0 )
1194 rc = dump_name( opts, row_id, spot_group, spot_group_len ); /* sam-dump-opts.c */
1195 if ( rc == 0 )
1196 rc = KOutMsg( "/%u unaligned\n", read_idx + 1 );
1197 }
1198
1199 if ( rc == 0 && read == NULL )
1200 rc = read_INSDC_dna_text_ptr( row_id, stx->cursor, stx->read_idx, &read, &rd_len, "READ" );
1201 if ( rc == 0 && quality == NULL )
1202 rc = read_quality( stx, row_id, &quality, rd_len );
1203 if ( rc == 0 && read_type == NULL )
1204 rc = read_read_type( stx, row_id, &read_type, nreads );
1205 /*
1206 if ( rc == 0 )
1207 reverse = ( ( read_type[ read_idx ] & READ_TYPE_REVERSE ) == READ_TYPE_REVERSE );
1208 */
1209 if ( rc == 0 && read_start == NULL )
1210 rc = read_read_start( stx, row_id, &read_start, nreads );
1211
1212 /* the READ */
1213 if ( rc == 0 )
1214 rc = print_sliced_read( read, read_idx, /*reverse*/ false, read_start, read_len );
1215 if ( rc == 0 )
1216 rc = KOutMsg( "\n" );
1217
1218 /* in case of fastq : the QUALITY-line */
1219 if ( rc == 0 && opts->output_format == of_fastq )
1220 {
1221 rc = KOutMsg( "+\n" );
1222 if ( rc == 0 )
1223 rc = print_sliced_quality( opts, quality, read_idx, /*reverse*/ false, read_start, read_len );
1224 if ( rc == 0 )
1225 rc = KOutMsg( "\n" );
1226 }
1227 }
1228 }
1229 return rc;
1230 }
1231
1232
dump_seq_tab_row_fastx(const samdump_opts * const opts,const seq_table_ctx * const stx,const int64_t row_id,const uint32_t nreads)1233 static rc_t dump_seq_tab_row_fastx( const samdump_opts * const opts,
1234 const seq_table_ctx * const stx,
1235 const int64_t row_id,
1236 const uint32_t nreads )
1237 {
1238 uint32_t read_idx, rd_len, name_len, spot_group_len = 0;
1239 const char * quality = NULL;
1240 const char * name;
1241 const INSDC_dna_text * read = NULL ;
1242 const char * spot_group = NULL;
1243 const INSDC_read_type * read_type;
1244 const INSDC_coord_zero * read_start = NULL;
1245 const INSDC_coord_len * read_len;
1246
1247 rc_t rc = read_read_len( stx, row_id, &read_len, nreads );
1248 if ( rc == 0 )
1249 rc = read_char_ptr( row_id, stx->cursor, stx->name_idx, &name, &name_len, "NAME" );
1250 if ( rc == 0 )
1251 rc = read_read_type( stx, row_id, &read_type, nreads );
1252
1253 for ( read_idx = 0; ( read_idx < nreads ) && ( rc == 0 ); ++read_idx )
1254 {
1255 if ( ( read_len[ read_idx ] > 0 ) && /* has a length! */
1256 ( ( read_type[ read_idx ] & READ_TYPE_BIOLOGICAL ) == READ_TYPE_BIOLOGICAL ) )
1257 {
1258 /* bool reverse; */
1259
1260 /* the NAME */
1261 if ( opts->output_format == of_fastq )
1262 rc = KOutMsg( "@" );
1263 else
1264 rc = KOutMsg( ">" );
1265 if ( rc == 0 )
1266 {
1267 if ( opts->print_spot_group_in_name && spot_group == NULL )
1268 rc = read_char_ptr( row_id, stx->cursor, stx->spot_group_idx, &spot_group, &spot_group_len, "SPOT_GROUP" );
1269
1270 if ( rc == 0 )
1271 rc = dump_name_legacy( opts, name, name_len, spot_group, spot_group_len ); /* sam-dump-opts.c */
1272
1273 if ( rc == 0 )
1274 rc = KOutMsg( "/%u unaligned\n", read_idx + 1 );
1275 }
1276
1277 if ( rc == 0 && read == NULL )
1278 rc = read_INSDC_dna_text_ptr( row_id, stx->cursor, stx->read_idx, &read, &rd_len, "READ" );
1279 if ( rc == 0 && quality == NULL )
1280 rc = read_quality( stx, row_id, &quality, rd_len );
1281 /*
1282 if ( rc == 0 )
1283 reverse = ( ( read_type[ read_idx ] & READ_TYPE_REVERSE ) == READ_TYPE_REVERSE );
1284 */
1285 if ( rc == 0 && read_start == NULL )
1286 rc = read_read_start( stx, row_id, &read_start, nreads );
1287
1288 /* the READ */
1289 if ( rc == 0 )
1290 rc = print_sliced_read( read, read_idx, /* reverse */ false, read_start, read_len );
1291 if ( rc == 0 )
1292 rc = KOutMsg( "\n" );
1293
1294 /* in case of fastq : the QUALITY-line */
1295 if ( rc == 0 && opts->output_format == of_fastq )
1296 {
1297 if ( quality == NULL )
1298 rc = read_quality( stx, row_id, &quality, rd_len );
1299 if ( rc == 0 )
1300 rc = KOutMsg( "+\n" );
1301 if ( rc == 0 )
1302 rc = print_sliced_quality( opts, quality, read_idx, /* reverse */ false, read_start, read_len );
1303 if ( rc == 0 )
1304 rc = KOutMsg( "\n" );
1305 }
1306 }
1307 }
1308 return rc;
1309 }
1310
1311
1312 typedef struct unaligned_callback_ctx
1313 {
1314 const samdump_opts * opts;
1315 const input_database * ids;
1316 const matecache * mc;
1317 seq_table_ctx * stx;
1318 prim_table_ctx * ptx;
1319 } unaligned_callback_ctx;
1320
1321
on_unaligned_seq_id(int64_t seq_id,int64_t al_id,void * user_data)1322 static rc_t CC on_unaligned_seq_id( int64_t seq_id, int64_t al_id, void * user_data )
1323 {
1324 rc_t rc = Quitting();
1325 if ( rc == 0 )
1326 {
1327 seq_row row;
1328 unaligned_callback_ctx * u_ctx = user_data;
1329 rc = read_seq_row( u_ctx->opts, u_ctx->stx, seq_id, &row );
1330 if ( rc == 0 && !row.filtered_out )
1331 {
1332 switch( u_ctx->opts->output_format )
1333 {
1334 case of_sam : rc = dump_seq_row_sam_filtered( u_ctx->opts, u_ctx->stx, u_ctx->ptx, u_ctx->mc, u_ctx->ids, seq_id, row.nreads ); break;
1335 case of_fasta : /* fall through intended ! */
1336 case of_fastq : rc = dump_seq_row_fastx_filtered( u_ctx->opts, u_ctx->stx, u_ctx->ptx, u_ctx->mc, u_ctx->ids, seq_id, row.nreads ); break;
1337 }
1338 }
1339 }
1340 return rc;
1341 }
1342
1343
print_unaligned_database_filtered_2(const samdump_opts * const opts,const input_table * const seq,const input_table * const prim,const matecache * const mc,const input_database * const ids)1344 static rc_t print_unaligned_database_filtered_2( const samdump_opts * const opts,
1345 const input_table * const seq,
1346 const input_table * const prim,
1347 const matecache * const mc,
1348 const input_database * const ids )
1349 {
1350 seq_table_ctx stx;
1351 rc_t rc = prepare_seq_table_ctx( opts, seq, &stx );
1352 if ( rc == 0 )
1353 {
1354 rc = VCursorOpen( stx.cursor );
1355 if ( rc != 0 )
1356 {
1357 (void)PLOGERR( klogInt, ( klogInt, rc, "VCursorOpen( SEQUENCE ) for $(tn) failed", "tn=%s", seq->path ) );
1358 }
1359 else
1360 {
1361 prim_table_ctx ptx;
1362 rc = prepare_prim_table_ctx( opts, prim, &ptx );
1363 if ( rc == 0 )
1364 {
1365 unaligned_callback_ctx u_ctx;
1366 u_ctx.opts = opts;
1367 u_ctx.ids = ids;
1368 u_ctx.mc = mc;
1369 u_ctx.stx = &stx;
1370 u_ctx.ptx = &ptx;
1371 rc = foreach_unaligned_entry( mc, ids->db_idx, on_unaligned_seq_id, &u_ctx );
1372 VCursorRelease( ptx.cursor );
1373 }
1374 }
1375 }
1376 return rc;
1377 }
1378
1379
1380 /* we are printing from a sra-database, we print half aligned reads only if we find them in the mate-cache */
print_unaligned_database_filtered(const samdump_opts * const opts,const input_table * const seq,const input_table * const prim,const matecache * const mc,const input_database * const ids)1381 static rc_t print_unaligned_database_filtered( const samdump_opts * const opts,
1382 const input_table * const seq,
1383 const input_table * const prim,
1384 const matecache * const mc,
1385 const input_database * const ids )
1386 {
1387 return print_unaligned_database_filtered_2( opts, seq, prim, mc, ids );
1388 #if 0
1389 seq_table_ctx stx;
1390 rc_t rc = prepare_seq_table_ctx( opts, seq, &stx );
1391 if ( rc == 0 )
1392 {
1393 rc = VCursorOpen( stx.cursor );
1394 if ( rc != 0 )
1395 {
1396 (void)PLOGERR( klogInt, ( klogInt, rc, "VCursorOpen( SEQUENCE ) for $(tn) failed", "tn=%s", seq->path ) );
1397 }
1398 else
1399 {
1400 int64_t first_row;
1401 uint64_t row_count;
1402 rc = VCursorIdRange( stx.cursor, stx.prim_al_id_idx, &first_row, &row_count );
1403 if ( rc != 0 )
1404 {
1405 (void)PLOGERR( klogInt, ( klogInt, rc, "VCursorIdRange( SEQUENCE ) for $(tn) failed", "tn=%s", seq->path ) );
1406 }
1407 else
1408 {
1409 prim_table_ctx ptx;
1410 rc = prepare_prim_table_ctx( opts, prim, &ptx );
1411 if ( rc == 0 )
1412 {
1413 int64_t row_id;
1414 seq_row row;
1415
1416 for ( row_id = first_row; ( ( row_id - first_row ) < row_count ) && rc == 0; ++row_id )
1417 {
1418 rc = Quitting();
1419 if ( rc == 0 )
1420 {
1421 rc = read_seq_row( opts, &stx, row_id, &row );
1422 if ( rc == 0 && !row.filtered_out )
1423 {
1424 switch( opts->output_format )
1425 {
1426 case of_sam : rc = dump_seq_row_sam_filtered( opts, &stx, &ptx, mc, ids, row_id, row.nreads ); break;
1427 case of_fasta : /* fall through intended ! */
1428 case of_fastq : rc = dump_seq_row_fastx_filtered( opts, &stx, &ptx, mc, ids, row_id, row.nreads ); break;
1429 }
1430 }
1431 }
1432 }
1433 VCursorRelease( ptx.cursor );
1434 }
1435 }
1436 }
1437 VCursorRelease( stx.cursor );
1438 }
1439 return rc;
1440 #endif
1441 }
1442
1443
1444 /* we are printing from a sra-database, we print all unaligned read we can find */
print_unaligned_database_full(const samdump_opts * const opts,const input_table * const seq,const input_table * const prim,const matecache * const mc,const input_database * const ids)1445 static rc_t print_unaligned_database_full( const samdump_opts * const opts,
1446 const input_table * const seq,
1447 const input_table * const prim,
1448 const matecache * const mc,
1449 const input_database * const ids )
1450 {
1451 seq_table_ctx stx;
1452 rc_t rc = prepare_seq_table_ctx( opts, seq, &stx );
1453 if ( rc == 0 )
1454 {
1455 rc = VCursorOpen( stx.cursor );
1456 if ( rc != 0 )
1457 {
1458 (void)PLOGERR( klogInt, ( klogInt, rc, "VCursorOpen( SEQUENCE ) for $(tn) failed", "tn=%s", seq->path ) );
1459 }
1460 else
1461 {
1462 prim_table_ctx ptx;
1463 if ( opts->output_format == of_sam )
1464 rc = prepare_prim_table_ctx( opts, prim, &ptx );
1465 if ( rc == 0 )
1466 {
1467 int64_t first_row, row_id;
1468 uint64_t row_count;
1469 rc = VCursorIdRange( stx.cursor, stx.read_type_idx, &first_row, &row_count );
1470 if ( rc != 0 )
1471 {
1472 (void)PLOGERR( klogInt, ( klogInt, rc, "VCursorIdRange( SEQUENCE ) for $(tn) failed", "tn=%s", seq->path ) );
1473 }
1474 else
1475 {
1476 seq_row row;
1477 for ( row_id = first_row; ( ( row_id - first_row ) < row_count ) && rc == 0; ++row_id )
1478 {
1479 rc = Quitting();
1480 if ( rc == 0 )
1481 {
1482 rc = read_seq_row( opts, &stx, row_id, &row );
1483 if ( rc == 0 && !row.filtered_out )
1484 {
1485 switch( opts->output_format )
1486 {
1487 case of_sam : rc = dump_seq_prim_row_sam( opts, &stx, &ptx, mc, ids, row_id, row.nreads ); break;
1488 case of_fasta : /* fall through intended ! */
1489 case of_fastq : rc = dump_seq_row_fastx( opts, &stx, row_id, row.nreads ); break;
1490 }
1491 }
1492 }
1493 }
1494 }
1495 if ( opts->output_format == of_sam )
1496 VCursorRelease( ptx.cursor );
1497 }
1498 }
1499 VCursorRelease( stx.cursor );
1500 }
1501 return rc;
1502 }
1503
1504
1505 /* we are printing from a (legacy) table not from a database! */
print_unaligned_table(const samdump_opts * const opts,const input_table * const seq)1506 static rc_t print_unaligned_table( const samdump_opts * const opts,
1507 const input_table * const seq )
1508 {
1509 seq_table_ctx stx;
1510 rc_t rc = prepare_seq_table_ctx( opts, seq, &stx );
1511 if ( rc == 0 )
1512 {
1513 rc = VCursorOpen( stx.cursor );
1514 if ( rc != 0 )
1515 {
1516 (void)PLOGERR( klogInt, ( klogInt, rc, "VCursorOpen( SEQUENCE ) for $(tn) failed", "tn=%s", seq->path ) );
1517 }
1518 else
1519 {
1520 int64_t first_row, row_id;
1521 uint64_t row_count;
1522 rc = VCursorIdRange( stx.cursor, stx.read_type_idx, &first_row, &row_count );
1523 if ( rc != 0 )
1524 {
1525 (void)PLOGERR( klogInt, ( klogInt, rc, "VCursorIdRange( SEQUENCE ) for $(tn) failed", "tn=%s", seq->path ) );
1526 }
1527 else
1528 {
1529 seq_row row;
1530 for ( row_id = first_row; ( ( row_id - first_row ) < row_count ) && rc == 0; ++row_id )
1531 {
1532 rc = Quitting();
1533 if ( rc == 0 )
1534 {
1535 rc = read_seq_row( opts, &stx, row_id, &row );
1536 if ( rc == 0 && !row.filtered_out )
1537 {
1538 switch( opts->output_format )
1539 {
1540 case of_sam : rc = dump_seq_row_sam( opts, &stx, row_id, row.nreads ); break;
1541 case of_fasta : /* fall through intended ! */
1542 case of_fastq : rc = dump_seq_tab_row_fastx( opts, &stx, row_id, row.nreads ); break;
1543 }
1544 }
1545 }
1546 }
1547 }
1548 }
1549 VCursorRelease( stx.cursor );
1550 }
1551 return rc;
1552 }
1553
1554
1555 /* entry point from sam-dump3.c */
print_unaligned_spots(const samdump_opts * const opts,const input_files * const ifs,const matecache * const mc)1556 rc_t print_unaligned_spots( const samdump_opts * const opts,
1557 const input_files * const ifs,
1558 const matecache * const mc )
1559 {
1560 rc_t rc = 0;
1561
1562 #if _DEBUGGING
1563 if ( opts->perf_log != NULL )
1564 perf_log_start_section( opts->perf_log, "unaligned spots" );
1565 #endif
1566
1567 if ( ( ifs->database_count > 0 ) && ( opts->dump_unaligned_reads || opts->dump_unaligned_only ) )
1568 {
1569 uint32_t db_idx;
1570 for ( db_idx = 0; db_idx < ifs->database_count && rc == 0; ++db_idx )
1571 {
1572 const input_database * ids = VectorGet( &ifs->dbs, db_idx );
1573 if ( ids != NULL )
1574 {
1575 input_table seq;
1576
1577 seq.path = ids->path;
1578 rc = VDatabaseOpenTableRead( ids->db, &seq.tab, "SEQUENCE" );
1579 if ( rc != 0 )
1580 {
1581 (void)PLOGERR( klogInt, ( klogInt, rc, "cannot open table SEQUENCE for $(tn)", "tn=%s", ids->path ) );
1582 }
1583 else
1584 {
1585 input_table prim;
1586 prim.path = ids->path;
1587 rc = VDatabaseOpenTableRead( ids->db, &prim.tab, "PRIMARY_ALIGNMENT" );
1588 if ( rc != 0 )
1589 {
1590 (void)PLOGERR( klogInt, ( klogInt, rc, "cannot open table PRIMARY_ALIGNMENT $(tn)", "tn=%s", ids->path ) );
1591 }
1592 else
1593 {
1594 if ( opts->region_count > 0 )
1595 {
1596 rc = print_unaligned_database_filtered( opts, &seq, &prim, mc, ids );
1597 }
1598 else
1599 {
1600 rc = print_unaligned_database_full( opts, &seq, &prim, mc, ids );
1601 }
1602 VTableRelease( prim.tab );
1603 }
1604 VTableRelease( seq.tab );
1605 }
1606 }
1607 }
1608 }
1609
1610 if ( rc == 0 && ifs->table_count > 0 )
1611 {
1612 uint32_t tab_idx;
1613 for ( tab_idx = 0; tab_idx < ifs->table_count && rc == 0; ++tab_idx )
1614 {
1615 input_table * itab = VectorGet( &ifs->tabs, tab_idx );
1616 if ( itab != NULL )
1617 rc = print_unaligned_table( opts, itab );
1618 }
1619 }
1620
1621 #if _DEBUGGING
1622 if ( opts->perf_log != NULL )
1623 perf_log_end_section( opts->perf_log );
1624 #endif
1625
1626 return rc;
1627 }
1628