1 /* $Id: magicblast_util.cpp 629892 2021-04-22 19:06:53Z ivanov $
2 * ===========================================================================
3 *
4 * PUBLIC DOMAIN NOTICE
5 * National Center for Biotechnology Information
6 *
7 * This software/database is a "United States Government Work" under the
8 * terms of the United States Copyright Act. It was written as part of
9 * the author's official duties as a United States Government employee and
10 * thus cannot be copyrighted. This software/database is freely available
11 * to the public for use. The National Library of Medicine and the U.S.
12 * Government have not placed any restriction on its use or reproduction.
13 *
14 * Although all reasonable efforts have been taken to ensure the accuracy
15 * and reliability of the software and data, the NLM and the U.S.
16 * Government do not and cannot warrant the performance or results that
17 * may be obtained by using this software or data. The NLM and the U.S.
18 * Government disclaim all warranties, express or implied, including
19 * warranties of performance, merchantability or fitness for any particular
20 * purpose.
21 *
22 * Please cite the author in any work or product based on this material.
23 *
24 * ===========================================================================
25 *
26 * Authors: Greg Boratyn
27 * Implements utils for MagicBLAST application
28 *
29 */
30
31
32 #include <ncbi_pch.hpp>
33 #include <corelib/ncbiapp.hpp>
34 #include <algo/blast/api/magicblast.hpp>
35 #include <algo/blast/api/remote_blast.hpp>
36 #include <algo/blast/api/magicblast_options.hpp>
37 #include <algo/blast/blastinput/blast_fasta_input.hpp>
38 #include <algo/blast/blastinput/blast_asn1_input.hpp>
39 #include <algo/blast/blast_sra_input/blast_sra_input.hpp>
40 #include <algo/blast/blastinput/magicblast_args.hpp>
41 #include <algo/blast/api/objmgr_query_data.hpp>
42 #include <algo/blast/format/blast_format.hpp>
43 #include "../blast/blast_app_util.hpp"
44
45 #include <objtools/format/sam_formatter.hpp>
46
47 #include <algo/blast/api/objmgrfree_query_data.hpp>
48 #include <objects/seqset/Seq_entry.hpp>
49 #include <objects/seqalign/Spliced_seg.hpp>
50 #include <objects/seqalign/Spliced_exon.hpp>
51 #include <objects/seqalign/Spliced_exon_chunk.hpp>
52 #include <objects/seqalign/Product_pos.hpp>
53 #include <objects/seqalign/Splice_site.hpp>
54
55 #include <objects/general/User_field.hpp>
56 #include <objects/general/User_object.hpp>
57 #include <objects/general/Dbtag.hpp>
58
59 #include <algo/blast/core/blast_nalookup.h>
60 #include <algo/blast/api/blast_seqinfosrc_aux.hpp>
61 #include <algo/sequence/consensus_splice.hpp>
62 #include <util/sequtil/sequtil_manip.hpp>
63 #include <util/sequtil/sequtil_convert.hpp>
64
65 #include "magicblast_util.hpp"
66
67 #include <unordered_set>
68 #include <unordered_map>
69 #include <memory>
70
71 #ifndef SKIP_DOXYGEN_PROCESSING
72 BEGIN_NCBI_SCOPE;
73 BEGIN_SCOPE(blast);
74 USING_SCOPE(objects);
75 #endif
76
77
78 typedef unordered_map<string, CRef<CSeq_entry> > TQueryMap;
79
80
81 static
82 CNcbiOstream& PrintTabularUnaligned(CNcbiOstream& ostr,
83 const CMagicBlastResults& results,
84 const TQueryMap& queries,
85 bool first_seg);
86
87 static
88 CNcbiOstream& PrintSAMUnaligned(CNcbiOstream& ostr,
89 const CMagicBlastResults& results,
90 const TQueryMap& queries,
91 bool first_seg,
92 bool trim_read_ids);
93
94
s_Complement(char c)95 static char s_Complement(char c)
96 {
97 char retval;
98
99 switch (c) {
100 case 'A':
101 retval = 'T';
102 break;
103
104 case 'C':
105 retval = 'G';
106 break;
107
108 case 'G':
109 retval = 'C';
110 break;
111
112 case 'T':
113 retval = 'A';
114 break;
115
116 case 'N':
117 retval = 'N';
118 break;
119
120 case '-':
121 retval = '-';
122 break;
123
124 default:
125 retval = 'x';
126 };
127
128 return retval;
129 }
130
131
s_GetBareId(const CSeq_id & id)132 static string s_GetBareId(const CSeq_id& id)
133 {
134 string retval;
135 // Gis are printed with the bar
136 if (id.IsGi()) {
137 retval = id.AsFastaString();
138 }
139 else if (id.IsGeneral()) {
140 const CDbtag& dbt = id.GetGeneral();
141 if (dbt.GetTag().IsStr()) {
142 retval = dbt.GetTag().GetStr();
143 }
144 else if (dbt.GetTag().IsId()) {
145 retval = NStr::IntToString(dbt.GetTag().GetId());
146 }
147 }
148 else {
149 retval = id.GetSeqIdString(true);
150 }
151
152 return retval;
153 }
154
155
s_GetSequenceId(const CBioseq & bioseq)156 static string s_GetSequenceId(const CBioseq& bioseq)
157 {
158 string retval;
159 if (bioseq.IsSetDescr()) {
160 for (auto it: bioseq.GetDescr().Get()) {
161 if (it->IsTitle()) {
162 vector<string> tokens;
163 NStr::Split(it->GetTitle(), " ", tokens);
164 retval = tokens[0];
165 }
166 }
167 }
168
169 if (retval.empty()) {
170 retval = s_GetBareId(*bioseq.GetFirstId());
171 }
172
173 return retval;
174 }
175
176
s_GetFastaDefline(const CBioseq & bioseq)177 static string s_GetFastaDefline(const CBioseq& bioseq)
178 {
179 string retval;
180 if (bioseq.IsSetDescr()) {
181 for (auto it: bioseq.GetDescr().Get()) {
182 if (it->IsTitle()) {
183 retval = it->GetTitle();
184 }
185 }
186 }
187
188 if (retval.empty()) {
189 retval = s_GetBareId(*bioseq.GetFirstId());
190 }
191
192 return retval;
193 }
194
195
s_CreateQueryMap(const CBioseq_set & query_batch,TQueryMap & query_map)196 static void s_CreateQueryMap(const CBioseq_set& query_batch,
197 TQueryMap& query_map)
198 {
199 query_map.clear();
200 for (auto it: query_batch.GetSeq_set()) {
201
202 CRef<CSeq_entry> seq_entry(it);
203 const CSeq_id* seq_id = seq_entry->GetSeq().GetFirstId();
204 if (!seq_id) {
205 NCBI_THROW(CException, eInvalid, "Missing Sequence Id");
206 }
207 string id = seq_id->GetSeqIdString();
208 query_map[id] = seq_entry;
209 }
210 }
211
212
s_GetQueryBioseq(const TQueryMap & queries,const CSeq_id & seqid)213 static const CBioseq& s_GetQueryBioseq(const TQueryMap& queries,
214 const CSeq_id& seqid)
215 {
216 TQueryMap::const_iterator it = queries.find(seqid.GetSeqIdString());
217 _ASSERT(it != queries.end());
218 if (it == queries.end()) {
219 NCBI_THROW(CException, eInvalid, (string)"Query Bioseq not found for "
220 "id: " + s_GetBareId(seqid));
221 }
222
223 return it->second->GetSeq();
224 }
225
s_GetQuerySequence(const CBioseq & bioseq,const CRange<TSeqPos> & range,bool reverse_complement,string & sequence)226 static int s_GetQuerySequence(const CBioseq& bioseq,
227 const CRange<TSeqPos>& range,
228 bool reverse_complement,
229 string& sequence)
230 {
231 const CSeq_data& seq_data = bioseq.GetInst().GetSeq_data();
232 switch (seq_data.Which()) {
233 case CSeq_data::e_Iupacna:
234 sequence = seq_data.GetIupacna().Get();
235 if (range.NotEmpty() && !range.IsWhole()) {
236 sequence = sequence.substr(range.GetFrom(), range.GetLength());
237 }
238 break;
239
240 case CSeq_data::e_Ncbi2na:
241 CSeqConvert::Convert(seq_data.GetNcbi2na().Get(),
242 CSeqUtil::e_Ncbi2na, range.GetFrom(),
243 range.GetLength(),
244 sequence, CSeqUtil::e_Iupacna);
245 break;
246 case CSeq_data::e_Ncbi4na:
247 CSeqConvert::Convert(seq_data.GetNcbi4na().Get(),
248 CSeqUtil::e_Ncbi4na, range.GetFrom(),
249 range.GetLength(),
250 sequence, CSeqUtil::e_Iupacna);
251 break;
252
253 case CSeq_data::e_Ncbi8na:
254 CSeqConvert::Convert(seq_data.GetNcbi8na().Get(),
255 CSeqUtil::e_Ncbi8na, range.GetFrom(),
256 range.GetLength(),
257 sequence, CSeqUtil::e_Iupacna);
258 break;
259
260 default:
261 NCBI_THROW(CException, eInvalid, "Unexpected query sequence "
262 "encoding");
263 };
264
265
266 if (reverse_complement) {
267 string tmp(sequence);
268 CSeqManip::ReverseComplement(tmp, CSeqUtil::e_Iupacna, 0, tmp.length(),
269 sequence);
270 }
271
272 return 0;
273 }
274
275
276 static
PrintFastaUnaligned(CNcbiOstream & ostr,const CMagicBlastResults & results,const TQueryMap & queries,bool first_seg)277 CNcbiOstream& PrintFastaUnaligned(CNcbiOstream& ostr,
278 const CMagicBlastResults& results,
279 const TQueryMap& queries,
280 bool first_seg)
281 {
282 CSeq_id id;
283 if (!results.IsPaired() || first_seg) {
284 id.Set(results.GetQueryId().AsFastaString());
285 }
286 else {
287 id.Set(results.GetLastId().AsFastaString());
288 }
289
290 const CBioseq& bioseq = s_GetQueryBioseq(queries, id);
291
292 // defline
293 ostr << ">" << s_GetFastaDefline(bioseq) << endl;
294
295 // sequence
296 string sequence;
297 CRange<TSeqPos> range;
298 s_GetQuerySequence(bioseq, range, false, sequence);
299 ostr << sequence;
300
301 return ostr;
302 }
303
304 static
PrintUnaligned(CNcbiOstream & ostr,CFormattingArgs::EOutputFormat fmt,const CMagicBlastResults & results,const TQueryMap & queries,bool first_seg,bool trim_read_ids)305 CNcbiOstream& PrintUnaligned(CNcbiOstream& ostr,
306 CFormattingArgs::EOutputFormat fmt,
307 const CMagicBlastResults& results,
308 const TQueryMap& queries,
309 bool first_seg,
310 bool trim_read_ids)
311 {
312
313
314 switch (fmt) {
315
316 case CFormattingArgs::eTabular:
317 return PrintTabularUnaligned(ostr, results, queries, first_seg);
318
319 case CFormattingArgs::eFasta:
320 return PrintFastaUnaligned(ostr, results, queries, first_seg);
321
322 default:
323 return PrintSAMUnaligned(ostr, results, queries, first_seg, trim_read_ids);
324 };
325 }
326
PrintTabularHeader(CNcbiOstream & ostr,const string & version,const string & cmd_line_args)327 CNcbiOstream& PrintTabularHeader(CNcbiOstream& ostr, const string& version,
328 const string& cmd_line_args)
329 {
330 string sep = "\t";
331
332 ostr << "# MAGICBLAST " << version << endl;
333 ostr << "# " << cmd_line_args << endl;
334
335 ostr << "# Fields: ";
336 ostr << "query acc." << sep;
337 ostr << "reference acc." << sep;
338 ostr << "% identity" << sep;
339 ostr << "not used" << sep;
340 ostr << "not used" << sep;
341 ostr << "not used" << sep;
342 ostr << "query start" << sep;
343 ostr << "query end" << sep;
344 ostr << "reference start" << sep;
345 ostr << "reference end" << sep;
346 ostr << "not used" << sep;
347 ostr << "not used" << sep;
348 ostr << "score" << sep;
349 ostr << "query strand" << sep;
350 ostr << "reference strand" << sep;
351 ostr << "query length" << sep;
352 ostr << "BTOP" << sep;
353 ostr << "num placements" << sep;
354 ostr << "not used" << sep;
355 ostr << "compartment" << sep;
356 ostr << "left overhang" << sep;
357 ostr << "right overhang" << sep;
358 ostr << "mate reference" << sep;
359 ostr << "mate ref. start" << sep;
360 ostr << "composite score";
361
362 ostr << endl;
363
364 return ostr;
365 }
366
367
368 static
PrintTabular(CNcbiOstream & ostr,const CSeq_align & align,const TQueryMap & queries,bool is_paired,int batch_number,int compartment,const CSeq_align * mate=NULL)369 CNcbiOstream& PrintTabular(CNcbiOstream& ostr, const CSeq_align& align,
370 const TQueryMap& queries,
371 bool is_paired, int batch_number, int compartment,
372 const CSeq_align* mate = NULL)
373 {
374 // if paired alignment
375 if (align.GetSegs().IsDisc()) {
376
377 const CSeq_align_set& disc = align.GetSegs().GetDisc();
378 _ASSERT(disc.Get().size() == 2u);
379
380 CSeq_align_set::Tdata::const_iterator first = disc.Get().begin();
381 _ASSERT(first != disc.Get().end());
382 CSeq_align_set::Tdata::const_iterator second(first);
383 ++second;
384 _ASSERT(second != disc.Get().end());
385
386 PrintTabular(ostr, **first, queries, is_paired, batch_number,
387 compartment, second->GetNonNullPointer());
388 ostr << endl;
389
390 PrintTabular(ostr, **second, queries, is_paired, batch_number,
391 compartment, first->GetNonNullPointer());
392
393 return ostr;
394 }
395
396 string sep = "\t";
397 const CBioseq& bioseq = s_GetQueryBioseq(queries, align.GetSeq_id(0));
398 ostr << s_GetSequenceId(bioseq) << sep;
399
400 ostr << s_GetBareId(align.GetSeq_id(1)) << sep;
401
402 int score;
403 double perc_identity;
404 align.GetNamedScore(CSeq_align::eScore_Score, score);
405 align.GetNamedScore(CSeq_align::eScore_PercentIdentity_Gapped,
406 perc_identity);
407
408 ostr << perc_identity << sep;
409
410 ostr << 0 << sep; // length
411 ostr << 0 << sep; // mismatch
412 ostr << 0 << sep; // gapopen
413
414
415 int query_len = 0;
416
417 if (align.GetSegs().IsDenseg()) {
418 CRange<TSeqPos> range = align.GetSeqRange(0);
419 ostr << range.GetFrom() + 1 << sep << range.GetTo() + 1 << sep;
420 range = align.GetSeqRange(1);
421 if (align.GetSeqStrand(0) == eNa_strand_minus) {
422 ostr << range.GetTo() + 1 << sep << range.GetFrom() + 1 << sep;
423 }
424 else {
425 ostr << range.GetFrom() + 1 << sep << range.GetTo() + 1 << sep;
426 }
427 }
428 else if (align.GetSegs().IsSpliced()) {
429 CRange<TSeqPos> range = align.GetSeqRange(0);
430 if (align.GetSegs().GetSpliced().IsSetProduct_length()) {
431 query_len = align.GetSegs().GetSpliced().GetProduct_length();
432 }
433 else {
434 _ASSERT(0);
435 }
436 if (align.GetSeqStrand(0) == eNa_strand_minus) {
437 ostr << query_len - range.GetTo() << sep
438 << query_len - range.GetFrom() << sep;
439
440 range = align.GetSeqRange(1);
441 ostr << range.GetTo() + 1 << sep << range.GetFrom() + 1 << sep;
442 }
443 else {
444 ostr << range.GetFrom() + 1 << sep << range.GetTo() + 1 << sep;
445 range = align.GetSeqRange(1);
446 ostr << range.GetFrom() + 1 << sep << range.GetTo() + 1 << sep;
447 }
448
449 }
450
451 ostr << 0.0 << sep; // e-value
452 ostr << 99 << sep; // bit score
453
454 ostr << score << sep;
455
456 // query is always a plus strand
457 ostr << "plus" << sep
458 << (align.GetSeqStrand(0) == 1 ? "plus" : "minus");
459
460 string btop_string;
461 Int4 num_hits = 0;
462 Int4 pair_start = 0;
463 Int4 fragment_score = 0;
464
465 CConstRef<CUser_object> ext = align.FindExt("Mapper Info");
466 if (ext.NotEmpty()) {
467
468 ITERATE (CUser_object::TData, it, ext->GetData()) {
469 if (!(*it)->GetLabel().IsStr()) {
470 continue;
471 }
472
473 if ((*it)->GetLabel().GetStr() == "btop" &&
474 (*it)->GetData().IsStr()) {
475
476 btop_string = (*it)->GetString();
477 }
478 else if ((*it)->GetLabel().GetStr() == "num_hits" &&
479 (*it)->GetData().IsInt()) {
480
481 num_hits = (*it)->GetInt();
482 }
483 }
484 }
485
486 // for alignments on the minus strand
487 if (align.GetSeqStrand(0) == eNa_strand_minus) {
488
489 // reverse btop string
490 string new_btop;
491 int i = btop_string.length() - 1;
492 while (i >= 0) {
493 int to = i;
494 while (i >= 0 && (isdigit(btop_string[i]) ||
495 btop_string[i] == ')')) {
496 i--;
497 }
498
499 new_btop += btop_string.substr(i + 1, to - i);
500
501 if (i > 0) {
502 if (isalpha(btop_string[i]) || btop_string[i] == '-') {
503 // complement bases in place
504 new_btop += s_Complement(btop_string[i - 1]);
505 new_btop += s_Complement(btop_string[i]);
506 i--;
507 }
508 else {
509 new_btop += btop_string[i];
510 }
511 }
512
513 i--;
514 }
515 btop_string.swap(new_btop);
516 }
517
518 fragment_score = score;
519 if (mate) {
520 int mate_score = 0;
521 mate->GetNamedScore(CSeq_align::eScore_Score, mate_score);
522 fragment_score += mate_score;
523 }
524
525 // report unaligned part of the query: 3' end and reverese complemented
526 // 5' end
527 string left_overhang = "-";
528 string right_overhang = "-";
529 if (query_len <= 0) {
530 query_len = bioseq.GetInst().GetLength();
531 }
532
533 CRange<TSeqPos> range = align.GetSeqRange(0);
534 int from = range.GetFrom();
535 int to = range.GetToOpen();
536 if (align.GetSeqStrand(0) == eNa_strand_minus) {
537 from = query_len - range.GetToOpen();
538 to = query_len - range.GetFrom();
539 }
540
541 // reverse complemented 5' end
542 if (from > 0) {
543 CRange<TSeqPos> r(MAX(0, from - 30), from - 1);
544 left_overhang.clear();
545 s_GetQuerySequence(bioseq, r, true, left_overhang);
546 }
547
548 // 3' end
549 if (to < query_len) {
550 CRange<TSeqPos> r(to, MIN(to + 30 - 1, query_len - 1));
551 right_overhang.clear();
552 s_GetQuerySequence(bioseq, r, false, right_overhang);
553 }
554
555 ostr << sep << query_len
556 << sep << btop_string
557 << sep << num_hits
558 << sep << /*splice*/ "-"
559 << sep << batch_number << ":" << compartment
560 << sep << left_overhang
561 << sep << right_overhang;
562
563 if (is_paired && mate) {
564 if (align.GetSeq_id(1).Match(mate->GetSeq_id(1))) {
565 ostr << sep << "-";
566 }
567 else {
568 ostr << sep << mate->GetSeq_id(1).AsFastaString();
569 }
570
571 pair_start = mate->GetSeqStart(1) + 1;
572 //FIXME: for tests
573 if (mate->GetSeqStrand(0) == eNa_strand_minus) {
574 pair_start = mate->GetSeqStop(1) + 1;
575 }
576 if ((align.GetSeqStart(1) < mate->GetSeqStart(1) &&
577 align.GetSeqStrand(0) == eNa_strand_minus) ||
578 (mate->GetSeqStart(1) < align.GetSeqStart(1) &&
579 mate->GetSeqStrand(0) == eNa_strand_minus)) {
580
581 pair_start = -pair_start;
582 }
583 ostr << sep << pair_start;
584
585 }
586 else {
587 ostr << sep << "-" << sep << "-";
588 }
589
590 ostr << sep << fragment_score;
591
592 return ostr;
593 }
594
595
PrintTabularUnaligned(CNcbiOstream & ostr,const CMagicBlastResults & results,const TQueryMap & queries,bool first_seg)596 CNcbiOstream& PrintTabularUnaligned(CNcbiOstream& ostr,
597 const CMagicBlastResults& results,
598 const TQueryMap& queries,
599 bool first_seg)
600 {
601 string sep = "\t";
602 CSeq_id id;
603 if (!results.IsPaired() || first_seg) {
604 id.Set(results.GetQueryId().AsFastaString());
605 }
606 else {
607 id.Set(results.GetLastId().AsFastaString());
608 }
609 const CBioseq& bioseq = s_GetQueryBioseq(queries, id);
610
611 // query
612 ostr << s_GetSequenceId(bioseq) << sep;
613
614 // subject
615 ostr << "-" << sep;
616
617 // percent identity
618 ostr << 0.0 << sep;
619
620 ostr << 0 << sep; // length
621 ostr << 0 << sep; // mismatch
622 ostr << 0 << sep; // gapopen
623
624 // query start and stop
625 ostr << 0 << sep << 0 << sep;
626
627 // subject start and stop
628 ostr << 0 << sep << 0 << sep;
629
630 ostr << 0 << sep; // e-value
631 ostr << 99 << sep; // bit score
632
633 ostr << 0 << sep;
634
635 // query and subject strand
636 ostr << "-" << sep << "-" << sep;
637
638 // query length
639 int query_len = bioseq.GetInst().GetLength();
640
641 ostr << query_len << sep;
642
643 // btop string
644 ostr << "-" << sep;
645
646 // number of placements
647 ostr << 0 << sep;
648
649 // splice
650 ostr << "-" << sep;
651
652 // compartment
653 string compart = "-";
654 // if a read did not pass filtering
655 CMagicBlastResults::TResultsInfo info =
656 first_seg ? results.GetFirstInfo() : results.GetLastInfo();
657 if ((info & CMagicBlastResults::fFiltered) != 0) {
658 compart = "F";
659 }
660 ostr << compart << sep;
661
662 // left overhang
663 ostr << "-" << sep;
664
665 // right overhang
666 ostr << "-" << sep;
667
668 // mate reference
669 ostr << "-" << sep;
670
671 // mate start position
672 ostr << "-" << sep;
673
674 // composite score
675 ostr << 0;
676
677 return ostr;
678 }
679
680 static
PrintTabular(CNcbiOstream & ostr,CNcbiOstream & unaligned_ostr,CFormattingArgs::EOutputFormat unaligned_fmt,const CMagicBlastResults & results,const TQueryMap & queries,bool is_paired,int batch_number,int & compartment,bool trim_read_id,bool print_unaligned,bool no_discordant)681 CNcbiOstream& PrintTabular(CNcbiOstream& ostr,
682 CNcbiOstream& unaligned_ostr,
683 CFormattingArgs::EOutputFormat unaligned_fmt,
684 const CMagicBlastResults& results,
685 const TQueryMap& queries,
686 bool is_paired, int batch_number,
687 int& compartment,
688 bool trim_read_id,
689 bool print_unaligned,
690 bool no_discordant)
691 {
692 bool is_concordant = results.IsConcordant();
693
694 if (!no_discordant || (no_discordant && is_concordant)) {
695 for (auto it: results.GetSeqAlign()->Get()) {
696 PrintTabular(ostr, *it, queries, is_paired, batch_number,
697 compartment++);
698 ostr << endl;
699 }
700 }
701
702 if (!print_unaligned) {
703 return ostr;
704 }
705
706 if ((results.GetFirstInfo() & CMagicBlastResults::fUnaligned) != 0 ||
707 (no_discordant && !is_concordant)) {
708
709 PrintUnaligned(unaligned_ostr, unaligned_fmt, results, queries, true,
710 trim_read_id);
711 unaligned_ostr << endl;
712 }
713
714 if (results.IsPaired() &&
715 ((results.GetLastInfo() & CMagicBlastResults::fUnaligned) != 0 ||
716 (no_discordant && !is_concordant))) {
717
718 PrintUnaligned(unaligned_ostr, unaligned_fmt, results, queries, false,
719 trim_read_id);
720 unaligned_ostr << endl;
721 }
722
723 return ostr;
724 }
725
726
PrintTabular(CNcbiOstream & ostr,CNcbiOstream & unaligned_ostr,CFormattingArgs::EOutputFormat unaligned_fmt,const CMagicBlastResultSet & results,const CBioseq_set & query_batch,bool is_paired,int batch_number,bool trim_read_id,bool print_unaligned,bool no_discordant)727 CNcbiOstream& PrintTabular(CNcbiOstream& ostr,
728 CNcbiOstream& unaligned_ostr,
729 CFormattingArgs::EOutputFormat unaligned_fmt,
730 const CMagicBlastResultSet& results,
731 const CBioseq_set& query_batch,
732 bool is_paired, int batch_number,
733 bool trim_read_id,
734 bool print_unaligned,
735 bool no_discordant)
736 {
737 TQueryMap queries;
738 s_CreateQueryMap(query_batch, queries);
739
740 int compartment = 0;
741 for (auto it: results) {
742 PrintTabular(ostr, unaligned_ostr, unaligned_fmt, *it, queries,
743 is_paired, batch_number, compartment, trim_read_id,
744 print_unaligned, no_discordant);
745 }
746
747 return ostr;
748 }
749
750
PrintSAMHeader(CNcbiOstream & ostr,CRef<CLocalDbAdapter> db_adapter,const string & cmd_line_args)751 CNcbiOstream& PrintSAMHeader(CNcbiOstream& ostr,
752 CRef<CLocalDbAdapter> db_adapter,
753 const string& cmd_line_args)
754 {
755 BlastSeqSrc* seq_src = db_adapter->MakeSeqSrc();
756 IBlastSeqInfoSrc* seqinfo_src = db_adapter->MakeSeqInfoSrc();
757 _ASSERT(seq_src && seqinfo_src);
758
759 CRef<CSeqDB> seqdb;
760 if (db_adapter->IsBlastDb()) {
761 seqdb.Reset(db_adapter->GetSearchDatabase()->GetSeqDb());
762 }
763
764 ostr << "@HD\t" << "VN:1.0\t" << "GO:query" << endl;
765
766 BlastSeqSrcResetChunkIterator(seq_src);
767 BlastSeqSrcIterator* it = BlastSeqSrcIteratorNew();
768 CRef<CSeq_id> seqid(new CSeq_id);
769 Uint4 length;
770 Int4 oid;
771 while ((oid = BlastSeqSrcIteratorNext(seq_src, it)) != BLAST_SEQSRC_EOF) {
772 GetSequenceLengthAndId(seqinfo_src, oid, CSeq_id::BlastRank, seqid,
773 &length);
774
775 ostr << "@SQ\t" << "SN:" << s_GetBareId(*seqid) << "\tLN:" << length;
776
777 vector<TTaxId> taxids;
778 if (seqdb.NotEmpty()) {
779 seqdb->GetTaxIDs(oid, taxids);
780 }
781
782 if (!taxids.empty() && taxids[0] != 0) {
783 ostr << "\tSP:";
784 for (vector<TTaxId>::iterator it = taxids.begin();
785 it != taxids.end(); ++it) {
786 if (it != taxids.begin()) {
787 ostr << ",";
788 }
789 ostr << *it;
790 }
791 }
792 ostr << endl;
793 }
794 BlastSeqSrcIteratorFree(it);
795
796 ostr << "@PG\tID:magicblast\tPN:magicblast\tCL:" << cmd_line_args << endl;
797
798 return ostr;
799 }
800
801
802 // hash function for pointers to Seq_id
803 struct hash_seqid
804 {
operator ()hash_seqid805 size_t operator()(const CSeq_id* s) const {
806 std::hash<string> h;
807 return h(s->AsFastaString());
808 }
809 };
810
811
812 // equal_to function for pointers to Seq_id
813 struct eq_seqid
814 {
operator ()eq_seqid815 bool operator()(const CSeq_id* a, const CSeq_id* b) const {
816 return a->Match(*b);
817 }
818 };
819
820 // hash_set of pointers to Seq_ids
821 typedef unordered_set<const CSeq_id*, hash_seqid, eq_seqid> TSeq_idHashSet;
822
823
824 static ENa_strand
s_GetSpliceSiteOrientation(const CSpliced_seg::TExons::const_iterator & exon,const CSpliced_seg::TExons::const_iterator & next_exon)825 s_GetSpliceSiteOrientation(const CSpliced_seg::TExons::const_iterator& exon,
826 const CSpliced_seg::TExons::const_iterator& next_exon)
827 {
828 ENa_strand result = eNa_strand_unknown;
829
830 // orientation is unknown if exons align on different strands or a exon's
831 // genomic strand is unknown
832 if ((*exon)->GetGenomic_strand() !=
833 (*next_exon)->GetGenomic_strand() ||
834 (*exon)->GetGenomic_strand() == eNa_strand_unknown) {
835
836 return eNa_strand_unknown;
837 }
838
839 // orientation is unknown if splice signal is not set
840 if (!(*exon)->IsSetDonor_after_exon() ||
841 !(*next_exon)->IsSetAcceptor_before_exon()) {
842
843 return eNa_strand_unknown;
844 }
845
846 // get splice signal
847 string donor = (*exon)->GetDonor_after_exon().GetBases();
848 string acceptor = (*next_exon)->GetAcceptor_before_exon().GetBases();
849
850 // if the signal is recognised then the splice orientation is the same as
851 // genomic strand
852 if (IsConsensusSplice(donor, acceptor) ||
853 IsKnownNonConsensusSplice(donor, acceptor)) {
854
855 result = (*exon)->GetGenomic_strand();
856 }
857 else {
858 // otherwise try to recognise reverse complemented splice signals
859
860 string rc_donor;
861 string rc_acceptor;
862
863 CSeqManip::ReverseComplement(donor,
864 CSeqUtil::e_Iupacna,
865 0, donor.length(),
866 rc_donor);
867
868 CSeqManip::ReverseComplement(acceptor,
869 CSeqUtil::e_Iupacna,
870 0, acceptor.length(),
871 rc_acceptor);
872
873 // if reverse complemented signals are recognised then splice
874 // orientation is opposite to genomic strand
875 if (IsConsensusSplice(rc_acceptor, rc_donor) ||
876 IsKnownNonConsensusSplice(rc_acceptor, rc_donor)) {
877
878 if ((*exon)->GetGenomic_strand() == eNa_strand_plus) {
879 result = eNa_strand_minus;
880 }
881 else if ((*exon)->GetGenomic_strand() == eNa_strand_minus) {
882 result = eNa_strand_plus;
883 }
884 else {
885 result = eNa_strand_unknown;
886 }
887 }
888 else {
889 // if neither signals are recognised then splice orientation is
890 // unknown
891 result = eNa_strand_unknown;
892 }
893
894 }
895
896 return result;
897 }
898
899
900 #define SAM_FLAG_MULTI_SEGMENTS 0x1
901 #define SAM_FLAG_SEGS_ALIGNED 0x2
902 #define SAM_FLAG_SEG_UNMAPPED 0x4
903 #define SAM_FLAG_NEXT_SEG_UNMAPPED 0x8
904 #define SAM_FLAG_SEQ_REVCOMP 0x10
905 #define SAM_FLAG_NEXT_REVCOMP 0x20
906 #define SAM_FLAG_FIRST_SEGMENT 0x40
907 #define SAM_FLAG_LAST_SEGMENT 0x80
908 #define SAM_FLAG_SECONDARY 0x100
909
910 static
PrintSAM(CNcbiOstream & ostr,const CSeq_align & align,const TQueryMap & queries,const BlastQueryInfo * query_info,bool is_spliced,int batch_number,bool & first_secondary,bool & last_secondary,bool trim_read_ids,E_StrandSpecificity strand_specific,bool only_specific,bool print_md_tag,bool other=false,const CSeq_align * mate=NULL)911 CNcbiOstream& PrintSAM(CNcbiOstream& ostr, const CSeq_align& align,
912 const TQueryMap& queries,
913 const BlastQueryInfo* query_info,
914 bool is_spliced,
915 int batch_number, bool& first_secondary,
916 bool& last_secondary, bool trim_read_ids,
917 E_StrandSpecificity strand_specific,
918 bool only_specific,
919 bool print_md_tag,
920 bool other = false,
921 const CSeq_align* mate = NULL)
922 {
923 string sep = "\t";
924
925 string btop_string;
926 string md_tag;
927 int query_len = 0;
928 int num_hits = 0;
929 int context = -1;
930 int sam_flags = 0;
931 const int kMaxInsertSize = is_spliced ?
932 MAGICBLAST_MAX_INSERT_SIZE_SPLICED :
933 MAGICBLAST_MAX_INSERT_SIZE_NONSPLICED;
934
935 // if paired alignment
936 if (align.GetSegs().IsDisc()) {
937
938 _ASSERT(align.GetSegs().GetDisc().Get().size() == 2);
939
940 const CSeq_align_set& disc = align.GetSegs().GetDisc();
941 CSeq_align_set::Tdata::const_iterator first = disc.Get().begin();
942 _ASSERT(first != disc.Get().end());
943 CSeq_align_set::Tdata::const_iterator second(first);
944 ++second;
945 _ASSERT(second != disc.Get().end());
946
947 PrintSAM(ostr, **first, queries, query_info, is_spliced,
948 batch_number, first_secondary, last_secondary,
949 trim_read_ids, strand_specific, only_specific,
950 print_md_tag, false,
951 second->GetNonNullPointer());
952 ostr << endl;
953
954 PrintSAM(ostr, **second, queries, query_info, is_spliced,
955 batch_number, first_secondary, last_secondary,
956 trim_read_ids, strand_specific, only_specific,
957 print_md_tag, true,
958 first->GetNonNullPointer());
959
960 return ostr;
961 }
962
963 // get align data saved in the user object
964 CConstRef<CUser_object> ext = align.FindExt("Mapper Info");
965 if (ext.NotEmpty()) {
966
967 ITERATE (CUser_object::TData, it, ext->GetData()) {
968 if (!(*it)->GetLabel().IsStr()) {
969 continue;
970 }
971
972 if ((*it)->GetLabel().GetStr() == "btop" &&
973 (*it)->GetData().IsStr()) {
974
975 btop_string = (*it)->GetString();
976 }
977 else if ((*it)->GetLabel().GetStr() == "num_hits" &&
978 (*it)->GetData().IsInt()) {
979
980 num_hits = (*it)->GetInt();
981 }
982 else if ((*it)->GetLabel().GetStr() == "context" &&
983 (*it)->GetData().IsInt()) {
984
985 context = (*it)->GetInt();
986 }
987 else if ((*it)->GetLabel().GetStr() == "md_tag" &&
988 (*it)->GetData().IsStr()) {
989
990 md_tag = (*it)->GetString();
991 }
992 }
993
994 }
995
996 vector<ENa_strand> orientation;
997 if (align.GetSegs().Which() == CSeq_align::TSegs::e_Spliced) {
998 const CSpliced_seg& spliced = align.GetSegs().GetSpliced();
999
1000 query_len = spliced.GetProduct_length();
1001 }
1002
1003 // observed template length
1004 int template_length = 0;
1005 CRange<TSeqPos> range = align.GetSeqRange(1);
1006 if (mate && align.GetSeq_id(1).Match(mate->GetSeq_id(1))) {
1007 CRange<TSeqPos> mate_range = mate->GetSeqRange(1);
1008 if (align.GetSeqStrand(0) == eNa_strand_plus &&
1009 align.GetSeqStrand(1) == eNa_strand_plus) {
1010
1011 template_length = (int)mate_range.GetTo() - (int)range.GetFrom() + 1;
1012 }
1013 else {
1014 template_length =
1015 -((int)range.GetTo() - (int)mate_range.GetFrom() + 1);
1016 }
1017 }
1018
1019
1020 // FIXME: if subject is on a minus strand we need to reverse
1021 // complement both
1022 if (align.GetSeqStrand(0) == eNa_strand_minus) {
1023 sam_flags |= SAM_FLAG_SEQ_REVCOMP;
1024 }
1025
1026 if (context >= 0 && query_info->contexts[context].segment_flags != 0) {
1027 sam_flags |= SAM_FLAG_MULTI_SEGMENTS;
1028
1029 if ((query_info->contexts[context].segment_flags & fFirstSegmentFlag)
1030 != 0) {
1031 sam_flags |= SAM_FLAG_FIRST_SEGMENT;
1032 }
1033
1034 if ((query_info->contexts[context].segment_flags & fLastSegmentFlag)
1035 != 0) {
1036 sam_flags |= SAM_FLAG_LAST_SEGMENT;
1037 }
1038
1039 if ((query_info->contexts[context].segment_flags & fPartialFlag) != 0
1040 || !mate) {
1041
1042 sam_flags |= SAM_FLAG_NEXT_SEG_UNMAPPED;
1043 }
1044
1045 if (mate) {
1046 // FIXME: it is assumed that subject is always in plus strand
1047 // (BLAST way)
1048 ENa_strand a_strand = align.GetSeqStrand(0);
1049 ENa_strand m_strand = mate->GetSeqStrand(0);
1050 bool plus_minus =
1051 a_strand == eNa_strand_plus && m_strand == eNa_strand_minus;
1052 bool minus_plus =
1053 a_strand == eNa_strand_minus && m_strand == eNa_strand_plus;
1054 TSeqPos a_start = align.GetSeqStart(1);
1055 TSeqPos m_start = mate->GetSeqStart(1);
1056
1057 // For strand specific output we reset SAM_FLAG_SEGS_ALIGNED
1058 // for paired alignments with the wrong configuration
1059 if (strand_specific != eNonSpecific) {
1060 // In this statement <bool1> != <bool2> is equivalent to
1061 // EXCLUSIVE-OR.
1062 // If <bool2> is false, conditional returns <bool1>.
1063 // If <bool2> is true, conditional returns <bool1> inverted.
1064 // So if "other" is true, actions based on "plus_minus"
1065 // and "minus_plus" are reversed.
1066 if (((strand_specific == eFwdRev && plus_minus != other)
1067 || (strand_specific == eRevFwd && minus_plus != other))
1068 && template_length < kMaxInsertSize) {
1069
1070 sam_flags |= SAM_FLAG_SEGS_ALIGNED;
1071 }
1072 } else {
1073 if (((a_start <= m_start && plus_minus)
1074 || (m_start <= a_start && minus_plus))
1075 && abs(template_length) < kMaxInsertSize) {
1076 sam_flags |= SAM_FLAG_SEGS_ALIGNED;
1077 }
1078 }
1079
1080 if (mate->GetSeqStrand(0) == eNa_strand_minus) {
1081 sam_flags |= SAM_FLAG_NEXT_REVCOMP;
1082 }
1083 }
1084 }
1085
1086 // set secondary alignment bit
1087 if ((sam_flags & SAM_FLAG_FIRST_SEGMENT) != 0) {
1088 if (first_secondary) {
1089 sam_flags |= SAM_FLAG_SECONDARY;
1090 }
1091 else {
1092 first_secondary = true;
1093 }
1094 }
1095 else {
1096 if (last_secondary) {
1097 sam_flags |= SAM_FLAG_SECONDARY;
1098 }
1099 else {
1100 last_secondary = true;
1101 }
1102 }
1103
1104 // read id
1105 const CBioseq& bioseq = s_GetQueryBioseq(queries, align.GetSeq_id(0));
1106 string read_id = s_GetSequenceId(bioseq);
1107 if (trim_read_ids &&
1108 (NStr::EndsWith(read_id, ".1") || NStr::EndsWith(read_id, ".2") ||
1109 NStr::EndsWith(read_id, "/1") || NStr::EndsWith(read_id, "/2"))) {
1110
1111 read_id.resize(read_id.length() - 2);
1112 }
1113 ostr << read_id << sep;
1114
1115 // flag
1116 ostr << sam_flags << sep;
1117
1118 // reference sequence id
1119 ostr << s_GetBareId(align.GetSeq_id(1)) << sep;
1120
1121 // mapping position
1122 ostr << range.GetFrom() + 1 << sep;
1123
1124 // mapping quality
1125 ostr << "255" << sep;
1126
1127 // CIGAR string
1128 string cigar;
1129 int edit_distance = 0;
1130 if (align.GetSegs().Which() == CSeq_align::TSegs::e_Denseg) {
1131 const CDense_seg& denseg = align.GetSegs().GetDenseg();
1132 const CDense_seg::TStarts& starts = denseg.GetStarts();
1133 const CDense_seg::TLens& lens = denseg.GetLens();
1134 CRange<TSeqPos> qrange = align.GetSeqRange(0);
1135
1136 if (align.GetSeqStrand(0) == eNa_strand_plus) {
1137 if (qrange.GetFrom() > 0) {
1138 cigar += NStr::IntToString(qrange.GetFrom());
1139 cigar += "S";
1140 }
1141 }
1142 else {
1143 if ((int)qrange.GetToOpen() < query_len) {
1144 cigar += NStr::IntToString(query_len - qrange.GetToOpen());
1145 cigar += "S";
1146 }
1147 }
1148 for (size_t i=0;i < starts.size();i+=2) {
1149 cigar += NStr::IntToString(lens[i/2]);
1150 if (starts[i] >= 0 && starts[i + 1] >= 0) {
1151 cigar += "M";
1152 }
1153 else if (starts[i] < 0) {
1154 if (lens[i/2] < 10) {
1155 cigar += "D";
1156 }
1157 else {
1158 cigar += "N";
1159 }
1160 }
1161 else {
1162 cigar += "I";
1163 }
1164 }
1165 if (align.GetSeqStrand(0) == eNa_strand_plus) {
1166 if ((int)qrange.GetToOpen() < query_len) {
1167 cigar += NStr::IntToString(query_len - qrange.GetToOpen());
1168 cigar += "S";
1169 }
1170 }
1171 else {
1172 if (qrange.GetFrom() > 0) {
1173 cigar += NStr::IntToString(qrange.GetFrom());
1174 cigar += "S";
1175 }
1176 }
1177 }
1178 else if (align.GetSegs().Which() == CSeq_align::TSegs::e_Spliced) {
1179 const CSpliced_seg& spliced = align.GetSegs().GetSpliced();
1180 CRange<TSeqPos> qrange = align.GetSeqRange(0);
1181
1182 if (qrange.GetFrom() > 0) {
1183 cigar += NStr::IntToString(qrange.GetFrom());
1184 cigar += "S";
1185 }
1186
1187 ITERATE (CSpliced_seg::TExons, exon, spliced.GetExons()) {
1188 int num = 0;
1189 char op = 0;
1190 ITERATE(CSpliced_exon::TParts, it, (*exon)->GetParts()) {
1191 switch ((*it)->Which()) {
1192 case CSpliced_exon_chunk::e_Match:
1193 if (op && op != 'M') {
1194 cigar += NStr::IntToString(num);
1195 cigar += op;
1196 num = 0;
1197 }
1198 num += (*it)->GetMatch();
1199 op = 'M';
1200 break;
1201
1202 case CSpliced_exon_chunk::e_Mismatch:
1203 if (op && op != 'M') {
1204 cigar += NStr::IntToString(num);
1205 cigar += op;
1206 num = 0;
1207 }
1208 edit_distance += (*it)->GetMismatch();
1209 num += (*it)->GetMismatch();
1210 op = 'M';
1211 break;
1212
1213 case CSpliced_exon_chunk::e_Product_ins:
1214 if (op && op != 'I') {
1215 cigar += NStr::IntToString(num);
1216 cigar += op;
1217 num = 0;
1218 }
1219 edit_distance += (*it)->GetProduct_ins();
1220 num += (*it)->GetProduct_ins();
1221 op = 'I';
1222 break;
1223
1224 case CSpliced_exon_chunk::e_Genomic_ins:
1225 if (op && op != 'D') {
1226 cigar += NStr::IntToString(num);
1227 cigar += op;
1228 num = 0;
1229 }
1230 edit_distance += (*it)->GetGenomic_ins();
1231 num += (*it)->GetGenomic_ins();
1232 op = 'D';
1233 break;
1234
1235 default:
1236 NCBI_THROW(CException, eInvalid, "Unsupported "
1237 "CSpliced_exon_chunk::TPart value");
1238 }
1239 }
1240 if (num > 0) {
1241 cigar += NStr::IntToString(num);
1242 cigar += op;
1243
1244 }
1245
1246 CSpliced_seg::TExons::const_iterator next_exon(exon);
1247 ++next_exon;
1248 if (next_exon != spliced.GetExons().end()) {
1249 int query_gap = (*next_exon)->GetProduct_start().GetNucpos() -
1250 (*exon)->GetProduct_end().GetNucpos() - 1;
1251 if (query_gap > 0) {
1252 cigar += NStr::IntToString(query_gap);
1253 cigar += "I";
1254 }
1255 edit_distance += query_gap;
1256
1257 int intron = (*next_exon)->GetGenomic_start() -
1258 (*exon)->GetGenomic_end() - 1;
1259 if (intron > 0) {
1260 cigar += NStr::IntToString(intron);
1261 cigar += "N";
1262 }
1263
1264 // get intron orientation
1265 orientation.push_back(
1266 s_GetSpliceSiteOrientation(exon, next_exon));
1267 }
1268 }
1269
1270 if ((int)qrange.GetToOpen() < query_len) {
1271 cigar += NStr::IntToString(query_len - qrange.GetToOpen());
1272 cigar += "S";
1273 }
1274 }
1275 else {
1276 NCBI_THROW(CSeqalignException, eUnsupported, "The SAM formatter does "
1277 "does not support this alignment structure");
1278 }
1279
1280 ostr << cigar << sep;
1281
1282 // reference name of the mate
1283 if (mate) {
1284 if (align.GetSeq_id(1).Match(mate->GetSeq_id(1))) {
1285 ostr << "=";
1286 }
1287 else {
1288 ostr << s_GetBareId(mate->GetSeq_id(1));
1289 }
1290 }
1291 else {
1292 ostr << "*";
1293 }
1294 ostr << sep;
1295
1296 // position of the mate
1297 if (mate) {
1298 ostr << MIN(mate->GetSeqStart(1), mate->GetSeqStop(1)) + 1;
1299 }
1300 else {
1301 ostr << "0";
1302 }
1303 ostr << sep;
1304
1305 // observed template length
1306 ostr << template_length;
1307 ostr << sep;
1308
1309 // read sequence
1310 string sequence;
1311 CRange<TSeqPos> r;
1312 int status = s_GetQuerySequence(bioseq, r,
1313 (sam_flags & SAM_FLAG_SEQ_REVCOMP) != 0, sequence);
1314
1315 if (!status && sequence.length() > 0) {
1316 ostr << sequence << sep;
1317 }
1318 else {
1319 ostr << "*" << sep;
1320 }
1321
1322 // quality string
1323 ostr << "*";
1324
1325 // optional fields
1326 // number of hits reported for the query
1327 ostr << sep << "NH:i:" << num_hits;
1328
1329 // score
1330 int score = 0;
1331 align.GetNamedScore(CSeq_align::eScore_Score, score);
1332 ostr << sep << "AS:i:" << score;
1333
1334 // edit distance
1335 ostr << sep << "NM:i:" << edit_distance;
1336
1337 // splice site orientation
1338 // The final splice orientation is positive or negative, if all introns in
1339 // the alignment have the same orientation, or unknown if orientation
1340 // changes.
1341 if (!orientation.empty()) {
1342 char ori;
1343
1344 switch (orientation[0]) {
1345 case eNa_strand_plus:
1346 ori = '+';
1347 break;
1348
1349 case eNa_strand_minus:
1350 ori = '-';
1351 break;
1352
1353 default:
1354 ori = '?';
1355 }
1356
1357 for (size_t i=1;i < orientation.size();i++) {
1358 if (orientation[i] != orientation[0]) {
1359 ori = '?';
1360 }
1361 }
1362
1363 ostr << sep << "XS:A:" << ori;
1364 }
1365
1366 // MD tag in Seq-align has long subject gaps (deletions) encoded as
1367 // !<gap length>!. 'x' is printed as each deletec base, because we do not
1368 // have access to subject sequence.
1369 if (print_md_tag && !md_tag.empty()) {
1370 vector<string> tokens;
1371 NStr::Split(md_tag, "!", tokens);
1372
1373 ostr << sep << "MD:Z:";
1374 size_t i = 0;
1375 for (;i < tokens.size();i+=2) {
1376 ostr << tokens[i];
1377
1378 if (i < tokens.size() - 1) {
1379 int num = NStr::StringToInt(tokens[i + 1]);
1380 _ASSERT(num > 0);
1381 ostr << "^";
1382 for (int k=0;k < num;k++) {
1383 ostr << "x";
1384 }
1385 }
1386 }
1387 }
1388
1389 return ostr;
1390 }
1391
1392
PrintSAMUnaligned(CNcbiOstream & ostr,const CMagicBlastResults & results,const TQueryMap & queries,bool first_seg,bool trim_read_ids)1393 CNcbiOstream& PrintSAMUnaligned(CNcbiOstream& ostr,
1394 const CMagicBlastResults& results,
1395 const TQueryMap& queries,
1396 bool first_seg,
1397 bool trim_read_ids)
1398 {
1399 string sep = "\t";
1400
1401 CSeq_id id;
1402 if (!results.IsPaired() || first_seg) {
1403 id.Set(results.GetQueryId().AsFastaString());
1404 }
1405 else {
1406 id.Set(results.GetLastId().AsFastaString());
1407 }
1408
1409 // read id
1410 const CBioseq& bioseq = s_GetQueryBioseq(queries, id);
1411 string read_id = s_GetSequenceId(bioseq);
1412 if (trim_read_ids &&
1413 (NStr::EndsWith(read_id, ".1") || NStr::EndsWith(read_id, ".2") ||
1414 NStr::EndsWith(read_id, "/1") || NStr::EndsWith(read_id, "/2"))) {
1415
1416 read_id.resize(read_id.length() - 2);
1417 }
1418 ostr << read_id << sep;
1419
1420 // SAM flags
1421 int flags = SAM_FLAG_SEG_UNMAPPED;
1422 if (results.IsPaired()) {
1423 flags |= SAM_FLAG_MULTI_SEGMENTS;
1424 if ((first_seg && !results.LastAligned()) ||
1425 (!first_seg && !results.FirstAligned())) {
1426
1427 flags |= SAM_FLAG_NEXT_SEG_UNMAPPED;
1428 }
1429
1430 if (first_seg) {
1431 flags |= SAM_FLAG_FIRST_SEGMENT;
1432 }
1433 else {
1434 flags |= SAM_FLAG_LAST_SEGMENT;
1435 }
1436 }
1437 ostr << flags << sep;
1438
1439 // reference sequence id
1440 ostr << "*" << sep;
1441
1442 // mapping position
1443 ostr << "0" << sep;
1444
1445 // mapping quality
1446 ostr << "0" << sep;
1447
1448 // CIGAR
1449 ostr << "*" << sep;
1450
1451 // mate reference sequence id
1452 ostr << "*" << sep;
1453
1454 // mate postition
1455 ostr << "0" << sep;
1456
1457 // template length
1458 ostr << "0" << sep;
1459
1460 // sequence
1461 string sequence;
1462 CRange<TSeqPos> range;
1463 int status = s_GetQuerySequence(bioseq, range, false, sequence);
1464 if (status || sequence.empty()) {
1465 ostr << "*" << sep;
1466 }
1467 else {
1468 ostr << sequence << sep;
1469 }
1470
1471 // quality string
1472 ostr << "*";
1473
1474 // read did not pass filtering
1475 CMagicBlastResults::TResultsInfo info =
1476 first_seg ? results.GetFirstInfo() : results.GetLastInfo();
1477 if ((info & CMagicBlastResults::fFiltered) != 0) {
1478 ostr << sep << "YF:Z:F";
1479 }
1480
1481 return ostr;
1482 }
1483
1484 static
PrintSAM(CNcbiOstream & ostr,CNcbiOstream & unaligned_ostr,CFormattingArgs::EOutputFormat unaligned_fmt,CMagicBlastResults & results,const TQueryMap & queries,const BlastQueryInfo * query_info,bool is_spliced,int batch_number,bool trim_read_id,bool print_unaligned,bool no_discordant,E_StrandSpecificity strand_specific,bool only_specific,bool print_md_tag)1485 CNcbiOstream& PrintSAM(CNcbiOstream& ostr,
1486 CNcbiOstream& unaligned_ostr,
1487 CFormattingArgs::EOutputFormat unaligned_fmt,
1488 CMagicBlastResults& results,
1489 const TQueryMap& queries,
1490 const BlastQueryInfo* query_info,
1491 bool is_spliced, int batch_number,
1492 bool trim_read_id, bool print_unaligned,
1493 bool no_discordant, E_StrandSpecificity strand_specific,
1494 bool only_specific,
1495 bool print_md_tag)
1496 {
1497 bool first_secondary = false;
1498 bool last_secondary = false;
1499
1500 if (strand_specific == eFwdRev) {
1501 results.SortAlignments(CMagicBlastResults::eFwRevFirst);
1502 }
1503 else if (strand_specific == eRevFwd) {
1504 results.SortAlignments(CMagicBlastResults::eRevFwFirst);
1505 }
1506
1507 // Is the pair aligned concordantly? (Unpaired are treated as concordant.)
1508 bool is_concordant = results.IsConcordant();
1509
1510 if (!no_discordant || (no_discordant && is_concordant)) {
1511 for (auto it: results.GetSeqAlign()->Get()) {
1512 PrintSAM(ostr, *it, queries, query_info, is_spliced, batch_number,
1513 first_secondary, last_secondary, trim_read_id,
1514 strand_specific, only_specific, print_md_tag);
1515 ostr << endl;
1516 }
1517 }
1518
1519 if (!print_unaligned) {
1520 return ostr;
1521 }
1522
1523 if ((results.GetFirstInfo() & CMagicBlastResults::fUnaligned) != 0 ||
1524 (no_discordant && !is_concordant)) {
1525
1526 PrintUnaligned(unaligned_ostr, unaligned_fmt, results, queries, true,
1527 trim_read_id);
1528 unaligned_ostr << endl;
1529 }
1530
1531 if (results.IsPaired() &&
1532 ((results.GetLastInfo() & CMagicBlastResults::fUnaligned) != 0 ||
1533 (no_discordant && !is_concordant))) {
1534 PrintUnaligned(unaligned_ostr, unaligned_fmt, results, queries, false,
1535 trim_read_id);
1536 unaligned_ostr << endl;
1537 }
1538
1539 return ostr;
1540 }
1541
1542
PrintSAM(CNcbiOstream & ostr,CNcbiOstream & unaligned_ostr,CFormattingArgs::EOutputFormat unaligned_fmt,const CMagicBlastResultSet & results,const CBioseq_set & query_batch,const BlastQueryInfo * query_info,bool is_spliced,int batch_number,bool trim_read_id,bool print_unaligned,bool no_discordant,E_StrandSpecificity strand_specific,bool only_specific,bool print_md_tag)1543 CNcbiOstream& PrintSAM(CNcbiOstream& ostr,
1544 CNcbiOstream& unaligned_ostr,
1545 CFormattingArgs::EOutputFormat unaligned_fmt,
1546 const CMagicBlastResultSet& results,
1547 const CBioseq_set& query_batch,
1548 const BlastQueryInfo* query_info,
1549 bool is_spliced,
1550 int batch_number,
1551 bool trim_read_id,
1552 bool print_unaligned,
1553 bool no_discordant,
1554 E_StrandSpecificity strand_specific,
1555 bool only_specific,
1556 bool print_md_tag)
1557 {
1558 TQueryMap bioseqs;
1559 s_CreateQueryMap(query_batch, bioseqs);
1560
1561 for (auto it: results) {
1562 PrintSAM(ostr, unaligned_ostr, unaligned_fmt, *it, bioseqs, query_info,
1563 is_spliced, batch_number, trim_read_id, print_unaligned,
1564 no_discordant, strand_specific, only_specific, print_md_tag);
1565 }
1566
1567 return ostr;
1568 }
1569
1570
PrintASN1(CNcbiOstream & ostr,const CBioseq_set & query_batch,CSeq_align_set & aligns)1571 CNcbiOstream& PrintASN1(CNcbiOstream& ostr, const CBioseq_set& query_batch,
1572 CSeq_align_set& aligns)
1573 {
1574 TQueryMap queries;
1575 s_CreateQueryMap(query_batch, queries);
1576
1577 for (auto it: aligns.Set()) {
1578 if (it->GetSegs().Which() != CSeq_align::TSegs::e_Spliced) {
1579 continue;
1580 }
1581
1582 const CBioseq& bioseq = s_GetQueryBioseq(queries, it->GetSeq_id(0));
1583 CRef<CSeq_id> seqid;
1584 if (bioseq.IsSetDescr()) {
1585 for (auto it: bioseq.GetDescr().Get()) {
1586 if (it->IsTitle()) {
1587 vector<string> tokens;
1588 NStr::Split(it->GetTitle(), " ", tokens);
1589 seqid.Reset(new CSeq_id(CSeq_id::e_Local, tokens[0]));
1590 }
1591 }
1592 }
1593
1594 if (seqid.NotEmpty()) {
1595 it->SetSegs().SetSpliced().SetProduct_id(*seqid);
1596 }
1597 }
1598
1599 ostr << MSerial_AsnText << aligns;
1600
1601 return ostr;
1602 }
1603
1604
1605 END_SCOPE(blast)
1606 END_NCBI_SCOPE
1607
1608