1 /*  $Id: magicblast_util.cpp 629892 2021-04-22 19:06:53Z ivanov $
2  * ===========================================================================
3  *
4  *                            PUBLIC DOMAIN NOTICE
5  *               National Center for Biotechnology Information
6  *
7  *  This software/database is a "United States Government Work" under the
8  *  terms of the United States Copyright Act.  It was written as part of
9  *  the author's official duties as a United States Government employee and
10  *  thus cannot be copyrighted.  This software/database is freely available
11  *  to the public for use. The National Library of Medicine and the U.S.
12  *  Government have not placed any restriction on its use or reproduction.
13  *
14  *  Although all reasonable efforts have been taken to ensure the accuracy
15  *  and reliability of the software and data, the NLM and the U.S.
16  *  Government do not and cannot warrant the performance or results that
17  *  may be obtained by using this software or data. The NLM and the U.S.
18  *  Government disclaim all warranties, express or implied, including
19  *  warranties of performance, merchantability or fitness for any particular
20  *  purpose.
21  *
22  *  Please cite the author in any work or product based on this material.
23  *
24  * ===========================================================================
25  *
26  * Authors:  Greg Boratyn
27  *      Implements utils for MagicBLAST application
28  *
29  */
30 
31 
32 #include <ncbi_pch.hpp>
33 #include <corelib/ncbiapp.hpp>
34 #include <algo/blast/api/magicblast.hpp>
35 #include <algo/blast/api/remote_blast.hpp>
36 #include <algo/blast/api/magicblast_options.hpp>
37 #include <algo/blast/blastinput/blast_fasta_input.hpp>
38 #include <algo/blast/blastinput/blast_asn1_input.hpp>
39 #include <algo/blast/blast_sra_input/blast_sra_input.hpp>
40 #include <algo/blast/blastinput/magicblast_args.hpp>
41 #include <algo/blast/api/objmgr_query_data.hpp>
42 #include <algo/blast/format/blast_format.hpp>
43 #include "../blast/blast_app_util.hpp"
44 
45 #include <objtools/format/sam_formatter.hpp>
46 
47 #include <algo/blast/api/objmgrfree_query_data.hpp>
48 #include <objects/seqset/Seq_entry.hpp>
49 #include <objects/seqalign/Spliced_seg.hpp>
50 #include <objects/seqalign/Spliced_exon.hpp>
51 #include <objects/seqalign/Spliced_exon_chunk.hpp>
52 #include <objects/seqalign/Product_pos.hpp>
53 #include <objects/seqalign/Splice_site.hpp>
54 
55 #include <objects/general/User_field.hpp>
56 #include <objects/general/User_object.hpp>
57 #include <objects/general/Dbtag.hpp>
58 
59 #include <algo/blast/core/blast_nalookup.h>
60 #include <algo/blast/api/blast_seqinfosrc_aux.hpp>
61 #include <algo/sequence/consensus_splice.hpp>
62 #include <util/sequtil/sequtil_manip.hpp>
63 #include <util/sequtil/sequtil_convert.hpp>
64 
65 #include "magicblast_util.hpp"
66 
67 #include <unordered_set>
68 #include <unordered_map>
69 #include <memory>
70 
71 #ifndef SKIP_DOXYGEN_PROCESSING
72 BEGIN_NCBI_SCOPE;
73 BEGIN_SCOPE(blast);
74 USING_SCOPE(objects);
75 #endif
76 
77 
78 typedef unordered_map<string, CRef<CSeq_entry> > TQueryMap;
79 
80 
81 static
82 CNcbiOstream& PrintTabularUnaligned(CNcbiOstream& ostr,
83                                     const CMagicBlastResults& results,
84                                     const TQueryMap& queries,
85                                     bool first_seg);
86 
87 static
88 CNcbiOstream& PrintSAMUnaligned(CNcbiOstream& ostr,
89                                 const CMagicBlastResults& results,
90                                 const TQueryMap& queries,
91                                 bool first_seg,
92                                 bool trim_read_ids);
93 
94 
s_Complement(char c)95 static char s_Complement(char c)
96 {
97     char retval;
98 
99     switch (c) {
100     case 'A':
101         retval = 'T';
102         break;
103 
104     case 'C':
105         retval = 'G';
106         break;
107 
108     case 'G':
109         retval = 'C';
110         break;
111 
112     case 'T':
113         retval = 'A';
114         break;
115 
116     case 'N':
117         retval = 'N';
118         break;
119 
120     case '-':
121         retval = '-';
122         break;
123 
124     default:
125         retval = 'x';
126     };
127 
128     return retval;
129 }
130 
131 
s_GetBareId(const CSeq_id & id)132 static string s_GetBareId(const CSeq_id& id)
133 {
134     string retval;
135     // Gis are printed with the bar
136     if (id.IsGi()) {
137         retval = id.AsFastaString();
138     }
139     else if (id.IsGeneral()) {
140         const CDbtag& dbt = id.GetGeneral();
141         if (dbt.GetTag().IsStr()) {
142             retval = dbt.GetTag().GetStr();
143         }
144         else if (dbt.GetTag().IsId()) {
145             retval = NStr::IntToString(dbt.GetTag().GetId());
146         }
147     }
148     else {
149         retval = id.GetSeqIdString(true);
150     }
151 
152     return retval;
153 }
154 
155 
s_GetSequenceId(const CBioseq & bioseq)156 static string s_GetSequenceId(const CBioseq& bioseq)
157 {
158     string retval;
159     if (bioseq.IsSetDescr()) {
160         for (auto it: bioseq.GetDescr().Get()) {
161             if (it->IsTitle()) {
162                 vector<string> tokens;
163                 NStr::Split(it->GetTitle(), " ", tokens);
164                 retval = tokens[0];
165             }
166         }
167     }
168 
169     if (retval.empty()) {
170         retval = s_GetBareId(*bioseq.GetFirstId());
171     }
172 
173     return retval;
174 }
175 
176 
s_GetFastaDefline(const CBioseq & bioseq)177 static string s_GetFastaDefline(const CBioseq& bioseq)
178 {
179     string retval;
180     if (bioseq.IsSetDescr()) {
181         for (auto it: bioseq.GetDescr().Get()) {
182             if (it->IsTitle()) {
183                 retval = it->GetTitle();
184             }
185         }
186     }
187 
188     if (retval.empty()) {
189         retval = s_GetBareId(*bioseq.GetFirstId());
190     }
191 
192     return retval;
193 }
194 
195 
s_CreateQueryMap(const CBioseq_set & query_batch,TQueryMap & query_map)196 static void s_CreateQueryMap(const CBioseq_set& query_batch,
197                              TQueryMap& query_map)
198 {
199     query_map.clear();
200     for (auto it: query_batch.GetSeq_set()) {
201 
202         CRef<CSeq_entry> seq_entry(it);
203         const CSeq_id* seq_id = seq_entry->GetSeq().GetFirstId();
204         if (!seq_id) {
205             NCBI_THROW(CException, eInvalid, "Missing Sequence Id");
206         }
207         string id = seq_id->GetSeqIdString();
208         query_map[id] = seq_entry;
209     }
210 }
211 
212 
s_GetQueryBioseq(const TQueryMap & queries,const CSeq_id & seqid)213 static const CBioseq& s_GetQueryBioseq(const TQueryMap& queries,
214                                        const CSeq_id& seqid)
215 {
216     TQueryMap::const_iterator it = queries.find(seqid.GetSeqIdString());
217     _ASSERT(it != queries.end());
218     if (it == queries.end()) {
219         NCBI_THROW(CException, eInvalid, (string)"Query Bioseq not found for "
220                    "id: " + s_GetBareId(seqid));
221     }
222 
223     return it->second->GetSeq();
224 }
225 
s_GetQuerySequence(const CBioseq & bioseq,const CRange<TSeqPos> & range,bool reverse_complement,string & sequence)226 static int s_GetQuerySequence(const CBioseq& bioseq,
227                               const CRange<TSeqPos>& range,
228                               bool reverse_complement,
229                               string& sequence)
230 {
231     const CSeq_data& seq_data = bioseq.GetInst().GetSeq_data();
232     switch (seq_data.Which()) {
233     case CSeq_data::e_Iupacna:
234         sequence = seq_data.GetIupacna().Get();
235         if (range.NotEmpty() && !range.IsWhole()) {
236             sequence = sequence.substr(range.GetFrom(), range.GetLength());
237         }
238         break;
239 
240     case CSeq_data::e_Ncbi2na:
241         CSeqConvert::Convert(seq_data.GetNcbi2na().Get(),
242                              CSeqUtil::e_Ncbi2na, range.GetFrom(),
243                              range.GetLength(),
244                              sequence, CSeqUtil::e_Iupacna);
245         break;
246     case CSeq_data::e_Ncbi4na:
247         CSeqConvert::Convert(seq_data.GetNcbi4na().Get(),
248                              CSeqUtil::e_Ncbi4na, range.GetFrom(),
249                              range.GetLength(),
250                              sequence, CSeqUtil::e_Iupacna);
251         break;
252 
253     case CSeq_data::e_Ncbi8na:
254         CSeqConvert::Convert(seq_data.GetNcbi8na().Get(),
255                              CSeqUtil::e_Ncbi8na, range.GetFrom(),
256                              range.GetLength(),
257                              sequence, CSeqUtil::e_Iupacna);
258         break;
259 
260     default:
261         NCBI_THROW(CException, eInvalid, "Unexpected query sequence "
262                    "encoding");
263     };
264 
265 
266     if (reverse_complement) {
267         string tmp(sequence);
268         CSeqManip::ReverseComplement(tmp, CSeqUtil::e_Iupacna, 0, tmp.length(),
269                                      sequence);
270     }
271 
272     return 0;
273 }
274 
275 
276 static
PrintFastaUnaligned(CNcbiOstream & ostr,const CMagicBlastResults & results,const TQueryMap & queries,bool first_seg)277 CNcbiOstream& PrintFastaUnaligned(CNcbiOstream& ostr,
278                                   const CMagicBlastResults& results,
279                                   const TQueryMap& queries,
280                                   bool first_seg)
281 {
282     CSeq_id id;
283     if (!results.IsPaired() || first_seg) {
284         id.Set(results.GetQueryId().AsFastaString());
285     }
286     else {
287         id.Set(results.GetLastId().AsFastaString());
288     }
289 
290     const CBioseq& bioseq = s_GetQueryBioseq(queries, id);
291 
292     // defline
293     ostr << ">" << s_GetFastaDefline(bioseq) << endl;
294 
295     // sequence
296     string sequence;
297     CRange<TSeqPos> range;
298     s_GetQuerySequence(bioseq, range, false, sequence);
299     ostr << sequence;
300 
301     return ostr;
302 }
303 
304 static
PrintUnaligned(CNcbiOstream & ostr,CFormattingArgs::EOutputFormat fmt,const CMagicBlastResults & results,const TQueryMap & queries,bool first_seg,bool trim_read_ids)305 CNcbiOstream& PrintUnaligned(CNcbiOstream& ostr,
306                              CFormattingArgs::EOutputFormat fmt,
307                              const CMagicBlastResults& results,
308                              const TQueryMap& queries,
309                              bool first_seg,
310                              bool trim_read_ids)
311 {
312 
313 
314     switch (fmt) {
315 
316     case CFormattingArgs::eTabular:
317         return PrintTabularUnaligned(ostr, results, queries, first_seg);
318 
319     case CFormattingArgs::eFasta:
320         return PrintFastaUnaligned(ostr, results, queries, first_seg);
321 
322     default:
323         return PrintSAMUnaligned(ostr, results, queries, first_seg, trim_read_ids);
324     };
325 }
326 
PrintTabularHeader(CNcbiOstream & ostr,const string & version,const string & cmd_line_args)327 CNcbiOstream& PrintTabularHeader(CNcbiOstream& ostr, const string& version,
328                                  const string& cmd_line_args)
329 {
330     string sep = "\t";
331 
332     ostr << "# MAGICBLAST " << version << endl;
333     ostr << "# " << cmd_line_args << endl;
334 
335     ostr << "# Fields: ";
336     ostr << "query acc." << sep;
337     ostr << "reference acc." << sep;
338     ostr << "% identity" << sep;
339     ostr << "not used" << sep;
340     ostr << "not used" << sep;
341     ostr << "not used" << sep;
342     ostr << "query start" << sep;
343     ostr << "query end" << sep;
344     ostr << "reference start" << sep;
345     ostr << "reference end" << sep;
346     ostr << "not used" << sep;
347     ostr << "not used" << sep;
348     ostr << "score" << sep;
349     ostr << "query strand" << sep;
350     ostr << "reference strand" << sep;
351     ostr << "query length" << sep;
352     ostr << "BTOP" << sep;
353     ostr << "num placements" << sep;
354     ostr << "not used" << sep;
355     ostr << "compartment" << sep;
356     ostr << "left overhang" << sep;
357     ostr << "right overhang" << sep;
358     ostr << "mate reference" << sep;
359     ostr << "mate ref. start" << sep;
360     ostr << "composite score";
361 
362     ostr << endl;
363 
364     return ostr;
365 }
366 
367 
368 static
PrintTabular(CNcbiOstream & ostr,const CSeq_align & align,const TQueryMap & queries,bool is_paired,int batch_number,int compartment,const CSeq_align * mate=NULL)369 CNcbiOstream& PrintTabular(CNcbiOstream& ostr, const CSeq_align& align,
370                            const TQueryMap& queries,
371                            bool is_paired, int batch_number, int compartment,
372                            const CSeq_align* mate = NULL)
373 {
374     // if paired alignment
375     if (align.GetSegs().IsDisc()) {
376 
377         const CSeq_align_set& disc = align.GetSegs().GetDisc();
378         _ASSERT(disc.Get().size() == 2u);
379 
380         CSeq_align_set::Tdata::const_iterator first = disc.Get().begin();
381         _ASSERT(first != disc.Get().end());
382         CSeq_align_set::Tdata::const_iterator second(first);
383         ++second;
384         _ASSERT(second != disc.Get().end());
385 
386         PrintTabular(ostr, **first, queries, is_paired, batch_number,
387                      compartment, second->GetNonNullPointer());
388         ostr << endl;
389 
390         PrintTabular(ostr, **second, queries, is_paired, batch_number,
391                      compartment, first->GetNonNullPointer());
392 
393         return ostr;
394     }
395 
396     string sep = "\t";
397     const CBioseq& bioseq = s_GetQueryBioseq(queries, align.GetSeq_id(0));
398     ostr << s_GetSequenceId(bioseq) << sep;
399 
400     ostr << s_GetBareId(align.GetSeq_id(1)) << sep;
401 
402     int score;
403     double perc_identity;
404     align.GetNamedScore(CSeq_align::eScore_Score, score);
405     align.GetNamedScore(CSeq_align::eScore_PercentIdentity_Gapped,
406                         perc_identity);
407 
408     ostr << perc_identity << sep;
409 
410     ostr << 0 << sep; // length
411     ostr << 0 << sep; // mismatch
412     ostr << 0 << sep; // gapopen
413 
414 
415     int query_len = 0;
416 
417     if (align.GetSegs().IsDenseg()) {
418         CRange<TSeqPos> range = align.GetSeqRange(0);
419         ostr << range.GetFrom() + 1 << sep << range.GetTo() + 1 << sep;
420         range = align.GetSeqRange(1);
421         if (align.GetSeqStrand(0) == eNa_strand_minus) {
422             ostr << range.GetTo() + 1 << sep << range.GetFrom() + 1 << sep;
423         }
424         else {
425             ostr << range.GetFrom() + 1 << sep << range.GetTo() + 1 << sep;
426         }
427     }
428     else if (align.GetSegs().IsSpliced()) {
429         CRange<TSeqPos> range = align.GetSeqRange(0);
430         if (align.GetSegs().GetSpliced().IsSetProduct_length()) {
431             query_len = align.GetSegs().GetSpliced().GetProduct_length();
432         }
433         else {
434             _ASSERT(0);
435         }
436         if (align.GetSeqStrand(0) == eNa_strand_minus) {
437             ostr << query_len - range.GetTo() << sep
438                  << query_len - range.GetFrom() << sep;
439 
440             range = align.GetSeqRange(1);
441             ostr << range.GetTo() + 1 << sep << range.GetFrom() + 1 << sep;
442         }
443         else {
444             ostr << range.GetFrom() + 1 << sep << range.GetTo() + 1 << sep;
445             range = align.GetSeqRange(1);
446             ostr << range.GetFrom() + 1 << sep << range.GetTo() + 1 << sep;
447         }
448 
449     }
450 
451     ostr << 0.0 << sep; // e-value
452     ostr << 99 << sep;  // bit score
453 
454     ostr << score << sep;
455 
456     // query is always a plus strand
457     ostr << "plus" << sep
458          << (align.GetSeqStrand(0) == 1 ? "plus" : "minus");
459 
460     string btop_string;
461     Int4 num_hits = 0;
462     Int4 pair_start = 0;
463     Int4 fragment_score = 0;
464 
465     CConstRef<CUser_object> ext = align.FindExt("Mapper Info");
466     if (ext.NotEmpty()) {
467 
468         ITERATE (CUser_object::TData, it, ext->GetData()) {
469             if (!(*it)->GetLabel().IsStr()) {
470                 continue;
471             }
472 
473             if ((*it)->GetLabel().GetStr() == "btop" &&
474                 (*it)->GetData().IsStr()) {
475 
476                 btop_string = (*it)->GetString();
477             }
478             else if ((*it)->GetLabel().GetStr() == "num_hits" &&
479                      (*it)->GetData().IsInt()) {
480 
481                 num_hits = (*it)->GetInt();
482             }
483         }
484     }
485 
486     // for alignments on the minus strand
487     if (align.GetSeqStrand(0) == eNa_strand_minus) {
488 
489         // reverse btop string
490         string new_btop;
491         int i = btop_string.length() - 1;
492         while (i >= 0) {
493             int to = i;
494             while (i >= 0 && (isdigit(btop_string[i]) ||
495                               btop_string[i] == ')')) {
496                 i--;
497             }
498 
499             new_btop += btop_string.substr(i + 1, to - i);
500 
501             if (i > 0) {
502                 if (isalpha(btop_string[i]) || btop_string[i] == '-') {
503                     // complement bases in place
504                     new_btop += s_Complement(btop_string[i - 1]);
505                     new_btop += s_Complement(btop_string[i]);
506                     i--;
507                 }
508                 else {
509                     new_btop += btop_string[i];
510                 }
511             }
512 
513             i--;
514         }
515         btop_string.swap(new_btop);
516     }
517 
518     fragment_score = score;
519     if (mate) {
520         int mate_score = 0;
521         mate->GetNamedScore(CSeq_align::eScore_Score, mate_score);
522         fragment_score += mate_score;
523     }
524 
525     // report unaligned part of the query: 3' end and reverese complemented
526     // 5' end
527     string left_overhang = "-";
528     string right_overhang = "-";
529     if (query_len <= 0) {
530         query_len = bioseq.GetInst().GetLength();
531     }
532 
533     CRange<TSeqPos> range = align.GetSeqRange(0);
534     int from = range.GetFrom();
535     int to = range.GetToOpen();
536     if (align.GetSeqStrand(0) == eNa_strand_minus) {
537         from = query_len - range.GetToOpen();
538         to = query_len - range.GetFrom();
539     }
540 
541     // reverse complemented 5' end
542     if (from > 0) {
543         CRange<TSeqPos> r(MAX(0, from - 30), from - 1);
544         left_overhang.clear();
545         s_GetQuerySequence(bioseq, r, true, left_overhang);
546     }
547 
548     // 3' end
549     if (to < query_len) {
550         CRange<TSeqPos> r(to, MIN(to + 30 - 1, query_len - 1));
551         right_overhang.clear();
552         s_GetQuerySequence(bioseq, r, false, right_overhang);
553     }
554 
555     ostr << sep << query_len
556          << sep << btop_string
557          << sep << num_hits
558          << sep << /*splice*/ "-"
559          << sep << batch_number << ":" << compartment
560          << sep << left_overhang
561          << sep << right_overhang;
562 
563     if (is_paired && mate) {
564         if (align.GetSeq_id(1).Match(mate->GetSeq_id(1))) {
565             ostr << sep << "-";
566         }
567         else {
568             ostr << sep << mate->GetSeq_id(1).AsFastaString();
569         }
570 
571         pair_start = mate->GetSeqStart(1) + 1;
572         //FIXME: for tests
573         if (mate->GetSeqStrand(0) == eNa_strand_minus) {
574             pair_start = mate->GetSeqStop(1) + 1;
575         }
576         if ((align.GetSeqStart(1) < mate->GetSeqStart(1) &&
577              align.GetSeqStrand(0) == eNa_strand_minus) ||
578             (mate->GetSeqStart(1) < align.GetSeqStart(1) &&
579              mate->GetSeqStrand(0) == eNa_strand_minus)) {
580 
581             pair_start = -pair_start;
582         }
583         ostr << sep << pair_start;
584 
585     }
586     else {
587         ostr << sep << "-" << sep << "-";
588     }
589 
590     ostr << sep << fragment_score;
591 
592     return ostr;
593 }
594 
595 
PrintTabularUnaligned(CNcbiOstream & ostr,const CMagicBlastResults & results,const TQueryMap & queries,bool first_seg)596 CNcbiOstream& PrintTabularUnaligned(CNcbiOstream& ostr,
597                                     const CMagicBlastResults& results,
598                                     const TQueryMap& queries,
599                                     bool first_seg)
600 {
601     string sep = "\t";
602     CSeq_id id;
603     if (!results.IsPaired() || first_seg) {
604         id.Set(results.GetQueryId().AsFastaString());
605     }
606     else {
607         id.Set(results.GetLastId().AsFastaString());
608     }
609     const CBioseq& bioseq = s_GetQueryBioseq(queries, id);
610 
611     // query
612     ostr << s_GetSequenceId(bioseq) << sep;
613 
614     // subject
615     ostr << "-" << sep;
616 
617     // percent identity
618     ostr << 0.0 << sep;
619 
620     ostr << 0 << sep; // length
621     ostr << 0 << sep; // mismatch
622     ostr << 0 << sep; // gapopen
623 
624     // query start and stop
625     ostr << 0 << sep << 0 << sep;
626 
627     // subject start and stop
628     ostr << 0 << sep << 0 << sep;
629 
630     ostr << 0 << sep; // e-value
631     ostr << 99 << sep;  // bit score
632 
633     ostr << 0 << sep;
634 
635     // query and subject strand
636     ostr << "-" << sep << "-" << sep;
637 
638     // query length
639     int query_len = bioseq.GetInst().GetLength();
640 
641     ostr << query_len << sep;
642 
643     // btop string
644     ostr << "-" << sep;
645 
646     // number of placements
647     ostr << 0 << sep;
648 
649     // splice
650     ostr << "-" << sep;
651 
652     // compartment
653     string compart = "-";
654     // if a read did not pass filtering
655     CMagicBlastResults::TResultsInfo info =
656         first_seg ? results.GetFirstInfo() : results.GetLastInfo();
657     if ((info & CMagicBlastResults::fFiltered) != 0) {
658         compart = "F";
659     }
660     ostr << compart << sep;
661 
662     // left overhang
663     ostr << "-" << sep;
664 
665     // right overhang
666     ostr << "-" << sep;
667 
668     // mate reference
669     ostr << "-" << sep;
670 
671     // mate start position
672     ostr << "-" << sep;
673 
674     // composite score
675     ostr << 0;
676 
677     return ostr;
678 }
679 
680 static
PrintTabular(CNcbiOstream & ostr,CNcbiOstream & unaligned_ostr,CFormattingArgs::EOutputFormat unaligned_fmt,const CMagicBlastResults & results,const TQueryMap & queries,bool is_paired,int batch_number,int & compartment,bool trim_read_id,bool print_unaligned,bool no_discordant)681 CNcbiOstream& PrintTabular(CNcbiOstream& ostr,
682                            CNcbiOstream& unaligned_ostr,
683                            CFormattingArgs::EOutputFormat unaligned_fmt,
684                            const CMagicBlastResults& results,
685                            const TQueryMap& queries,
686                            bool is_paired, int batch_number,
687                            int& compartment,
688                            bool trim_read_id,
689                            bool print_unaligned,
690                            bool no_discordant)
691 {
692     bool is_concordant = results.IsConcordant();
693 
694     if (!no_discordant || (no_discordant && is_concordant)) {
695         for (auto it: results.GetSeqAlign()->Get()) {
696             PrintTabular(ostr, *it, queries, is_paired, batch_number,
697                          compartment++);
698             ostr << endl;
699         }
700     }
701 
702     if (!print_unaligned) {
703         return ostr;
704     }
705 
706     if ((results.GetFirstInfo() & CMagicBlastResults::fUnaligned) != 0 ||
707         (no_discordant && !is_concordant)) {
708 
709         PrintUnaligned(unaligned_ostr, unaligned_fmt, results, queries, true,
710                        trim_read_id);
711         unaligned_ostr << endl;
712     }
713 
714     if (results.IsPaired() &&
715         ((results.GetLastInfo() & CMagicBlastResults::fUnaligned) != 0 ||
716          (no_discordant && !is_concordant))) {
717 
718         PrintUnaligned(unaligned_ostr, unaligned_fmt, results, queries, false,
719                        trim_read_id);
720         unaligned_ostr << endl;
721     }
722 
723     return ostr;
724 }
725 
726 
PrintTabular(CNcbiOstream & ostr,CNcbiOstream & unaligned_ostr,CFormattingArgs::EOutputFormat unaligned_fmt,const CMagicBlastResultSet & results,const CBioseq_set & query_batch,bool is_paired,int batch_number,bool trim_read_id,bool print_unaligned,bool no_discordant)727 CNcbiOstream& PrintTabular(CNcbiOstream& ostr,
728                            CNcbiOstream& unaligned_ostr,
729                            CFormattingArgs::EOutputFormat unaligned_fmt,
730                            const CMagicBlastResultSet& results,
731                            const CBioseq_set& query_batch,
732                            bool is_paired, int batch_number,
733                            bool trim_read_id,
734                            bool print_unaligned,
735                            bool no_discordant)
736 {
737     TQueryMap queries;
738     s_CreateQueryMap(query_batch, queries);
739 
740     int compartment = 0;
741     for (auto it: results) {
742         PrintTabular(ostr, unaligned_ostr, unaligned_fmt, *it, queries,
743                      is_paired, batch_number, compartment, trim_read_id,
744                      print_unaligned, no_discordant);
745     }
746 
747     return ostr;
748 }
749 
750 
PrintSAMHeader(CNcbiOstream & ostr,CRef<CLocalDbAdapter> db_adapter,const string & cmd_line_args)751 CNcbiOstream& PrintSAMHeader(CNcbiOstream& ostr,
752                              CRef<CLocalDbAdapter> db_adapter,
753                              const string& cmd_line_args)
754 {
755     BlastSeqSrc* seq_src = db_adapter->MakeSeqSrc();
756     IBlastSeqInfoSrc* seqinfo_src = db_adapter->MakeSeqInfoSrc();
757     _ASSERT(seq_src && seqinfo_src);
758 
759     CRef<CSeqDB> seqdb;
760     if (db_adapter->IsBlastDb()) {
761         seqdb.Reset(db_adapter->GetSearchDatabase()->GetSeqDb());
762     }
763 
764     ostr << "@HD\t" << "VN:1.0\t" << "GO:query" << endl;
765 
766     BlastSeqSrcResetChunkIterator(seq_src);
767     BlastSeqSrcIterator* it = BlastSeqSrcIteratorNew();
768     CRef<CSeq_id> seqid(new CSeq_id);
769     Uint4 length;
770     Int4 oid;
771     while ((oid = BlastSeqSrcIteratorNext(seq_src, it)) != BLAST_SEQSRC_EOF) {
772         GetSequenceLengthAndId(seqinfo_src, oid, CSeq_id::BlastRank, seqid,
773                                &length);
774 
775         ostr << "@SQ\t" << "SN:" << s_GetBareId(*seqid) << "\tLN:" << length;
776 
777         vector<TTaxId> taxids;
778         if (seqdb.NotEmpty()) {
779             seqdb->GetTaxIDs(oid, taxids);
780         }
781 
782         if (!taxids.empty() && taxids[0] != 0) {
783             ostr << "\tSP:";
784             for (vector<TTaxId>::iterator it = taxids.begin();
785                  it != taxids.end(); ++it) {
786                 if (it != taxids.begin()) {
787                     ostr << ",";
788                 }
789                 ostr << *it;
790             }
791         }
792         ostr << endl;
793     }
794     BlastSeqSrcIteratorFree(it);
795 
796     ostr << "@PG\tID:magicblast\tPN:magicblast\tCL:" << cmd_line_args << endl;
797 
798     return ostr;
799 }
800 
801 
802 // hash function for pointers to Seq_id
803 struct hash_seqid
804 {
operator ()hash_seqid805     size_t operator()(const CSeq_id* s) const {
806         std::hash<string> h;
807         return h(s->AsFastaString());
808     }
809 };
810 
811 
812 // equal_to function for pointers to Seq_id
813 struct eq_seqid
814 {
operator ()eq_seqid815     bool operator()(const CSeq_id* a, const CSeq_id* b) const {
816         return a->Match(*b);
817     }
818 };
819 
820 // hash_set of pointers to Seq_ids
821 typedef unordered_set<const CSeq_id*, hash_seqid, eq_seqid> TSeq_idHashSet;
822 
823 
824 static ENa_strand
s_GetSpliceSiteOrientation(const CSpliced_seg::TExons::const_iterator & exon,const CSpliced_seg::TExons::const_iterator & next_exon)825 s_GetSpliceSiteOrientation(const CSpliced_seg::TExons::const_iterator& exon,
826                            const CSpliced_seg::TExons::const_iterator& next_exon)
827 {
828     ENa_strand result = eNa_strand_unknown;
829 
830     // orientation is unknown if exons align on different strands or a exon's
831     // genomic strand is unknown
832     if ((*exon)->GetGenomic_strand() !=
833         (*next_exon)->GetGenomic_strand() ||
834         (*exon)->GetGenomic_strand() == eNa_strand_unknown) {
835 
836         return eNa_strand_unknown;
837     }
838 
839     // orientation is unknown if splice signal is not set
840     if (!(*exon)->IsSetDonor_after_exon() ||
841         !(*next_exon)->IsSetAcceptor_before_exon()) {
842 
843         return eNa_strand_unknown;
844     }
845 
846     // get splice signal
847     string donor = (*exon)->GetDonor_after_exon().GetBases();
848     string acceptor = (*next_exon)->GetAcceptor_before_exon().GetBases();
849 
850     // if the signal is recognised then the splice orientation is the same as
851     // genomic strand
852     if (IsConsensusSplice(donor, acceptor) ||
853         IsKnownNonConsensusSplice(donor, acceptor)) {
854 
855         result = (*exon)->GetGenomic_strand();
856     }
857     else {
858         // otherwise try to recognise reverse complemented splice signals
859 
860         string rc_donor;
861         string rc_acceptor;
862 
863         CSeqManip::ReverseComplement(donor,
864                                      CSeqUtil::e_Iupacna,
865                                      0, donor.length(),
866                                      rc_donor);
867 
868         CSeqManip::ReverseComplement(acceptor,
869                                      CSeqUtil::e_Iupacna,
870                                      0, acceptor.length(),
871                                      rc_acceptor);
872 
873         // if reverse complemented signals are recognised then splice
874         // orientation is opposite to genomic strand
875         if (IsConsensusSplice(rc_acceptor, rc_donor) ||
876             IsKnownNonConsensusSplice(rc_acceptor, rc_donor)) {
877 
878             if ((*exon)->GetGenomic_strand() == eNa_strand_plus) {
879                 result = eNa_strand_minus;
880             }
881             else if ((*exon)->GetGenomic_strand() == eNa_strand_minus) {
882                 result = eNa_strand_plus;
883             }
884             else {
885                 result = eNa_strand_unknown;
886             }
887         }
888         else {
889             // if neither signals are recognised then splice orientation is
890             // unknown
891             result = eNa_strand_unknown;
892         }
893 
894     }
895 
896     return result;
897 }
898 
899 
900 #define SAM_FLAG_MULTI_SEGMENTS  0x1
901 #define SAM_FLAG_SEGS_ALIGNED    0x2
902 #define SAM_FLAG_SEG_UNMAPPED    0x4
903 #define SAM_FLAG_NEXT_SEG_UNMAPPED 0x8
904 #define SAM_FLAG_SEQ_REVCOMP    0x10
905 #define SAM_FLAG_NEXT_REVCOMP   0x20
906 #define SAM_FLAG_FIRST_SEGMENT  0x40
907 #define SAM_FLAG_LAST_SEGMENT   0x80
908 #define SAM_FLAG_SECONDARY      0x100
909 
910 static
PrintSAM(CNcbiOstream & ostr,const CSeq_align & align,const TQueryMap & queries,const BlastQueryInfo * query_info,bool is_spliced,int batch_number,bool & first_secondary,bool & last_secondary,bool trim_read_ids,E_StrandSpecificity strand_specific,bool only_specific,bool print_md_tag,bool other=false,const CSeq_align * mate=NULL)911 CNcbiOstream& PrintSAM(CNcbiOstream& ostr, const CSeq_align& align,
912                        const TQueryMap& queries,
913                        const BlastQueryInfo* query_info,
914                        bool is_spliced,
915                        int batch_number, bool& first_secondary,
916                        bool& last_secondary, bool trim_read_ids,
917                        E_StrandSpecificity strand_specific,
918                        bool only_specific,
919                        bool print_md_tag,
920                        bool other = false,
921                        const CSeq_align* mate = NULL)
922 {
923     string sep = "\t";
924 
925     string btop_string;
926     string md_tag;
927     int query_len = 0;
928     int num_hits = 0;
929     int context = -1;
930     int sam_flags = 0;
931     const int kMaxInsertSize = is_spliced ?
932         MAGICBLAST_MAX_INSERT_SIZE_SPLICED :
933         MAGICBLAST_MAX_INSERT_SIZE_NONSPLICED;
934 
935     // if paired alignment
936     if (align.GetSegs().IsDisc()) {
937 
938         _ASSERT(align.GetSegs().GetDisc().Get().size() == 2);
939 
940         const CSeq_align_set& disc = align.GetSegs().GetDisc();
941         CSeq_align_set::Tdata::const_iterator first = disc.Get().begin();
942         _ASSERT(first != disc.Get().end());
943         CSeq_align_set::Tdata::const_iterator second(first);
944         ++second;
945         _ASSERT(second != disc.Get().end());
946 
947         PrintSAM(ostr, **first, queries, query_info, is_spliced,
948                  batch_number, first_secondary, last_secondary,
949                  trim_read_ids, strand_specific, only_specific,
950                  print_md_tag, false,
951                  second->GetNonNullPointer());
952         ostr << endl;
953 
954         PrintSAM(ostr, **second, queries, query_info, is_spliced,
955                  batch_number, first_secondary, last_secondary,
956                  trim_read_ids, strand_specific, only_specific,
957                  print_md_tag, true,
958                  first->GetNonNullPointer());
959 
960         return ostr;
961     }
962 
963     // get align data saved in the user object
964     CConstRef<CUser_object> ext = align.FindExt("Mapper Info");
965     if (ext.NotEmpty()) {
966 
967         ITERATE (CUser_object::TData, it, ext->GetData()) {
968             if (!(*it)->GetLabel().IsStr()) {
969                 continue;
970             }
971 
972             if ((*it)->GetLabel().GetStr() == "btop" &&
973                 (*it)->GetData().IsStr()) {
974 
975                 btop_string = (*it)->GetString();
976             }
977             else if ((*it)->GetLabel().GetStr() == "num_hits" &&
978                      (*it)->GetData().IsInt()) {
979 
980                 num_hits = (*it)->GetInt();
981             }
982             else if ((*it)->GetLabel().GetStr() == "context" &&
983                      (*it)->GetData().IsInt()) {
984 
985                 context = (*it)->GetInt();
986             }
987             else if ((*it)->GetLabel().GetStr() == "md_tag" &&
988                      (*it)->GetData().IsStr()) {
989 
990                 md_tag = (*it)->GetString();
991             }
992         }
993 
994     }
995 
996     vector<ENa_strand> orientation;
997     if (align.GetSegs().Which() == CSeq_align::TSegs::e_Spliced) {
998         const CSpliced_seg& spliced = align.GetSegs().GetSpliced();
999 
1000         query_len = spliced.GetProduct_length();
1001     }
1002 
1003     // observed template length
1004     int template_length = 0;
1005     CRange<TSeqPos> range = align.GetSeqRange(1);
1006     if (mate && align.GetSeq_id(1).Match(mate->GetSeq_id(1))) {
1007         CRange<TSeqPos> mate_range = mate->GetSeqRange(1);
1008         if (align.GetSeqStrand(0) == eNa_strand_plus &&
1009             align.GetSeqStrand(1) == eNa_strand_plus) {
1010 
1011             template_length = (int)mate_range.GetTo() - (int)range.GetFrom() + 1;
1012         }
1013         else {
1014             template_length =
1015                 -((int)range.GetTo() - (int)mate_range.GetFrom() + 1);
1016         }
1017     }
1018 
1019 
1020     // FIXME: if subject is on a minus strand we need to reverse
1021     // complement both
1022     if (align.GetSeqStrand(0) == eNa_strand_minus) {
1023         sam_flags |= SAM_FLAG_SEQ_REVCOMP;
1024     }
1025 
1026     if (context >= 0 && query_info->contexts[context].segment_flags != 0) {
1027         sam_flags |= SAM_FLAG_MULTI_SEGMENTS;
1028 
1029         if ((query_info->contexts[context].segment_flags & fFirstSegmentFlag)
1030             != 0) {
1031             sam_flags |= SAM_FLAG_FIRST_SEGMENT;
1032         }
1033 
1034         if ((query_info->contexts[context].segment_flags & fLastSegmentFlag)
1035             != 0) {
1036             sam_flags |= SAM_FLAG_LAST_SEGMENT;
1037         }
1038 
1039         if ((query_info->contexts[context].segment_flags & fPartialFlag) != 0
1040             || !mate) {
1041 
1042             sam_flags |= SAM_FLAG_NEXT_SEG_UNMAPPED;
1043         }
1044 
1045         if (mate) {
1046             // FIXME: it is assumed that subject is always in plus strand
1047             // (BLAST way)
1048             ENa_strand a_strand = align.GetSeqStrand(0);
1049             ENa_strand m_strand = mate->GetSeqStrand(0);
1050             bool plus_minus =
1051                     a_strand == eNa_strand_plus && m_strand == eNa_strand_minus;
1052             bool minus_plus =
1053                     a_strand == eNa_strand_minus && m_strand == eNa_strand_plus;
1054             TSeqPos a_start = align.GetSeqStart(1);
1055             TSeqPos m_start = mate->GetSeqStart(1);
1056 
1057             // For strand specific output we reset SAM_FLAG_SEGS_ALIGNED
1058             // for paired alignments with the wrong configuration
1059             if (strand_specific != eNonSpecific) {
1060                 // In this statement <bool1> != <bool2> is equivalent to
1061                 // EXCLUSIVE-OR.
1062                 // If <bool2> is false, conditional returns <bool1>.
1063                 // If <bool2> is true, conditional returns <bool1> inverted.
1064                 // So if "other" is true, actions based on "plus_minus"
1065                 // and "minus_plus" are reversed.
1066                 if (((strand_specific == eFwdRev  &&  plus_minus != other)
1067                      || (strand_specific == eRevFwd  &&  minus_plus != other))
1068                     && template_length < kMaxInsertSize) {
1069 
1070                     sam_flags |= SAM_FLAG_SEGS_ALIGNED;
1071                 }
1072             } else {
1073                 if (((a_start <= m_start && plus_minus)
1074                      || (m_start <= a_start && minus_plus))
1075                     && abs(template_length) < kMaxInsertSize) {
1076                     sam_flags |= SAM_FLAG_SEGS_ALIGNED;
1077                 }
1078             }
1079 
1080             if (mate->GetSeqStrand(0) == eNa_strand_minus) {
1081                 sam_flags |= SAM_FLAG_NEXT_REVCOMP;
1082             }
1083         }
1084     }
1085 
1086     // set secondary alignment bit
1087     if ((sam_flags & SAM_FLAG_FIRST_SEGMENT) != 0) {
1088         if (first_secondary) {
1089             sam_flags |= SAM_FLAG_SECONDARY;
1090         }
1091         else {
1092             first_secondary = true;
1093         }
1094     }
1095     else {
1096         if (last_secondary) {
1097             sam_flags |= SAM_FLAG_SECONDARY;
1098         }
1099         else {
1100             last_secondary = true;
1101         }
1102     }
1103 
1104     // read id
1105     const CBioseq& bioseq = s_GetQueryBioseq(queries, align.GetSeq_id(0));
1106     string read_id = s_GetSequenceId(bioseq);
1107     if (trim_read_ids &&
1108         (NStr::EndsWith(read_id, ".1") || NStr::EndsWith(read_id, ".2") ||
1109          NStr::EndsWith(read_id, "/1") || NStr::EndsWith(read_id, "/2"))) {
1110 
1111         read_id.resize(read_id.length() - 2);
1112     }
1113     ostr << read_id << sep;
1114 
1115     // flag
1116     ostr << sam_flags << sep;
1117 
1118     // reference sequence id
1119     ostr << s_GetBareId(align.GetSeq_id(1)) << sep;
1120 
1121     // mapping position
1122     ostr << range.GetFrom() + 1 << sep;
1123 
1124     // mapping quality
1125     ostr << "255" << sep;
1126 
1127     // CIGAR string
1128     string cigar;
1129     int edit_distance = 0;
1130     if (align.GetSegs().Which() == CSeq_align::TSegs::e_Denseg) {
1131         const CDense_seg& denseg = align.GetSegs().GetDenseg();
1132         const CDense_seg::TStarts& starts = denseg.GetStarts();
1133         const CDense_seg::TLens& lens = denseg.GetLens();
1134         CRange<TSeqPos> qrange = align.GetSeqRange(0);
1135 
1136         if (align.GetSeqStrand(0) == eNa_strand_plus) {
1137             if (qrange.GetFrom() > 0) {
1138                 cigar += NStr::IntToString(qrange.GetFrom());
1139                 cigar += "S";
1140             }
1141         }
1142         else {
1143             if ((int)qrange.GetToOpen() < query_len) {
1144                 cigar += NStr::IntToString(query_len - qrange.GetToOpen());
1145                 cigar += "S";
1146             }
1147         }
1148         for (size_t i=0;i < starts.size();i+=2) {
1149             cigar += NStr::IntToString(lens[i/2]);
1150             if (starts[i] >= 0 && starts[i + 1] >= 0) {
1151                 cigar += "M";
1152             }
1153             else if (starts[i] < 0) {
1154                 if (lens[i/2] < 10) {
1155                     cigar += "D";
1156                 }
1157                 else {
1158                     cigar += "N";
1159                 }
1160             }
1161             else {
1162                 cigar += "I";
1163             }
1164         }
1165         if (align.GetSeqStrand(0) == eNa_strand_plus) {
1166             if ((int)qrange.GetToOpen() < query_len) {
1167                 cigar += NStr::IntToString(query_len - qrange.GetToOpen());
1168                 cigar += "S";
1169             }
1170         }
1171         else {
1172             if (qrange.GetFrom() > 0) {
1173                 cigar += NStr::IntToString(qrange.GetFrom());
1174                 cigar += "S";
1175             }
1176         }
1177     }
1178     else if (align.GetSegs().Which() == CSeq_align::TSegs::e_Spliced) {
1179         const CSpliced_seg& spliced = align.GetSegs().GetSpliced();
1180         CRange<TSeqPos> qrange = align.GetSeqRange(0);
1181 
1182         if (qrange.GetFrom() > 0) {
1183             cigar += NStr::IntToString(qrange.GetFrom());
1184             cigar += "S";
1185         }
1186 
1187         ITERATE (CSpliced_seg::TExons, exon, spliced.GetExons()) {
1188             int num = 0;
1189             char op = 0;
1190             ITERATE(CSpliced_exon::TParts, it, (*exon)->GetParts()) {
1191                 switch ((*it)->Which()) {
1192                 case CSpliced_exon_chunk::e_Match:
1193                     if (op && op != 'M') {
1194                         cigar += NStr::IntToString(num);
1195                         cigar += op;
1196                         num = 0;
1197                     }
1198                     num += (*it)->GetMatch();
1199                     op = 'M';
1200                     break;
1201 
1202                 case CSpliced_exon_chunk::e_Mismatch:
1203                     if (op && op != 'M') {
1204                         cigar += NStr::IntToString(num);
1205                         cigar += op;
1206                         num = 0;
1207                     }
1208                     edit_distance += (*it)->GetMismatch();
1209                     num += (*it)->GetMismatch();
1210                     op = 'M';
1211                     break;
1212 
1213                 case CSpliced_exon_chunk::e_Product_ins:
1214                     if (op && op != 'I') {
1215                         cigar += NStr::IntToString(num);
1216                         cigar += op;
1217                         num = 0;
1218                     }
1219                     edit_distance += (*it)->GetProduct_ins();
1220                     num += (*it)->GetProduct_ins();
1221                     op = 'I';
1222                     break;
1223 
1224                 case CSpliced_exon_chunk::e_Genomic_ins:
1225                     if (op && op != 'D') {
1226                         cigar += NStr::IntToString(num);
1227                         cigar += op;
1228                         num = 0;
1229                     }
1230                     edit_distance += (*it)->GetGenomic_ins();
1231                     num += (*it)->GetGenomic_ins();
1232                     op = 'D';
1233                     break;
1234 
1235                 default:
1236                     NCBI_THROW(CException, eInvalid, "Unsupported "
1237                                "CSpliced_exon_chunk::TPart value");
1238                 }
1239             }
1240             if (num > 0) {
1241                 cigar += NStr::IntToString(num);
1242                 cigar += op;
1243 
1244             }
1245 
1246             CSpliced_seg::TExons::const_iterator next_exon(exon);
1247             ++next_exon;
1248             if (next_exon != spliced.GetExons().end()) {
1249                 int query_gap = (*next_exon)->GetProduct_start().GetNucpos() -
1250                     (*exon)->GetProduct_end().GetNucpos() - 1;
1251                 if (query_gap > 0) {
1252                     cigar += NStr::IntToString(query_gap);
1253                     cigar += "I";
1254                 }
1255                 edit_distance += query_gap;
1256 
1257                 int intron = (*next_exon)->GetGenomic_start() -
1258                     (*exon)->GetGenomic_end() - 1;
1259                 if (intron > 0) {
1260                     cigar += NStr::IntToString(intron);
1261                     cigar += "N";
1262                 }
1263 
1264                 // get intron orientation
1265                 orientation.push_back(
1266                                s_GetSpliceSiteOrientation(exon, next_exon));
1267             }
1268         }
1269 
1270         if ((int)qrange.GetToOpen() < query_len) {
1271             cigar += NStr::IntToString(query_len - qrange.GetToOpen());
1272             cigar += "S";
1273         }
1274     }
1275     else {
1276         NCBI_THROW(CSeqalignException, eUnsupported, "The SAM formatter does "
1277                    "does not support this alignment structure");
1278     }
1279 
1280     ostr << cigar << sep;
1281 
1282     // reference name of the mate
1283     if (mate) {
1284         if (align.GetSeq_id(1).Match(mate->GetSeq_id(1))) {
1285             ostr << "=";
1286         }
1287         else {
1288             ostr << s_GetBareId(mate->GetSeq_id(1));
1289         }
1290     }
1291     else {
1292         ostr << "*";
1293     }
1294     ostr << sep;
1295 
1296     // position of the mate
1297     if (mate) {
1298         ostr << MIN(mate->GetSeqStart(1), mate->GetSeqStop(1)) + 1;
1299     }
1300     else {
1301         ostr << "0";
1302     }
1303     ostr << sep;
1304 
1305     // observed template length
1306     ostr << template_length;
1307     ostr << sep;
1308 
1309     // read sequence
1310     string sequence;
1311     CRange<TSeqPos> r;
1312     int status = s_GetQuerySequence(bioseq, r,
1313                            (sam_flags & SAM_FLAG_SEQ_REVCOMP) != 0, sequence);
1314 
1315     if (!status && sequence.length() > 0) {
1316         ostr << sequence << sep;
1317     }
1318     else {
1319         ostr << "*" << sep;
1320     }
1321 
1322     // quality string
1323     ostr << "*";
1324 
1325     // optional fields
1326     // number of hits reported for the query
1327     ostr << sep << "NH:i:" << num_hits;
1328 
1329     // score
1330     int score = 0;
1331     align.GetNamedScore(CSeq_align::eScore_Score, score);
1332     ostr << sep << "AS:i:" << score;
1333 
1334     // edit distance
1335     ostr << sep << "NM:i:" << edit_distance;
1336 
1337     // splice site orientation
1338     // The final splice orientation is positive or negative, if all introns in
1339     // the alignment have the same orientation, or unknown if orientation
1340     // changes.
1341     if (!orientation.empty()) {
1342         char ori;
1343 
1344         switch (orientation[0]) {
1345         case eNa_strand_plus:
1346             ori = '+';
1347             break;
1348 
1349         case eNa_strand_minus:
1350             ori = '-';
1351             break;
1352 
1353         default:
1354             ori = '?';
1355         }
1356 
1357         for (size_t i=1;i < orientation.size();i++) {
1358             if (orientation[i] != orientation[0]) {
1359                 ori = '?';
1360             }
1361         }
1362 
1363         ostr << sep << "XS:A:" << ori;
1364     }
1365 
1366     // MD tag in Seq-align has long subject gaps (deletions) encoded as
1367     // !<gap length>!. 'x' is printed as each deletec base, because we do not
1368     // have access to subject sequence.
1369     if (print_md_tag && !md_tag.empty()) {
1370         vector<string> tokens;
1371         NStr::Split(md_tag, "!", tokens);
1372 
1373         ostr << sep << "MD:Z:";
1374         size_t i = 0;
1375         for (;i < tokens.size();i+=2) {
1376             ostr << tokens[i];
1377 
1378             if (i < tokens.size() - 1) {
1379                 int num = NStr::StringToInt(tokens[i + 1]);
1380                 _ASSERT(num > 0);
1381                 ostr << "^";
1382                 for (int k=0;k < num;k++) {
1383                     ostr << "x";
1384                 }
1385             }
1386         }
1387     }
1388 
1389     return ostr;
1390 }
1391 
1392 
PrintSAMUnaligned(CNcbiOstream & ostr,const CMagicBlastResults & results,const TQueryMap & queries,bool first_seg,bool trim_read_ids)1393 CNcbiOstream& PrintSAMUnaligned(CNcbiOstream& ostr,
1394                                 const CMagicBlastResults& results,
1395                                 const TQueryMap& queries,
1396                                 bool first_seg,
1397                                 bool trim_read_ids)
1398 {
1399     string sep = "\t";
1400 
1401     CSeq_id id;
1402     if (!results.IsPaired() || first_seg) {
1403         id.Set(results.GetQueryId().AsFastaString());
1404     }
1405     else {
1406         id.Set(results.GetLastId().AsFastaString());
1407     }
1408 
1409     // read id
1410     const CBioseq& bioseq = s_GetQueryBioseq(queries, id);
1411     string read_id = s_GetSequenceId(bioseq);
1412     if (trim_read_ids &&
1413         (NStr::EndsWith(read_id, ".1") || NStr::EndsWith(read_id, ".2") ||
1414          NStr::EndsWith(read_id, "/1") || NStr::EndsWith(read_id, "/2"))) {
1415 
1416         read_id.resize(read_id.length() - 2);
1417     }
1418     ostr << read_id << sep;
1419 
1420     // SAM flags
1421     int flags = SAM_FLAG_SEG_UNMAPPED;
1422     if (results.IsPaired()) {
1423         flags |= SAM_FLAG_MULTI_SEGMENTS;
1424         if ((first_seg && !results.LastAligned()) ||
1425             (!first_seg && !results.FirstAligned())) {
1426 
1427             flags |= SAM_FLAG_NEXT_SEG_UNMAPPED;
1428         }
1429 
1430         if (first_seg) {
1431             flags |= SAM_FLAG_FIRST_SEGMENT;
1432         }
1433         else {
1434             flags |= SAM_FLAG_LAST_SEGMENT;
1435         }
1436     }
1437     ostr << flags << sep;
1438 
1439     // reference sequence id
1440     ostr << "*" << sep;
1441 
1442     // mapping position
1443     ostr << "0" << sep;
1444 
1445     // mapping quality
1446     ostr << "0" << sep;
1447 
1448     // CIGAR
1449     ostr << "*" << sep;
1450 
1451     // mate reference sequence id
1452     ostr << "*" << sep;
1453 
1454     // mate postition
1455     ostr << "0" << sep;
1456 
1457     // template length
1458     ostr << "0" << sep;
1459 
1460     // sequence
1461     string sequence;
1462     CRange<TSeqPos> range;
1463     int status = s_GetQuerySequence(bioseq, range, false, sequence);
1464     if (status || sequence.empty()) {
1465         ostr << "*" << sep;
1466     }
1467     else {
1468         ostr << sequence << sep;
1469     }
1470 
1471     // quality string
1472     ostr << "*";
1473 
1474     // read did not pass filtering
1475     CMagicBlastResults::TResultsInfo info =
1476         first_seg ? results.GetFirstInfo() : results.GetLastInfo();
1477     if ((info & CMagicBlastResults::fFiltered) != 0) {
1478         ostr << sep << "YF:Z:F";
1479     }
1480 
1481     return ostr;
1482 }
1483 
1484 static
PrintSAM(CNcbiOstream & ostr,CNcbiOstream & unaligned_ostr,CFormattingArgs::EOutputFormat unaligned_fmt,CMagicBlastResults & results,const TQueryMap & queries,const BlastQueryInfo * query_info,bool is_spliced,int batch_number,bool trim_read_id,bool print_unaligned,bool no_discordant,E_StrandSpecificity strand_specific,bool only_specific,bool print_md_tag)1485 CNcbiOstream& PrintSAM(CNcbiOstream& ostr,
1486                        CNcbiOstream& unaligned_ostr,
1487                        CFormattingArgs::EOutputFormat unaligned_fmt,
1488                        CMagicBlastResults& results,
1489                        const TQueryMap& queries,
1490                        const BlastQueryInfo* query_info,
1491                        bool is_spliced, int batch_number,
1492                        bool trim_read_id, bool print_unaligned,
1493                        bool no_discordant, E_StrandSpecificity strand_specific,
1494                        bool only_specific,
1495                        bool print_md_tag)
1496 {
1497     bool first_secondary = false;
1498     bool last_secondary = false;
1499 
1500     if (strand_specific == eFwdRev) {
1501         results.SortAlignments(CMagicBlastResults::eFwRevFirst);
1502     }
1503     else if (strand_specific == eRevFwd) {
1504         results.SortAlignments(CMagicBlastResults::eRevFwFirst);
1505     }
1506 
1507      // Is the pair aligned concordantly? (Unpaired are treated as concordant.)
1508     bool is_concordant = results.IsConcordant();
1509 
1510     if (!no_discordant || (no_discordant && is_concordant)) {
1511         for (auto it: results.GetSeqAlign()->Get()) {
1512             PrintSAM(ostr, *it, queries, query_info, is_spliced, batch_number,
1513                      first_secondary, last_secondary, trim_read_id,
1514                      strand_specific, only_specific, print_md_tag);
1515             ostr << endl;
1516         }
1517     }
1518 
1519     if (!print_unaligned) {
1520         return ostr;
1521     }
1522 
1523     if ((results.GetFirstInfo() & CMagicBlastResults::fUnaligned) != 0 ||
1524         (no_discordant && !is_concordant)) {
1525 
1526         PrintUnaligned(unaligned_ostr, unaligned_fmt, results, queries, true,
1527                        trim_read_id);
1528         unaligned_ostr << endl;
1529     }
1530 
1531     if (results.IsPaired() &&
1532         ((results.GetLastInfo() & CMagicBlastResults::fUnaligned) != 0 ||
1533          (no_discordant && !is_concordant))) {
1534         PrintUnaligned(unaligned_ostr, unaligned_fmt, results, queries, false,
1535                        trim_read_id);
1536         unaligned_ostr << endl;
1537     }
1538 
1539     return ostr;
1540 }
1541 
1542 
PrintSAM(CNcbiOstream & ostr,CNcbiOstream & unaligned_ostr,CFormattingArgs::EOutputFormat unaligned_fmt,const CMagicBlastResultSet & results,const CBioseq_set & query_batch,const BlastQueryInfo * query_info,bool is_spliced,int batch_number,bool trim_read_id,bool print_unaligned,bool no_discordant,E_StrandSpecificity strand_specific,bool only_specific,bool print_md_tag)1543 CNcbiOstream& PrintSAM(CNcbiOstream& ostr,
1544                        CNcbiOstream& unaligned_ostr,
1545                        CFormattingArgs::EOutputFormat unaligned_fmt,
1546                        const CMagicBlastResultSet& results,
1547                        const CBioseq_set& query_batch,
1548                        const BlastQueryInfo* query_info,
1549                        bool is_spliced,
1550                        int batch_number,
1551                        bool trim_read_id,
1552                        bool print_unaligned,
1553                        bool no_discordant,
1554                        E_StrandSpecificity strand_specific,
1555                        bool only_specific,
1556                        bool print_md_tag)
1557 {
1558     TQueryMap bioseqs;
1559     s_CreateQueryMap(query_batch, bioseqs);
1560 
1561     for (auto it: results) {
1562         PrintSAM(ostr, unaligned_ostr, unaligned_fmt, *it, bioseqs, query_info,
1563                  is_spliced, batch_number, trim_read_id, print_unaligned,
1564                  no_discordant, strand_specific, only_specific, print_md_tag);
1565     }
1566 
1567     return ostr;
1568 }
1569 
1570 
PrintASN1(CNcbiOstream & ostr,const CBioseq_set & query_batch,CSeq_align_set & aligns)1571 CNcbiOstream& PrintASN1(CNcbiOstream& ostr, const CBioseq_set& query_batch,
1572                         CSeq_align_set& aligns)
1573 {
1574     TQueryMap queries;
1575     s_CreateQueryMap(query_batch, queries);
1576 
1577     for (auto it: aligns.Set()) {
1578         if (it->GetSegs().Which() != CSeq_align::TSegs::e_Spliced) {
1579             continue;
1580         }
1581 
1582         const CBioseq& bioseq = s_GetQueryBioseq(queries, it->GetSeq_id(0));
1583         CRef<CSeq_id> seqid;
1584         if (bioseq.IsSetDescr()) {
1585             for (auto it: bioseq.GetDescr().Get()) {
1586                 if (it->IsTitle()) {
1587                     vector<string> tokens;
1588                     NStr::Split(it->GetTitle(), " ", tokens);
1589                     seqid.Reset(new CSeq_id(CSeq_id::e_Local, tokens[0]));
1590                 }
1591             }
1592         }
1593 
1594         if (seqid.NotEmpty()) {
1595             it->SetSegs().SetSpliced().SetProduct_id(*seqid);
1596         }
1597     }
1598 
1599     ostr << MSerial_AsnText << aligns;
1600 
1601     return ostr;
1602 }
1603 
1604 
1605 END_SCOPE(blast)
1606 END_NCBI_SCOPE
1607 
1608