1 /*
2 * ===========================================================================
3 *
4 *                            PUBLIC DOMAIN NOTICE
5 *               National Center for Biotechnology Information
6 *
7 *  This software/database is a "United States Government Work" under the
8 *  terms of the United States Copyright Act.  It was written as part of
9 *  the author's offical duties as a United States Government employee and
10 *  thus cannot be copyrighted.  This software/database is freely available
11 *  to the public for use. The National Library of Medicine and the U.S.
12 *  Government have not placed any restriction on its use or reproduction.
13 *
14 *  Although all reasonable efforts have been taken to ensure the accuracy
15 *  and reliability of the software and data, the NLM and the U.S.
16 *  Government do not and cannot warrant the performance or results that
17 *  may be obtained by using this software or data. The NLM and the U.S.
18 *  Government disclaim all warranties, express or implied, including
19 *  warranties of performance, merchantability or fitness for any particular
20 *  purpose.
21 *
22 *  Please cite the author in any work or product based on this material.
23 *
24 * ===========================================================================*/
25 
26 /*****************************************************************************
27 
28 Author: Jason Papadopoulos
29 
30 ******************************************************************************/
31 
32 /** @file blast_format.cpppiv
33  * Produce formatted blast output
34 */
35 
36 #include <ncbi_pch.hpp>
37 #include <algo/blast/format/blast_format.hpp>
38 #include <objects/seq/Seq_annot.hpp>
39 #include <objects/seq/Seq_descr.hpp>
40 #include <objmgr/seq_loc_mapper.hpp>
41 #include <objmgr/util/sequence.hpp>
42 #include <objmgr/util/create_defline.hpp>
43 #include <algo/blast/core/blast_stat.h>
44 #include <corelib/ncbiutil.hpp>                 // for FindBestChoice
45 #include <algo/blast/api/sseqloc.hpp>
46 #include <algo/blast/api/objmgr_query_data.hpp>
47 
48 #include <algo/blast/format/blastxml_format.hpp>
49 #include <algo/blast/format/data4xmlformat.hpp>       /* NCBI_FAKE_WARNING */
50 #include <algo/blast/format/blastxml2_format.hpp>
51 #include <algo/blast/format/data4xml2format.hpp>       /* NCBI_FAKE_WARNING */
52 #include <algo/blast/format/build_archive.hpp>
53 #include <misc/jsonwrapp/jsonwrapp.hpp>
54 #include <objtools/blast/seqdb_reader/seqdb.hpp>   // for CSeqDB
55 #include <serial/objostrxml.hpp>
56 
57 #include <corelib/ncbistre.hpp>
58 
59 #ifndef SKIP_DOXYGEN_PROCESSING
60 USING_NCBI_SCOPE;
61 USING_SCOPE(blast);
62 USING_SCOPE(objects);
63 USING_SCOPE(align_format);
64 USING_SCOPE(sequence);
65 #endif
66 
67 
CBlastFormat(const blast::CBlastOptions & options,blast::CLocalDbAdapter & db_adapter,blast::CFormattingArgs::EOutputFormat format_type,bool believe_query,CNcbiOstream & outfile,int num_summary,int num_alignments,CScope & scope,const char * matrix_name,bool show_gi,bool is_html,int qgencode,int dbgencode,bool use_sum_statistics,bool is_remote_search,int dbfilt_algorithm,const string & custom_output_format,bool is_megablast,bool is_indexed,const blast::CIgBlastOptions * ig_opts,const blast::CLocalDbAdapter * domain_db_adapter,const string & cmdline,const string & subjectTag)68 CBlastFormat::CBlastFormat(const blast::CBlastOptions& options,
69                            blast::CLocalDbAdapter& db_adapter,
70                  blast::CFormattingArgs::EOutputFormat format_type,
71                  bool believe_query, CNcbiOstream& outfile,
72                  int num_summary,
73                  int num_alignments,
74                  CScope & scope,
75                  const char *matrix_name /* = BLAST_DEFAULT_MATRIX */,
76                  bool show_gi /* = false */,
77                  bool is_html /* = false */,
78                  int qgencode /* = BLAST_GENETIC_CODE */,
79                  int dbgencode /* = BLAST_GENETIC_CODE */,
80                  bool use_sum_statistics /* = false */,
81                  bool is_remote_search /* = false */,
82                  int dbfilt_algorithm /* = -1 */,
83                  const string& custom_output_format /* = kEmptyStr */,
84                  bool is_megablast /* = false */,
85                  bool is_indexed /* = false */,
86                  const blast::CIgBlastOptions *ig_opts /* = NULL */,
87                  const blast::CLocalDbAdapter* domain_db_adapter /* = NULL*/,
88                  const string & cmdline /* =kEMptyStr*/,
89 		 const string& subjectTag /* =kEmptyStr */)
90         : m_FormatType(format_type), m_IsHTML(is_html),
91           m_DbIsAA(db_adapter.IsProtein()), m_BelieveQuery(believe_query),
92           m_Outfile(outfile), m_NumSummary(num_summary),
93           m_NumAlignments(num_alignments), m_HitlistSize(options.GetHitlistSize()),
94           m_Program(Blast_ProgramNameFromType(options.GetProgramType())),
95           m_DbName(kEmptyStr),
96           m_QueryGenCode(qgencode), m_DbGenCode(dbgencode),
97           m_ShowGi(show_gi), m_ShowLinkedSetSize(false),
98           m_IsUngappedSearch(!options.GetGappedMode()),
99           m_MatrixName(matrix_name),
100           m_Scope(& scope),
101           m_IsBl2Seq(false),
102           m_IsDbScan(false),
103           m_SubjectTag(subjectTag),
104           m_IsRemoteSearch(is_remote_search),
105           m_QueriesFormatted(0),
106           m_Megablast(is_megablast),
107           m_IndexedMegablast(is_indexed),
108           m_CustomOutputFormatSpec(custom_output_format),
109           m_IgOptions(ig_opts),
110           m_Options(&options),
111           m_IsVdb(false),
112           m_IsIterative(false),
113           m_BaseFile(kEmptyStr),
114           m_XMLFileCount(0),
115           m_LineLength(align_format::kDfltLineLength),
116           m_OrigExceptionMask(outfile.exceptions()),
117           m_Cmdline(cmdline)
118 {
119     m_Outfile.exceptions(NcbiBadbit);
120     m_DbName = db_adapter.GetDatabaseName();
121     m_IsBl2Seq = (m_DbName == kEmptyStr ? true : false);
122     m_IsDbScan = db_adapter.IsDbScanMode();
123     if (m_IsBl2Seq) {
124         m_SeqInfoSrc.Reset(db_adapter.MakeSeqInfoSrc());
125     }
126     else {
127     	m_SearchDb = db_adapter.GetSearchDatabase();
128     }
129     if(m_IsDbScan) {
130 	int num_seqs=0;
131         int total_length=0;
132 	if (!is_remote_search)
133         {
134                 BlastSeqSrc* seqsrc = db_adapter.MakeSeqSrc();
135                 num_seqs=BlastSeqSrcGetNumSeqs(seqsrc);
136                 total_length=BlastSeqSrcGetTotLen(seqsrc);
137         }
138 	CBlastFormatUtil::FillScanModeBlastDbInfo(m_DbInfo, m_DbIsAA,
139                 num_seqs, total_length, m_SubjectTag);
140     } else {
141         int filteringAlgorithmId = db_adapter.GetFilteringAlgorithm();
142         if(filteringAlgorithmId == -1) {
143             CRef <CSearchDatabase> db_Info = db_adapter.GetSearchDatabase();
144             if (db_Info && db_Info.NotEmpty()) {
145                 ESubjectMaskingType maskType = db_Info->GetMaskType();
146                 if(maskType != eNoSubjMasking) {
147                     db_Info->SetFilteringAlgorithm(-1, eNoSubjMasking);
148                     ERR_POST(Warning << "Subject mask not found in " + m_DbName +", proceeding without subject masking.");
149                 }
150             }
151         }
152         CBlastFormatUtil::GetBlastDbInfo(m_DbInfo, m_DbName, m_DbIsAA,
153                                    dbfilt_algorithm, is_remote_search);
154     }
155     if (m_FormatType == CFormattingArgs::eXml) {
156         m_AccumulatedQueries.Reset(new CBlastQueryVector());
157         m_BlastXMLIncremental.Reset(new SBlastXMLIncremental());
158     }
159 
160     if ((m_FormatType == CFormattingArgs::eXml2) || (m_FormatType == CFormattingArgs::eJson) ||
161         (m_FormatType == CFormattingArgs::eXml2_S) || (m_FormatType == CFormattingArgs::eJson_S)){
162            m_AccumulatedQueries.Reset(new CBlastQueryVector());
163     }
164 
165     if (use_sum_statistics && m_IsUngappedSearch) {
166         m_ShowLinkedSetSize = true;
167     }
168     if ( m_Program == "blastn" &&
169          options.GetMatchReward() == 0 &&
170          options.GetMismatchPenalty() == 0 )
171     {
172        /* This combination is an indicator that we have used matrices
173         * solely to develop the hsp score.  Also for the time being it
174         * indicates that KA stats are not available. -RMH-
175         */
176         m_DisableKAStats = true;
177     }
178     else
179     {
180         m_DisableKAStats = false;
181     }
182 
183     CAlignFormatUtil::GetAsciiProteinMatrix(m_MatrixName, m_ScoringMatrix);
184 
185     if (options.GetProgram() == eDeltaBlast) {
186         _ASSERT(options.GetProgramType() == eBlastTypePsiBlast);
187         m_Program = "deltablast";
188 
189         if (domain_db_adapter) {
190             CBlastFormatUtil::GetBlastDbInfo(m_DomainDbInfo,
191                                          domain_db_adapter->GetDatabaseName(),
192                                          true, -1, is_remote_search);
193         }
194     }
195 
196     m_IsIterative = options.IsIterativeSearch();
197     if (m_FormatType == CFormattingArgs::eSAM) {
198     	x_InitSAMFormatter();
199     }
200 
201     CNcbiApplication* app = CNcbiApplication::Instance();
202     if (app) {
203         const CNcbiRegistry& registry = app->GetConfig();
204         m_LongSeqId = (registry.Get("BLAST", "LONG_SEQID") == "1");
205     }
206     m_HitsSortOption = -1;
207     m_HspsSortOption = -1;
208 }
209 
CBlastFormat(const blast::CBlastOptions & opts,const vector<CBlastFormatUtil::SDbInfo> & dbinfo_list,blast::CFormattingArgs::EOutputFormat format_type,bool believe_query,CNcbiOstream & outfile,int num_summary,int num_alignments,CScope & scope,bool show_gi,bool is_html,bool is_remote_search,const string & custom_output_format,bool is_vdb,const string & cmdline)210 CBlastFormat::CBlastFormat(const blast::CBlastOptions& opts,
211                  const vector< CBlastFormatUtil::SDbInfo >& dbinfo_list,
212                  blast::CFormattingArgs::EOutputFormat format_type,
213                  bool believe_query, CNcbiOstream& outfile,
214                  int num_summary,
215                  int num_alignments,
216                  CScope& scope,
217                  bool show_gi,
218                  bool is_html,
219                  bool is_remote_search,
220                  const string& custom_output_format,
221                  bool is_vdb,
222                  const string & cmdline)
223         : m_FormatType(format_type),
224           m_IsHTML(is_html),
225           m_DbIsAA(!Blast_SubjectIsNucleotide(opts.GetProgramType())),
226           m_BelieveQuery(believe_query),
227           m_Outfile(outfile),
228           m_NumSummary(num_summary),
229           m_NumAlignments(num_alignments),
230           m_HitlistSize(opts.GetHitlistSize()),
231           m_Program(Blast_ProgramNameFromType(opts.GetProgramType())),
232           m_DbName(kEmptyStr),
233           m_QueryGenCode(opts.GetQueryGeneticCode()),
234           m_DbGenCode(opts.GetDbGeneticCode()),
235           m_ShowGi(show_gi),
236           m_ShowLinkedSetSize(false),
237           m_IsUngappedSearch(!opts.GetGappedMode()),
238           m_MatrixName(opts.GetMatrixName()),
239           m_Scope(&scope),
240           m_IsBl2Seq(false),
241           m_IsDbScan (false),
242           m_IsRemoteSearch(is_remote_search),
243           m_QueriesFormatted(0),
244           m_Megablast(opts.GetProgram() == eMegablast ||
245                       opts.GetProgram() == eDiscMegablast),
246           m_IndexedMegablast(opts.GetMBIndexLoaded()),
247           m_CustomOutputFormatSpec(custom_output_format),
248           m_Options(&opts),
249           m_IsVdb(is_vdb),
250           m_IsIterative(false),
251           m_BaseFile(kEmptyStr),
252           m_XMLFileCount(0),
253           m_LineLength(align_format::kDfltLineLength),
254           m_OrigExceptionMask(outfile.exceptions()),
255           m_Cmdline(cmdline)
256 {
257     m_Outfile.exceptions(NcbiBadbit);
258     m_DbInfo.assign(dbinfo_list.begin(), dbinfo_list.end());
259     vector< CBlastFormatUtil::SDbInfo >::const_iterator itInfo;
260     for (itInfo = m_DbInfo.begin(); itInfo != m_DbInfo.end(); itInfo++)
261     {
262     	if(itInfo != m_DbInfo.begin())
263     		m_DbName += " ";
264 
265         m_DbName += itInfo->name;
266     }
267 
268     m_IsBl2Seq = false;
269 
270     if (m_FormatType == CFormattingArgs::eXml) {
271         m_AccumulatedQueries.Reset(new CBlastQueryVector());
272         m_BlastXMLIncremental.Reset(new SBlastXMLIncremental());
273     }
274 
275     if ((m_FormatType == CFormattingArgs::eXml2) || (m_FormatType == CFormattingArgs::eJson) ||
276         (m_FormatType == CFormattingArgs::eXml2_S) || (m_FormatType == CFormattingArgs::eJson_S)) {
277            m_AccumulatedQueries.Reset(new CBlastQueryVector());
278     }
279 
280     if (opts.GetSumStatisticsMode() && m_IsUngappedSearch) {
281         m_ShowLinkedSetSize = true;
282     }
283 
284     if ( m_Program == "blastn" &&
285          opts.GetMatchReward() == 0 &&
286          opts.GetMismatchPenalty() == 0 )
287     {
288        /* This combination is an indicator that we have used matrices
289         * solely to develop the hsp score.  Also for the time being it
290         * indicates that KA stats are not available. -RMH-
291         */
292         m_DisableKAStats = true;
293     }
294     else
295     {
296         m_DisableKAStats = false;
297     }
298 
299     CAlignFormatUtil::GetAsciiProteinMatrix(m_MatrixName, m_ScoringMatrix);
300 
301     if (opts.GetProgram() == eDeltaBlast) {
302         _ASSERT(opts.GetProgramType() == eBlastTypePsiBlast);
303         m_Program = "deltablast";
304     }
305     m_IsIterative = opts.IsIterativeSearch();
306     if (m_FormatType == CFormattingArgs::eSAM) {
307     	x_InitSAMFormatter();
308     }
309     CNcbiApplication* app = CNcbiApplication::Instance();
310     if (app) {
311         const CNcbiRegistry& registry = app->GetConfig();
312         m_LongSeqId = (registry.Get("BLAST", "LONG_SEQID") == "1");
313     }
314     m_HitsSortOption = -1;
315     m_HspsSortOption = -1;
316 }
317 
~CBlastFormat()318 CBlastFormat::~CBlastFormat()
319 {
320     try {
321         m_Outfile.exceptions(m_OrigExceptionMask);
322     } catch (...) {/*ignore exceptions*/}
323     m_Outfile.flush();
324 }
325 
326 static const string kHTML_Prefix =
327 "<HTML>\n"
328 "<HEAD><TITLE>BLAST Search Results</TITLE></HEAD>\n"
329 "<BODY BGCOLOR=\"#FFFFFF\" LINK=\"#0000FF\" VLINK=\"#660099\" ALINK=\"#660099\">\n"
330 "<PRE>\n";
331 
332 static const string kHTML_Suffix =
333 "</PRE>\n"
334 "</BODY>\n"
335 "</HTML>";
336 
337 Int8
GetDbTotalLength()338 CBlastFormat::GetDbTotalLength()
339 {
340     Int8 retv = 0L;
341     for (size_t i = 0; i < m_DbInfo.size(); i++) {
342         retv += m_DbInfo[i].total_length;
343     }
344     return retv;
345 }
346 
347 void
PrintProlog()348 CBlastFormat::PrintProlog()
349 {
350     // no header for some output types
351     if (m_FormatType >= CFormattingArgs::eXml) {
352     	if(m_FormatType == CFormattingArgs::eXml2_S) {
353     		BlastXML2_PrintHeader(&m_Outfile);
354     	}
355     	else if(m_FormatType == CFormattingArgs::eJson_S){
356     		BlastJSON_PrintHeader(&m_Outfile);
357     	}
358         return;
359     }
360 
361     if (m_IsHTML) {
362         m_Outfile << kHTML_Prefix << "\n";
363     }
364     // Make sure no-one confuses us with the standard BLASTN
365     // algorithm.  -RMH-
366     if ( m_Program == "blastn" &&
367          m_DisableKAStats == true )
368     {
369       CBlastFormatUtil::BlastPrintVersionInfo("rmblastn", m_IsHTML,
370                                               m_Outfile);
371       m_Outfile << "\n\n";
372       m_Outfile << "Reference: Robert M. Hubley, Arian Smit\n";
373       m_Outfile << "RMBlast - RepeatMasker Search Engine\n";
374       m_Outfile << "2010 <http://www.repeatmasker.org>";
375     }else
376     {
377       CBlastFormatUtil::BlastPrintVersionInfo(m_Program, m_IsHTML,
378                                               m_Outfile);
379     }
380 
381     if (m_IsBl2Seq && !m_IsDbScan) {
382         return;
383     }
384 
385     m_Outfile << NcbiEndl << NcbiEndl;
386     if (m_Program == "deltablast") {
387         CBlastFormatUtil::BlastPrintReference(m_IsHTML, kFormatLineLength,
388                               m_Outfile, CReference::eDeltaBlast);
389         m_Outfile << "\n";
390     }
391 
392     if (m_Megablast)
393         CBlastFormatUtil::BlastPrintReference(m_IsHTML, kFormatLineLength,
394                                           m_Outfile, CReference::eMegaBlast);
395     else
396         CBlastFormatUtil::BlastPrintReference(m_IsHTML, kFormatLineLength,
397                                           m_Outfile);
398 
399     if (m_Megablast && m_IndexedMegablast)
400     {
401         m_Outfile << "\n";
402         CBlastFormatUtil::BlastPrintReference(m_IsHTML, kFormatLineLength,
403                               m_Outfile, CReference::eIndexedMegablast);
404     }
405 
406     if (m_Program == "psiblast" || m_Program == "deltablast") {
407         m_Outfile << "\n";
408         CBlastFormatUtil::BlastPrintReference(m_IsHTML, kFormatLineLength,
409                               m_Outfile, CReference::eCompAdjustedMatrices);
410     }
411     if (m_Program == "psiblast" || m_Program == "blastp") {
412         m_Outfile << "\n";
413         CBlastFormatUtil::BlastPrintReference(m_IsHTML, kFormatLineLength,
414                               m_Outfile, CReference::eCompBasedStats,
415                               (bool)(m_Program == "psiblast"));
416     }
417 
418     if (m_Program == "deltablast" || !m_DomainDbInfo.empty()) {
419         m_Outfile << "\n\n";
420         if (!m_DomainDbInfo.empty()) {
421         	m_Outfile << "\n\n" << "Conserved Domain ";
422         	CBlastFormatUtil::PrintDbReport(m_DomainDbInfo, kFormatLineLength,
423                                         m_Outfile, true);
424         }
425     }
426     else {
427         m_Outfile << "\n\n";
428     }
429     if (!m_IsBl2Seq || m_IsDbScan)
430         CBlastFormatUtil::PrintDbReport(m_DbInfo, kFormatLineLength,
431                                     m_Outfile, true);
432 }
433 
434 void
x_PrintOneQueryFooter(const blast::CBlastAncillaryData & summary)435 CBlastFormat::x_PrintOneQueryFooter(const blast::CBlastAncillaryData& summary)
436 {
437     /* Skip printing KA parameters if the program is rmblastn -RMH- */
438     if ( m_DisableKAStats )
439       return;
440 
441     const Blast_KarlinBlk *kbp_ungap =
442         (m_Program == "psiblast" || m_Program == "deltablast")
443         ? summary.GetPsiUngappedKarlinBlk()
444         : summary.GetUngappedKarlinBlk();
445     const Blast_GumbelBlk *gbp = summary.GetGumbelBlk();
446     m_Outfile << NcbiEndl;
447     if (kbp_ungap) {
448         CBlastFormatUtil::PrintKAParameters(kbp_ungap->Lambda,
449                                             kbp_ungap->K, kbp_ungap->H,
450                                             kFormatLineLength, m_Outfile,
451                                             false, gbp);
452     }
453 
454     const Blast_KarlinBlk *kbp_gap =
455         (m_Program == "psiblast" || m_Program == "deltablast")
456         ? summary.GetPsiGappedKarlinBlk()
457         : summary.GetGappedKarlinBlk();
458     m_Outfile << "\n";
459     if (kbp_gap) {
460         CBlastFormatUtil::PrintKAParameters(kbp_gap->Lambda,
461                                             kbp_gap->K, kbp_gap->H,
462                                             kFormatLineLength, m_Outfile,
463                                             true, gbp);
464     }
465 
466     m_Outfile << "\n";
467     m_Outfile << "Effective search space used: " <<
468                         summary.GetSearchSpace() << "\n";
469 }
470 
471 /// Auxialiary function to determine if there are local IDs in the identifiers
472 /// of the query sequences
473 /// @param queries query sequence(s) [in]
474 static bool
s_HasLocalIDs(CConstRef<CBlastQueryVector> queries)475 s_HasLocalIDs(CConstRef<CBlastQueryVector> queries)
476 {
477     bool retval = false;
478     ITERATE(CBlastQueryVector, itr, *queries) {
479         if (blast::IsLocalId((*itr)->GetQuerySeqLoc()->GetId())) {
480             retval = true;
481             break;
482         }
483     }
484     return retval;
485 }
486 
487 void
x_ConfigCShowBlastDefline(CShowBlastDefline & showdef,int skip_from,int skip_to,int index,int num_descriptions_to_show)488 CBlastFormat::x_ConfigCShowBlastDefline(CShowBlastDefline& showdef,
489                                         int skip_from, int skip_to, int index,
490                                         int num_descriptions_to_show /* = -1 */)
491 {
492     int flags = 0;
493     if (m_ShowLinkedSetSize)
494         flags |= CShowBlastDefline::eShowSumN;
495     if (m_IsHTML){
496         flags |= CShowBlastDefline::eHtml;
497         if (index >= 0) {
498             showdef.SetResultPosIndex(index);
499         }
500     }
501     if (m_ShowGi)
502         flags |= CShowBlastDefline::eShowGi;
503     if (num_descriptions_to_show == 0)
504         flags |= CShowBlastDefline::eNoShowHeader;
505     if (m_LongSeqId) {
506         flags |= CShowBlastDefline::eLongSeqId;
507     }
508     if(m_HitsSortOption >= 0) {
509         flags |= CShowBlastDefline::eShowPercentIdent;
510         flags |= CShowBlastDefline::eShowTotalScore;
511         flags |= CShowBlastDefline::eShowQueryCoverage;
512     }
513     showdef.SetOption(flags);
514     showdef.SetDbName(m_DbName);
515     showdef.SetDbType(!m_DbIsAA);
516     showdef.SetSkipRange(skip_from, skip_to);
517 }
518 
519 void
x_SplitSeqAlign(CConstRef<CSeq_align_set> full_alignment,CSeq_align_set & repeated_seqs,CSeq_align_set & new_seqs,blast::CPsiBlastIterationState::TSeqIds & prev_seqids)520 CBlastFormat::x_SplitSeqAlign(CConstRef<CSeq_align_set> full_alignment,
521                        CSeq_align_set& repeated_seqs,
522                        CSeq_align_set& new_seqs,
523                        blast::CPsiBlastIterationState::TSeqIds& prev_seqids)
524 {
525     static const CSeq_align::TDim kSubjRow = 1;
526     _ASSERT( !prev_seqids.empty() );
527     _ASSERT( !full_alignment->IsEmpty() );
528     _ASSERT(repeated_seqs.IsEmpty());
529     _ASSERT(new_seqs.IsEmpty());
530 
531     unsigned int count = 0;
532     ITERATE(CSeq_align_set::Tdata, alignment, full_alignment->Get()) {
533         CSeq_id_Handle subj_id =
534             CSeq_id_Handle::GetHandle((*alignment)->GetSeq_id(kSubjRow));
535         if (prev_seqids.find(subj_id) != prev_seqids.end()) {
536             // if found among previously seen Seq-ids...
537             repeated_seqs.Set().push_back(*alignment);
538         } else {
539             // ... else add them as new
540             new_seqs.Set().push_back(*alignment);
541         }
542         count++;
543         if(count >= (unsigned int)m_NumSummary)
544         	break;
545     }
546 }
547 
548 bool
s_IsGlobalSeqAlign(CConstRef<objects::CSeq_align_set> seqalign_set)549 s_IsGlobalSeqAlign(CConstRef<objects::CSeq_align_set> seqalign_set)
550 {
551    bool kIsGlobal = (seqalign_set->IsSet() && seqalign_set->CanGet() &&
552           seqalign_set->Get().front()->CanGetType() &&
553           seqalign_set->Get().front()->GetType() == CSeq_align_Base::eType_global);
554 
555    return kIsGlobal;
556 }
557 
558 
559 void
x_DisplayDeflines(CConstRef<CSeq_align_set> aln_set,unsigned int itr_num,blast::CPsiBlastIterationState::TSeqIds & prev_seqids,int additional,int index,int defline_length)560 CBlastFormat::x_DisplayDeflines(CConstRef<CSeq_align_set> aln_set,
561                                 unsigned int itr_num,
562                                 blast::CPsiBlastIterationState::TSeqIds& prev_seqids,
563                                 int additional,
564                                 int index,
565                                 int defline_length )
566 {
567 
568     if (itr_num != numeric_limits<unsigned int>::max() &&
569         !prev_seqids.empty()) {
570         // Split seq-align-set
571         CSeq_align_set repeated_seqs, new_seqs;
572         x_SplitSeqAlign(aln_set, repeated_seqs, new_seqs, prev_seqids);
573 
574         // Show deflines for 'repeat' sequences
575         {{
576             CShowBlastDefline showdef(repeated_seqs, *m_Scope,
577                                       kFormatLineLength,
578                                   	  repeated_seqs.Size());
579             x_ConfigCShowBlastDefline(showdef);
580             showdef.SetupPsiblast(NULL, CShowBlastDefline::eRepeatPass);
581             showdef.DisplayBlastDefline(m_Outfile);
582         }}
583         m_Outfile << "\n";
584 
585         // Show deflines for 'new' sequences
586         {{
587             CShowBlastDefline showdef(new_seqs, *m_Scope, kFormatLineLength,
588                               	  	  new_seqs.Size());
589             x_ConfigCShowBlastDefline(showdef);
590             showdef.SetupPsiblast(NULL, CShowBlastDefline::eNewPass);
591             showdef.DisplayBlastDefline(m_Outfile);
592         }}
593 
594     } else {
595 
596         CShowBlastDefline showdef(*aln_set, *m_Scope,
597                                   defline_length == -1 ? kFormatLineLength:defline_length,
598                                   m_NumSummary + additional);
599         x_ConfigCShowBlastDefline(showdef, -1, -1, index,
600                                   m_NumSummary+additional);
601         showdef.DisplayBlastDefline(m_Outfile);
602     }
603     m_Outfile << "\n";
604 }
605 
606 int
s_SetFlags(string & program,blast::CFormattingArgs::EOutputFormat format_type,bool html,bool showgi,bool isbl2seq,bool disableKAStats)607 s_SetFlags(string& program,
608     blast::CFormattingArgs::EOutputFormat format_type,
609     bool html, bool showgi, bool isbl2seq, bool disableKAStats)
610 {
611    // set the alignment flags
612     int flags = CDisplaySeqalign::eShowBlastInfo;
613 
614     if ( isbl2seq ) {
615         flags |= CDisplaySeqalign::eShowNoDeflineInfo;
616     }
617 
618     if (html)
619         flags |= CDisplaySeqalign::eHtml;
620     if (showgi)
621         flags |= CDisplaySeqalign::eShowGi;
622 
623     if (format_type >= CFormattingArgs::eQueryAnchoredIdentities &&
624         format_type <= CFormattingArgs::eFlatQueryAnchoredNoIdentities) {
625         flags |= CDisplaySeqalign::eMergeAlign;
626     }
627     else {
628         flags |= CDisplaySeqalign::eShowBlastStyleId |
629                  CDisplaySeqalign::eShowMiddleLine;
630     }
631 
632     if (format_type == CFormattingArgs::eQueryAnchoredIdentities ||
633         format_type == CFormattingArgs::eFlatQueryAnchoredIdentities) {
634         flags |= CDisplaySeqalign::eShowIdentity;
635     }
636     if (format_type == CFormattingArgs::eQueryAnchoredIdentities ||
637         format_type == CFormattingArgs::eQueryAnchoredNoIdentities) {
638         flags |= CDisplaySeqalign::eMasterAnchored;
639     }
640     if (program == "tblastx") {
641         flags |= CDisplaySeqalign::eTranslateNucToNucAlignment;
642     }
643 
644     if (disableKAStats)
645         flags |= CDisplaySeqalign::eShowRawScoreOnly;
646 
647     return flags;
648 }
649 
650 bool
x_IsVdbSearch() const651 CBlastFormat::x_IsVdbSearch() const
652 {
653 	return m_IsVdb;
654 }
655 // Port of jzmisc.c's AddAlignInfoToSeqAnnotEx (CVS revision 6.11)
656 CRef<objects::CSeq_annot>
x_WrapAlignmentInSeqAnnot(CConstRef<objects::CSeq_align_set> alnset,const string & db_title) const657 CBlastFormat::x_WrapAlignmentInSeqAnnot(CConstRef<objects::CSeq_align_set> alnset,
658                                         const string& db_title) const
659 {
660 	return CBlastFormatUtil::CreateSeqAnnotFromSeqAlignSet(*alnset,
661                                                            ProgramNameToEnum(m_Program),
662                                                            m_DbName, db_title,
663                                                            x_IsVdbSearch());
664 }
665 
666 void
x_PrintStructuredReport(const blast::CSearchResults & results,CConstRef<blast::CBlastQueryVector> queries)667 CBlastFormat::x_PrintStructuredReport(const blast::CSearchResults& results,
668               CConstRef<blast::CBlastQueryVector> queries)
669 {
670     string db_title;
671     if (!m_DbInfo.empty()) {
672         db_title = m_DbInfo.front().definition;
673         for (size_t i=1;i < m_DbInfo.size();i++) {
674             db_title += "; ";
675             db_title += m_DbInfo[i].definition;
676         }
677     }
678 
679     // ASN.1 formatting is straightforward
680     if (m_FormatType == CFormattingArgs::eAsnText || m_FormatType == CFormattingArgs::eJsonSeqalign) {
681         if (results.HasAlignments()) {
682     	    CRef<CSeq_align_set> aln_set (new CSeq_align_set);
683             CBlastFormatUtil::PruneSeqalign(*(results.GetSeqAlign()), *aln_set, m_HitlistSize);
684             if(m_FormatType == CFormattingArgs::eAsnText)
685             	m_Outfile << MSerial_AsnText << *x_WrapAlignmentInSeqAnnot(aln_set, db_title);
686             else
687             	m_Outfile << MSerial_Json << *x_WrapAlignmentInSeqAnnot(aln_set, db_title);
688         }
689         return;
690     } else if (m_FormatType == CFormattingArgs::eAsnBinary) {
691         if (results.HasAlignments()) {
692             CRef<CSeq_align_set> aln_set (new CSeq_align_set);
693             CBlastFormatUtil::PruneSeqalign(*(results.GetSeqAlign()), *aln_set, m_HitlistSize);
694             m_Outfile << MSerial_AsnBinary <<
695                 *x_WrapAlignmentInSeqAnnot(aln_set, db_title);
696         }
697         return;
698     } else if (m_FormatType == CFormattingArgs::eXml) {
699         CRef<CSearchResults> res(const_cast<CSearchResults*>(&results));
700         res->TrimSeqAlign(m_HitlistSize);
701         m_AccumulatedResults.push_back(res);
702         CConstRef<CSeq_id> query_id = results.GetSeqId();
703         // FIXME: this can be a bottleneck with large numbers of queries
704         ITERATE(CBlastQueryVector, itr, *queries) {
705             if (query_id->Match(*(*itr)->GetQueryId())) {
706                 m_AccumulatedQueries->push_back(*itr);
707                 break;
708             }
709         }
710 
711         objects::CBlastOutput xml_output;
712         if(x_IsVdbSearch()) {
713         	CCmdLineBlastXMLReportData report_data(m_AccumulatedQueries,
714                                                    m_AccumulatedResults,
715                                                    *m_Options, m_DbInfo,
716                                                    m_QueryGenCode, m_DbGenCode,
717                                                    m_IsRemoteSearch);
718         	BlastXML_FormatReport(xml_output, &report_data, &m_Outfile,
719                                   m_BlastXMLIncremental.GetPointer());
720 
721         }
722         else {
723         	CCmdLineBlastXMLReportData report_data(m_AccumulatedQueries,
724                                                    m_AccumulatedResults,
725                                                    *m_Options, m_DbName, m_DbIsAA,
726                                                    m_QueryGenCode, m_DbGenCode,
727                                                    m_IsRemoteSearch);
728         	BlastXML_FormatReport(xml_output, &report_data, &m_Outfile,
729                                   m_BlastXMLIncremental.GetPointer());
730         }
731         m_AccumulatedResults.clear();
732         m_AccumulatedQueries->clear();
733         return;
734     }
735     else if(m_FormatType == CFormattingArgs::eXml2 || m_FormatType == CFormattingArgs::eJson ||
736             m_FormatType == CFormattingArgs::eXml2_S || m_FormatType == CFormattingArgs::eJson_S) {
737     	x_PrintXML2Report(results, queries);
738     	return;
739     }
740     else if (m_FormatType == CFormattingArgs::eSAM) {
741     	if(results.HasAlignments()) {
742             m_SamFormatter->Print(*(results.GetSeqAlign()));
743         }
744         return;
745     }
746 }
747 
748 void
x_PrintTabularReport(const blast::CSearchResults & results,unsigned int itr_num)749 CBlastFormat::x_PrintTabularReport(const blast::CSearchResults& results,
750                                    unsigned int itr_num)
751 {
752     CConstRef<CSeq_align_set> aln_set = results.GetSeqAlign();
753     if (m_IsUngappedSearch && results.HasAlignments()) {
754         aln_set.Reset(CDisplaySeqalign::PrepareBlastUngappedSeqalign(*aln_set));
755     }
756     // other output types will need a bioseq handle
757     CBioseq_Handle bhandle = m_Scope->GetBioseqHandle(*results.GetSeqId(),
758                                                       CScope::eGetBioseq_All);
759 
760     // tabular formatting just prints each alignment in turn
761     // (plus a header)
762     if (m_FormatType == CFormattingArgs::eTabular ||
763         m_FormatType == CFormattingArgs::eTabularWithComments ||
764         m_FormatType == CFormattingArgs::eCommaSeparatedValues) {
765         const CBlastTabularInfo::EFieldDelimiter kDelim =
766             (m_FormatType == CFormattingArgs::eCommaSeparatedValues
767              ? CBlastTabularInfo::eComma : CBlastTabularInfo::eTab);
768 
769         CBlastTabularInfo tabinfo(m_Outfile, m_CustomOutputFormatSpec, kDelim);
770         if(!m_CustomDelim.empty()) {
771             tabinfo.SetCustomDelim(m_CustomDelim);
772         }
773         tabinfo.SetParseLocalIds(m_BelieveQuery);
774         if((m_IsBl2Seq && (!m_BelieveQuery))|| m_IsRemoteSearch) {
775         	tabinfo.SetParseSubjectDefline(true);
776         }
777         tabinfo.SetQueryRange(m_QueryRange);
778         if (ncbi::NStr::ToLower(m_Program) == string("blastn"))
779         	tabinfo.SetNoFetch(true);
780 
781         if (m_FormatType == CFormattingArgs::eTabularWithComments) {
782             string strProgVersion =
783                 NStr::ToUpper(m_Program) + " " + blast::CBlastVersion().Print();
784 	    string dbname;
785 	    if (m_IsDbScan)
786 		dbname = string("User specified sequence set (Input: ") + m_SubjectTag + string(")");
787 	    else
788 		dbname = m_DbName;
789             CConstRef<CBioseq> subject_bioseq;
790             // dbname used in place of Bioseq in most cases.
791             if (dbname.empty())
792             	subject_bioseq.Reset(x_CreateSubjectBioseq());
793             tabinfo.PrintHeader(strProgVersion, *(bhandle.GetBioseqCore()),
794                                 dbname, results.GetRID(), itr_num, aln_set,
795                                 subject_bioseq);
796         }
797 
798         if (results.HasAlignments()) {
799     	    CSeq_align_set copy_aln_set;
800             CBlastFormatUtil::PruneSeqalign(*aln_set, copy_aln_set, m_HitlistSize);
801 
802             {
803             	unsigned int scores = CBlastFormatUtil::eNoQuerySubjCov;
804             	if(string::npos != m_CustomOutputFormatSpec.find("qcovs"))
805             		scores |= CBlastFormatUtil::eQueryCovPerSubj ;
806             	if(string::npos != m_CustomOutputFormatSpec.find("qcovus") &&
807             			ncbi::NStr::ToLower(m_Program) == string("blastn"))
808             		scores |= CBlastFormatUtil::eQueryCovPerUniqSubj ;
809 
810             	if(scores != CBlastFormatUtil::eNoQuerySubjCov)
811             		CBlastFormatUtil::InsertSubjectScores (copy_aln_set, bhandle, m_QueryRange, (CBlastFormatUtil::ESubjectScores) scores);
812            	}
813             tabinfo.SetQueryGeneticCode(m_QueryGenCode);
814             tabinfo.SetDbGeneticCode(m_DbGenCode);
815             ITERATE(CSeq_align_set::Tdata, itr, copy_aln_set.Get()) {
816                     const CSeq_align& s = **itr;
817                     tabinfo.SetFields(s, *m_Scope, &m_ScoringMatrix);
818                     tabinfo.Print();
819             }
820         }
821         return;
822     }
823 }
824 
s_SetCloneInfo(const CIgBlastTabularInfo & tabinfo,const CBioseq_Handle & handle,CBlastFormat::SClone & clone_info)825 static void s_SetCloneInfo(const CIgBlastTabularInfo& tabinfo,
826                            const CBioseq_Handle& handle,
827                            CBlastFormat::SClone& clone_info) {
828 
829     if (handle.GetSeqId()->Which() == CSeq_id::e_Local){
830         CDeflineGenerator defline (handle.GetSeq_entry_Handle());
831         clone_info.seqid = defline.GenerateDefline(handle).substr(0, 45);
832 
833     //    clone_info.seqid = CDeflineGenerator.substr(0, 45);
834     } else {
835         string seqid;
836         CRef<CSeq_id> wid = FindBestChoice(handle.GetBioseqCore()->GetId(), CSeq_id::WorstRank);
837         wid->GetLabel(&seqid, CSeq_id::eContent);
838         clone_info.seqid = seqid.substr(0, 45);
839     }
840     tabinfo.GetIgInfo (clone_info.v_gene, clone_info.d_gene, clone_info.j_gene,
841                        clone_info.chain_type, clone_info.na, clone_info.aa, clone_info.productive);
842     clone_info.identity = 0;
843     const vector<CIgBlastTabularInfo::SIgDomain*>& domains = tabinfo.GetIgDomains();
844     int length = 0;
845     int num_match = 0;
846     for (unsigned int i=0; i<domains.size(); ++i) {
847         if (domains[i]->length > 0) {
848             length += domains[i]->length;
849             num_match += domains[i]->num_match;
850         }
851     }
852     if (length > 0){
853         clone_info.identity = ((double)num_match)/length;
854 
855     }
856 
857 }
858 
859 void
x_PrintTaxReport(const blast::CSearchResults & results)860 CBlastFormat::x_PrintTaxReport(const blast::CSearchResults& results)
861 {
862     CBioseq_Handle bhandle = m_Scope->GetBioseqHandle(*results.GetSeqId(),
863                                                       CScope::eGetBioseq_All);
864     CConstRef<CBioseq> bioseq = bhandle.GetBioseqCore();
865     if(m_IsHTML) {
866         m_Outfile <<  "<pre>";
867     }
868     else {
869         m_Outfile <<  "\n";
870     }
871     CBlastFormatUtil::AcknowledgeBlastQuery(*bioseq, kFormatLineLength,
872                                             m_Outfile, m_BelieveQuery,
873                                             m_IsHTML, false,
874                                             results.GetRID());
875 
876     if(m_IsHTML) {
877         m_Outfile <<  "</pre>";
878     }
879     CConstRef<CSeq_align_set> aln_set = results.GetSeqAlign();
880     if (m_IsUngappedSearch && results.HasAlignments()) {
881         aln_set.Reset(CDisplaySeqalign::PrepareBlastUngappedSeqalign(*aln_set));
882     }
883 
884     CRef<CSeq_align_set> new_aln_set(const_cast<CSeq_align_set*>(aln_set.GetPointer()));
885     CTaxFormat *taxFormatRes = new CTaxFormat (*new_aln_set,*m_Scope,m_IsHTML ? CTaxFormat::eHtml:CTaxFormat::eText, false,max(kMinTaxFormatLineLength,kFormatLineLength));
886     taxFormatRes->DisplayOrgReport(m_Outfile);
887 }
888 
889 void
x_PrintIgTabularReport(const blast::CIgBlastResults & results,SClone & clone_info,bool fill_clone_info)890 CBlastFormat::x_PrintIgTabularReport(const blast::CIgBlastResults& results,
891                                      SClone& clone_info,
892                                      bool fill_clone_info)
893 {
894     CConstRef<CSeq_align_set> aln_set = results.GetSeqAlign();
895     /* TODO do we support ungapped Igblast search?
896     if (m_IsUngappedSearch && results.HasAlignments()) {
897         aln_set.Reset(CDisplaySeqalign::PrepareBlastUngappedSeqalign(*aln_set));
898     } */
899     // other output types will need a bioseq handle
900     CBioseq_Handle bhandle = m_Scope->GetBioseqHandle(*results.GetSeqId(),
901                                                       CScope::eGetBioseq_All);
902 
903     // tabular formatting just prints each alignment in turn
904     // (plus a header)
905     if (m_FormatType != CFormattingArgs::eTabular &&
906         m_FormatType != CFormattingArgs::eTabularWithComments &&
907         m_FormatType != CFormattingArgs::eCommaSeparatedValues) return;
908 
909     const CBlastTabularInfo::EFieldDelimiter kDelim =
910             (m_FormatType == CFormattingArgs::eCommaSeparatedValues
911              ? CBlastTabularInfo::eComma : CBlastTabularInfo::eTab);
912 
913     CIgBlastTabularInfo tabinfo(m_Outfile, m_CustomOutputFormatSpec, kDelim);
914     tabinfo.SetParseLocalIds(m_BelieveQuery);
915 
916     string strProgVersion =
917         "IG" + NStr::ToUpper(m_Program);
918     CConstRef<CBioseq> subject_bioseq = x_CreateSubjectBioseq();
919 
920     if (m_IsHTML) {
921         m_Outfile << "<html><body><pre>\n";
922     }
923     if (results.HasAlignments()) {
924         const CRef<CIgAnnotation> & annots = results.GetIgAnnotation();
925         CSeq_align_set::Tdata::const_iterator itr = aln_set->Get().begin();
926         tabinfo.SetMasterFields(**itr, *m_Scope,
927                                 annots->m_ChainType[0],
928                                 annots->m_ChainTypeToShow,
929                                 &m_ScoringMatrix);
930         tabinfo.SetIgAnnotation(annots, m_IgOptions, aln_set, *m_Scope);
931         if (fill_clone_info) {
932             s_SetCloneInfo(tabinfo, bhandle, clone_info);
933         }
934         tabinfo.PrintHeader(strProgVersion, *(bhandle.GetBioseqCore()),
935                             m_DbName,
936                             m_IgOptions->m_DomainSystem,
937                             results.GetRID(),
938                             numeric_limits<unsigned int>::max(),
939                             aln_set, subject_bioseq);
940 
941         int j = 1;
942         for (; itr != aln_set->Get().end(); ++itr) {
943             tabinfo.SetFields(**itr, *m_Scope,
944                               annots->m_ChainType[j++],
945                               annots->m_ChainTypeToShow,
946                               &m_ScoringMatrix);
947             tabinfo.Print();
948         }
949     } else {
950         tabinfo.PrintHeader(strProgVersion, *(bhandle.GetBioseqCore()),
951                                 m_DbName,
952                                 m_IgOptions->m_DomainSystem,
953                                 results.GetRID(),
954                                 numeric_limits<unsigned int>::max(),
955                                 0, subject_bioseq);
956     }
957     if (m_IsHTML) {
958         m_Outfile << "\n</pre></body></html>\n";
959     }
960 }
961 
962 
x_PrintAirrRearrangement(const blast::CIgBlastResults & results,SClone & clone_info,bool fill_clone_info,bool print_airr_format_header)963 void CBlastFormat::x_PrintAirrRearrangement(const blast::CIgBlastResults& results,
964                                             SClone& clone_info,
965                                             bool fill_clone_info,
966                                             bool print_airr_format_header)
967 {
968     CConstRef<CSeq_align_set> aln_set = results.GetSeqAlign();
969 
970     // other output types will need a bioseq handle
971     CBioseq_Handle bhandle = m_Scope->GetBioseqHandle(*results.GetSeqId(),
972                                                       CScope::eGetBioseq_All);
973 
974     // tabular formatting just prints each alignment in turn
975     // (plus a header)
976 
977     const CBlastTabularInfo::EFieldDelimiter kDelim =
978             (m_FormatType == CFormattingArgs::eCommaSeparatedValues
979              ? CBlastTabularInfo::eComma : CBlastTabularInfo::eTab);
980 
981     CIgBlastTabularInfo tabinfo(m_Outfile, m_CustomOutputFormatSpec, kDelim);
982     tabinfo.SetParseLocalIds(m_BelieveQuery);
983 
984     string strProgVersion =
985         "IG" + NStr::ToUpper(m_Program);
986     CConstRef<CBioseq> subject_bioseq = x_CreateSubjectBioseq();
987 
988     CRef<CIgAnnotation> annots(null);
989     if (results.HasAlignments()) {
990         annots = results.GetIgAnnotation();
991         tabinfo.SetIgAnnotation(annots, m_IgOptions, aln_set, *m_Scope);
992         if (fill_clone_info) {
993             s_SetCloneInfo(tabinfo, bhandle, clone_info);
994         }
995     }
996     tabinfo.SetAirrFormatData(*m_Scope, annots,
997                               bhandle, aln_set, m_IgOptions);
998 
999 
1000     tabinfo.PrintAirrRearrangement(*m_Scope, annots, strProgVersion,
1001                                    *(bhandle.GetBioseqCore()),
1002                                    m_DbName,
1003                                    m_IgOptions->m_DomainSystem,
1004                                        results.GetRID(),
1005                                        numeric_limits<unsigned int>::max(),
1006                                        aln_set, subject_bioseq, &m_ScoringMatrix,
1007                                        print_airr_format_header,
1008                                        m_IgOptions);
1009 
1010 }
1011 
x_CreateSubjectBioseq()1012 CConstRef<objects::CBioseq> CBlastFormat::x_CreateSubjectBioseq()
1013 {
1014     if ( !m_IsBl2Seq && !m_IsDbScan) {
1015         return CConstRef<CBioseq>();
1016     }
1017 
1018     _ASSERT(m_IsBl2Seq);
1019     _ASSERT(m_SeqInfoSrc);
1020     static Uint4 subj_index = 0;
1021 
1022     list< CRef<CSeq_id> > ids = m_SeqInfoSrc->GetId(subj_index++);
1023     CRef<CSeq_id> id = FindBestChoice(ids, CSeq_id::BestRank);
1024     CBioseq_Handle bhandle = m_Scope->GetBioseqHandle(*id,
1025                                                       CScope::eGetBioseq_All);
1026     // If this assertion fails, we're not able to get the subject, possibly a
1027     // programming error (see @note in this function's declaration - was the
1028     // order of calls altered?)
1029     _ASSERT(bhandle);
1030 
1031     // reset the subject index if necessary
1032     if (subj_index >= m_SeqInfoSrc->Size()) {
1033         subj_index = 0;
1034     }
1035     return bhandle.GetBioseqCore();
1036 }
1037 
1038 /// Auxiliary function to print the BLAST Archive in multiple output formats
PrintArchive(CRef<objects::CBlast4_archive> archive,CNcbiOstream & out)1039 void CBlastFormat::PrintArchive(CRef<objects::CBlast4_archive> archive, CNcbiOstream& out)
1040 {
1041     if (archive.Empty()) {
1042         return;
1043     }
1044     string outfmt = CNcbiEnvironment().Get("ARCHIVE_FORMAT");
1045     if (outfmt.empty()) {
1046         out << MSerial_AsnText << *archive;
1047     } else if (!NStr::CompareNocase(outfmt, "xml")) {
1048         out << MSerial_Xml << *archive;
1049     } else if (NStr::StartsWith(outfmt, "bin", NStr::eNocase)) {
1050         out << MSerial_AsnBinary << *archive;
1051     }
1052 }
1053 
1054 void
WriteArchive(blast::IQueryFactory & queries,blast::CBlastOptionsHandle & options_handle,const CSearchResultSet & results,unsigned int num_iters,const list<CRef<CBlast4_error>> & msg)1055 CBlastFormat::WriteArchive(blast::IQueryFactory& queries,
1056                            blast::CBlastOptionsHandle& options_handle,
1057                            const CSearchResultSet& results,
1058                            unsigned int num_iters,
1059                            const list<CRef<CBlast4_error> > & msg)
1060 {
1061     CRef<objects::CBlast4_archive>  archive;
1062     if (m_IsBl2Seq)
1063     {
1064 	CRef<CBlastQueryVector> query_vector(new CBlastQueryVector);
1065 	for (unsigned int i=0; i<m_SeqInfoSrc->Size(); i++)
1066         {
1067 		list< CRef<CSeq_id> > ids = m_SeqInfoSrc->GetId(i);
1068                 CRef<CSeq_id> id = FindBestChoice(ids, CSeq_id::BestRank);
1069                 CRef<CSeq_loc> seq_loc(new CSeq_loc);
1070                 seq_loc->SetWhole(*id);
1071                 CRef<CBlastSearchQuery> search_query(new CBlastSearchQuery(*seq_loc, *m_Scope));
1072                 query_vector->AddQuery(search_query);
1073         }
1074         CObjMgr_QueryFactory subjects(*query_vector);
1075         archive = BlastBuildArchive(queries, options_handle, results, subjects);
1076 
1077     }
1078     else
1079     {
1080     	// Use only by psi blast
1081     	if(num_iters != 0) {
1082     		archive = BlastBuildArchive(queries, options_handle, results, m_SearchDb , num_iters);
1083     	}
1084     	else {
1085     		archive = BlastBuildArchive(queries, options_handle, results, m_SearchDb );
1086     	}
1087     }
1088 
1089     if(msg.size() > 0) {
1090     	archive->SetMessages() = msg;
1091     }
1092     PrintArchive(archive, m_Outfile);
1093 }
1094 
1095 void
WriteArchive(objects::CPssmWithParameters & pssm,blast::CBlastOptionsHandle & options_handle,const CSearchResultSet & results,unsigned int num_iters,const list<CRef<CBlast4_error>> & msg)1096 CBlastFormat::WriteArchive(objects::CPssmWithParameters & pssm,
1097                            blast::CBlastOptionsHandle& options_handle,
1098                            const CSearchResultSet& results,
1099                            unsigned int num_iters,
1100                            const list<CRef<CBlast4_error> > & msg)
1101 {
1102     CRef<objects::CBlast4_archive> archive(BlastBuildArchive(pssm, options_handle, results,  m_SearchDb, num_iters));
1103 
1104     if(msg.size() > 0) {
1105     	archive->SetMessages() = msg;
1106     }
1107     PrintArchive(archive, m_Outfile);
1108 }
1109 
1110 
x_CreateDeflinesJson(CConstRef<CSeq_align_set> aln_set)1111 void CBlastFormat::x_CreateDeflinesJson(CConstRef<CSeq_align_set> aln_set)
1112 {
1113 
1114     int delineFormatOption = 0;
1115     CShowBlastDefline deflines(*aln_set, *m_Scope,kFormatLineLength,m_NumSummary);
1116 
1117     deflines.SetQueryNumber(1);//m_Query_number
1118     deflines.SetDbType (!m_DbIsAA);
1119     deflines.SetDbName(m_DbName);
1120     delineFormatOption |= CShowBlastDefline::eHtml;
1121     delineFormatOption |= CShowBlastDefline::eShowPercentIdent;
1122     deflines.SetOption(delineFormatOption); //m_defline_option
1123 
1124     //Next three lines are for proper initialization in formatting of defline
1125     CShowBlastDefline::SDeflineTemplates *deflineTemplates = new CShowBlastDefline::SDeflineTemplates;
1126     deflineTemplates->advancedView = true;
1127     deflines.SetDeflineTemplates (deflineTemplates);
1128 
1129 
1130     vector <CShowBlastDefline::SDeflineFormattingInfo *> sdlFortInfoVec = deflines.GetFormattingInfo();
1131     CJson_Document doc;
1132     CJson_Object top_obj = doc.SetObject();
1133     CJson_Array defline_array = top_obj.insert_array("deflines");
1134 
1135     for(size_t i = 0; i < sdlFortInfoVec.size(); i++) {
1136         CJson_Object obj = defline_array.push_back_object();
1137 
1138         obj.insert("dfln_url",sdlFortInfoVec[i]->dfln_url);
1139         obj.insert("dfln_rid",sdlFortInfoVec[i]->dfln_rid);
1140         obj.insert("dfln_gi",sdlFortInfoVec[i]->dfln_gi);
1141         obj.insert("dfln_seqid",sdlFortInfoVec[i]->dfln_seqid);
1142         obj.insert("full_dfln_defline",sdlFortInfoVec[i]->full_dfln_defline);
1143         obj.insert("dfln_defline",sdlFortInfoVec[i]->dfln_defline);
1144         obj.insert("dfln_id",sdlFortInfoVec[i]->dfln_id);
1145         obj.insert("dflnFrm_id",sdlFortInfoVec[i]->dflnFrm_id);
1146         obj.insert("dflnFASTA_id",sdlFortInfoVec[i]->dflnFASTA_id);
1147         obj.insert("dflnAccs",sdlFortInfoVec[i]->dflnAccs);
1148 
1149         obj.insert("score_info",sdlFortInfoVec[i]->score_info);
1150         obj.insert("dfln_hspnum",sdlFortInfoVec[i]->dfln_hspnum);
1151         obj.insert("dfln_alnLen",sdlFortInfoVec[i]->dfln_alnLen);
1152         obj.insert("dfln_blast_rank",sdlFortInfoVec[i]->dfln_blast_rank);
1153 	    obj.insert("total_bit_string",sdlFortInfoVec[i]->total_bit_string);
1154         obj.insert("percent_coverage",sdlFortInfoVec[i]->percent_coverage);
1155         obj.insert("evalue_string",sdlFortInfoVec[i]->evalue_string);
1156         obj.insert("percent_identity",sdlFortInfoVec[i]->percent_identity);
1157     }
1158     doc.Write(m_Outfile);
1159 }
1160 
1161 
x_DisplayDeflinesWithTemplates(CConstRef<CSeq_align_set> aln_set)1162 void CBlastFormat::x_DisplayDeflinesWithTemplates(CConstRef<CSeq_align_set> aln_set)
1163 {
1164     x_InitDeflineTemplates();
1165     _ASSERT(m_DeflineTemplates);
1166 
1167     int delineFormatOption = 0;
1168     CShowBlastDefline deflines(*aln_set, *m_Scope,kFormatLineLength,m_NumSummary);
1169 
1170     deflines.SetQueryNumber(1);//m_Query_number
1171     deflines.SetDbType (!m_DbIsAA);
1172     deflines.SetDbName(m_DbName);
1173     delineFormatOption |= CShowBlastDefline::eHtml;
1174     delineFormatOption |= CShowBlastDefline::eShowPercentIdent;
1175     deflines.SetOption(delineFormatOption); //m_defline_option
1176     deflines.SetDeflineTemplates (m_DeflineTemplates);
1177 
1178     deflines.Init();
1179     deflines.Display(m_Outfile);
1180 }
1181 
1182 
x_DisplayAlignsWithTemplates(CConstRef<CSeq_align_set> aln_set,const blast::CSearchResults & results)1183 void CBlastFormat::x_DisplayAlignsWithTemplates(CConstRef<CSeq_align_set> aln_set,const blast::CSearchResults& results)
1184 {
1185     x_InitAlignTemplates();
1186     _ASSERT(m_AlignTemplates);
1187 
1188     TMaskedQueryRegions masklocs;
1189     results.GetMaskedQueryRegions(masklocs);
1190 
1191     CSeq_align_set copy_aln_set;
1192     CBlastFormatUtil::PruneSeqalign(*aln_set, copy_aln_set, m_NumAlignments);
1193 
1194     CRef<CSeq_align_set> seqAlnSet(const_cast<CSeq_align_set*>(&copy_aln_set));
1195     if(!m_AlignSeqList.empty()) {
1196         CAlignFormatUtil::ExtractSeqAlignForSeqList(seqAlnSet, m_AlignSeqList);
1197     }
1198 
1199     CDisplaySeqalign display(*seqAlnSet, *m_Scope, &masklocs, NULL, m_MatrixName);
1200     x_SetAlignParameters(display);
1201     display.SetAlignTemplates(m_AlignTemplates);
1202 
1203     display.DisplaySeqalign(m_Outfile);
1204 }
1205 
x_InitDeflineTemplates(void)1206 void CBlastFormat::x_InitDeflineTemplates(void)
1207 {
1208     CNcbiApplication* app = CNcbiApplication::Instance();
1209     if(!app) return;
1210     const CNcbiRegistry& reg = app->GetConfig();
1211 
1212 
1213     m_DeflineTemplates = new CShowBlastDefline::SDeflineTemplates;
1214     string defLineTmpl;
1215 
1216     m_DeflineTemplates->defLineTmpl = reg.Get("Templates", "DFL_TABLE_ROW");
1217     m_DeflineTemplates->scoreInfoTmpl = reg.Get("Templates", "DFL_TABLE_SCORE_INFO");
1218     m_DeflineTemplates->seqInfoTmpl = reg.Get("Templates", "DFL_TABLE_SEQ_INFO");
1219     m_DeflineTemplates->advancedView = true;
1220 }
1221 
x_InitAlignTemplates(void)1222 void CBlastFormat::x_InitAlignTemplates(void)
1223 {
1224     CNcbiApplication* app = CNcbiApplication::Instance();
1225     if(!app) return;
1226     const CNcbiRegistry& reg = app->GetConfig();
1227 
1228     m_AlignTemplates = new CDisplaySeqalign::SAlignTemplates;
1229 
1230     m_AlignTemplates->alignHeaderTmpl = reg.Get("Templates", "BLAST_ALIGN_HEADER");
1231     string blastAlignParamsTemplData = reg.Get("Templates", "BLAST_ALIGN_PARAMS");
1232     string blastAlignParamsTag = (m_Program == "blastn") ? "ALIGN_PARAMS_NUC" : "ALIGN_PARAMS_PROT";
1233     string blastAlignProtParamsTable = reg.Get("Templates", blastAlignParamsTag);
1234     m_AlignTemplates->alignInfoTmpl = CAlignFormatUtil::MapTemplate(blastAlignParamsTemplData,"align_params",blastAlignProtParamsTable);
1235     m_AlignTemplates->sortInfoTmpl = reg.Get("Templates", "SORT_ALIGNS_SEQ");
1236     m_AlignTemplates->alignFeatureTmpl = reg.Get("Templates", "ALN_FEATURES");
1237     m_AlignTemplates->alignFeatureLinkTmpl = reg.Get("Templates", "ALN_FEATURES_LINK");
1238 
1239 	m_AlignTemplates->alnDefLineTmpl = reg.Get("Templates", "ALN_DEFLINE_ROW");
1240     m_AlignTemplates->alnTitlesLinkTmpl = reg.Get("Templates", "ALN_DEFLINE_TITLES_LNK");
1241     m_AlignTemplates->alnTitlesTmpl = reg.Get("Templates", "ALN_DEFLINE_TITLES");
1242     m_AlignTemplates->alnSeqInfoTmpl = reg.Get("Templates", "ALN_DEFLINE_SEQ_INFO");
1243     m_AlignTemplates->alignRowTmpl = reg.Get("Templates", "BLAST_ALIGN_ROWS");
1244     m_AlignTemplates->alignRowTmplLast = reg.Get("Templates", "BLAST_ALIGN_ROWS_LST");
1245 }
1246 
1247 
1248 
x_SetAlignParameters(CDisplaySeqalign & cds)1249 void CBlastFormat::x_SetAlignParameters(CDisplaySeqalign& cds)
1250 {
1251 
1252     int AlignOption = 0;
1253 
1254     AlignOption += CDisplaySeqalign::eShowMiddleLine;
1255 
1256     if (m_Program == "tblastx") {
1257         AlignOption += CDisplaySeqalign::eTranslateNucToNucAlignment;
1258     }
1259     AlignOption += CDisplaySeqalign::eShowBlastInfo;
1260     AlignOption += CDisplaySeqalign::eShowBlastStyleId;
1261     AlignOption += CDisplaySeqalign::eHtml;
1262     AlignOption += CDisplaySeqalign::eShowSortControls;//*******????
1263     AlignOption += CDisplaySeqalign::eDynamicFeature;
1264     cds.SetAlignOption(AlignOption);
1265 
1266     cds.SetDbName(m_DbName);
1267     cds.SetDbType(!m_DbIsAA);
1268     cds.SetLineLen(m_LineLength);
1269 
1270     if (m_Program == "blastn" || m_Program == "megablast") {
1271         cds.SetMiddleLineStyle (CDisplaySeqalign::eBar);
1272         cds.SetAlignType(CDisplaySeqalign::eNuc);
1273     } else {
1274         cds.SetMiddleLineStyle (CDisplaySeqalign::eChar);
1275         cds.SetAlignType(CDisplaySeqalign::eProt);
1276     }
1277     cds.SetQueryNumber(1); //m_Query_number
1278     cds.SetSeqLocChar (CDisplaySeqalign::eLowerCase);
1279     cds.SetSeqLocColor ( CDisplaySeqalign::eGrey);
1280     cds.SetMasterGeneticCode(m_QueryGenCode);
1281     cds.SetSlaveGeneticCode(m_DbGenCode);
1282 }
1283 
1284 
1285 
s_GetMolType(const CBioseq_Handle & bioseqHandle)1286 static string s_GetMolType(const CBioseq_Handle& bioseqHandle)
1287 {
1288     int molType = bioseqHandle.GetBioseqMolType();
1289     string molTypeString;
1290 
1291     switch(molType) {
1292         case CSeq_inst::eMol_not_set:
1293             molTypeString = "cdna";
1294             break;
1295         case CSeq_inst::eMol_dna:
1296             molTypeString = "dna";
1297             break;
1298         case CSeq_inst::eMol_rna:
1299             molTypeString = "rna";
1300             break;
1301         case CSeq_inst::eMol_aa:
1302             molTypeString = "amino acid";
1303             break;
1304         case CSeq_inst::eMol_na:
1305             molTypeString = "nucleic acid";
1306             break;
1307         default:
1308             molTypeString = "Unknown";
1309     }
1310     return  molTypeString;
1311 }
1312 
1313 void
PrintReport(const blast::CSearchResults & results,CBlastFormat::DisplayOption displayOption)1314 CBlastFormat::PrintReport(const blast::CSearchResults& results,
1315                           CBlastFormat::DisplayOption displayOption)
1316 {
1317     if (displayOption == eMetadata) {//Metadata in json format
1318         CBioseq_Handle bhandle = m_Scope->GetBioseqHandle(*results.GetSeqId(), CScope::eGetBioseq_All);
1319         CConstRef<CBioseq> bioseq = bhandle.GetBioseqCore();
1320 
1321         //string seqID = CAlignFormatUtil::GetSeqIdString(*bioseq, m_BelieveQuery);
1322         string seqID;
1323         CConstRef <CSeq_id> queryID = sequence::GetId(bhandle).GetSeqId();
1324         CSeq_id::ELabelType labelType = (queryID->IsLocal()) ? CSeq_id::eDefault : CSeq_id::eContent;
1325         queryID->GetLabel(&seqID,labelType);
1326 
1327 
1328         string seqDescr = CBlastFormatUtil::GetSeqDescrString(*bioseq);
1329         seqDescr = seqDescr.empty() ? "None" : seqDescr;
1330 
1331         string molType = s_GetMolType(bhandle);
1332 
1333         int length = 0;
1334         if(bioseq->IsSetInst() && bioseq->GetInst().CanGetLength()){
1335             length = bioseq->GetInst().GetLength();
1336         }
1337 
1338         CJson_Document doc;
1339         CJson_Object obj = doc.SetObject();
1340         obj.insert("Query",seqID);
1341         obj.insert("Query_descr",seqDescr);
1342         obj.insert("IsQueryLocal",queryID->IsLocal());
1343         obj.insert("Length",NStr::IntToString(length));
1344         obj.insert("Moltype",molType);
1345         obj.insert("Database",m_DbName);
1346         string dbTitle;
1347         try {
1348             CRef<CSeqDB> seqdb;
1349             seqdb = new CSeqDB(m_DbName, m_DbIsAA ? CSeqDB::eProtein : CSeqDB::eNucleotide);
1350             dbTitle = seqdb->GetTitle();
1351         }
1352         catch (...) {/*ignore exceptions for now*/}
1353         obj.insert("Database_descr",dbTitle);
1354         obj.insert("IsDBProtein",m_DbIsAA);
1355         obj.insert("Program",m_Program);
1356 
1357 
1358         if (results.HasErrors()) {
1359             obj.insert("Error",results.GetErrorStrings());
1360         }
1361         if (results.HasWarnings()) {
1362             obj.insert("Warning",results.GetWarningStrings());
1363         }
1364         doc.Write(m_Outfile);
1365     }
1366     else {
1367         CConstRef<CSeq_align_set> aln_set = results.GetSeqAlign();
1368         _ASSERT(results.HasAlignments());
1369         if (m_IsUngappedSearch) {
1370             aln_set.Reset(CDisplaySeqalign::PrepareBlastUngappedSeqalign(*aln_set));
1371         }
1372 
1373         if (displayOption == eDescriptionsWithTemplates) {//Descriptions with html templates
1374             x_DisplayDeflinesWithTemplates(aln_set);
1375         }
1376         if (displayOption == eDescriptions) {//Descriptions with html templates
1377             x_CreateDeflinesJson(aln_set);
1378         }
1379         else if (displayOption == eAlignments) {// print the alignments with html templates
1380             x_DisplayAlignsWithTemplates(aln_set,results);
1381         }
1382     }
1383 }
1384 
1385 void
PrintOneResultSet(const blast::CSearchResults & results,CConstRef<blast::CBlastQueryVector> queries,unsigned int itr_num,blast::CPsiBlastIterationState::TSeqIds prev_seqids,bool is_deltablast_domain_result)1386 CBlastFormat::PrintOneResultSet(const blast::CSearchResults& results,
1387                         CConstRef<blast::CBlastQueryVector> queries,
1388                         unsigned int itr_num
1389                         /* = numeric_limits<unsigned int>::max() */,
1390                         blast::CPsiBlastIterationState::TSeqIds prev_seqids
1391                         /* = CPsiBlastIterationState::TSeqIds() */,
1392                         bool is_deltablast_domain_result /* = false */)
1393 {
1394     // For remote searches, we don't retrieve the sequence data for the query
1395     // sequence when initially sending the request to the BLAST server (if it's
1396     // a GI/accession/TI), so we flush the scope so that it can be retrieved
1397     // (needed if a self-hit is found) again. This is not applicable if the
1398     // query sequence(s) are specified as FASTA (will be identified by local
1399     // IDs).
1400     if (m_IsRemoteSearch && !s_HasLocalIDs(queries)) {
1401         ResetScopeHistory();
1402     }
1403 
1404     // Used with tabular output to print number of searches formatted at end.
1405     m_QueriesFormatted++;
1406 
1407     if (m_FormatType == CFormattingArgs::eAsnText
1408       || m_FormatType == CFormattingArgs::eAsnBinary
1409       || m_FormatType == CFormattingArgs::eXml
1410       || m_FormatType == CFormattingArgs::eXml2
1411       || m_FormatType == CFormattingArgs::eJson
1412       || m_FormatType == CFormattingArgs::eXml2_S
1413       || m_FormatType == CFormattingArgs::eJson_S
1414       || m_FormatType == CFormattingArgs::eJsonSeqalign
1415       || m_FormatType == CFormattingArgs::eSAM)
1416     {
1417         x_PrintStructuredReport(results, queries);
1418         return;
1419     }
1420 
1421     if (results.HasErrors()) {
1422         ERR_POST(Error << results.GetErrorStrings());
1423         return; // errors are deemed fatal
1424     }
1425     if (results.HasWarnings()) {
1426         ERR_POST(Warning << results.GetWarningStrings());
1427     }
1428 
1429     if (m_FormatType == CFormattingArgs::eTabular ||
1430         m_FormatType == CFormattingArgs::eTabularWithComments ||
1431         m_FormatType == CFormattingArgs::eCommaSeparatedValues) {
1432         x_PrintTabularReport(results, itr_num);
1433         return;
1434     }
1435     if (m_FormatType == CFormattingArgs::eTaxFormat) {
1436         string reportCaption = "Tax BLAST report";
1437         reportCaption = m_IsHTML ? "<h1>" + reportCaption + "</h1>" : CAlignFormatUtil::AddSpaces(reportCaption,max(kMinTaxFormatLineLength,kFormatLineLength),CAlignFormatUtil::eSpacePosToCenter | CAlignFormatUtil::eAddEOLAtLineStart | CAlignFormatUtil::eAddEOLAtLineEnd);
1438         m_Outfile << reportCaption;
1439         x_PrintTaxReport(results);
1440         return;
1441     }
1442     const bool kIsTabularOutput = false;
1443 
1444     if (is_deltablast_domain_result) {
1445         m_Outfile << "Results from domain search" << "\n";
1446     }
1447 
1448     if (itr_num != numeric_limits<unsigned int>::max()) {
1449         m_Outfile << "Results from round " << itr_num << "\n";
1450     }
1451 
1452     // other output types will need a bioseq handle
1453     CBioseq_Handle bhandle = m_Scope->GetBioseqHandle(*results.GetSeqId(),
1454                                                       CScope::eGetBioseq_All);
1455     // If we're not able to get the query, most likely a bug. SB-981 , GP-2207
1456     if( !bhandle  ){
1457         string message = "Failed to resolve SeqId: "+results.GetSeqId()->AsFastaString();
1458 	ERR_POST(message);
1459         NCBI_THROW(CException, eUnknown, message);
1460     }
1461     CConstRef<CBioseq> bioseq = bhandle.GetBioseqCore();
1462 
1463     // print the preamble for this query
1464 
1465     m_Outfile << "\n\n";
1466     CBlastFormatUtil::AcknowledgeBlastQuery(*bioseq, kFormatLineLength,
1467                                             m_Outfile, m_BelieveQuery,
1468                                             m_IsHTML, kIsTabularOutput,
1469                                             results.GetRID());
1470 
1471     if (m_IsBl2Seq && !m_IsDbScan) {
1472         m_Outfile << "\n";
1473         // FIXME: this might be configurable in the future
1474         const bool kBelieveSubject = false;
1475         CConstRef<CBioseq> subject_bioseq = x_CreateSubjectBioseq();
1476         CBlastFormatUtil::AcknowledgeBlastSubject(*subject_bioseq,
1477                                                   kFormatLineLength,
1478                                                   m_Outfile, kBelieveSubject,
1479                                                   m_IsHTML, kIsTabularOutput);
1480     }
1481 
1482     // quit early if there are no hits
1483     if ( !results.HasAlignments() ) {
1484         m_Outfile << "\n\n"
1485               << "***** " << CBlastFormatUtil::kNoHitsFound << " *****" << "\n"
1486               << "\n\n";
1487         x_PrintOneQueryFooter(*results.GetAncillaryData());
1488         return;
1489     }
1490 
1491     CConstRef<CSeq_align_set> aln_set = results.GetSeqAlign();
1492     _ASSERT(results.HasAlignments());
1493     if (m_IsUngappedSearch) {
1494         aln_set.Reset(CDisplaySeqalign::PrepareBlastUngappedSeqalign(*aln_set));
1495     }
1496 
1497     //invoke sorting only for m_HitsSortOption > CAlignFormatUtil::eEvalue or m_HspsSortOption > CAlignFormatUtil::eHspEvalue
1498     if(m_HitsSortOption > 0 || m_HspsSortOption > 0) {
1499         aln_set = CBlastFormatUtil::SortSeqalignForSortableFormat(
1500                                       *(const_cast<CSeq_align_set*>(aln_set.GetPointer())),
1501                                       (m_Program == "tblastx") ? true : false,
1502                                       m_HitsSortOption,
1503                                       m_HspsSortOption);
1504     }
1505 
1506     const bool kIsGlobal = s_IsGlobalSeqAlign(aln_set);
1507 
1508     //-------------------------------------------------
1509     // print 1-line summaries
1510     // Also disable when program is rmblastn.  At this time
1511     // we do not want summary bit scores/evalues for this
1512     // program. -RMH-
1513     if ( (!m_IsBl2Seq || m_IsDbScan) && !(m_DisableKAStats || kIsGlobal) ) {
1514         x_DisplayDeflines(aln_set, itr_num, prev_seqids);
1515     }
1516 
1517     //-------------------------------------------------
1518     // print the alignments
1519     m_Outfile << "\n";
1520 
1521     TMaskedQueryRegions masklocs;
1522     results.GetMaskedQueryRegions(masklocs);
1523 
1524     CSeq_align_set copy_aln_set;
1525     CBlastFormatUtil::PruneSeqalign(*aln_set, copy_aln_set, m_NumAlignments);
1526 
1527     int flags = s_SetFlags(m_Program, m_FormatType, m_IsHTML, m_ShowGi,
1528                            (m_IsBl2Seq && !m_IsDbScan), (m_DisableKAStats || kIsGlobal));
1529 
1530     CDisplaySeqalign display(copy_aln_set, *m_Scope, &masklocs, NULL, m_MatrixName);
1531     display.SetDbName(m_DbName);
1532     display.SetDbType(!m_DbIsAA);
1533     display.SetLineLen(m_LineLength);
1534     int kAlignToShow=2000000000;  // Nice large number per SB-1817
1535     display.SetNumAlignToShow(kAlignToShow);
1536 
1537     // set the alignment flags
1538     display.SetAlignOption(flags);
1539 
1540     if (m_LongSeqId) {
1541         display.UseLongSequenceIds();
1542     }
1543 
1544     if (m_Program == "blastn" || m_Program == "megablast") {
1545             display.SetMiddleLineStyle(CDisplaySeqalign::eBar);
1546             display.SetAlignType(CDisplaySeqalign::eNuc);
1547     }
1548     else {
1549             display.SetMiddleLineStyle(CDisplaySeqalign::eChar);
1550             display.SetAlignType(CDisplaySeqalign::eProt);
1551     }
1552 
1553     display.SetMasterGeneticCode(m_QueryGenCode);
1554     display.SetSlaveGeneticCode(m_DbGenCode);
1555     display.SetSeqLocChar(CDisplaySeqalign::eLowerCase);
1556     TSeqLocInfoVector subj_masks;
1557     results.GetSubjectMasks(subj_masks);
1558     display.SetSubjectMasks(subj_masks);
1559     display.DisplaySeqalign(m_Outfile);
1560 
1561     // print the ancillary data for this query
1562 
1563     x_PrintOneQueryFooter(*results.GetAncillaryData());
1564 }
1565 
1566 void
PrintOneResultSet(blast::CIgBlastResults & results,CConstRef<blast::CBlastQueryVector> queries,SClone & clone_info,bool fill_clone_info,bool print_airr_format_header,int index)1567 CBlastFormat::PrintOneResultSet(blast::CIgBlastResults& results,
1568                                 CConstRef<blast::CBlastQueryVector> queries,
1569                                 SClone& clone_info,
1570                                 bool fill_clone_info,
1571                                 bool print_airr_format_header,
1572                                 int index)
1573 {
1574     clone_info.na = NcbiEmptyString;
1575     clone_info.aa = NcbiEmptyString;
1576 
1577     // For remote searches, we don't retrieve the sequence data for the query
1578     // sequence when initially sending the request to the BLAST server (if it's
1579     // a GI/accession/TI), so we flush the scope so that it can be retrieved
1580     // (needed if a self-hit is found) again. This is not applicable if the
1581     // query sequence(s) are specified as FASTA (will be identified by local
1582     // IDs).
1583     if (m_IsRemoteSearch && !s_HasLocalIDs(queries)) {
1584         ResetScopeHistory();
1585     }
1586 
1587     // Used with tabular output to print number of searches formatted at end.
1588     m_QueriesFormatted++;
1589 
1590     if (m_FormatType == CFormattingArgs::eAsnText
1591       || m_FormatType == CFormattingArgs::eAsnBinary
1592       || m_FormatType == CFormattingArgs::eXml
1593       || m_FormatType == CFormattingArgs::eXml2
1594       || m_FormatType == CFormattingArgs::eJson
1595       || m_FormatType == CFormattingArgs::eXml2_S
1596       || m_FormatType == CFormattingArgs::eJson_S
1597       || m_FormatType == CFormattingArgs::eJsonSeqalign)
1598     {
1599         x_PrintStructuredReport(results, queries);
1600         return;
1601     }
1602 
1603     if (results.HasErrors()) {
1604         ERR_POST(Error << results.GetErrorStrings());
1605         return; // errors are deemed fatal
1606     }
1607     if (results.HasWarnings()) {
1608         ERR_POST(Warning << results.GetWarningStrings());
1609     }
1610 
1611     if (results.GetIgAnnotation()->m_MinusStrand) {
1612         x_ReverseQuery(results);
1613     }
1614     //set j domain
1615     CRef<CIgAnnotation> & annots_edit = results.SetIgAnnotation();
1616     if (annots_edit->m_JDomain[1] > 0 && annots_edit->m_DomainInfo[9] > 0 &&
1617         annots_edit->m_JDomain[1] > annots_edit->m_DomainInfo[9]){
1618         annots_edit->m_JDomain[0] = annots_edit->m_DomainInfo[9] + 1 ;
1619         //fwr4
1620         if (annots_edit->m_JDomain[3] > 0) {
1621             annots_edit->m_JDomain[2] = annots_edit->m_JDomain[1] + 1 ;
1622         }
1623     }
1624 
1625     if (m_FormatType == CFormattingArgs::eTabular ||
1626         m_FormatType == CFormattingArgs::eTabularWithComments ||
1627         m_FormatType == CFormattingArgs::eCommaSeparatedValues) {
1628         m_FormatType = CFormattingArgs::eTabularWithComments;
1629         x_PrintIgTabularReport(results, clone_info, fill_clone_info);
1630         return;
1631     }
1632 
1633     if (m_FormatType == CFormattingArgs::eAirrRearrangement) {
1634 
1635         if (m_Program == "blastn" || m_Program == "BLASTN") {
1636             x_PrintAirrRearrangement(results, clone_info, fill_clone_info, print_airr_format_header);
1637         } else {
1638             m_Outfile << "The AIRR format is only available for nucleotide sequence search" << endl;
1639         }
1640         return;
1641     }
1642 
1643     if (m_FormatType == CFormattingArgs::eTaxFormat) {
1644         string reportCaption = "Tax BLAST report";
1645         reportCaption = m_IsHTML ? "<h1>" + reportCaption + "</h1>" : CAlignFormatUtil::AddSpaces(reportCaption,max(kMinTaxFormatLineLength,kFormatLineLength),CAlignFormatUtil::eSpacePosToCenter | CAlignFormatUtil::eAddEOLAtLineStart | CAlignFormatUtil::eAddEOLAtLineEnd);
1646         m_Outfile << reportCaption;
1647         x_PrintTaxReport(results);
1648         return;
1649     }
1650 
1651     const bool kIsTabularOutput = false;
1652 
1653     // other output types will need a bioseq handle
1654     CBioseq_Handle bhandle = m_Scope->GetBioseqHandle(*results.GetSeqId(),
1655                                                       CScope::eGetBioseq_All);
1656     // If this assertion fails, we're not able to get the query, most likely a
1657     // bug
1658     _ASSERT(bhandle);
1659     CConstRef<CBioseq> bioseq = bhandle.GetBioseqCore();
1660 
1661     // print the preamble for this query
1662 
1663     m_Outfile << "\n\n";
1664 
1665     CBlastFormatUtil::AcknowledgeBlastQuery(*bioseq, kFormatLineLength,
1666                                             m_Outfile, m_BelieveQuery,
1667                                             m_IsHTML, kIsTabularOutput,
1668                                             results.GetRID());
1669 
1670     // quit early if there are no hits
1671     if ( !results.HasAlignments() ) {
1672         m_Outfile << "\n\n"
1673               << "***** " << CBlastFormatUtil::kNoHitsFound << " *****" << "\n"
1674               << "\n\n";
1675         x_PrintOneQueryFooter(*results.GetAncillaryData());
1676         return;
1677     }
1678 
1679     CConstRef<CSeq_align_set> aln_set = results.GetSeqAlign();
1680     _ASSERT(results.HasAlignments());
1681     if (m_IsUngappedSearch) {
1682         aln_set.Reset(CDisplaySeqalign::PrepareBlastUngappedSeqalign(*aln_set));
1683     }
1684 
1685     //-------------------------------------------------
1686     // print 1-line summaries
1687     if ( !m_IsBl2Seq ) {
1688         CPsiBlastIterationState::TSeqIds prev_ids = CPsiBlastIterationState::TSeqIds();
1689         int additional =  results.m_NumActualV +results.m_NumActualD + results.m_NumActualJ;
1690         x_DisplayDeflines(aln_set, numeric_limits<unsigned int>::max(), prev_ids, additional, index, 100);
1691     }
1692 
1693     //-------------------------------------------------
1694     // print the alignments
1695     m_Outfile << "\n";
1696 
1697     const CBlastTabularInfo::EFieldDelimiter kDelim =
1698             (m_FormatType == CFormattingArgs::eCommaSeparatedValues
1699              ? CBlastTabularInfo::eComma : CBlastTabularInfo::eTab);
1700 
1701     CIgBlastTabularInfo tabinfo(m_Outfile, m_CustomOutputFormatSpec, kDelim);
1702     tabinfo.SetParseLocalIds(m_BelieveQuery);
1703 
1704     // print the master alignment
1705     if (results.HasAlignments()) {
1706         const CRef<CIgAnnotation> & annots = results.GetIgAnnotation();
1707         CSeq_align_set::Tdata::const_iterator itr = aln_set->Get().begin();
1708         tabinfo.SetMasterFields(**itr, *m_Scope,
1709                                 annots->m_ChainType[0],
1710                                 annots->m_ChainTypeToShow,
1711                                 &m_ScoringMatrix);
1712         tabinfo.SetIgAnnotation(annots, m_IgOptions, aln_set, *m_Scope);
1713         if (fill_clone_info) {
1714             s_SetCloneInfo(tabinfo, bhandle, clone_info);
1715         }
1716         m_Outfile << "Domain classification requested: " << m_IgOptions->m_DomainSystem << endl << endl;
1717         if (m_IsHTML) {
1718             tabinfo.PrintHtmlSummary();
1719         } else {
1720             tabinfo.PrintMasterAlign("");
1721         }
1722     }
1723 
1724     TMaskedQueryRegions masklocs;
1725     results.GetMaskedQueryRegions(masklocs);
1726 
1727     int flags = CDisplaySeqalign::eMergeAlign
1728         + CDisplaySeqalign::eShowIdentity
1729         + CDisplaySeqalign::eNewTargetWindow
1730         + CDisplaySeqalign::eShowEndGaps
1731         + CDisplaySeqalign::eShowAlignStatsForMultiAlignView;
1732 
1733     if (m_FormatType == CFormattingArgs::eFlatQueryAnchoredNoIdentities) {
1734         flags -= CDisplaySeqalign::eShowIdentity;
1735     }
1736 
1737     if (m_IsHTML) {
1738           flags += CDisplaySeqalign::eHtml;
1739           flags += CDisplaySeqalign::eHyperLinkSlaveSeqid;
1740     }
1741 
1742     list < CRef<CDisplaySeqalign::DomainInfo> >  domain;
1743 
1744     string kabat_domain_name[] = {"FR1", "CDR1", "FR2", "CDR2", "FR3", "CDR3", "FR4"};
1745     string imgt_domain_name[] = {"FR1-IMGT", "CDR1-IMGT", "FR2-IMGT", "CDR2-IMGT", "FR3-IMGT", "CDR3-IMGT", "FR4-IMGT"};
1746     int domain_name_length = 7;
1747     vector<string> domain_name;
1748     if (m_IgOptions->m_DomainSystem == "kabat") {
1749         for (int i = 0; i < domain_name_length; i ++) {
1750             domain_name.push_back(kabat_domain_name[i]);
1751         }
1752     } else {
1753         for (int i = 0; i < domain_name_length; i ++) {
1754             domain_name.push_back(imgt_domain_name[i]);
1755         }
1756     }
1757 
1758     const CRef<CIgAnnotation> & annots = results.GetIgAnnotation();
1759 
1760     for (int i=0; i<9; i = i + 2) {
1761         if (annots->m_DomainInfo[i] >= 0){
1762             CRef<CDisplaySeqalign::DomainInfo> temp(new CDisplaySeqalign::DomainInfo);
1763             int start = annots->m_DomainInfo[i];
1764             int subject_start = annots->m_DomainInfo_S[i];
1765 
1766             int stop = annots->m_DomainInfo[i+1];
1767             int subject_stop = annots->m_DomainInfo_S[i+1];
1768 
1769             temp->seqloc = new CSeq_loc((CSeq_loc::TId &) aln_set->Get().front()->GetSeq_id(0),
1770                                         (CSeq_loc::TPoint) start,
1771                                         (CSeq_loc::TPoint) stop);
1772             temp->subject_seqloc = new CSeq_loc((CSeq_loc::TId &) aln_set->Get().front()->GetSeq_id(1),
1773                                                 (CSeq_loc::TPoint) subject_start,
1774                                                 (CSeq_loc::TPoint) subject_stop);
1775             temp->is_subject_start_valid = subject_start > 0 ? true:false;
1776             temp->is_subject_stop_valid = subject_stop > 0 ? true:false;
1777             temp->domain_name = domain_name[i/2];
1778             domain.push_back(temp);
1779         }
1780     }
1781 
1782     //J domain
1783     //cdr3
1784     if (annots->m_JDomain[0] > 0 && annots->m_JDomain[1] > 0){
1785         CRef<CDisplaySeqalign::DomainInfo> temp(new CDisplaySeqalign::DomainInfo);
1786         int start = annots->m_JDomain[0];
1787         int subject_start = -1;
1788         int stop = annots->m_JDomain[1];
1789         int subject_stop = -1;
1790 
1791         temp->seqloc = new CSeq_loc((CSeq_loc::TId &) aln_set->Get().front()->GetSeq_id(0),
1792                                         (CSeq_loc::TPoint) start,
1793                                         (CSeq_loc::TPoint) stop);
1794         CRef<CSeq_id> id_holder (new CSeq_id);
1795         temp->subject_seqloc = new CSeq_loc(*id_holder,
1796                                             (CSeq_loc::TPoint) subject_start,
1797                                             (CSeq_loc::TPoint) subject_stop);
1798         temp->is_subject_start_valid = subject_start > 0 ? true:false;
1799         temp->is_subject_stop_valid = subject_stop > 0 ? true:false;
1800         temp->domain_name = domain_name[5];
1801         domain.push_back(temp);
1802     }
1803     //fwr4
1804     if (annots->m_JDomain[2] > 0 && annots->m_JDomain[3] > 0){
1805         CRef<CDisplaySeqalign::DomainInfo> temp(new CDisplaySeqalign::DomainInfo);
1806         int start = annots->m_JDomain[2];
1807         int subject_start = -1;
1808         int stop = annots->m_JDomain[3];
1809         int subject_stop = -1;
1810 
1811         temp->seqloc = new CSeq_loc((CSeq_loc::TId &) aln_set->Get().front()->GetSeq_id(0),
1812                                     (CSeq_loc::TPoint) start,
1813                                     (CSeq_loc::TPoint) stop);
1814         CRef<CSeq_id> id_holder (new CSeq_id);
1815         temp->subject_seqloc = new CSeq_loc(*id_holder,
1816                                             (CSeq_loc::TPoint) subject_start,
1817                                             (CSeq_loc::TPoint) subject_stop);
1818         temp->is_subject_start_valid = subject_start > 0 ? true:false;
1819         temp->is_subject_stop_valid = subject_stop > 0 ? true:false;
1820         temp->domain_name = domain_name[6];
1821         domain.push_back(temp);
1822     }
1823 
1824     CDisplaySeqalign display(*aln_set, *m_Scope, &masklocs, NULL,  m_MatrixName);
1825     int num_align_to_show = results.m_NumActualV + results.m_NumActualD +
1826         results.m_NumActualJ;
1827     if (m_DbName != m_IgOptions->m_Db[0]->GetDatabaseName()){
1828         num_align_to_show += m_NumAlignments;
1829     }
1830     display.SetNumAlignToShow(num_align_to_show);
1831     display.SetMasterDomain(&domain);
1832     display.SetDbName(m_DbName);
1833     display.SetDbType(!m_DbIsAA);
1834     display.SetLineLen(90);
1835 
1836     if (m_LongSeqId) {
1837         display.UseLongSequenceIds();
1838     }
1839 
1840     if (annots->m_FrameInfo[0] >= 0 && m_IgOptions->m_Translate) {
1841         display.SetTranslatedFrameForLocalSeq((CDisplaySeqalign::TranslatedFrameForLocalSeq) (annots->m_FrameInfo[0]%3));
1842         flags += CDisplaySeqalign::eShowTranslationForLocalSeq;
1843     }
1844     flags += CDisplaySeqalign::eShowSequencePropertyLabel;
1845     flags += CDisplaySeqalign::eShowInfoOnMouseOverSeqid;
1846     vector<string> chain_type_list;
1847     ITERATE(vector<string>, iter, annots->m_ChainType) {
1848         if (*iter=="N/A"){
1849             chain_type_list.push_back(NcbiEmptyString);
1850         } else {
1851             chain_type_list.push_back(*iter);
1852         }
1853     }
1854     display.SetSequencePropertyLabel(&chain_type_list);
1855     // set the alignment flags
1856 
1857     display.SetAlignOption(flags);
1858     if (m_Program == "blastn" || m_Program == "BLASTN") {
1859         display.SetAlignType(CDisplaySeqalign::eNuc);
1860     } else {
1861         display.SetAlignType(CDisplaySeqalign::eProt);
1862     }
1863     display.SetMasterGeneticCode(m_QueryGenCode);
1864     display.SetSlaveGeneticCode(m_DbGenCode);
1865     display.SetSeqLocChar(CDisplaySeqalign::eLowerCase);
1866     TSeqLocInfoVector subj_masks;
1867     results.GetSubjectMasks(subj_masks);
1868     display.SetSubjectMasks(subj_masks);
1869 
1870     if (m_IsHTML) {
1871         display.SetResultPositionIndex(index);
1872         m_Outfile << "\n<CENTER><b><FONT color=\"green\">Alignments</FONT></b></CENTER>"
1873                   << endl;
1874 
1875     } else {
1876         m_Outfile << "\nAlignments" << endl;
1877     }
1878 
1879     display.DisplaySeqalign(m_Outfile);
1880 
1881     // print the ancillary data for this query
1882 
1883     x_PrintOneQueryFooter(*results.GetAncillaryData());
1884     if (m_IsHTML) {
1885         m_Outfile << "<hr>" << endl;
1886     }
1887 }
1888 
1889 void
x_ReverseQuery(blast::CIgBlastResults & results)1890 CBlastFormat::x_ReverseQuery(blast::CIgBlastResults& results)
1891 {
1892     if (!results.HasAlignments()){
1893         return;
1894     }
1895     // create a temporary seq_id
1896     CConstRef<CSeq_id> qid = results.GetSeqId();
1897     string new_id = qid->AsFastaString() + "_reversed";
1898 
1899     // create a bioseq
1900     CBioseq_Handle q_bh = m_Scope->GetBioseqHandle(*qid);
1901     int len = q_bh.GetBioseqLength();
1902     CSeq_loc loc(*(const_cast<CSeq_id *>(&*qid)), 0, len-1, eNa_strand_minus);
1903     CRef<CBioseq> q_new(new CBioseq(loc, new_id));
1904     CConstRef<CSeq_id> new_qid = m_Scope->AddBioseq(*q_new).GetSeqId();
1905     if (qid->IsLocal()) {
1906         string title = sequence::CDeflineGenerator().GenerateDefline(q_bh);
1907         if (title != "") {
1908             CRef<CSeqdesc> des(new CSeqdesc());
1909             if (m_FormatType != CFormattingArgs::eAirrRearrangement) {
1910                 des->SetTitle("reversed|" + title);
1911             } else {
1912                 des->SetTitle(title);
1913             }
1914             m_Scope->GetBioseqEditHandle(*q_new).SetDescr().Set().push_back(des);
1915         }
1916     }
1917 
1918     // set up the mapping
1919     CSeq_loc new_loc(*(const_cast<CSeq_id *>(&*new_qid)), 0, len-1, eNa_strand_plus);
1920     CSeq_loc_Mapper mapper(loc, new_loc, &*m_Scope);
1921 
1922     // replace the alignment with the new query
1923     CRef<CSeq_align_set> align_set(new CSeq_align_set());
1924     ITERATE(CSeq_align_set::Tdata, align, results.GetSeqAlign()->Get()) {
1925         CRef<CSeq_align> new_align = mapper.Map(**align, 0);
1926         align_set->Set().push_back(new_align);
1927     }
1928     results.SetSeqAlign().Reset(&*align_set);
1929 
1930     // reverse IgAnnotations
1931     CRef<CIgAnnotation> &annots = results.SetIgAnnotation();
1932     for (int i=0; i<6; i+=2) {
1933         int start = annots->m_GeneInfo[i];
1934         if (start >= 0) {
1935             annots->m_GeneInfo[i] = len - annots->m_GeneInfo[i+1];
1936             annots->m_GeneInfo[i+1] = len - start;
1937         }
1938     }
1939 
1940     for (int i=0; i<12; ++i) {
1941         int pos = annots->m_DomainInfo[i];
1942         if (pos >= 0) {
1943             annots->m_DomainInfo[i] = max(0, len - 1 - pos);
1944         }
1945     }
1946 
1947     for (int i=0; i<3; ++i) {
1948         int pos = annots->m_FrameInfo[i];
1949         if (pos >= 0) {
1950             annots->m_FrameInfo[i] = len -1 - pos;
1951         }
1952     }
1953 }
1954 
1955 void
PrintPhiResult(const blast::CSearchResultSet & result_set,CConstRef<blast::CBlastQueryVector> queries,unsigned int itr_num,blast::CPsiBlastIterationState::TSeqIds prev_seqids)1956 CBlastFormat::PrintPhiResult(const blast::CSearchResultSet& result_set,
1957                         CConstRef<blast::CBlastQueryVector> queries,
1958                         unsigned int itr_num
1959                         /* = numeric_limits<unsigned int>::max() */,
1960                         blast::CPsiBlastIterationState::TSeqIds prev_seqids
1961                         /* = CPsiBlastIterationState::TSeqIds() */)
1962 {
1963     // For remote searches, we don't retrieve the sequence data for the query
1964     // sequence when initially sending the request to the BLAST server (if it's
1965     // a GI/accession/TI), so we flush the scope so that it can be retrieved
1966     // (needed if a self-hit is found) again. This is not applicable if the
1967     // query sequence(s) are specified as FASTA (will be identified by local
1968     // IDs).
1969     if (m_IsRemoteSearch && !s_HasLocalIDs(queries)) {
1970         ResetScopeHistory();
1971     }
1972 
1973     if (m_FormatType == CFormattingArgs::eAsnText
1974       || m_FormatType == CFormattingArgs::eAsnBinary
1975       || m_FormatType == CFormattingArgs::eXml
1976       || m_FormatType == CFormattingArgs::eXml2
1977       || m_FormatType == CFormattingArgs::eJson
1978       || m_FormatType == CFormattingArgs::eXml2_S
1979       || m_FormatType == CFormattingArgs::eJson_S
1980       || m_FormatType == CFormattingArgs::eJsonSeqalign)
1981     {
1982         ITERATE(CSearchResultSet, result, result_set) {
1983            x_PrintStructuredReport(**result, queries);
1984         }
1985         return;
1986     }
1987 
1988     ITERATE(CSearchResultSet, result, result_set) {
1989         if ((**result).HasErrors()) {
1990             m_Outfile << "\n" << (**result).GetErrorStrings() << "\n";
1991             return; // errors are deemed fatal
1992         }
1993         if ((**result).HasWarnings()) {
1994             m_Outfile << "\n" << (**result).GetWarningStrings() << "\n";
1995         }
1996     }
1997 
1998     if (m_FormatType == CFormattingArgs::eTabular ||
1999         m_FormatType == CFormattingArgs::eTabularWithComments ||
2000         m_FormatType == CFormattingArgs::eCommaSeparatedValues) {
2001         ITERATE(CSearchResultSet, result, result_set) {
2002            x_PrintTabularReport(**result, itr_num);
2003         }
2004         return;
2005     }
2006     if (m_FormatType == CFormattingArgs::eTaxFormat) {
2007         string reportCaption = "Tax BLAST report";
2008         reportCaption = m_IsHTML ? "<h1>" + reportCaption + "</h1>" : CAlignFormatUtil::AddSpaces(reportCaption,max(kMinTaxFormatLineLength,kFormatLineLength),CAlignFormatUtil::eSpacePosToCenter | CAlignFormatUtil::eAddEOLAtLineStart | CAlignFormatUtil::eAddEOLAtLineEnd);
2009         m_Outfile << reportCaption;
2010         ITERATE(CSearchResultSet, result, result_set) {
2011             x_PrintTaxReport(**result);
2012         }
2013         return;
2014     }
2015 
2016     const CSearchResults& first_results = result_set[0];
2017 
2018     if (itr_num != numeric_limits<unsigned int>::max()) {
2019         m_Outfile << "Results from round " << itr_num << "\n";
2020     }
2021 
2022     CBioseq_Handle bhandle = m_Scope->GetBioseqHandle(*first_results.GetSeqId(),
2023                                                       CScope::eGetBioseq_All);
2024     CConstRef<CBioseq> bioseq = bhandle.GetBioseqCore();
2025 
2026     // print the preamble for this query
2027 
2028     m_Outfile << "\n\n";
2029     CBlastFormatUtil::AcknowledgeBlastQuery(*bioseq, kFormatLineLength,
2030                                             m_Outfile, m_BelieveQuery,
2031                                             m_IsHTML, false,
2032                                             first_results.GetRID());
2033 
2034     if (m_FormatType == CFormattingArgs::eTaxFormat) {
2035         string reportCaption = "Tax BLAST report";
2036         reportCaption = m_IsHTML ? "<h1>" + reportCaption + "</h1>" : CAlignFormatUtil::AddSpaces(reportCaption,max(kMinTaxFormatLineLength,kFormatLineLength),CAlignFormatUtil::eSpacePosToCenter | CAlignFormatUtil::eAddEOLAtLineStart | CAlignFormatUtil::eAddEOLAtLineEnd);
2037         m_Outfile << reportCaption;
2038         ITERATE(CSearchResultSet, result, result_set) {
2039            x_PrintTaxReport(**result);
2040         }
2041         return;
2042     }
2043 
2044     const SPHIQueryInfo *phi_query_info = first_results.GetPhiQueryInfo();
2045 
2046     if (phi_query_info)
2047     {
2048         vector<int> offsets;
2049         for (int index=0; index<phi_query_info->num_patterns; index++)
2050             offsets.push_back(phi_query_info->occurrences[index].offset);
2051 
2052         CBlastFormatUtil::PrintPhiInfo(phi_query_info->num_patterns,
2053                                    string(phi_query_info->pattern),
2054                                    phi_query_info->probability,
2055                                    offsets, m_Outfile);
2056     }
2057 
2058     // quit early if there are no hits
2059     if ( !first_results.HasAlignments() ) {
2060         m_Outfile << "\n\n"
2061               << "***** " << CBlastFormatUtil::kNoHitsFound << " *****" << "\n"
2062               << "\n\n";
2063         x_PrintOneQueryFooter(*first_results.GetAncillaryData());
2064         return;
2065     }
2066 
2067     _ASSERT(first_results.HasAlignments());
2068     //-------------------------------------------------
2069 
2070     ITERATE(CSearchResultSet, result, result_set)
2071     {
2072         CConstRef<CSeq_align_set> aln_set = (**result).GetSeqAlign();
2073         x_DisplayDeflines(aln_set, itr_num, prev_seqids);
2074     }
2075 
2076     //-------------------------------------------------
2077     // print the alignments
2078     m_Outfile << "\n";
2079 
2080 
2081     int flags = s_SetFlags(m_Program, m_FormatType, m_IsHTML, m_ShowGi,
2082                            (m_IsBl2Seq && !m_IsDbScan), false);
2083 
2084     if (phi_query_info)
2085     {
2086         SPHIPatternInfo *occurrences = phi_query_info->occurrences;
2087         int index;
2088         for (index=0; index<phi_query_info->num_patterns; index++)
2089         {
2090            list <CDisplaySeqalign::FeatureInfo*> phiblast_pattern;
2091            CSeq_id* id = new CSeq_id;
2092            id->Assign(*(result_set[index]).GetSeqId());
2093            CDisplaySeqalign::FeatureInfo*  feature_info = new CDisplaySeqalign::FeatureInfo;
2094            feature_info->seqloc = new CSeq_loc(*id, (TSeqPos) occurrences[index].offset,
2095                   (TSeqPos) (occurrences[index].offset + occurrences[index].length - 1));
2096            feature_info->feature_char = '*';
2097            feature_info->feature_id = "pattern";
2098            phiblast_pattern.push_back(feature_info);
2099 
2100            m_Outfile << "\nSignificant alignments for pattern occurrence " << index+1
2101                  << " at position " << 1+occurrences[index].offset << "\n\n";
2102 
2103            TMaskedQueryRegions masklocs;
2104            result_set[index].GetMaskedQueryRegions(masklocs);
2105            CConstRef<CSeq_align_set> aln_set = result_set[index].GetSeqAlign();
2106            CSeq_align_set copy_aln_set;
2107            CBlastFormatUtil::PruneSeqalign(*aln_set, copy_aln_set, m_NumAlignments);
2108 
2109            CDisplaySeqalign display(copy_aln_set, *m_Scope, &masklocs, &phiblast_pattern,
2110                              m_MatrixName);
2111 
2112            display.SetDbName(m_DbName);
2113            display.SetDbType(!m_DbIsAA);
2114            display.SetLineLen(m_LineLength);
2115 
2116            // set the alignment flags
2117            display.SetAlignOption(flags);
2118 
2119            if (m_LongSeqId) {
2120                display.UseLongSequenceIds();
2121            }
2122 
2123            if (m_Program == "blastn" || m_Program == "megablast") {
2124                display.SetMiddleLineStyle(CDisplaySeqalign::eBar);
2125                display.SetAlignType(CDisplaySeqalign::eNuc);
2126            }
2127            else {
2128                display.SetMiddleLineStyle(CDisplaySeqalign::eChar);
2129                display.SetAlignType(CDisplaySeqalign::eProt);
2130            }
2131 
2132            display.SetMasterGeneticCode(m_QueryGenCode);
2133            display.SetSlaveGeneticCode(m_DbGenCode);
2134            display.SetSeqLocChar(CDisplaySeqalign::eLowerCase);
2135            display.DisplaySeqalign(m_Outfile);
2136            m_Outfile << "\n";
2137 
2138            NON_CONST_ITERATE(list<CDisplaySeqalign::FeatureInfo*>, itr, phiblast_pattern) {
2139                delete *itr;
2140            }
2141         }
2142     }
2143 
2144     // print the ancillary data for this query
2145 
2146     x_PrintOneQueryFooter(*first_results.GetAncillaryData());
2147 }
2148 
2149 
2150 
2151 void
PrintEpilog(const blast::CBlastOptions & options)2152 CBlastFormat::PrintEpilog(const blast::CBlastOptions& options)
2153 {
2154     if ((m_FormatType == CFormattingArgs::eXml2) || (m_FormatType == CFormattingArgs::eJson) ||
2155         (m_FormatType == CFormattingArgs::eXml2_S) || (m_FormatType == CFormattingArgs::eJson_S)) {
2156     	if(!m_AccumulatedResults.empty()) {
2157     		CRef <CBlastSearchQuery> q = m_AccumulatedQueries->GetBlastSearchQuery(0);
2158     		if(m_IsBl2Seq) {
2159     			CCmdLineBlastXML2ReportData report_data(q, m_AccumulatedResults, m_Options,
2160     		 		   		   	   	   	   	   	   	    m_Scope, m_SeqInfoSrc);
2161 		    	x_WriteXML2(report_data);
2162     		}
2163     		else if(m_IsIterative){
2164     			CCmdLineBlastXML2ReportData report_data (q, m_AccumulatedResults, m_Options,
2165     					 	 	 	 	 				 m_Scope, m_DbInfo);
2166 		    	x_WriteXML2(report_data);
2167     		}
2168     		m_AccumulatedResults.clear();
2169     		m_AccumulatedQueries->clear();
2170     	}
2171     	if (m_FormatType == CFormattingArgs::eXml2
2172     		|| m_FormatType == CFormattingArgs::eXml2_S) {
2173     		x_GenerateXML2MasterFile();
2174     	}
2175     	else {
2176     		x_GenerateJSONMasterFile();
2177     	}
2178     	return;
2179     }
2180 
2181     if (m_FormatType == CFormattingArgs::eTabularWithComments) {
2182         CBlastTabularInfo tabinfo(m_Outfile, m_CustomOutputFormatSpec);
2183         tabinfo.PrintNumProcessed(m_QueriesFormatted);
2184         return;
2185     } else if (m_FormatType >= CFormattingArgs::eTabular)
2186         return;  // No footer for these.
2187 
2188     // Most of XML is printed as it's finished.
2189     // the epilog closes the report.
2190     if (m_FormatType == CFormattingArgs::eXml) {
2191         m_Outfile << m_BlastXMLIncremental->m_SerialXmlEnd << endl;
2192         m_AccumulatedResults.clear();
2193         m_AccumulatedQueries->clear();
2194         return;
2195     }
2196 
2197     m_Outfile << NcbiEndl << NcbiEndl;
2198     if (m_Program == "deltablast" && !m_DomainDbInfo.empty()) {
2199         m_Outfile << "Conserved Domain";
2200         CBlastFormatUtil::PrintDbReport(m_DomainDbInfo, kFormatLineLength,
2201                                         m_Outfile, false);
2202     }
2203 
2204     if ( !m_IsBl2Seq || m_IsDbScan) {
2205         CBlastFormatUtil::PrintDbReport(m_DbInfo, kFormatLineLength,
2206                                         m_Outfile, false);
2207     }
2208 
2209     if (m_Program == "blastn" || m_Program == "megablast") {
2210         m_Outfile << "\n\nMatrix: " << "blastn matrix " <<
2211                         options.GetMatchReward() << " " <<
2212                         options.GetMismatchPenalty() << "\n";
2213     }
2214     else {
2215         m_Outfile << "\n\nMatrix: " << options.GetMatrixName() << "\n";
2216     }
2217 
2218     if (options.GetGappedMode() == true) {
2219         double gap_extension = (double) options.GetGapExtensionCost();
2220         if ((m_Program == "megablast" || m_Program == "blastn") && options.GetGapExtensionCost() == 0)
2221         { // Formula from PMID 10890397 applies if both gap values are zero.
2222                gap_extension = -2*options.GetMismatchPenalty() + options.GetMatchReward();
2223                gap_extension /= 2.0;
2224         }
2225         m_Outfile << "Gap Penalties: Existence: "
2226                 << options.GetGapOpeningCost() << ", Extension: "
2227                 << gap_extension << "\n";
2228     }
2229     if (options.GetWordThreshold()) {
2230         m_Outfile << "Neighboring words threshold: " <<
2231                         options.GetWordThreshold() << "\n";
2232     }
2233     if (options.GetWindowSize()) {
2234         m_Outfile << "Window for multiple hits: " <<
2235                         options.GetWindowSize() << "\n";
2236     }
2237 
2238     if (m_IsHTML) {
2239         m_Outfile << kHTML_Suffix << "\n";
2240     }
2241 }
2242 
ResetScopeHistory()2243 void CBlastFormat::ResetScopeHistory()
2244 {
2245     // Do not reset the scope for BLAST2Sequences or else we'll loose the
2246     // sequence data! (see x_CreateSubjectBioseq)
2247     if ((m_IsBl2Seq)
2248     	|| (m_FormatType == CFormattingArgs::eXml2)
2249     	|| (m_FormatType == CFormattingArgs::eJson)
2250     	|| (m_FormatType == CFormattingArgs::eXml2_S)
2251     	|| (m_FormatType == CFormattingArgs::eJson_S)){
2252         return;
2253     }
2254 
2255     // Our current XML/ASN.1 libraries do not have provisions for
2256     // incremental object input/output, so with XML output format we
2257     // need to accumulate the whole document before writing any data.
2258 
2259     // This means that XML output requires more memory than other
2260     // output formats.
2261 
2262     if (m_FormatType != CFormattingArgs::eXml)
2263     {
2264         m_Scope->ResetDataAndHistory();
2265     }
2266 }
2267 
s_GetBaseName(const string & baseFile,bool isXML,bool withPath)2268 static string s_GetBaseName(const string & baseFile, bool isXML, bool withPath)
2269 {
2270 	string dir = kEmptyStr;
2271 	string base = kEmptyStr;
2272 	string ext = kEmptyStr;
2273 	CDirEntry::SplitPath(baseFile, withPath ? &dir:NULL, &base, &ext );
2274 	if(!((isXML && NStr::CompareNocase(ext, ".xml") == 0 ) ||
2275 	     (!isXML && NStr::CompareNocase(ext, ".json") == 0))){
2276 		 base += ext;
2277 	}
2278 	if(withPath)
2279 		return dir + base;
2280 
2281 	return base;
2282 }
2283 
x_WriteXML2(CCmdLineBlastXML2ReportData & report_data)2284 void CBlastFormat::x_WriteXML2(CCmdLineBlastXML2ReportData & report_data)
2285 {
2286 	if(m_FormatType == CFormattingArgs::eXml2_S) {
2287 			BlastXML2_FormatReport(&report_data, &m_Outfile);
2288 	}
2289 	else if (m_FormatType == CFormattingArgs::eJson_S) {
2290 			m_XMLFileCount++;
2291 			if(m_XMLFileCount > 1) {
2292 				m_Outfile << ",\n";
2293 			}
2294 			BlastJSON_FormatReport(&report_data, &m_Outfile);
2295 	}
2296 	else {
2297 		m_XMLFileCount++;
2298 
2299 		if(m_FormatType == CFormattingArgs::eXml2) {
2300 			string file_name = s_GetBaseName(m_BaseFile, true, true) + "_" + NStr::IntToString(m_XMLFileCount) + ".xml";
2301 			BlastXML2_FormatReport(&report_data, file_name);
2302 		}
2303 		else {
2304 			string file_name = s_GetBaseName(m_BaseFile, false, true) + "_" + NStr::IntToString(m_XMLFileCount) + ".json";
2305 			BlastJSON_FormatReport(&report_data, file_name);
2306 		}
2307 	}
2308 }
2309 
x_PrintXML2Report(const blast::CSearchResults & results,CConstRef<blast::CBlastQueryVector> queries)2310 void CBlastFormat::x_PrintXML2Report(const blast::CSearchResults& results,
2311         							 CConstRef<blast::CBlastQueryVector> queries)
2312 {
2313 	CRef<CSearchResults> res(const_cast<CSearchResults*>(&results));
2314     res->TrimSeqAlign(m_HitlistSize);
2315 	if((m_IsIterative) || (m_IsBl2Seq)) {
2316 		if(m_AccumulatedResults.empty()) {
2317 			_ASSERT(m_AccumulatedQueries->size() == 0);
2318 			m_AccumulatedResults.push_back(res);
2319 			CConstRef<CSeq_id> query_id = results.GetSeqId();
2320 			ITERATE(CBlastQueryVector, itr, *queries) {
2321 				if (query_id->Match(*(*itr)->GetQueryId())) {
2322 			 		m_AccumulatedQueries->push_back(*itr);
2323 			 		break;
2324 			   	}
2325 			}
2326 	    }
2327 		else {
2328 			CConstRef<CSeq_id> query_id = results.GetSeqId();
2329 			if(m_AccumulatedResults[0].GetSeqId()->Match(*query_id)) {
2330 				m_AccumulatedResults.push_back(res);
2331 			}
2332 			else {
2333 				CRef <CBlastSearchQuery> q = m_AccumulatedQueries->GetBlastSearchQuery(0);
2334 				if(m_IsBl2Seq) {
2335 				    CCmdLineBlastXML2ReportData report_data(q, m_AccumulatedResults, m_Options,
2336 				    		   	   	   	   	   	   	    m_Scope, m_SeqInfoSrc);
2337 				    x_WriteXML2(report_data);
2338 				}
2339 				else {
2340 					CCmdLineBlastXML2ReportData report_data (q, m_AccumulatedResults, m_Options,
2341 							 	 	 	 	 				 m_Scope, m_DbInfo);
2342 				    x_WriteXML2(report_data);
2343 				}
2344 				m_AccumulatedResults.clear();
2345 				m_AccumulatedQueries->clear();
2346 
2347 				m_AccumulatedResults.push_back(res);
2348 				ITERATE(CBlastQueryVector, itr, *queries) {
2349 					if (query_id->Match(*(*itr)->GetQueryId())) {
2350 				 		m_AccumulatedQueries->push_back(*itr);
2351 				 		break;
2352 				   	}
2353 				}
2354 			}
2355 		}
2356 	}
2357 	else {
2358 		CRef<CBlastSearchQuery> q;
2359 			CConstRef<CSeq_id> query_id = results.GetSeqId();
2360 		ITERATE(CBlastQueryVector, itr, *queries) {
2361 			if (query_id->Match(*(*itr)->GetQueryId())) {
2362 				q = *itr;
2363 				break;
2364 			}
2365 		}
2366 		CCmdLineBlastXML2ReportData report_data (q, *res,  m_Options, m_Scope, m_DbInfo);
2367 	    x_WriteXML2(report_data);
2368 	}
2369 }
2370 
x_GenerateXML2MasterFile(void)2371 void CBlastFormat::x_GenerateXML2MasterFile(void)
2372 {
2373 	if(m_FormatType == CFormattingArgs::eXml2_S) {
2374 		m_Outfile << "</BlastXML2>\n";
2375 		return;
2376 	}
2377 
2378 	m_Outfile << "<?xml version=\"1.0\"?>\n<BlastXML2\n"
2379 			"xmlns=\"http://www.ncbi.nlm.nih.gov\"\n"
2380 			"xmlns:xi=\"http://www.w3.org/2003/XInclude\"\n"
2381 			"xmlns:xs=\"http://www.w3.org/2001/XMLSchema-instance\"\n"
2382 			"xs:schemaLocation=\"http://www.ncbi.nlm.nih.gov http://www.ncbi.nlm.nih.gov/data_specs/schema_alt/NCBI_BlastOutput2.xsd\">\n";
2383 
2384 	string base = s_GetBaseName(m_BaseFile, true, false);
2385 	for(int i = 1; i <= m_XMLFileCount; i ++) {
2386 		string file_name = base + "_" + NStr::IntToString(i) + ".xml";
2387 		m_Outfile << "\t<xi:include href=\"" + file_name + "\"/>\n";
2388 	}
2389 	m_Outfile << "</BlastXML2>\n";
2390 }
2391 
x_GenerateJSONMasterFile(void)2392 void CBlastFormat::x_GenerateJSONMasterFile(void)
2393 {
2394 	if(m_FormatType == CFormattingArgs::eJson_S) {
2395 		m_Outfile << "]\n}\n";
2396 		return;
2397 	}
2398 
2399 	m_Outfile << "{\n\t\"BlastJSON\": [\n";
2400 
2401 	string base = s_GetBaseName(m_BaseFile, true, false);
2402 	for(int i = 1; i <= m_XMLFileCount; i ++) {
2403 		string file_name = base + "_" + NStr::IntToString(i) + ".json";
2404 		m_Outfile << "\t\t{\"File\": \"" + file_name + "\" }";
2405 		if(i != m_XMLFileCount)
2406 			m_Outfile << ",";
2407 		m_Outfile << "\n";
2408 	}
2409 	m_Outfile << "\t]\n}";
2410 }
2411 
x_InitSAMFormatter()2412 void CBlastFormat::x_InitSAMFormatter()
2413 {
2414 	CSAM_Formatter::SProgramInfo  pg("0", blast::CBlastVersion().Print(), m_Cmdline);
2415    	pg.m_Name = m_Program;
2416     m_SamFormatter.reset(new CBlast_SAM_Formatter(m_Outfile, *m_Scope,
2417         		                                  m_CustomOutputFormatSpec, pg));
2418 }
2419 
s_SetCompBasedStats(EProgram program)2420 bool s_SetCompBasedStats(EProgram program)
2421 {
2422 	 if (program == eBlastp || program == eTblastn ||
2423 	        program == ePSIBlast || program == ePSITblastn ||
2424 	        program == eRPSBlast || program == eRPSTblastn ||
2425 	        program == eBlastx  ||  program == eDeltaBlast) {
2426 		 return true;
2427 	 }
2428 	 return false;
2429 }
2430 
LogBlastSearchInfo(CBlastUsageReport & report)2431 void CBlastFormat::LogBlastSearchInfo(CBlastUsageReport & report)
2432 {
2433 	if (report.IsEnabled()) {
2434 		report.AddParam(CBlastUsageReport::eProgram, m_Program);
2435 		EProgram task = m_Options->GetProgram();
2436 		string task_str =  EProgramToTaskName(task);
2437 		report.AddParam(CBlastUsageReport::eTask, task_str);
2438 		report.AddParam(CBlastUsageReport::eEvalueThreshold, m_Options->GetEvalueThreshold());
2439 		report.AddParam(CBlastUsageReport::eHitListSize, m_Options->GetHitlistSize());
2440 		report.AddParam(CBlastUsageReport::eOutputFmt, m_FormatType);
2441 
2442 		if (s_SetCompBasedStats(task)) {
2443 			report.AddParam(CBlastUsageReport::eCompBasedStats, m_Options->GetCompositionBasedStats());
2444 		}
2445 
2446 		int num_seqs = 0;
2447 	    for (size_t i = 0; i < m_DbInfo.size(); i++) {
2448 	        num_seqs += m_DbInfo[i].number_seqs;
2449 	    }
2450 		if( m_IsBl2Seq) {
2451 			report.AddParam(CBlastUsageReport::eBl2seq, "true");
2452 			if (m_IsDbScan) {
2453 				report.AddParam(CBlastUsageReport::eNumSubjects, num_seqs);
2454 				report.AddParam(CBlastUsageReport::eSubjectsLength, GetDbTotalLength());
2455 			}
2456 			else if (m_SeqInfoSrc.NotEmpty()){
2457 				report.AddParam(CBlastUsageReport::eNumSubjects, (int) m_SeqInfoSrc->Size());
2458 				int total_subj_length = 0;
2459 				for (size_t i = 0; i < m_SeqInfoSrc->Size(); i++) {
2460 				       total_subj_length += m_SeqInfoSrc->GetLength(i);
2461 				}
2462 				report.AddParam(CBlastUsageReport::eSubjectsLength, total_subj_length);
2463 			}
2464 		}
2465 		else {
2466 			string dir = kEmptyStr;
2467 			CFile::SplitPath(m_DbName, &dir);
2468 			string db_name = m_DbName;
2469 			if (dir != kEmptyStr) {
2470 				db_name = m_DbName.substr(dir.length());
2471 			}
2472 			report.AddParam(CBlastUsageReport::eDBName, db_name);
2473 			report.AddParam(CBlastUsageReport::eDBLength, GetDbTotalLength());
2474 			report.AddParam(CBlastUsageReport::eDBNumSeqs, num_seqs);
2475 			report.AddParam(CBlastUsageReport::eDBDate, m_DbInfo[0].date);
2476 			if(m_SearchDb.NotEmpty()){
2477 				if(m_SearchDb->GetGiList().NotEmpty()) {
2478 					 CRef<CSeqDBGiList>  l = m_SearchDb->GetGiList();
2479 					 if (l->GetNumGis()) {
2480 						 report.AddParam(CBlastUsageReport::eGIList, true);
2481 					 }
2482 					 if (l->GetNumSis()){
2483 						 report.AddParam(CBlastUsageReport::eSeqIdList, true);
2484 					 }
2485 					 if (l->GetNumTaxIds()){
2486 						 report.AddParam(CBlastUsageReport::eTaxIdList, true);
2487 					 }
2488 					 if (l->GetNumPigs()) {
2489 						 report.AddParam(CBlastUsageReport::eIPGList, true);
2490 					 }
2491 				}
2492 				if(m_SearchDb->GetNegativeGiList().NotEmpty()) {
2493 					 CRef<CSeqDBGiList>  l = m_SearchDb->GetNegativeGiList();
2494 					 if (l->GetNumGis()) {
2495 						 report.AddParam(CBlastUsageReport::eNegGIList, true);
2496 					 }
2497 					 if (l->GetNumSis()){
2498 						 report.AddParam(CBlastUsageReport::eNegSeqIdList, true);
2499 					 }
2500 					 if (l->GetNumTaxIds()){
2501 						 report.AddParam(CBlastUsageReport::eNegTaxIdList, true);
2502 					 }
2503 					 if (l->GetNumPigs()) {
2504 						 report.AddParam(CBlastUsageReport::eNegIPGList, true);
2505 					 }
2506 				}
2507 			}
2508 		}
2509 	}
2510 }
2511