1 /*
2 * ===========================================================================
3 *
4 * PUBLIC DOMAIN NOTICE
5 * National Center for Biotechnology Information
6 *
7 * This software/database is a "United States Government Work" under the
8 * terms of the United States Copyright Act. It was written as part of
9 * the author's offical duties as a United States Government employee and
10 * thus cannot be copyrighted. This software/database is freely available
11 * to the public for use. The National Library of Medicine and the U.S.
12 * Government have not placed any restriction on its use or reproduction.
13 *
14 * Although all reasonable efforts have been taken to ensure the accuracy
15 * and reliability of the software and data, the NLM and the U.S.
16 * Government do not and cannot warrant the performance or results that
17 * may be obtained by using this software or data. The NLM and the U.S.
18 * Government disclaim all warranties, express or implied, including
19 * warranties of performance, merchantability or fitness for any particular
20 * purpose.
21 *
22 * Please cite the author in any work or product based on this material.
23 *
24 * ===========================================================================*/
25
26 /*****************************************************************************
27
28 Author: Jason Papadopoulos
29
30 ******************************************************************************/
31
32 /** @file blast_format.cpppiv
33 * Produce formatted blast output
34 */
35
36 #include <ncbi_pch.hpp>
37 #include <algo/blast/format/blast_format.hpp>
38 #include <objects/seq/Seq_annot.hpp>
39 #include <objects/seq/Seq_descr.hpp>
40 #include <objmgr/seq_loc_mapper.hpp>
41 #include <objmgr/util/sequence.hpp>
42 #include <objmgr/util/create_defline.hpp>
43 #include <algo/blast/core/blast_stat.h>
44 #include <corelib/ncbiutil.hpp> // for FindBestChoice
45 #include <algo/blast/api/sseqloc.hpp>
46 #include <algo/blast/api/objmgr_query_data.hpp>
47
48 #include <algo/blast/format/blastxml_format.hpp>
49 #include <algo/blast/format/data4xmlformat.hpp> /* NCBI_FAKE_WARNING */
50 #include <algo/blast/format/blastxml2_format.hpp>
51 #include <algo/blast/format/data4xml2format.hpp> /* NCBI_FAKE_WARNING */
52 #include <algo/blast/format/build_archive.hpp>
53 #include <misc/jsonwrapp/jsonwrapp.hpp>
54 #include <objtools/blast/seqdb_reader/seqdb.hpp> // for CSeqDB
55 #include <serial/objostrxml.hpp>
56
57 #include <corelib/ncbistre.hpp>
58
59 #ifndef SKIP_DOXYGEN_PROCESSING
60 USING_NCBI_SCOPE;
61 USING_SCOPE(blast);
62 USING_SCOPE(objects);
63 USING_SCOPE(align_format);
64 USING_SCOPE(sequence);
65 #endif
66
67
CBlastFormat(const blast::CBlastOptions & options,blast::CLocalDbAdapter & db_adapter,blast::CFormattingArgs::EOutputFormat format_type,bool believe_query,CNcbiOstream & outfile,int num_summary,int num_alignments,CScope & scope,const char * matrix_name,bool show_gi,bool is_html,int qgencode,int dbgencode,bool use_sum_statistics,bool is_remote_search,int dbfilt_algorithm,const string & custom_output_format,bool is_megablast,bool is_indexed,const blast::CIgBlastOptions * ig_opts,const blast::CLocalDbAdapter * domain_db_adapter,const string & cmdline,const string & subjectTag)68 CBlastFormat::CBlastFormat(const blast::CBlastOptions& options,
69 blast::CLocalDbAdapter& db_adapter,
70 blast::CFormattingArgs::EOutputFormat format_type,
71 bool believe_query, CNcbiOstream& outfile,
72 int num_summary,
73 int num_alignments,
74 CScope & scope,
75 const char *matrix_name /* = BLAST_DEFAULT_MATRIX */,
76 bool show_gi /* = false */,
77 bool is_html /* = false */,
78 int qgencode /* = BLAST_GENETIC_CODE */,
79 int dbgencode /* = BLAST_GENETIC_CODE */,
80 bool use_sum_statistics /* = false */,
81 bool is_remote_search /* = false */,
82 int dbfilt_algorithm /* = -1 */,
83 const string& custom_output_format /* = kEmptyStr */,
84 bool is_megablast /* = false */,
85 bool is_indexed /* = false */,
86 const blast::CIgBlastOptions *ig_opts /* = NULL */,
87 const blast::CLocalDbAdapter* domain_db_adapter /* = NULL*/,
88 const string & cmdline /* =kEMptyStr*/,
89 const string& subjectTag /* =kEmptyStr */)
90 : m_FormatType(format_type), m_IsHTML(is_html),
91 m_DbIsAA(db_adapter.IsProtein()), m_BelieveQuery(believe_query),
92 m_Outfile(outfile), m_NumSummary(num_summary),
93 m_NumAlignments(num_alignments), m_HitlistSize(options.GetHitlistSize()),
94 m_Program(Blast_ProgramNameFromType(options.GetProgramType())),
95 m_DbName(kEmptyStr),
96 m_QueryGenCode(qgencode), m_DbGenCode(dbgencode),
97 m_ShowGi(show_gi), m_ShowLinkedSetSize(false),
98 m_IsUngappedSearch(!options.GetGappedMode()),
99 m_MatrixName(matrix_name),
100 m_Scope(& scope),
101 m_IsBl2Seq(false),
102 m_IsDbScan(false),
103 m_SubjectTag(subjectTag),
104 m_IsRemoteSearch(is_remote_search),
105 m_QueriesFormatted(0),
106 m_Megablast(is_megablast),
107 m_IndexedMegablast(is_indexed),
108 m_CustomOutputFormatSpec(custom_output_format),
109 m_IgOptions(ig_opts),
110 m_Options(&options),
111 m_IsVdb(false),
112 m_IsIterative(false),
113 m_BaseFile(kEmptyStr),
114 m_XMLFileCount(0),
115 m_LineLength(align_format::kDfltLineLength),
116 m_OrigExceptionMask(outfile.exceptions()),
117 m_Cmdline(cmdline)
118 {
119 m_Outfile.exceptions(NcbiBadbit);
120 m_DbName = db_adapter.GetDatabaseName();
121 m_IsBl2Seq = (m_DbName == kEmptyStr ? true : false);
122 m_IsDbScan = db_adapter.IsDbScanMode();
123 if (m_IsBl2Seq) {
124 m_SeqInfoSrc.Reset(db_adapter.MakeSeqInfoSrc());
125 }
126 else {
127 m_SearchDb = db_adapter.GetSearchDatabase();
128 }
129 if(m_IsDbScan) {
130 int num_seqs=0;
131 int total_length=0;
132 if (!is_remote_search)
133 {
134 BlastSeqSrc* seqsrc = db_adapter.MakeSeqSrc();
135 num_seqs=BlastSeqSrcGetNumSeqs(seqsrc);
136 total_length=BlastSeqSrcGetTotLen(seqsrc);
137 }
138 CBlastFormatUtil::FillScanModeBlastDbInfo(m_DbInfo, m_DbIsAA,
139 num_seqs, total_length, m_SubjectTag);
140 } else {
141 int filteringAlgorithmId = db_adapter.GetFilteringAlgorithm();
142 if(filteringAlgorithmId == -1) {
143 CRef <CSearchDatabase> db_Info = db_adapter.GetSearchDatabase();
144 if (db_Info && db_Info.NotEmpty()) {
145 ESubjectMaskingType maskType = db_Info->GetMaskType();
146 if(maskType != eNoSubjMasking) {
147 db_Info->SetFilteringAlgorithm(-1, eNoSubjMasking);
148 ERR_POST(Warning << "Subject mask not found in " + m_DbName +", proceeding without subject masking.");
149 }
150 }
151 }
152 CBlastFormatUtil::GetBlastDbInfo(m_DbInfo, m_DbName, m_DbIsAA,
153 dbfilt_algorithm, is_remote_search);
154 }
155 if (m_FormatType == CFormattingArgs::eXml) {
156 m_AccumulatedQueries.Reset(new CBlastQueryVector());
157 m_BlastXMLIncremental.Reset(new SBlastXMLIncremental());
158 }
159
160 if ((m_FormatType == CFormattingArgs::eXml2) || (m_FormatType == CFormattingArgs::eJson) ||
161 (m_FormatType == CFormattingArgs::eXml2_S) || (m_FormatType == CFormattingArgs::eJson_S)){
162 m_AccumulatedQueries.Reset(new CBlastQueryVector());
163 }
164
165 if (use_sum_statistics && m_IsUngappedSearch) {
166 m_ShowLinkedSetSize = true;
167 }
168 if ( m_Program == "blastn" &&
169 options.GetMatchReward() == 0 &&
170 options.GetMismatchPenalty() == 0 )
171 {
172 /* This combination is an indicator that we have used matrices
173 * solely to develop the hsp score. Also for the time being it
174 * indicates that KA stats are not available. -RMH-
175 */
176 m_DisableKAStats = true;
177 }
178 else
179 {
180 m_DisableKAStats = false;
181 }
182
183 CAlignFormatUtil::GetAsciiProteinMatrix(m_MatrixName, m_ScoringMatrix);
184
185 if (options.GetProgram() == eDeltaBlast) {
186 _ASSERT(options.GetProgramType() == eBlastTypePsiBlast);
187 m_Program = "deltablast";
188
189 if (domain_db_adapter) {
190 CBlastFormatUtil::GetBlastDbInfo(m_DomainDbInfo,
191 domain_db_adapter->GetDatabaseName(),
192 true, -1, is_remote_search);
193 }
194 }
195
196 m_IsIterative = options.IsIterativeSearch();
197 if (m_FormatType == CFormattingArgs::eSAM) {
198 x_InitSAMFormatter();
199 }
200
201 CNcbiApplication* app = CNcbiApplication::Instance();
202 if (app) {
203 const CNcbiRegistry& registry = app->GetConfig();
204 m_LongSeqId = (registry.Get("BLAST", "LONG_SEQID") == "1");
205 }
206 m_HitsSortOption = -1;
207 m_HspsSortOption = -1;
208 }
209
CBlastFormat(const blast::CBlastOptions & opts,const vector<CBlastFormatUtil::SDbInfo> & dbinfo_list,blast::CFormattingArgs::EOutputFormat format_type,bool believe_query,CNcbiOstream & outfile,int num_summary,int num_alignments,CScope & scope,bool show_gi,bool is_html,bool is_remote_search,const string & custom_output_format,bool is_vdb,const string & cmdline)210 CBlastFormat::CBlastFormat(const blast::CBlastOptions& opts,
211 const vector< CBlastFormatUtil::SDbInfo >& dbinfo_list,
212 blast::CFormattingArgs::EOutputFormat format_type,
213 bool believe_query, CNcbiOstream& outfile,
214 int num_summary,
215 int num_alignments,
216 CScope& scope,
217 bool show_gi,
218 bool is_html,
219 bool is_remote_search,
220 const string& custom_output_format,
221 bool is_vdb,
222 const string & cmdline)
223 : m_FormatType(format_type),
224 m_IsHTML(is_html),
225 m_DbIsAA(!Blast_SubjectIsNucleotide(opts.GetProgramType())),
226 m_BelieveQuery(believe_query),
227 m_Outfile(outfile),
228 m_NumSummary(num_summary),
229 m_NumAlignments(num_alignments),
230 m_HitlistSize(opts.GetHitlistSize()),
231 m_Program(Blast_ProgramNameFromType(opts.GetProgramType())),
232 m_DbName(kEmptyStr),
233 m_QueryGenCode(opts.GetQueryGeneticCode()),
234 m_DbGenCode(opts.GetDbGeneticCode()),
235 m_ShowGi(show_gi),
236 m_ShowLinkedSetSize(false),
237 m_IsUngappedSearch(!opts.GetGappedMode()),
238 m_MatrixName(opts.GetMatrixName()),
239 m_Scope(&scope),
240 m_IsBl2Seq(false),
241 m_IsDbScan (false),
242 m_IsRemoteSearch(is_remote_search),
243 m_QueriesFormatted(0),
244 m_Megablast(opts.GetProgram() == eMegablast ||
245 opts.GetProgram() == eDiscMegablast),
246 m_IndexedMegablast(opts.GetMBIndexLoaded()),
247 m_CustomOutputFormatSpec(custom_output_format),
248 m_Options(&opts),
249 m_IsVdb(is_vdb),
250 m_IsIterative(false),
251 m_BaseFile(kEmptyStr),
252 m_XMLFileCount(0),
253 m_LineLength(align_format::kDfltLineLength),
254 m_OrigExceptionMask(outfile.exceptions()),
255 m_Cmdline(cmdline)
256 {
257 m_Outfile.exceptions(NcbiBadbit);
258 m_DbInfo.assign(dbinfo_list.begin(), dbinfo_list.end());
259 vector< CBlastFormatUtil::SDbInfo >::const_iterator itInfo;
260 for (itInfo = m_DbInfo.begin(); itInfo != m_DbInfo.end(); itInfo++)
261 {
262 if(itInfo != m_DbInfo.begin())
263 m_DbName += " ";
264
265 m_DbName += itInfo->name;
266 }
267
268 m_IsBl2Seq = false;
269
270 if (m_FormatType == CFormattingArgs::eXml) {
271 m_AccumulatedQueries.Reset(new CBlastQueryVector());
272 m_BlastXMLIncremental.Reset(new SBlastXMLIncremental());
273 }
274
275 if ((m_FormatType == CFormattingArgs::eXml2) || (m_FormatType == CFormattingArgs::eJson) ||
276 (m_FormatType == CFormattingArgs::eXml2_S) || (m_FormatType == CFormattingArgs::eJson_S)) {
277 m_AccumulatedQueries.Reset(new CBlastQueryVector());
278 }
279
280 if (opts.GetSumStatisticsMode() && m_IsUngappedSearch) {
281 m_ShowLinkedSetSize = true;
282 }
283
284 if ( m_Program == "blastn" &&
285 opts.GetMatchReward() == 0 &&
286 opts.GetMismatchPenalty() == 0 )
287 {
288 /* This combination is an indicator that we have used matrices
289 * solely to develop the hsp score. Also for the time being it
290 * indicates that KA stats are not available. -RMH-
291 */
292 m_DisableKAStats = true;
293 }
294 else
295 {
296 m_DisableKAStats = false;
297 }
298
299 CAlignFormatUtil::GetAsciiProteinMatrix(m_MatrixName, m_ScoringMatrix);
300
301 if (opts.GetProgram() == eDeltaBlast) {
302 _ASSERT(opts.GetProgramType() == eBlastTypePsiBlast);
303 m_Program = "deltablast";
304 }
305 m_IsIterative = opts.IsIterativeSearch();
306 if (m_FormatType == CFormattingArgs::eSAM) {
307 x_InitSAMFormatter();
308 }
309 CNcbiApplication* app = CNcbiApplication::Instance();
310 if (app) {
311 const CNcbiRegistry& registry = app->GetConfig();
312 m_LongSeqId = (registry.Get("BLAST", "LONG_SEQID") == "1");
313 }
314 m_HitsSortOption = -1;
315 m_HspsSortOption = -1;
316 }
317
~CBlastFormat()318 CBlastFormat::~CBlastFormat()
319 {
320 try {
321 m_Outfile.exceptions(m_OrigExceptionMask);
322 } catch (...) {/*ignore exceptions*/}
323 m_Outfile.flush();
324 }
325
326 static const string kHTML_Prefix =
327 "<HTML>\n"
328 "<HEAD><TITLE>BLAST Search Results</TITLE></HEAD>\n"
329 "<BODY BGCOLOR=\"#FFFFFF\" LINK=\"#0000FF\" VLINK=\"#660099\" ALINK=\"#660099\">\n"
330 "<PRE>\n";
331
332 static const string kHTML_Suffix =
333 "</PRE>\n"
334 "</BODY>\n"
335 "</HTML>";
336
337 Int8
GetDbTotalLength()338 CBlastFormat::GetDbTotalLength()
339 {
340 Int8 retv = 0L;
341 for (size_t i = 0; i < m_DbInfo.size(); i++) {
342 retv += m_DbInfo[i].total_length;
343 }
344 return retv;
345 }
346
347 void
PrintProlog()348 CBlastFormat::PrintProlog()
349 {
350 // no header for some output types
351 if (m_FormatType >= CFormattingArgs::eXml) {
352 if(m_FormatType == CFormattingArgs::eXml2_S) {
353 BlastXML2_PrintHeader(&m_Outfile);
354 }
355 else if(m_FormatType == CFormattingArgs::eJson_S){
356 BlastJSON_PrintHeader(&m_Outfile);
357 }
358 return;
359 }
360
361 if (m_IsHTML) {
362 m_Outfile << kHTML_Prefix << "\n";
363 }
364 // Make sure no-one confuses us with the standard BLASTN
365 // algorithm. -RMH-
366 if ( m_Program == "blastn" &&
367 m_DisableKAStats == true )
368 {
369 CBlastFormatUtil::BlastPrintVersionInfo("rmblastn", m_IsHTML,
370 m_Outfile);
371 m_Outfile << "\n\n";
372 m_Outfile << "Reference: Robert M. Hubley, Arian Smit\n";
373 m_Outfile << "RMBlast - RepeatMasker Search Engine\n";
374 m_Outfile << "2010 <http://www.repeatmasker.org>";
375 }else
376 {
377 CBlastFormatUtil::BlastPrintVersionInfo(m_Program, m_IsHTML,
378 m_Outfile);
379 }
380
381 if (m_IsBl2Seq && !m_IsDbScan) {
382 return;
383 }
384
385 m_Outfile << NcbiEndl << NcbiEndl;
386 if (m_Program == "deltablast") {
387 CBlastFormatUtil::BlastPrintReference(m_IsHTML, kFormatLineLength,
388 m_Outfile, CReference::eDeltaBlast);
389 m_Outfile << "\n";
390 }
391
392 if (m_Megablast)
393 CBlastFormatUtil::BlastPrintReference(m_IsHTML, kFormatLineLength,
394 m_Outfile, CReference::eMegaBlast);
395 else
396 CBlastFormatUtil::BlastPrintReference(m_IsHTML, kFormatLineLength,
397 m_Outfile);
398
399 if (m_Megablast && m_IndexedMegablast)
400 {
401 m_Outfile << "\n";
402 CBlastFormatUtil::BlastPrintReference(m_IsHTML, kFormatLineLength,
403 m_Outfile, CReference::eIndexedMegablast);
404 }
405
406 if (m_Program == "psiblast" || m_Program == "deltablast") {
407 m_Outfile << "\n";
408 CBlastFormatUtil::BlastPrintReference(m_IsHTML, kFormatLineLength,
409 m_Outfile, CReference::eCompAdjustedMatrices);
410 }
411 if (m_Program == "psiblast" || m_Program == "blastp") {
412 m_Outfile << "\n";
413 CBlastFormatUtil::BlastPrintReference(m_IsHTML, kFormatLineLength,
414 m_Outfile, CReference::eCompBasedStats,
415 (bool)(m_Program == "psiblast"));
416 }
417
418 if (m_Program == "deltablast" || !m_DomainDbInfo.empty()) {
419 m_Outfile << "\n\n";
420 if (!m_DomainDbInfo.empty()) {
421 m_Outfile << "\n\n" << "Conserved Domain ";
422 CBlastFormatUtil::PrintDbReport(m_DomainDbInfo, kFormatLineLength,
423 m_Outfile, true);
424 }
425 }
426 else {
427 m_Outfile << "\n\n";
428 }
429 if (!m_IsBl2Seq || m_IsDbScan)
430 CBlastFormatUtil::PrintDbReport(m_DbInfo, kFormatLineLength,
431 m_Outfile, true);
432 }
433
434 void
x_PrintOneQueryFooter(const blast::CBlastAncillaryData & summary)435 CBlastFormat::x_PrintOneQueryFooter(const blast::CBlastAncillaryData& summary)
436 {
437 /* Skip printing KA parameters if the program is rmblastn -RMH- */
438 if ( m_DisableKAStats )
439 return;
440
441 const Blast_KarlinBlk *kbp_ungap =
442 (m_Program == "psiblast" || m_Program == "deltablast")
443 ? summary.GetPsiUngappedKarlinBlk()
444 : summary.GetUngappedKarlinBlk();
445 const Blast_GumbelBlk *gbp = summary.GetGumbelBlk();
446 m_Outfile << NcbiEndl;
447 if (kbp_ungap) {
448 CBlastFormatUtil::PrintKAParameters(kbp_ungap->Lambda,
449 kbp_ungap->K, kbp_ungap->H,
450 kFormatLineLength, m_Outfile,
451 false, gbp);
452 }
453
454 const Blast_KarlinBlk *kbp_gap =
455 (m_Program == "psiblast" || m_Program == "deltablast")
456 ? summary.GetPsiGappedKarlinBlk()
457 : summary.GetGappedKarlinBlk();
458 m_Outfile << "\n";
459 if (kbp_gap) {
460 CBlastFormatUtil::PrintKAParameters(kbp_gap->Lambda,
461 kbp_gap->K, kbp_gap->H,
462 kFormatLineLength, m_Outfile,
463 true, gbp);
464 }
465
466 m_Outfile << "\n";
467 m_Outfile << "Effective search space used: " <<
468 summary.GetSearchSpace() << "\n";
469 }
470
471 /// Auxialiary function to determine if there are local IDs in the identifiers
472 /// of the query sequences
473 /// @param queries query sequence(s) [in]
474 static bool
s_HasLocalIDs(CConstRef<CBlastQueryVector> queries)475 s_HasLocalIDs(CConstRef<CBlastQueryVector> queries)
476 {
477 bool retval = false;
478 ITERATE(CBlastQueryVector, itr, *queries) {
479 if (blast::IsLocalId((*itr)->GetQuerySeqLoc()->GetId())) {
480 retval = true;
481 break;
482 }
483 }
484 return retval;
485 }
486
487 void
x_ConfigCShowBlastDefline(CShowBlastDefline & showdef,int skip_from,int skip_to,int index,int num_descriptions_to_show)488 CBlastFormat::x_ConfigCShowBlastDefline(CShowBlastDefline& showdef,
489 int skip_from, int skip_to, int index,
490 int num_descriptions_to_show /* = -1 */)
491 {
492 int flags = 0;
493 if (m_ShowLinkedSetSize)
494 flags |= CShowBlastDefline::eShowSumN;
495 if (m_IsHTML){
496 flags |= CShowBlastDefline::eHtml;
497 if (index >= 0) {
498 showdef.SetResultPosIndex(index);
499 }
500 }
501 if (m_ShowGi)
502 flags |= CShowBlastDefline::eShowGi;
503 if (num_descriptions_to_show == 0)
504 flags |= CShowBlastDefline::eNoShowHeader;
505 if (m_LongSeqId) {
506 flags |= CShowBlastDefline::eLongSeqId;
507 }
508 if(m_HitsSortOption >= 0) {
509 flags |= CShowBlastDefline::eShowPercentIdent;
510 flags |= CShowBlastDefline::eShowTotalScore;
511 flags |= CShowBlastDefline::eShowQueryCoverage;
512 }
513 showdef.SetOption(flags);
514 showdef.SetDbName(m_DbName);
515 showdef.SetDbType(!m_DbIsAA);
516 showdef.SetSkipRange(skip_from, skip_to);
517 }
518
519 void
x_SplitSeqAlign(CConstRef<CSeq_align_set> full_alignment,CSeq_align_set & repeated_seqs,CSeq_align_set & new_seqs,blast::CPsiBlastIterationState::TSeqIds & prev_seqids)520 CBlastFormat::x_SplitSeqAlign(CConstRef<CSeq_align_set> full_alignment,
521 CSeq_align_set& repeated_seqs,
522 CSeq_align_set& new_seqs,
523 blast::CPsiBlastIterationState::TSeqIds& prev_seqids)
524 {
525 static const CSeq_align::TDim kSubjRow = 1;
526 _ASSERT( !prev_seqids.empty() );
527 _ASSERT( !full_alignment->IsEmpty() );
528 _ASSERT(repeated_seqs.IsEmpty());
529 _ASSERT(new_seqs.IsEmpty());
530
531 unsigned int count = 0;
532 ITERATE(CSeq_align_set::Tdata, alignment, full_alignment->Get()) {
533 CSeq_id_Handle subj_id =
534 CSeq_id_Handle::GetHandle((*alignment)->GetSeq_id(kSubjRow));
535 if (prev_seqids.find(subj_id) != prev_seqids.end()) {
536 // if found among previously seen Seq-ids...
537 repeated_seqs.Set().push_back(*alignment);
538 } else {
539 // ... else add them as new
540 new_seqs.Set().push_back(*alignment);
541 }
542 count++;
543 if(count >= (unsigned int)m_NumSummary)
544 break;
545 }
546 }
547
548 bool
s_IsGlobalSeqAlign(CConstRef<objects::CSeq_align_set> seqalign_set)549 s_IsGlobalSeqAlign(CConstRef<objects::CSeq_align_set> seqalign_set)
550 {
551 bool kIsGlobal = (seqalign_set->IsSet() && seqalign_set->CanGet() &&
552 seqalign_set->Get().front()->CanGetType() &&
553 seqalign_set->Get().front()->GetType() == CSeq_align_Base::eType_global);
554
555 return kIsGlobal;
556 }
557
558
559 void
x_DisplayDeflines(CConstRef<CSeq_align_set> aln_set,unsigned int itr_num,blast::CPsiBlastIterationState::TSeqIds & prev_seqids,int additional,int index,int defline_length)560 CBlastFormat::x_DisplayDeflines(CConstRef<CSeq_align_set> aln_set,
561 unsigned int itr_num,
562 blast::CPsiBlastIterationState::TSeqIds& prev_seqids,
563 int additional,
564 int index,
565 int defline_length )
566 {
567
568 if (itr_num != numeric_limits<unsigned int>::max() &&
569 !prev_seqids.empty()) {
570 // Split seq-align-set
571 CSeq_align_set repeated_seqs, new_seqs;
572 x_SplitSeqAlign(aln_set, repeated_seqs, new_seqs, prev_seqids);
573
574 // Show deflines for 'repeat' sequences
575 {{
576 CShowBlastDefline showdef(repeated_seqs, *m_Scope,
577 kFormatLineLength,
578 repeated_seqs.Size());
579 x_ConfigCShowBlastDefline(showdef);
580 showdef.SetupPsiblast(NULL, CShowBlastDefline::eRepeatPass);
581 showdef.DisplayBlastDefline(m_Outfile);
582 }}
583 m_Outfile << "\n";
584
585 // Show deflines for 'new' sequences
586 {{
587 CShowBlastDefline showdef(new_seqs, *m_Scope, kFormatLineLength,
588 new_seqs.Size());
589 x_ConfigCShowBlastDefline(showdef);
590 showdef.SetupPsiblast(NULL, CShowBlastDefline::eNewPass);
591 showdef.DisplayBlastDefline(m_Outfile);
592 }}
593
594 } else {
595
596 CShowBlastDefline showdef(*aln_set, *m_Scope,
597 defline_length == -1 ? kFormatLineLength:defline_length,
598 m_NumSummary + additional);
599 x_ConfigCShowBlastDefline(showdef, -1, -1, index,
600 m_NumSummary+additional);
601 showdef.DisplayBlastDefline(m_Outfile);
602 }
603 m_Outfile << "\n";
604 }
605
606 int
s_SetFlags(string & program,blast::CFormattingArgs::EOutputFormat format_type,bool html,bool showgi,bool isbl2seq,bool disableKAStats)607 s_SetFlags(string& program,
608 blast::CFormattingArgs::EOutputFormat format_type,
609 bool html, bool showgi, bool isbl2seq, bool disableKAStats)
610 {
611 // set the alignment flags
612 int flags = CDisplaySeqalign::eShowBlastInfo;
613
614 if ( isbl2seq ) {
615 flags |= CDisplaySeqalign::eShowNoDeflineInfo;
616 }
617
618 if (html)
619 flags |= CDisplaySeqalign::eHtml;
620 if (showgi)
621 flags |= CDisplaySeqalign::eShowGi;
622
623 if (format_type >= CFormattingArgs::eQueryAnchoredIdentities &&
624 format_type <= CFormattingArgs::eFlatQueryAnchoredNoIdentities) {
625 flags |= CDisplaySeqalign::eMergeAlign;
626 }
627 else {
628 flags |= CDisplaySeqalign::eShowBlastStyleId |
629 CDisplaySeqalign::eShowMiddleLine;
630 }
631
632 if (format_type == CFormattingArgs::eQueryAnchoredIdentities ||
633 format_type == CFormattingArgs::eFlatQueryAnchoredIdentities) {
634 flags |= CDisplaySeqalign::eShowIdentity;
635 }
636 if (format_type == CFormattingArgs::eQueryAnchoredIdentities ||
637 format_type == CFormattingArgs::eQueryAnchoredNoIdentities) {
638 flags |= CDisplaySeqalign::eMasterAnchored;
639 }
640 if (program == "tblastx") {
641 flags |= CDisplaySeqalign::eTranslateNucToNucAlignment;
642 }
643
644 if (disableKAStats)
645 flags |= CDisplaySeqalign::eShowRawScoreOnly;
646
647 return flags;
648 }
649
650 bool
x_IsVdbSearch() const651 CBlastFormat::x_IsVdbSearch() const
652 {
653 return m_IsVdb;
654 }
655 // Port of jzmisc.c's AddAlignInfoToSeqAnnotEx (CVS revision 6.11)
656 CRef<objects::CSeq_annot>
x_WrapAlignmentInSeqAnnot(CConstRef<objects::CSeq_align_set> alnset,const string & db_title) const657 CBlastFormat::x_WrapAlignmentInSeqAnnot(CConstRef<objects::CSeq_align_set> alnset,
658 const string& db_title) const
659 {
660 return CBlastFormatUtil::CreateSeqAnnotFromSeqAlignSet(*alnset,
661 ProgramNameToEnum(m_Program),
662 m_DbName, db_title,
663 x_IsVdbSearch());
664 }
665
666 void
x_PrintStructuredReport(const blast::CSearchResults & results,CConstRef<blast::CBlastQueryVector> queries)667 CBlastFormat::x_PrintStructuredReport(const blast::CSearchResults& results,
668 CConstRef<blast::CBlastQueryVector> queries)
669 {
670 string db_title;
671 if (!m_DbInfo.empty()) {
672 db_title = m_DbInfo.front().definition;
673 for (size_t i=1;i < m_DbInfo.size();i++) {
674 db_title += "; ";
675 db_title += m_DbInfo[i].definition;
676 }
677 }
678
679 // ASN.1 formatting is straightforward
680 if (m_FormatType == CFormattingArgs::eAsnText || m_FormatType == CFormattingArgs::eJsonSeqalign) {
681 if (results.HasAlignments()) {
682 CRef<CSeq_align_set> aln_set (new CSeq_align_set);
683 CBlastFormatUtil::PruneSeqalign(*(results.GetSeqAlign()), *aln_set, m_HitlistSize);
684 if(m_FormatType == CFormattingArgs::eAsnText)
685 m_Outfile << MSerial_AsnText << *x_WrapAlignmentInSeqAnnot(aln_set, db_title);
686 else
687 m_Outfile << MSerial_Json << *x_WrapAlignmentInSeqAnnot(aln_set, db_title);
688 }
689 return;
690 } else if (m_FormatType == CFormattingArgs::eAsnBinary) {
691 if (results.HasAlignments()) {
692 CRef<CSeq_align_set> aln_set (new CSeq_align_set);
693 CBlastFormatUtil::PruneSeqalign(*(results.GetSeqAlign()), *aln_set, m_HitlistSize);
694 m_Outfile << MSerial_AsnBinary <<
695 *x_WrapAlignmentInSeqAnnot(aln_set, db_title);
696 }
697 return;
698 } else if (m_FormatType == CFormattingArgs::eXml) {
699 CRef<CSearchResults> res(const_cast<CSearchResults*>(&results));
700 res->TrimSeqAlign(m_HitlistSize);
701 m_AccumulatedResults.push_back(res);
702 CConstRef<CSeq_id> query_id = results.GetSeqId();
703 // FIXME: this can be a bottleneck with large numbers of queries
704 ITERATE(CBlastQueryVector, itr, *queries) {
705 if (query_id->Match(*(*itr)->GetQueryId())) {
706 m_AccumulatedQueries->push_back(*itr);
707 break;
708 }
709 }
710
711 objects::CBlastOutput xml_output;
712 if(x_IsVdbSearch()) {
713 CCmdLineBlastXMLReportData report_data(m_AccumulatedQueries,
714 m_AccumulatedResults,
715 *m_Options, m_DbInfo,
716 m_QueryGenCode, m_DbGenCode,
717 m_IsRemoteSearch);
718 BlastXML_FormatReport(xml_output, &report_data, &m_Outfile,
719 m_BlastXMLIncremental.GetPointer());
720
721 }
722 else {
723 CCmdLineBlastXMLReportData report_data(m_AccumulatedQueries,
724 m_AccumulatedResults,
725 *m_Options, m_DbName, m_DbIsAA,
726 m_QueryGenCode, m_DbGenCode,
727 m_IsRemoteSearch);
728 BlastXML_FormatReport(xml_output, &report_data, &m_Outfile,
729 m_BlastXMLIncremental.GetPointer());
730 }
731 m_AccumulatedResults.clear();
732 m_AccumulatedQueries->clear();
733 return;
734 }
735 else if(m_FormatType == CFormattingArgs::eXml2 || m_FormatType == CFormattingArgs::eJson ||
736 m_FormatType == CFormattingArgs::eXml2_S || m_FormatType == CFormattingArgs::eJson_S) {
737 x_PrintXML2Report(results, queries);
738 return;
739 }
740 else if (m_FormatType == CFormattingArgs::eSAM) {
741 if(results.HasAlignments()) {
742 m_SamFormatter->Print(*(results.GetSeqAlign()));
743 }
744 return;
745 }
746 }
747
748 void
x_PrintTabularReport(const blast::CSearchResults & results,unsigned int itr_num)749 CBlastFormat::x_PrintTabularReport(const blast::CSearchResults& results,
750 unsigned int itr_num)
751 {
752 CConstRef<CSeq_align_set> aln_set = results.GetSeqAlign();
753 if (m_IsUngappedSearch && results.HasAlignments()) {
754 aln_set.Reset(CDisplaySeqalign::PrepareBlastUngappedSeqalign(*aln_set));
755 }
756 // other output types will need a bioseq handle
757 CBioseq_Handle bhandle = m_Scope->GetBioseqHandle(*results.GetSeqId(),
758 CScope::eGetBioseq_All);
759
760 // tabular formatting just prints each alignment in turn
761 // (plus a header)
762 if (m_FormatType == CFormattingArgs::eTabular ||
763 m_FormatType == CFormattingArgs::eTabularWithComments ||
764 m_FormatType == CFormattingArgs::eCommaSeparatedValues) {
765 const CBlastTabularInfo::EFieldDelimiter kDelim =
766 (m_FormatType == CFormattingArgs::eCommaSeparatedValues
767 ? CBlastTabularInfo::eComma : CBlastTabularInfo::eTab);
768
769 CBlastTabularInfo tabinfo(m_Outfile, m_CustomOutputFormatSpec, kDelim);
770 if(!m_CustomDelim.empty()) {
771 tabinfo.SetCustomDelim(m_CustomDelim);
772 }
773 tabinfo.SetParseLocalIds(m_BelieveQuery);
774 if((m_IsBl2Seq && (!m_BelieveQuery))|| m_IsRemoteSearch) {
775 tabinfo.SetParseSubjectDefline(true);
776 }
777 tabinfo.SetQueryRange(m_QueryRange);
778 if (ncbi::NStr::ToLower(m_Program) == string("blastn"))
779 tabinfo.SetNoFetch(true);
780
781 if (m_FormatType == CFormattingArgs::eTabularWithComments) {
782 string strProgVersion =
783 NStr::ToUpper(m_Program) + " " + blast::CBlastVersion().Print();
784 string dbname;
785 if (m_IsDbScan)
786 dbname = string("User specified sequence set (Input: ") + m_SubjectTag + string(")");
787 else
788 dbname = m_DbName;
789 CConstRef<CBioseq> subject_bioseq;
790 // dbname used in place of Bioseq in most cases.
791 if (dbname.empty())
792 subject_bioseq.Reset(x_CreateSubjectBioseq());
793 tabinfo.PrintHeader(strProgVersion, *(bhandle.GetBioseqCore()),
794 dbname, results.GetRID(), itr_num, aln_set,
795 subject_bioseq);
796 }
797
798 if (results.HasAlignments()) {
799 CSeq_align_set copy_aln_set;
800 CBlastFormatUtil::PruneSeqalign(*aln_set, copy_aln_set, m_HitlistSize);
801
802 {
803 unsigned int scores = CBlastFormatUtil::eNoQuerySubjCov;
804 if(string::npos != m_CustomOutputFormatSpec.find("qcovs"))
805 scores |= CBlastFormatUtil::eQueryCovPerSubj ;
806 if(string::npos != m_CustomOutputFormatSpec.find("qcovus") &&
807 ncbi::NStr::ToLower(m_Program) == string("blastn"))
808 scores |= CBlastFormatUtil::eQueryCovPerUniqSubj ;
809
810 if(scores != CBlastFormatUtil::eNoQuerySubjCov)
811 CBlastFormatUtil::InsertSubjectScores (copy_aln_set, bhandle, m_QueryRange, (CBlastFormatUtil::ESubjectScores) scores);
812 }
813 tabinfo.SetQueryGeneticCode(m_QueryGenCode);
814 tabinfo.SetDbGeneticCode(m_DbGenCode);
815 ITERATE(CSeq_align_set::Tdata, itr, copy_aln_set.Get()) {
816 const CSeq_align& s = **itr;
817 tabinfo.SetFields(s, *m_Scope, &m_ScoringMatrix);
818 tabinfo.Print();
819 }
820 }
821 return;
822 }
823 }
824
s_SetCloneInfo(const CIgBlastTabularInfo & tabinfo,const CBioseq_Handle & handle,CBlastFormat::SClone & clone_info)825 static void s_SetCloneInfo(const CIgBlastTabularInfo& tabinfo,
826 const CBioseq_Handle& handle,
827 CBlastFormat::SClone& clone_info) {
828
829 if (handle.GetSeqId()->Which() == CSeq_id::e_Local){
830 CDeflineGenerator defline (handle.GetSeq_entry_Handle());
831 clone_info.seqid = defline.GenerateDefline(handle).substr(0, 45);
832
833 // clone_info.seqid = CDeflineGenerator.substr(0, 45);
834 } else {
835 string seqid;
836 CRef<CSeq_id> wid = FindBestChoice(handle.GetBioseqCore()->GetId(), CSeq_id::WorstRank);
837 wid->GetLabel(&seqid, CSeq_id::eContent);
838 clone_info.seqid = seqid.substr(0, 45);
839 }
840 tabinfo.GetIgInfo (clone_info.v_gene, clone_info.d_gene, clone_info.j_gene,
841 clone_info.chain_type, clone_info.na, clone_info.aa, clone_info.productive);
842 clone_info.identity = 0;
843 const vector<CIgBlastTabularInfo::SIgDomain*>& domains = tabinfo.GetIgDomains();
844 int length = 0;
845 int num_match = 0;
846 for (unsigned int i=0; i<domains.size(); ++i) {
847 if (domains[i]->length > 0) {
848 length += domains[i]->length;
849 num_match += domains[i]->num_match;
850 }
851 }
852 if (length > 0){
853 clone_info.identity = ((double)num_match)/length;
854
855 }
856
857 }
858
859 void
x_PrintTaxReport(const blast::CSearchResults & results)860 CBlastFormat::x_PrintTaxReport(const blast::CSearchResults& results)
861 {
862 CBioseq_Handle bhandle = m_Scope->GetBioseqHandle(*results.GetSeqId(),
863 CScope::eGetBioseq_All);
864 CConstRef<CBioseq> bioseq = bhandle.GetBioseqCore();
865 if(m_IsHTML) {
866 m_Outfile << "<pre>";
867 }
868 else {
869 m_Outfile << "\n";
870 }
871 CBlastFormatUtil::AcknowledgeBlastQuery(*bioseq, kFormatLineLength,
872 m_Outfile, m_BelieveQuery,
873 m_IsHTML, false,
874 results.GetRID());
875
876 if(m_IsHTML) {
877 m_Outfile << "</pre>";
878 }
879 CConstRef<CSeq_align_set> aln_set = results.GetSeqAlign();
880 if (m_IsUngappedSearch && results.HasAlignments()) {
881 aln_set.Reset(CDisplaySeqalign::PrepareBlastUngappedSeqalign(*aln_set));
882 }
883
884 CRef<CSeq_align_set> new_aln_set(const_cast<CSeq_align_set*>(aln_set.GetPointer()));
885 CTaxFormat *taxFormatRes = new CTaxFormat (*new_aln_set,*m_Scope,m_IsHTML ? CTaxFormat::eHtml:CTaxFormat::eText, false,max(kMinTaxFormatLineLength,kFormatLineLength));
886 taxFormatRes->DisplayOrgReport(m_Outfile);
887 }
888
889 void
x_PrintIgTabularReport(const blast::CIgBlastResults & results,SClone & clone_info,bool fill_clone_info)890 CBlastFormat::x_PrintIgTabularReport(const blast::CIgBlastResults& results,
891 SClone& clone_info,
892 bool fill_clone_info)
893 {
894 CConstRef<CSeq_align_set> aln_set = results.GetSeqAlign();
895 /* TODO do we support ungapped Igblast search?
896 if (m_IsUngappedSearch && results.HasAlignments()) {
897 aln_set.Reset(CDisplaySeqalign::PrepareBlastUngappedSeqalign(*aln_set));
898 } */
899 // other output types will need a bioseq handle
900 CBioseq_Handle bhandle = m_Scope->GetBioseqHandle(*results.GetSeqId(),
901 CScope::eGetBioseq_All);
902
903 // tabular formatting just prints each alignment in turn
904 // (plus a header)
905 if (m_FormatType != CFormattingArgs::eTabular &&
906 m_FormatType != CFormattingArgs::eTabularWithComments &&
907 m_FormatType != CFormattingArgs::eCommaSeparatedValues) return;
908
909 const CBlastTabularInfo::EFieldDelimiter kDelim =
910 (m_FormatType == CFormattingArgs::eCommaSeparatedValues
911 ? CBlastTabularInfo::eComma : CBlastTabularInfo::eTab);
912
913 CIgBlastTabularInfo tabinfo(m_Outfile, m_CustomOutputFormatSpec, kDelim);
914 tabinfo.SetParseLocalIds(m_BelieveQuery);
915
916 string strProgVersion =
917 "IG" + NStr::ToUpper(m_Program);
918 CConstRef<CBioseq> subject_bioseq = x_CreateSubjectBioseq();
919
920 if (m_IsHTML) {
921 m_Outfile << "<html><body><pre>\n";
922 }
923 if (results.HasAlignments()) {
924 const CRef<CIgAnnotation> & annots = results.GetIgAnnotation();
925 CSeq_align_set::Tdata::const_iterator itr = aln_set->Get().begin();
926 tabinfo.SetMasterFields(**itr, *m_Scope,
927 annots->m_ChainType[0],
928 annots->m_ChainTypeToShow,
929 &m_ScoringMatrix);
930 tabinfo.SetIgAnnotation(annots, m_IgOptions, aln_set, *m_Scope);
931 if (fill_clone_info) {
932 s_SetCloneInfo(tabinfo, bhandle, clone_info);
933 }
934 tabinfo.PrintHeader(strProgVersion, *(bhandle.GetBioseqCore()),
935 m_DbName,
936 m_IgOptions->m_DomainSystem,
937 results.GetRID(),
938 numeric_limits<unsigned int>::max(),
939 aln_set, subject_bioseq);
940
941 int j = 1;
942 for (; itr != aln_set->Get().end(); ++itr) {
943 tabinfo.SetFields(**itr, *m_Scope,
944 annots->m_ChainType[j++],
945 annots->m_ChainTypeToShow,
946 &m_ScoringMatrix);
947 tabinfo.Print();
948 }
949 } else {
950 tabinfo.PrintHeader(strProgVersion, *(bhandle.GetBioseqCore()),
951 m_DbName,
952 m_IgOptions->m_DomainSystem,
953 results.GetRID(),
954 numeric_limits<unsigned int>::max(),
955 0, subject_bioseq);
956 }
957 if (m_IsHTML) {
958 m_Outfile << "\n</pre></body></html>\n";
959 }
960 }
961
962
x_PrintAirrRearrangement(const blast::CIgBlastResults & results,SClone & clone_info,bool fill_clone_info,bool print_airr_format_header)963 void CBlastFormat::x_PrintAirrRearrangement(const blast::CIgBlastResults& results,
964 SClone& clone_info,
965 bool fill_clone_info,
966 bool print_airr_format_header)
967 {
968 CConstRef<CSeq_align_set> aln_set = results.GetSeqAlign();
969
970 // other output types will need a bioseq handle
971 CBioseq_Handle bhandle = m_Scope->GetBioseqHandle(*results.GetSeqId(),
972 CScope::eGetBioseq_All);
973
974 // tabular formatting just prints each alignment in turn
975 // (plus a header)
976
977 const CBlastTabularInfo::EFieldDelimiter kDelim =
978 (m_FormatType == CFormattingArgs::eCommaSeparatedValues
979 ? CBlastTabularInfo::eComma : CBlastTabularInfo::eTab);
980
981 CIgBlastTabularInfo tabinfo(m_Outfile, m_CustomOutputFormatSpec, kDelim);
982 tabinfo.SetParseLocalIds(m_BelieveQuery);
983
984 string strProgVersion =
985 "IG" + NStr::ToUpper(m_Program);
986 CConstRef<CBioseq> subject_bioseq = x_CreateSubjectBioseq();
987
988 CRef<CIgAnnotation> annots(null);
989 if (results.HasAlignments()) {
990 annots = results.GetIgAnnotation();
991 tabinfo.SetIgAnnotation(annots, m_IgOptions, aln_set, *m_Scope);
992 if (fill_clone_info) {
993 s_SetCloneInfo(tabinfo, bhandle, clone_info);
994 }
995 }
996 tabinfo.SetAirrFormatData(*m_Scope, annots,
997 bhandle, aln_set, m_IgOptions);
998
999
1000 tabinfo.PrintAirrRearrangement(*m_Scope, annots, strProgVersion,
1001 *(bhandle.GetBioseqCore()),
1002 m_DbName,
1003 m_IgOptions->m_DomainSystem,
1004 results.GetRID(),
1005 numeric_limits<unsigned int>::max(),
1006 aln_set, subject_bioseq, &m_ScoringMatrix,
1007 print_airr_format_header,
1008 m_IgOptions);
1009
1010 }
1011
x_CreateSubjectBioseq()1012 CConstRef<objects::CBioseq> CBlastFormat::x_CreateSubjectBioseq()
1013 {
1014 if ( !m_IsBl2Seq && !m_IsDbScan) {
1015 return CConstRef<CBioseq>();
1016 }
1017
1018 _ASSERT(m_IsBl2Seq);
1019 _ASSERT(m_SeqInfoSrc);
1020 static Uint4 subj_index = 0;
1021
1022 list< CRef<CSeq_id> > ids = m_SeqInfoSrc->GetId(subj_index++);
1023 CRef<CSeq_id> id = FindBestChoice(ids, CSeq_id::BestRank);
1024 CBioseq_Handle bhandle = m_Scope->GetBioseqHandle(*id,
1025 CScope::eGetBioseq_All);
1026 // If this assertion fails, we're not able to get the subject, possibly a
1027 // programming error (see @note in this function's declaration - was the
1028 // order of calls altered?)
1029 _ASSERT(bhandle);
1030
1031 // reset the subject index if necessary
1032 if (subj_index >= m_SeqInfoSrc->Size()) {
1033 subj_index = 0;
1034 }
1035 return bhandle.GetBioseqCore();
1036 }
1037
1038 /// Auxiliary function to print the BLAST Archive in multiple output formats
PrintArchive(CRef<objects::CBlast4_archive> archive,CNcbiOstream & out)1039 void CBlastFormat::PrintArchive(CRef<objects::CBlast4_archive> archive, CNcbiOstream& out)
1040 {
1041 if (archive.Empty()) {
1042 return;
1043 }
1044 string outfmt = CNcbiEnvironment().Get("ARCHIVE_FORMAT");
1045 if (outfmt.empty()) {
1046 out << MSerial_AsnText << *archive;
1047 } else if (!NStr::CompareNocase(outfmt, "xml")) {
1048 out << MSerial_Xml << *archive;
1049 } else if (NStr::StartsWith(outfmt, "bin", NStr::eNocase)) {
1050 out << MSerial_AsnBinary << *archive;
1051 }
1052 }
1053
1054 void
WriteArchive(blast::IQueryFactory & queries,blast::CBlastOptionsHandle & options_handle,const CSearchResultSet & results,unsigned int num_iters,const list<CRef<CBlast4_error>> & msg)1055 CBlastFormat::WriteArchive(blast::IQueryFactory& queries,
1056 blast::CBlastOptionsHandle& options_handle,
1057 const CSearchResultSet& results,
1058 unsigned int num_iters,
1059 const list<CRef<CBlast4_error> > & msg)
1060 {
1061 CRef<objects::CBlast4_archive> archive;
1062 if (m_IsBl2Seq)
1063 {
1064 CRef<CBlastQueryVector> query_vector(new CBlastQueryVector);
1065 for (unsigned int i=0; i<m_SeqInfoSrc->Size(); i++)
1066 {
1067 list< CRef<CSeq_id> > ids = m_SeqInfoSrc->GetId(i);
1068 CRef<CSeq_id> id = FindBestChoice(ids, CSeq_id::BestRank);
1069 CRef<CSeq_loc> seq_loc(new CSeq_loc);
1070 seq_loc->SetWhole(*id);
1071 CRef<CBlastSearchQuery> search_query(new CBlastSearchQuery(*seq_loc, *m_Scope));
1072 query_vector->AddQuery(search_query);
1073 }
1074 CObjMgr_QueryFactory subjects(*query_vector);
1075 archive = BlastBuildArchive(queries, options_handle, results, subjects);
1076
1077 }
1078 else
1079 {
1080 // Use only by psi blast
1081 if(num_iters != 0) {
1082 archive = BlastBuildArchive(queries, options_handle, results, m_SearchDb , num_iters);
1083 }
1084 else {
1085 archive = BlastBuildArchive(queries, options_handle, results, m_SearchDb );
1086 }
1087 }
1088
1089 if(msg.size() > 0) {
1090 archive->SetMessages() = msg;
1091 }
1092 PrintArchive(archive, m_Outfile);
1093 }
1094
1095 void
WriteArchive(objects::CPssmWithParameters & pssm,blast::CBlastOptionsHandle & options_handle,const CSearchResultSet & results,unsigned int num_iters,const list<CRef<CBlast4_error>> & msg)1096 CBlastFormat::WriteArchive(objects::CPssmWithParameters & pssm,
1097 blast::CBlastOptionsHandle& options_handle,
1098 const CSearchResultSet& results,
1099 unsigned int num_iters,
1100 const list<CRef<CBlast4_error> > & msg)
1101 {
1102 CRef<objects::CBlast4_archive> archive(BlastBuildArchive(pssm, options_handle, results, m_SearchDb, num_iters));
1103
1104 if(msg.size() > 0) {
1105 archive->SetMessages() = msg;
1106 }
1107 PrintArchive(archive, m_Outfile);
1108 }
1109
1110
x_CreateDeflinesJson(CConstRef<CSeq_align_set> aln_set)1111 void CBlastFormat::x_CreateDeflinesJson(CConstRef<CSeq_align_set> aln_set)
1112 {
1113
1114 int delineFormatOption = 0;
1115 CShowBlastDefline deflines(*aln_set, *m_Scope,kFormatLineLength,m_NumSummary);
1116
1117 deflines.SetQueryNumber(1);//m_Query_number
1118 deflines.SetDbType (!m_DbIsAA);
1119 deflines.SetDbName(m_DbName);
1120 delineFormatOption |= CShowBlastDefline::eHtml;
1121 delineFormatOption |= CShowBlastDefline::eShowPercentIdent;
1122 deflines.SetOption(delineFormatOption); //m_defline_option
1123
1124 //Next three lines are for proper initialization in formatting of defline
1125 CShowBlastDefline::SDeflineTemplates *deflineTemplates = new CShowBlastDefline::SDeflineTemplates;
1126 deflineTemplates->advancedView = true;
1127 deflines.SetDeflineTemplates (deflineTemplates);
1128
1129
1130 vector <CShowBlastDefline::SDeflineFormattingInfo *> sdlFortInfoVec = deflines.GetFormattingInfo();
1131 CJson_Document doc;
1132 CJson_Object top_obj = doc.SetObject();
1133 CJson_Array defline_array = top_obj.insert_array("deflines");
1134
1135 for(size_t i = 0; i < sdlFortInfoVec.size(); i++) {
1136 CJson_Object obj = defline_array.push_back_object();
1137
1138 obj.insert("dfln_url",sdlFortInfoVec[i]->dfln_url);
1139 obj.insert("dfln_rid",sdlFortInfoVec[i]->dfln_rid);
1140 obj.insert("dfln_gi",sdlFortInfoVec[i]->dfln_gi);
1141 obj.insert("dfln_seqid",sdlFortInfoVec[i]->dfln_seqid);
1142 obj.insert("full_dfln_defline",sdlFortInfoVec[i]->full_dfln_defline);
1143 obj.insert("dfln_defline",sdlFortInfoVec[i]->dfln_defline);
1144 obj.insert("dfln_id",sdlFortInfoVec[i]->dfln_id);
1145 obj.insert("dflnFrm_id",sdlFortInfoVec[i]->dflnFrm_id);
1146 obj.insert("dflnFASTA_id",sdlFortInfoVec[i]->dflnFASTA_id);
1147 obj.insert("dflnAccs",sdlFortInfoVec[i]->dflnAccs);
1148
1149 obj.insert("score_info",sdlFortInfoVec[i]->score_info);
1150 obj.insert("dfln_hspnum",sdlFortInfoVec[i]->dfln_hspnum);
1151 obj.insert("dfln_alnLen",sdlFortInfoVec[i]->dfln_alnLen);
1152 obj.insert("dfln_blast_rank",sdlFortInfoVec[i]->dfln_blast_rank);
1153 obj.insert("total_bit_string",sdlFortInfoVec[i]->total_bit_string);
1154 obj.insert("percent_coverage",sdlFortInfoVec[i]->percent_coverage);
1155 obj.insert("evalue_string",sdlFortInfoVec[i]->evalue_string);
1156 obj.insert("percent_identity",sdlFortInfoVec[i]->percent_identity);
1157 }
1158 doc.Write(m_Outfile);
1159 }
1160
1161
x_DisplayDeflinesWithTemplates(CConstRef<CSeq_align_set> aln_set)1162 void CBlastFormat::x_DisplayDeflinesWithTemplates(CConstRef<CSeq_align_set> aln_set)
1163 {
1164 x_InitDeflineTemplates();
1165 _ASSERT(m_DeflineTemplates);
1166
1167 int delineFormatOption = 0;
1168 CShowBlastDefline deflines(*aln_set, *m_Scope,kFormatLineLength,m_NumSummary);
1169
1170 deflines.SetQueryNumber(1);//m_Query_number
1171 deflines.SetDbType (!m_DbIsAA);
1172 deflines.SetDbName(m_DbName);
1173 delineFormatOption |= CShowBlastDefline::eHtml;
1174 delineFormatOption |= CShowBlastDefline::eShowPercentIdent;
1175 deflines.SetOption(delineFormatOption); //m_defline_option
1176 deflines.SetDeflineTemplates (m_DeflineTemplates);
1177
1178 deflines.Init();
1179 deflines.Display(m_Outfile);
1180 }
1181
1182
x_DisplayAlignsWithTemplates(CConstRef<CSeq_align_set> aln_set,const blast::CSearchResults & results)1183 void CBlastFormat::x_DisplayAlignsWithTemplates(CConstRef<CSeq_align_set> aln_set,const blast::CSearchResults& results)
1184 {
1185 x_InitAlignTemplates();
1186 _ASSERT(m_AlignTemplates);
1187
1188 TMaskedQueryRegions masklocs;
1189 results.GetMaskedQueryRegions(masklocs);
1190
1191 CSeq_align_set copy_aln_set;
1192 CBlastFormatUtil::PruneSeqalign(*aln_set, copy_aln_set, m_NumAlignments);
1193
1194 CRef<CSeq_align_set> seqAlnSet(const_cast<CSeq_align_set*>(©_aln_set));
1195 if(!m_AlignSeqList.empty()) {
1196 CAlignFormatUtil::ExtractSeqAlignForSeqList(seqAlnSet, m_AlignSeqList);
1197 }
1198
1199 CDisplaySeqalign display(*seqAlnSet, *m_Scope, &masklocs, NULL, m_MatrixName);
1200 x_SetAlignParameters(display);
1201 display.SetAlignTemplates(m_AlignTemplates);
1202
1203 display.DisplaySeqalign(m_Outfile);
1204 }
1205
x_InitDeflineTemplates(void)1206 void CBlastFormat::x_InitDeflineTemplates(void)
1207 {
1208 CNcbiApplication* app = CNcbiApplication::Instance();
1209 if(!app) return;
1210 const CNcbiRegistry& reg = app->GetConfig();
1211
1212
1213 m_DeflineTemplates = new CShowBlastDefline::SDeflineTemplates;
1214 string defLineTmpl;
1215
1216 m_DeflineTemplates->defLineTmpl = reg.Get("Templates", "DFL_TABLE_ROW");
1217 m_DeflineTemplates->scoreInfoTmpl = reg.Get("Templates", "DFL_TABLE_SCORE_INFO");
1218 m_DeflineTemplates->seqInfoTmpl = reg.Get("Templates", "DFL_TABLE_SEQ_INFO");
1219 m_DeflineTemplates->advancedView = true;
1220 }
1221
x_InitAlignTemplates(void)1222 void CBlastFormat::x_InitAlignTemplates(void)
1223 {
1224 CNcbiApplication* app = CNcbiApplication::Instance();
1225 if(!app) return;
1226 const CNcbiRegistry& reg = app->GetConfig();
1227
1228 m_AlignTemplates = new CDisplaySeqalign::SAlignTemplates;
1229
1230 m_AlignTemplates->alignHeaderTmpl = reg.Get("Templates", "BLAST_ALIGN_HEADER");
1231 string blastAlignParamsTemplData = reg.Get("Templates", "BLAST_ALIGN_PARAMS");
1232 string blastAlignParamsTag = (m_Program == "blastn") ? "ALIGN_PARAMS_NUC" : "ALIGN_PARAMS_PROT";
1233 string blastAlignProtParamsTable = reg.Get("Templates", blastAlignParamsTag);
1234 m_AlignTemplates->alignInfoTmpl = CAlignFormatUtil::MapTemplate(blastAlignParamsTemplData,"align_params",blastAlignProtParamsTable);
1235 m_AlignTemplates->sortInfoTmpl = reg.Get("Templates", "SORT_ALIGNS_SEQ");
1236 m_AlignTemplates->alignFeatureTmpl = reg.Get("Templates", "ALN_FEATURES");
1237 m_AlignTemplates->alignFeatureLinkTmpl = reg.Get("Templates", "ALN_FEATURES_LINK");
1238
1239 m_AlignTemplates->alnDefLineTmpl = reg.Get("Templates", "ALN_DEFLINE_ROW");
1240 m_AlignTemplates->alnTitlesLinkTmpl = reg.Get("Templates", "ALN_DEFLINE_TITLES_LNK");
1241 m_AlignTemplates->alnTitlesTmpl = reg.Get("Templates", "ALN_DEFLINE_TITLES");
1242 m_AlignTemplates->alnSeqInfoTmpl = reg.Get("Templates", "ALN_DEFLINE_SEQ_INFO");
1243 m_AlignTemplates->alignRowTmpl = reg.Get("Templates", "BLAST_ALIGN_ROWS");
1244 m_AlignTemplates->alignRowTmplLast = reg.Get("Templates", "BLAST_ALIGN_ROWS_LST");
1245 }
1246
1247
1248
x_SetAlignParameters(CDisplaySeqalign & cds)1249 void CBlastFormat::x_SetAlignParameters(CDisplaySeqalign& cds)
1250 {
1251
1252 int AlignOption = 0;
1253
1254 AlignOption += CDisplaySeqalign::eShowMiddleLine;
1255
1256 if (m_Program == "tblastx") {
1257 AlignOption += CDisplaySeqalign::eTranslateNucToNucAlignment;
1258 }
1259 AlignOption += CDisplaySeqalign::eShowBlastInfo;
1260 AlignOption += CDisplaySeqalign::eShowBlastStyleId;
1261 AlignOption += CDisplaySeqalign::eHtml;
1262 AlignOption += CDisplaySeqalign::eShowSortControls;//*******????
1263 AlignOption += CDisplaySeqalign::eDynamicFeature;
1264 cds.SetAlignOption(AlignOption);
1265
1266 cds.SetDbName(m_DbName);
1267 cds.SetDbType(!m_DbIsAA);
1268 cds.SetLineLen(m_LineLength);
1269
1270 if (m_Program == "blastn" || m_Program == "megablast") {
1271 cds.SetMiddleLineStyle (CDisplaySeqalign::eBar);
1272 cds.SetAlignType(CDisplaySeqalign::eNuc);
1273 } else {
1274 cds.SetMiddleLineStyle (CDisplaySeqalign::eChar);
1275 cds.SetAlignType(CDisplaySeqalign::eProt);
1276 }
1277 cds.SetQueryNumber(1); //m_Query_number
1278 cds.SetSeqLocChar (CDisplaySeqalign::eLowerCase);
1279 cds.SetSeqLocColor ( CDisplaySeqalign::eGrey);
1280 cds.SetMasterGeneticCode(m_QueryGenCode);
1281 cds.SetSlaveGeneticCode(m_DbGenCode);
1282 }
1283
1284
1285
s_GetMolType(const CBioseq_Handle & bioseqHandle)1286 static string s_GetMolType(const CBioseq_Handle& bioseqHandle)
1287 {
1288 int molType = bioseqHandle.GetBioseqMolType();
1289 string molTypeString;
1290
1291 switch(molType) {
1292 case CSeq_inst::eMol_not_set:
1293 molTypeString = "cdna";
1294 break;
1295 case CSeq_inst::eMol_dna:
1296 molTypeString = "dna";
1297 break;
1298 case CSeq_inst::eMol_rna:
1299 molTypeString = "rna";
1300 break;
1301 case CSeq_inst::eMol_aa:
1302 molTypeString = "amino acid";
1303 break;
1304 case CSeq_inst::eMol_na:
1305 molTypeString = "nucleic acid";
1306 break;
1307 default:
1308 molTypeString = "Unknown";
1309 }
1310 return molTypeString;
1311 }
1312
1313 void
PrintReport(const blast::CSearchResults & results,CBlastFormat::DisplayOption displayOption)1314 CBlastFormat::PrintReport(const blast::CSearchResults& results,
1315 CBlastFormat::DisplayOption displayOption)
1316 {
1317 if (displayOption == eMetadata) {//Metadata in json format
1318 CBioseq_Handle bhandle = m_Scope->GetBioseqHandle(*results.GetSeqId(), CScope::eGetBioseq_All);
1319 CConstRef<CBioseq> bioseq = bhandle.GetBioseqCore();
1320
1321 //string seqID = CAlignFormatUtil::GetSeqIdString(*bioseq, m_BelieveQuery);
1322 string seqID;
1323 CConstRef <CSeq_id> queryID = sequence::GetId(bhandle).GetSeqId();
1324 CSeq_id::ELabelType labelType = (queryID->IsLocal()) ? CSeq_id::eDefault : CSeq_id::eContent;
1325 queryID->GetLabel(&seqID,labelType);
1326
1327
1328 string seqDescr = CBlastFormatUtil::GetSeqDescrString(*bioseq);
1329 seqDescr = seqDescr.empty() ? "None" : seqDescr;
1330
1331 string molType = s_GetMolType(bhandle);
1332
1333 int length = 0;
1334 if(bioseq->IsSetInst() && bioseq->GetInst().CanGetLength()){
1335 length = bioseq->GetInst().GetLength();
1336 }
1337
1338 CJson_Document doc;
1339 CJson_Object obj = doc.SetObject();
1340 obj.insert("Query",seqID);
1341 obj.insert("Query_descr",seqDescr);
1342 obj.insert("IsQueryLocal",queryID->IsLocal());
1343 obj.insert("Length",NStr::IntToString(length));
1344 obj.insert("Moltype",molType);
1345 obj.insert("Database",m_DbName);
1346 string dbTitle;
1347 try {
1348 CRef<CSeqDB> seqdb;
1349 seqdb = new CSeqDB(m_DbName, m_DbIsAA ? CSeqDB::eProtein : CSeqDB::eNucleotide);
1350 dbTitle = seqdb->GetTitle();
1351 }
1352 catch (...) {/*ignore exceptions for now*/}
1353 obj.insert("Database_descr",dbTitle);
1354 obj.insert("IsDBProtein",m_DbIsAA);
1355 obj.insert("Program",m_Program);
1356
1357
1358 if (results.HasErrors()) {
1359 obj.insert("Error",results.GetErrorStrings());
1360 }
1361 if (results.HasWarnings()) {
1362 obj.insert("Warning",results.GetWarningStrings());
1363 }
1364 doc.Write(m_Outfile);
1365 }
1366 else {
1367 CConstRef<CSeq_align_set> aln_set = results.GetSeqAlign();
1368 _ASSERT(results.HasAlignments());
1369 if (m_IsUngappedSearch) {
1370 aln_set.Reset(CDisplaySeqalign::PrepareBlastUngappedSeqalign(*aln_set));
1371 }
1372
1373 if (displayOption == eDescriptionsWithTemplates) {//Descriptions with html templates
1374 x_DisplayDeflinesWithTemplates(aln_set);
1375 }
1376 if (displayOption == eDescriptions) {//Descriptions with html templates
1377 x_CreateDeflinesJson(aln_set);
1378 }
1379 else if (displayOption == eAlignments) {// print the alignments with html templates
1380 x_DisplayAlignsWithTemplates(aln_set,results);
1381 }
1382 }
1383 }
1384
1385 void
PrintOneResultSet(const blast::CSearchResults & results,CConstRef<blast::CBlastQueryVector> queries,unsigned int itr_num,blast::CPsiBlastIterationState::TSeqIds prev_seqids,bool is_deltablast_domain_result)1386 CBlastFormat::PrintOneResultSet(const blast::CSearchResults& results,
1387 CConstRef<blast::CBlastQueryVector> queries,
1388 unsigned int itr_num
1389 /* = numeric_limits<unsigned int>::max() */,
1390 blast::CPsiBlastIterationState::TSeqIds prev_seqids
1391 /* = CPsiBlastIterationState::TSeqIds() */,
1392 bool is_deltablast_domain_result /* = false */)
1393 {
1394 // For remote searches, we don't retrieve the sequence data for the query
1395 // sequence when initially sending the request to the BLAST server (if it's
1396 // a GI/accession/TI), so we flush the scope so that it can be retrieved
1397 // (needed if a self-hit is found) again. This is not applicable if the
1398 // query sequence(s) are specified as FASTA (will be identified by local
1399 // IDs).
1400 if (m_IsRemoteSearch && !s_HasLocalIDs(queries)) {
1401 ResetScopeHistory();
1402 }
1403
1404 // Used with tabular output to print number of searches formatted at end.
1405 m_QueriesFormatted++;
1406
1407 if (m_FormatType == CFormattingArgs::eAsnText
1408 || m_FormatType == CFormattingArgs::eAsnBinary
1409 || m_FormatType == CFormattingArgs::eXml
1410 || m_FormatType == CFormattingArgs::eXml2
1411 || m_FormatType == CFormattingArgs::eJson
1412 || m_FormatType == CFormattingArgs::eXml2_S
1413 || m_FormatType == CFormattingArgs::eJson_S
1414 || m_FormatType == CFormattingArgs::eJsonSeqalign
1415 || m_FormatType == CFormattingArgs::eSAM)
1416 {
1417 x_PrintStructuredReport(results, queries);
1418 return;
1419 }
1420
1421 if (results.HasErrors()) {
1422 ERR_POST(Error << results.GetErrorStrings());
1423 return; // errors are deemed fatal
1424 }
1425 if (results.HasWarnings()) {
1426 ERR_POST(Warning << results.GetWarningStrings());
1427 }
1428
1429 if (m_FormatType == CFormattingArgs::eTabular ||
1430 m_FormatType == CFormattingArgs::eTabularWithComments ||
1431 m_FormatType == CFormattingArgs::eCommaSeparatedValues) {
1432 x_PrintTabularReport(results, itr_num);
1433 return;
1434 }
1435 if (m_FormatType == CFormattingArgs::eTaxFormat) {
1436 string reportCaption = "Tax BLAST report";
1437 reportCaption = m_IsHTML ? "<h1>" + reportCaption + "</h1>" : CAlignFormatUtil::AddSpaces(reportCaption,max(kMinTaxFormatLineLength,kFormatLineLength),CAlignFormatUtil::eSpacePosToCenter | CAlignFormatUtil::eAddEOLAtLineStart | CAlignFormatUtil::eAddEOLAtLineEnd);
1438 m_Outfile << reportCaption;
1439 x_PrintTaxReport(results);
1440 return;
1441 }
1442 const bool kIsTabularOutput = false;
1443
1444 if (is_deltablast_domain_result) {
1445 m_Outfile << "Results from domain search" << "\n";
1446 }
1447
1448 if (itr_num != numeric_limits<unsigned int>::max()) {
1449 m_Outfile << "Results from round " << itr_num << "\n";
1450 }
1451
1452 // other output types will need a bioseq handle
1453 CBioseq_Handle bhandle = m_Scope->GetBioseqHandle(*results.GetSeqId(),
1454 CScope::eGetBioseq_All);
1455 // If we're not able to get the query, most likely a bug. SB-981 , GP-2207
1456 if( !bhandle ){
1457 string message = "Failed to resolve SeqId: "+results.GetSeqId()->AsFastaString();
1458 ERR_POST(message);
1459 NCBI_THROW(CException, eUnknown, message);
1460 }
1461 CConstRef<CBioseq> bioseq = bhandle.GetBioseqCore();
1462
1463 // print the preamble for this query
1464
1465 m_Outfile << "\n\n";
1466 CBlastFormatUtil::AcknowledgeBlastQuery(*bioseq, kFormatLineLength,
1467 m_Outfile, m_BelieveQuery,
1468 m_IsHTML, kIsTabularOutput,
1469 results.GetRID());
1470
1471 if (m_IsBl2Seq && !m_IsDbScan) {
1472 m_Outfile << "\n";
1473 // FIXME: this might be configurable in the future
1474 const bool kBelieveSubject = false;
1475 CConstRef<CBioseq> subject_bioseq = x_CreateSubjectBioseq();
1476 CBlastFormatUtil::AcknowledgeBlastSubject(*subject_bioseq,
1477 kFormatLineLength,
1478 m_Outfile, kBelieveSubject,
1479 m_IsHTML, kIsTabularOutput);
1480 }
1481
1482 // quit early if there are no hits
1483 if ( !results.HasAlignments() ) {
1484 m_Outfile << "\n\n"
1485 << "***** " << CBlastFormatUtil::kNoHitsFound << " *****" << "\n"
1486 << "\n\n";
1487 x_PrintOneQueryFooter(*results.GetAncillaryData());
1488 return;
1489 }
1490
1491 CConstRef<CSeq_align_set> aln_set = results.GetSeqAlign();
1492 _ASSERT(results.HasAlignments());
1493 if (m_IsUngappedSearch) {
1494 aln_set.Reset(CDisplaySeqalign::PrepareBlastUngappedSeqalign(*aln_set));
1495 }
1496
1497 //invoke sorting only for m_HitsSortOption > CAlignFormatUtil::eEvalue or m_HspsSortOption > CAlignFormatUtil::eHspEvalue
1498 if(m_HitsSortOption > 0 || m_HspsSortOption > 0) {
1499 aln_set = CBlastFormatUtil::SortSeqalignForSortableFormat(
1500 *(const_cast<CSeq_align_set*>(aln_set.GetPointer())),
1501 (m_Program == "tblastx") ? true : false,
1502 m_HitsSortOption,
1503 m_HspsSortOption);
1504 }
1505
1506 const bool kIsGlobal = s_IsGlobalSeqAlign(aln_set);
1507
1508 //-------------------------------------------------
1509 // print 1-line summaries
1510 // Also disable when program is rmblastn. At this time
1511 // we do not want summary bit scores/evalues for this
1512 // program. -RMH-
1513 if ( (!m_IsBl2Seq || m_IsDbScan) && !(m_DisableKAStats || kIsGlobal) ) {
1514 x_DisplayDeflines(aln_set, itr_num, prev_seqids);
1515 }
1516
1517 //-------------------------------------------------
1518 // print the alignments
1519 m_Outfile << "\n";
1520
1521 TMaskedQueryRegions masklocs;
1522 results.GetMaskedQueryRegions(masklocs);
1523
1524 CSeq_align_set copy_aln_set;
1525 CBlastFormatUtil::PruneSeqalign(*aln_set, copy_aln_set, m_NumAlignments);
1526
1527 int flags = s_SetFlags(m_Program, m_FormatType, m_IsHTML, m_ShowGi,
1528 (m_IsBl2Seq && !m_IsDbScan), (m_DisableKAStats || kIsGlobal));
1529
1530 CDisplaySeqalign display(copy_aln_set, *m_Scope, &masklocs, NULL, m_MatrixName);
1531 display.SetDbName(m_DbName);
1532 display.SetDbType(!m_DbIsAA);
1533 display.SetLineLen(m_LineLength);
1534 int kAlignToShow=2000000000; // Nice large number per SB-1817
1535 display.SetNumAlignToShow(kAlignToShow);
1536
1537 // set the alignment flags
1538 display.SetAlignOption(flags);
1539
1540 if (m_LongSeqId) {
1541 display.UseLongSequenceIds();
1542 }
1543
1544 if (m_Program == "blastn" || m_Program == "megablast") {
1545 display.SetMiddleLineStyle(CDisplaySeqalign::eBar);
1546 display.SetAlignType(CDisplaySeqalign::eNuc);
1547 }
1548 else {
1549 display.SetMiddleLineStyle(CDisplaySeqalign::eChar);
1550 display.SetAlignType(CDisplaySeqalign::eProt);
1551 }
1552
1553 display.SetMasterGeneticCode(m_QueryGenCode);
1554 display.SetSlaveGeneticCode(m_DbGenCode);
1555 display.SetSeqLocChar(CDisplaySeqalign::eLowerCase);
1556 TSeqLocInfoVector subj_masks;
1557 results.GetSubjectMasks(subj_masks);
1558 display.SetSubjectMasks(subj_masks);
1559 display.DisplaySeqalign(m_Outfile);
1560
1561 // print the ancillary data for this query
1562
1563 x_PrintOneQueryFooter(*results.GetAncillaryData());
1564 }
1565
1566 void
PrintOneResultSet(blast::CIgBlastResults & results,CConstRef<blast::CBlastQueryVector> queries,SClone & clone_info,bool fill_clone_info,bool print_airr_format_header,int index)1567 CBlastFormat::PrintOneResultSet(blast::CIgBlastResults& results,
1568 CConstRef<blast::CBlastQueryVector> queries,
1569 SClone& clone_info,
1570 bool fill_clone_info,
1571 bool print_airr_format_header,
1572 int index)
1573 {
1574 clone_info.na = NcbiEmptyString;
1575 clone_info.aa = NcbiEmptyString;
1576
1577 // For remote searches, we don't retrieve the sequence data for the query
1578 // sequence when initially sending the request to the BLAST server (if it's
1579 // a GI/accession/TI), so we flush the scope so that it can be retrieved
1580 // (needed if a self-hit is found) again. This is not applicable if the
1581 // query sequence(s) are specified as FASTA (will be identified by local
1582 // IDs).
1583 if (m_IsRemoteSearch && !s_HasLocalIDs(queries)) {
1584 ResetScopeHistory();
1585 }
1586
1587 // Used with tabular output to print number of searches formatted at end.
1588 m_QueriesFormatted++;
1589
1590 if (m_FormatType == CFormattingArgs::eAsnText
1591 || m_FormatType == CFormattingArgs::eAsnBinary
1592 || m_FormatType == CFormattingArgs::eXml
1593 || m_FormatType == CFormattingArgs::eXml2
1594 || m_FormatType == CFormattingArgs::eJson
1595 || m_FormatType == CFormattingArgs::eXml2_S
1596 || m_FormatType == CFormattingArgs::eJson_S
1597 || m_FormatType == CFormattingArgs::eJsonSeqalign)
1598 {
1599 x_PrintStructuredReport(results, queries);
1600 return;
1601 }
1602
1603 if (results.HasErrors()) {
1604 ERR_POST(Error << results.GetErrorStrings());
1605 return; // errors are deemed fatal
1606 }
1607 if (results.HasWarnings()) {
1608 ERR_POST(Warning << results.GetWarningStrings());
1609 }
1610
1611 if (results.GetIgAnnotation()->m_MinusStrand) {
1612 x_ReverseQuery(results);
1613 }
1614 //set j domain
1615 CRef<CIgAnnotation> & annots_edit = results.SetIgAnnotation();
1616 if (annots_edit->m_JDomain[1] > 0 && annots_edit->m_DomainInfo[9] > 0 &&
1617 annots_edit->m_JDomain[1] > annots_edit->m_DomainInfo[9]){
1618 annots_edit->m_JDomain[0] = annots_edit->m_DomainInfo[9] + 1 ;
1619 //fwr4
1620 if (annots_edit->m_JDomain[3] > 0) {
1621 annots_edit->m_JDomain[2] = annots_edit->m_JDomain[1] + 1 ;
1622 }
1623 }
1624
1625 if (m_FormatType == CFormattingArgs::eTabular ||
1626 m_FormatType == CFormattingArgs::eTabularWithComments ||
1627 m_FormatType == CFormattingArgs::eCommaSeparatedValues) {
1628 m_FormatType = CFormattingArgs::eTabularWithComments;
1629 x_PrintIgTabularReport(results, clone_info, fill_clone_info);
1630 return;
1631 }
1632
1633 if (m_FormatType == CFormattingArgs::eAirrRearrangement) {
1634
1635 if (m_Program == "blastn" || m_Program == "BLASTN") {
1636 x_PrintAirrRearrangement(results, clone_info, fill_clone_info, print_airr_format_header);
1637 } else {
1638 m_Outfile << "The AIRR format is only available for nucleotide sequence search" << endl;
1639 }
1640 return;
1641 }
1642
1643 if (m_FormatType == CFormattingArgs::eTaxFormat) {
1644 string reportCaption = "Tax BLAST report";
1645 reportCaption = m_IsHTML ? "<h1>" + reportCaption + "</h1>" : CAlignFormatUtil::AddSpaces(reportCaption,max(kMinTaxFormatLineLength,kFormatLineLength),CAlignFormatUtil::eSpacePosToCenter | CAlignFormatUtil::eAddEOLAtLineStart | CAlignFormatUtil::eAddEOLAtLineEnd);
1646 m_Outfile << reportCaption;
1647 x_PrintTaxReport(results);
1648 return;
1649 }
1650
1651 const bool kIsTabularOutput = false;
1652
1653 // other output types will need a bioseq handle
1654 CBioseq_Handle bhandle = m_Scope->GetBioseqHandle(*results.GetSeqId(),
1655 CScope::eGetBioseq_All);
1656 // If this assertion fails, we're not able to get the query, most likely a
1657 // bug
1658 _ASSERT(bhandle);
1659 CConstRef<CBioseq> bioseq = bhandle.GetBioseqCore();
1660
1661 // print the preamble for this query
1662
1663 m_Outfile << "\n\n";
1664
1665 CBlastFormatUtil::AcknowledgeBlastQuery(*bioseq, kFormatLineLength,
1666 m_Outfile, m_BelieveQuery,
1667 m_IsHTML, kIsTabularOutput,
1668 results.GetRID());
1669
1670 // quit early if there are no hits
1671 if ( !results.HasAlignments() ) {
1672 m_Outfile << "\n\n"
1673 << "***** " << CBlastFormatUtil::kNoHitsFound << " *****" << "\n"
1674 << "\n\n";
1675 x_PrintOneQueryFooter(*results.GetAncillaryData());
1676 return;
1677 }
1678
1679 CConstRef<CSeq_align_set> aln_set = results.GetSeqAlign();
1680 _ASSERT(results.HasAlignments());
1681 if (m_IsUngappedSearch) {
1682 aln_set.Reset(CDisplaySeqalign::PrepareBlastUngappedSeqalign(*aln_set));
1683 }
1684
1685 //-------------------------------------------------
1686 // print 1-line summaries
1687 if ( !m_IsBl2Seq ) {
1688 CPsiBlastIterationState::TSeqIds prev_ids = CPsiBlastIterationState::TSeqIds();
1689 int additional = results.m_NumActualV +results.m_NumActualD + results.m_NumActualJ;
1690 x_DisplayDeflines(aln_set, numeric_limits<unsigned int>::max(), prev_ids, additional, index, 100);
1691 }
1692
1693 //-------------------------------------------------
1694 // print the alignments
1695 m_Outfile << "\n";
1696
1697 const CBlastTabularInfo::EFieldDelimiter kDelim =
1698 (m_FormatType == CFormattingArgs::eCommaSeparatedValues
1699 ? CBlastTabularInfo::eComma : CBlastTabularInfo::eTab);
1700
1701 CIgBlastTabularInfo tabinfo(m_Outfile, m_CustomOutputFormatSpec, kDelim);
1702 tabinfo.SetParseLocalIds(m_BelieveQuery);
1703
1704 // print the master alignment
1705 if (results.HasAlignments()) {
1706 const CRef<CIgAnnotation> & annots = results.GetIgAnnotation();
1707 CSeq_align_set::Tdata::const_iterator itr = aln_set->Get().begin();
1708 tabinfo.SetMasterFields(**itr, *m_Scope,
1709 annots->m_ChainType[0],
1710 annots->m_ChainTypeToShow,
1711 &m_ScoringMatrix);
1712 tabinfo.SetIgAnnotation(annots, m_IgOptions, aln_set, *m_Scope);
1713 if (fill_clone_info) {
1714 s_SetCloneInfo(tabinfo, bhandle, clone_info);
1715 }
1716 m_Outfile << "Domain classification requested: " << m_IgOptions->m_DomainSystem << endl << endl;
1717 if (m_IsHTML) {
1718 tabinfo.PrintHtmlSummary();
1719 } else {
1720 tabinfo.PrintMasterAlign("");
1721 }
1722 }
1723
1724 TMaskedQueryRegions masklocs;
1725 results.GetMaskedQueryRegions(masklocs);
1726
1727 int flags = CDisplaySeqalign::eMergeAlign
1728 + CDisplaySeqalign::eShowIdentity
1729 + CDisplaySeqalign::eNewTargetWindow
1730 + CDisplaySeqalign::eShowEndGaps
1731 + CDisplaySeqalign::eShowAlignStatsForMultiAlignView;
1732
1733 if (m_FormatType == CFormattingArgs::eFlatQueryAnchoredNoIdentities) {
1734 flags -= CDisplaySeqalign::eShowIdentity;
1735 }
1736
1737 if (m_IsHTML) {
1738 flags += CDisplaySeqalign::eHtml;
1739 flags += CDisplaySeqalign::eHyperLinkSlaveSeqid;
1740 }
1741
1742 list < CRef<CDisplaySeqalign::DomainInfo> > domain;
1743
1744 string kabat_domain_name[] = {"FR1", "CDR1", "FR2", "CDR2", "FR3", "CDR3", "FR4"};
1745 string imgt_domain_name[] = {"FR1-IMGT", "CDR1-IMGT", "FR2-IMGT", "CDR2-IMGT", "FR3-IMGT", "CDR3-IMGT", "FR4-IMGT"};
1746 int domain_name_length = 7;
1747 vector<string> domain_name;
1748 if (m_IgOptions->m_DomainSystem == "kabat") {
1749 for (int i = 0; i < domain_name_length; i ++) {
1750 domain_name.push_back(kabat_domain_name[i]);
1751 }
1752 } else {
1753 for (int i = 0; i < domain_name_length; i ++) {
1754 domain_name.push_back(imgt_domain_name[i]);
1755 }
1756 }
1757
1758 const CRef<CIgAnnotation> & annots = results.GetIgAnnotation();
1759
1760 for (int i=0; i<9; i = i + 2) {
1761 if (annots->m_DomainInfo[i] >= 0){
1762 CRef<CDisplaySeqalign::DomainInfo> temp(new CDisplaySeqalign::DomainInfo);
1763 int start = annots->m_DomainInfo[i];
1764 int subject_start = annots->m_DomainInfo_S[i];
1765
1766 int stop = annots->m_DomainInfo[i+1];
1767 int subject_stop = annots->m_DomainInfo_S[i+1];
1768
1769 temp->seqloc = new CSeq_loc((CSeq_loc::TId &) aln_set->Get().front()->GetSeq_id(0),
1770 (CSeq_loc::TPoint) start,
1771 (CSeq_loc::TPoint) stop);
1772 temp->subject_seqloc = new CSeq_loc((CSeq_loc::TId &) aln_set->Get().front()->GetSeq_id(1),
1773 (CSeq_loc::TPoint) subject_start,
1774 (CSeq_loc::TPoint) subject_stop);
1775 temp->is_subject_start_valid = subject_start > 0 ? true:false;
1776 temp->is_subject_stop_valid = subject_stop > 0 ? true:false;
1777 temp->domain_name = domain_name[i/2];
1778 domain.push_back(temp);
1779 }
1780 }
1781
1782 //J domain
1783 //cdr3
1784 if (annots->m_JDomain[0] > 0 && annots->m_JDomain[1] > 0){
1785 CRef<CDisplaySeqalign::DomainInfo> temp(new CDisplaySeqalign::DomainInfo);
1786 int start = annots->m_JDomain[0];
1787 int subject_start = -1;
1788 int stop = annots->m_JDomain[1];
1789 int subject_stop = -1;
1790
1791 temp->seqloc = new CSeq_loc((CSeq_loc::TId &) aln_set->Get().front()->GetSeq_id(0),
1792 (CSeq_loc::TPoint) start,
1793 (CSeq_loc::TPoint) stop);
1794 CRef<CSeq_id> id_holder (new CSeq_id);
1795 temp->subject_seqloc = new CSeq_loc(*id_holder,
1796 (CSeq_loc::TPoint) subject_start,
1797 (CSeq_loc::TPoint) subject_stop);
1798 temp->is_subject_start_valid = subject_start > 0 ? true:false;
1799 temp->is_subject_stop_valid = subject_stop > 0 ? true:false;
1800 temp->domain_name = domain_name[5];
1801 domain.push_back(temp);
1802 }
1803 //fwr4
1804 if (annots->m_JDomain[2] > 0 && annots->m_JDomain[3] > 0){
1805 CRef<CDisplaySeqalign::DomainInfo> temp(new CDisplaySeqalign::DomainInfo);
1806 int start = annots->m_JDomain[2];
1807 int subject_start = -1;
1808 int stop = annots->m_JDomain[3];
1809 int subject_stop = -1;
1810
1811 temp->seqloc = new CSeq_loc((CSeq_loc::TId &) aln_set->Get().front()->GetSeq_id(0),
1812 (CSeq_loc::TPoint) start,
1813 (CSeq_loc::TPoint) stop);
1814 CRef<CSeq_id> id_holder (new CSeq_id);
1815 temp->subject_seqloc = new CSeq_loc(*id_holder,
1816 (CSeq_loc::TPoint) subject_start,
1817 (CSeq_loc::TPoint) subject_stop);
1818 temp->is_subject_start_valid = subject_start > 0 ? true:false;
1819 temp->is_subject_stop_valid = subject_stop > 0 ? true:false;
1820 temp->domain_name = domain_name[6];
1821 domain.push_back(temp);
1822 }
1823
1824 CDisplaySeqalign display(*aln_set, *m_Scope, &masklocs, NULL, m_MatrixName);
1825 int num_align_to_show = results.m_NumActualV + results.m_NumActualD +
1826 results.m_NumActualJ;
1827 if (m_DbName != m_IgOptions->m_Db[0]->GetDatabaseName()){
1828 num_align_to_show += m_NumAlignments;
1829 }
1830 display.SetNumAlignToShow(num_align_to_show);
1831 display.SetMasterDomain(&domain);
1832 display.SetDbName(m_DbName);
1833 display.SetDbType(!m_DbIsAA);
1834 display.SetLineLen(90);
1835
1836 if (m_LongSeqId) {
1837 display.UseLongSequenceIds();
1838 }
1839
1840 if (annots->m_FrameInfo[0] >= 0 && m_IgOptions->m_Translate) {
1841 display.SetTranslatedFrameForLocalSeq((CDisplaySeqalign::TranslatedFrameForLocalSeq) (annots->m_FrameInfo[0]%3));
1842 flags += CDisplaySeqalign::eShowTranslationForLocalSeq;
1843 }
1844 flags += CDisplaySeqalign::eShowSequencePropertyLabel;
1845 flags += CDisplaySeqalign::eShowInfoOnMouseOverSeqid;
1846 vector<string> chain_type_list;
1847 ITERATE(vector<string>, iter, annots->m_ChainType) {
1848 if (*iter=="N/A"){
1849 chain_type_list.push_back(NcbiEmptyString);
1850 } else {
1851 chain_type_list.push_back(*iter);
1852 }
1853 }
1854 display.SetSequencePropertyLabel(&chain_type_list);
1855 // set the alignment flags
1856
1857 display.SetAlignOption(flags);
1858 if (m_Program == "blastn" || m_Program == "BLASTN") {
1859 display.SetAlignType(CDisplaySeqalign::eNuc);
1860 } else {
1861 display.SetAlignType(CDisplaySeqalign::eProt);
1862 }
1863 display.SetMasterGeneticCode(m_QueryGenCode);
1864 display.SetSlaveGeneticCode(m_DbGenCode);
1865 display.SetSeqLocChar(CDisplaySeqalign::eLowerCase);
1866 TSeqLocInfoVector subj_masks;
1867 results.GetSubjectMasks(subj_masks);
1868 display.SetSubjectMasks(subj_masks);
1869
1870 if (m_IsHTML) {
1871 display.SetResultPositionIndex(index);
1872 m_Outfile << "\n<CENTER><b><FONT color=\"green\">Alignments</FONT></b></CENTER>"
1873 << endl;
1874
1875 } else {
1876 m_Outfile << "\nAlignments" << endl;
1877 }
1878
1879 display.DisplaySeqalign(m_Outfile);
1880
1881 // print the ancillary data for this query
1882
1883 x_PrintOneQueryFooter(*results.GetAncillaryData());
1884 if (m_IsHTML) {
1885 m_Outfile << "<hr>" << endl;
1886 }
1887 }
1888
1889 void
x_ReverseQuery(blast::CIgBlastResults & results)1890 CBlastFormat::x_ReverseQuery(blast::CIgBlastResults& results)
1891 {
1892 if (!results.HasAlignments()){
1893 return;
1894 }
1895 // create a temporary seq_id
1896 CConstRef<CSeq_id> qid = results.GetSeqId();
1897 string new_id = qid->AsFastaString() + "_reversed";
1898
1899 // create a bioseq
1900 CBioseq_Handle q_bh = m_Scope->GetBioseqHandle(*qid);
1901 int len = q_bh.GetBioseqLength();
1902 CSeq_loc loc(*(const_cast<CSeq_id *>(&*qid)), 0, len-1, eNa_strand_minus);
1903 CRef<CBioseq> q_new(new CBioseq(loc, new_id));
1904 CConstRef<CSeq_id> new_qid = m_Scope->AddBioseq(*q_new).GetSeqId();
1905 if (qid->IsLocal()) {
1906 string title = sequence::CDeflineGenerator().GenerateDefline(q_bh);
1907 if (title != "") {
1908 CRef<CSeqdesc> des(new CSeqdesc());
1909 if (m_FormatType != CFormattingArgs::eAirrRearrangement) {
1910 des->SetTitle("reversed|" + title);
1911 } else {
1912 des->SetTitle(title);
1913 }
1914 m_Scope->GetBioseqEditHandle(*q_new).SetDescr().Set().push_back(des);
1915 }
1916 }
1917
1918 // set up the mapping
1919 CSeq_loc new_loc(*(const_cast<CSeq_id *>(&*new_qid)), 0, len-1, eNa_strand_plus);
1920 CSeq_loc_Mapper mapper(loc, new_loc, &*m_Scope);
1921
1922 // replace the alignment with the new query
1923 CRef<CSeq_align_set> align_set(new CSeq_align_set());
1924 ITERATE(CSeq_align_set::Tdata, align, results.GetSeqAlign()->Get()) {
1925 CRef<CSeq_align> new_align = mapper.Map(**align, 0);
1926 align_set->Set().push_back(new_align);
1927 }
1928 results.SetSeqAlign().Reset(&*align_set);
1929
1930 // reverse IgAnnotations
1931 CRef<CIgAnnotation> &annots = results.SetIgAnnotation();
1932 for (int i=0; i<6; i+=2) {
1933 int start = annots->m_GeneInfo[i];
1934 if (start >= 0) {
1935 annots->m_GeneInfo[i] = len - annots->m_GeneInfo[i+1];
1936 annots->m_GeneInfo[i+1] = len - start;
1937 }
1938 }
1939
1940 for (int i=0; i<12; ++i) {
1941 int pos = annots->m_DomainInfo[i];
1942 if (pos >= 0) {
1943 annots->m_DomainInfo[i] = max(0, len - 1 - pos);
1944 }
1945 }
1946
1947 for (int i=0; i<3; ++i) {
1948 int pos = annots->m_FrameInfo[i];
1949 if (pos >= 0) {
1950 annots->m_FrameInfo[i] = len -1 - pos;
1951 }
1952 }
1953 }
1954
1955 void
PrintPhiResult(const blast::CSearchResultSet & result_set,CConstRef<blast::CBlastQueryVector> queries,unsigned int itr_num,blast::CPsiBlastIterationState::TSeqIds prev_seqids)1956 CBlastFormat::PrintPhiResult(const blast::CSearchResultSet& result_set,
1957 CConstRef<blast::CBlastQueryVector> queries,
1958 unsigned int itr_num
1959 /* = numeric_limits<unsigned int>::max() */,
1960 blast::CPsiBlastIterationState::TSeqIds prev_seqids
1961 /* = CPsiBlastIterationState::TSeqIds() */)
1962 {
1963 // For remote searches, we don't retrieve the sequence data for the query
1964 // sequence when initially sending the request to the BLAST server (if it's
1965 // a GI/accession/TI), so we flush the scope so that it can be retrieved
1966 // (needed if a self-hit is found) again. This is not applicable if the
1967 // query sequence(s) are specified as FASTA (will be identified by local
1968 // IDs).
1969 if (m_IsRemoteSearch && !s_HasLocalIDs(queries)) {
1970 ResetScopeHistory();
1971 }
1972
1973 if (m_FormatType == CFormattingArgs::eAsnText
1974 || m_FormatType == CFormattingArgs::eAsnBinary
1975 || m_FormatType == CFormattingArgs::eXml
1976 || m_FormatType == CFormattingArgs::eXml2
1977 || m_FormatType == CFormattingArgs::eJson
1978 || m_FormatType == CFormattingArgs::eXml2_S
1979 || m_FormatType == CFormattingArgs::eJson_S
1980 || m_FormatType == CFormattingArgs::eJsonSeqalign)
1981 {
1982 ITERATE(CSearchResultSet, result, result_set) {
1983 x_PrintStructuredReport(**result, queries);
1984 }
1985 return;
1986 }
1987
1988 ITERATE(CSearchResultSet, result, result_set) {
1989 if ((**result).HasErrors()) {
1990 m_Outfile << "\n" << (**result).GetErrorStrings() << "\n";
1991 return; // errors are deemed fatal
1992 }
1993 if ((**result).HasWarnings()) {
1994 m_Outfile << "\n" << (**result).GetWarningStrings() << "\n";
1995 }
1996 }
1997
1998 if (m_FormatType == CFormattingArgs::eTabular ||
1999 m_FormatType == CFormattingArgs::eTabularWithComments ||
2000 m_FormatType == CFormattingArgs::eCommaSeparatedValues) {
2001 ITERATE(CSearchResultSet, result, result_set) {
2002 x_PrintTabularReport(**result, itr_num);
2003 }
2004 return;
2005 }
2006 if (m_FormatType == CFormattingArgs::eTaxFormat) {
2007 string reportCaption = "Tax BLAST report";
2008 reportCaption = m_IsHTML ? "<h1>" + reportCaption + "</h1>" : CAlignFormatUtil::AddSpaces(reportCaption,max(kMinTaxFormatLineLength,kFormatLineLength),CAlignFormatUtil::eSpacePosToCenter | CAlignFormatUtil::eAddEOLAtLineStart | CAlignFormatUtil::eAddEOLAtLineEnd);
2009 m_Outfile << reportCaption;
2010 ITERATE(CSearchResultSet, result, result_set) {
2011 x_PrintTaxReport(**result);
2012 }
2013 return;
2014 }
2015
2016 const CSearchResults& first_results = result_set[0];
2017
2018 if (itr_num != numeric_limits<unsigned int>::max()) {
2019 m_Outfile << "Results from round " << itr_num << "\n";
2020 }
2021
2022 CBioseq_Handle bhandle = m_Scope->GetBioseqHandle(*first_results.GetSeqId(),
2023 CScope::eGetBioseq_All);
2024 CConstRef<CBioseq> bioseq = bhandle.GetBioseqCore();
2025
2026 // print the preamble for this query
2027
2028 m_Outfile << "\n\n";
2029 CBlastFormatUtil::AcknowledgeBlastQuery(*bioseq, kFormatLineLength,
2030 m_Outfile, m_BelieveQuery,
2031 m_IsHTML, false,
2032 first_results.GetRID());
2033
2034 if (m_FormatType == CFormattingArgs::eTaxFormat) {
2035 string reportCaption = "Tax BLAST report";
2036 reportCaption = m_IsHTML ? "<h1>" + reportCaption + "</h1>" : CAlignFormatUtil::AddSpaces(reportCaption,max(kMinTaxFormatLineLength,kFormatLineLength),CAlignFormatUtil::eSpacePosToCenter | CAlignFormatUtil::eAddEOLAtLineStart | CAlignFormatUtil::eAddEOLAtLineEnd);
2037 m_Outfile << reportCaption;
2038 ITERATE(CSearchResultSet, result, result_set) {
2039 x_PrintTaxReport(**result);
2040 }
2041 return;
2042 }
2043
2044 const SPHIQueryInfo *phi_query_info = first_results.GetPhiQueryInfo();
2045
2046 if (phi_query_info)
2047 {
2048 vector<int> offsets;
2049 for (int index=0; index<phi_query_info->num_patterns; index++)
2050 offsets.push_back(phi_query_info->occurrences[index].offset);
2051
2052 CBlastFormatUtil::PrintPhiInfo(phi_query_info->num_patterns,
2053 string(phi_query_info->pattern),
2054 phi_query_info->probability,
2055 offsets, m_Outfile);
2056 }
2057
2058 // quit early if there are no hits
2059 if ( !first_results.HasAlignments() ) {
2060 m_Outfile << "\n\n"
2061 << "***** " << CBlastFormatUtil::kNoHitsFound << " *****" << "\n"
2062 << "\n\n";
2063 x_PrintOneQueryFooter(*first_results.GetAncillaryData());
2064 return;
2065 }
2066
2067 _ASSERT(first_results.HasAlignments());
2068 //-------------------------------------------------
2069
2070 ITERATE(CSearchResultSet, result, result_set)
2071 {
2072 CConstRef<CSeq_align_set> aln_set = (**result).GetSeqAlign();
2073 x_DisplayDeflines(aln_set, itr_num, prev_seqids);
2074 }
2075
2076 //-------------------------------------------------
2077 // print the alignments
2078 m_Outfile << "\n";
2079
2080
2081 int flags = s_SetFlags(m_Program, m_FormatType, m_IsHTML, m_ShowGi,
2082 (m_IsBl2Seq && !m_IsDbScan), false);
2083
2084 if (phi_query_info)
2085 {
2086 SPHIPatternInfo *occurrences = phi_query_info->occurrences;
2087 int index;
2088 for (index=0; index<phi_query_info->num_patterns; index++)
2089 {
2090 list <CDisplaySeqalign::FeatureInfo*> phiblast_pattern;
2091 CSeq_id* id = new CSeq_id;
2092 id->Assign(*(result_set[index]).GetSeqId());
2093 CDisplaySeqalign::FeatureInfo* feature_info = new CDisplaySeqalign::FeatureInfo;
2094 feature_info->seqloc = new CSeq_loc(*id, (TSeqPos) occurrences[index].offset,
2095 (TSeqPos) (occurrences[index].offset + occurrences[index].length - 1));
2096 feature_info->feature_char = '*';
2097 feature_info->feature_id = "pattern";
2098 phiblast_pattern.push_back(feature_info);
2099
2100 m_Outfile << "\nSignificant alignments for pattern occurrence " << index+1
2101 << " at position " << 1+occurrences[index].offset << "\n\n";
2102
2103 TMaskedQueryRegions masklocs;
2104 result_set[index].GetMaskedQueryRegions(masklocs);
2105 CConstRef<CSeq_align_set> aln_set = result_set[index].GetSeqAlign();
2106 CSeq_align_set copy_aln_set;
2107 CBlastFormatUtil::PruneSeqalign(*aln_set, copy_aln_set, m_NumAlignments);
2108
2109 CDisplaySeqalign display(copy_aln_set, *m_Scope, &masklocs, &phiblast_pattern,
2110 m_MatrixName);
2111
2112 display.SetDbName(m_DbName);
2113 display.SetDbType(!m_DbIsAA);
2114 display.SetLineLen(m_LineLength);
2115
2116 // set the alignment flags
2117 display.SetAlignOption(flags);
2118
2119 if (m_LongSeqId) {
2120 display.UseLongSequenceIds();
2121 }
2122
2123 if (m_Program == "blastn" || m_Program == "megablast") {
2124 display.SetMiddleLineStyle(CDisplaySeqalign::eBar);
2125 display.SetAlignType(CDisplaySeqalign::eNuc);
2126 }
2127 else {
2128 display.SetMiddleLineStyle(CDisplaySeqalign::eChar);
2129 display.SetAlignType(CDisplaySeqalign::eProt);
2130 }
2131
2132 display.SetMasterGeneticCode(m_QueryGenCode);
2133 display.SetSlaveGeneticCode(m_DbGenCode);
2134 display.SetSeqLocChar(CDisplaySeqalign::eLowerCase);
2135 display.DisplaySeqalign(m_Outfile);
2136 m_Outfile << "\n";
2137
2138 NON_CONST_ITERATE(list<CDisplaySeqalign::FeatureInfo*>, itr, phiblast_pattern) {
2139 delete *itr;
2140 }
2141 }
2142 }
2143
2144 // print the ancillary data for this query
2145
2146 x_PrintOneQueryFooter(*first_results.GetAncillaryData());
2147 }
2148
2149
2150
2151 void
PrintEpilog(const blast::CBlastOptions & options)2152 CBlastFormat::PrintEpilog(const blast::CBlastOptions& options)
2153 {
2154 if ((m_FormatType == CFormattingArgs::eXml2) || (m_FormatType == CFormattingArgs::eJson) ||
2155 (m_FormatType == CFormattingArgs::eXml2_S) || (m_FormatType == CFormattingArgs::eJson_S)) {
2156 if(!m_AccumulatedResults.empty()) {
2157 CRef <CBlastSearchQuery> q = m_AccumulatedQueries->GetBlastSearchQuery(0);
2158 if(m_IsBl2Seq) {
2159 CCmdLineBlastXML2ReportData report_data(q, m_AccumulatedResults, m_Options,
2160 m_Scope, m_SeqInfoSrc);
2161 x_WriteXML2(report_data);
2162 }
2163 else if(m_IsIterative){
2164 CCmdLineBlastXML2ReportData report_data (q, m_AccumulatedResults, m_Options,
2165 m_Scope, m_DbInfo);
2166 x_WriteXML2(report_data);
2167 }
2168 m_AccumulatedResults.clear();
2169 m_AccumulatedQueries->clear();
2170 }
2171 if (m_FormatType == CFormattingArgs::eXml2
2172 || m_FormatType == CFormattingArgs::eXml2_S) {
2173 x_GenerateXML2MasterFile();
2174 }
2175 else {
2176 x_GenerateJSONMasterFile();
2177 }
2178 return;
2179 }
2180
2181 if (m_FormatType == CFormattingArgs::eTabularWithComments) {
2182 CBlastTabularInfo tabinfo(m_Outfile, m_CustomOutputFormatSpec);
2183 tabinfo.PrintNumProcessed(m_QueriesFormatted);
2184 return;
2185 } else if (m_FormatType >= CFormattingArgs::eTabular)
2186 return; // No footer for these.
2187
2188 // Most of XML is printed as it's finished.
2189 // the epilog closes the report.
2190 if (m_FormatType == CFormattingArgs::eXml) {
2191 m_Outfile << m_BlastXMLIncremental->m_SerialXmlEnd << endl;
2192 m_AccumulatedResults.clear();
2193 m_AccumulatedQueries->clear();
2194 return;
2195 }
2196
2197 m_Outfile << NcbiEndl << NcbiEndl;
2198 if (m_Program == "deltablast" && !m_DomainDbInfo.empty()) {
2199 m_Outfile << "Conserved Domain";
2200 CBlastFormatUtil::PrintDbReport(m_DomainDbInfo, kFormatLineLength,
2201 m_Outfile, false);
2202 }
2203
2204 if ( !m_IsBl2Seq || m_IsDbScan) {
2205 CBlastFormatUtil::PrintDbReport(m_DbInfo, kFormatLineLength,
2206 m_Outfile, false);
2207 }
2208
2209 if (m_Program == "blastn" || m_Program == "megablast") {
2210 m_Outfile << "\n\nMatrix: " << "blastn matrix " <<
2211 options.GetMatchReward() << " " <<
2212 options.GetMismatchPenalty() << "\n";
2213 }
2214 else {
2215 m_Outfile << "\n\nMatrix: " << options.GetMatrixName() << "\n";
2216 }
2217
2218 if (options.GetGappedMode() == true) {
2219 double gap_extension = (double) options.GetGapExtensionCost();
2220 if ((m_Program == "megablast" || m_Program == "blastn") && options.GetGapExtensionCost() == 0)
2221 { // Formula from PMID 10890397 applies if both gap values are zero.
2222 gap_extension = -2*options.GetMismatchPenalty() + options.GetMatchReward();
2223 gap_extension /= 2.0;
2224 }
2225 m_Outfile << "Gap Penalties: Existence: "
2226 << options.GetGapOpeningCost() << ", Extension: "
2227 << gap_extension << "\n";
2228 }
2229 if (options.GetWordThreshold()) {
2230 m_Outfile << "Neighboring words threshold: " <<
2231 options.GetWordThreshold() << "\n";
2232 }
2233 if (options.GetWindowSize()) {
2234 m_Outfile << "Window for multiple hits: " <<
2235 options.GetWindowSize() << "\n";
2236 }
2237
2238 if (m_IsHTML) {
2239 m_Outfile << kHTML_Suffix << "\n";
2240 }
2241 }
2242
ResetScopeHistory()2243 void CBlastFormat::ResetScopeHistory()
2244 {
2245 // Do not reset the scope for BLAST2Sequences or else we'll loose the
2246 // sequence data! (see x_CreateSubjectBioseq)
2247 if ((m_IsBl2Seq)
2248 || (m_FormatType == CFormattingArgs::eXml2)
2249 || (m_FormatType == CFormattingArgs::eJson)
2250 || (m_FormatType == CFormattingArgs::eXml2_S)
2251 || (m_FormatType == CFormattingArgs::eJson_S)){
2252 return;
2253 }
2254
2255 // Our current XML/ASN.1 libraries do not have provisions for
2256 // incremental object input/output, so with XML output format we
2257 // need to accumulate the whole document before writing any data.
2258
2259 // This means that XML output requires more memory than other
2260 // output formats.
2261
2262 if (m_FormatType != CFormattingArgs::eXml)
2263 {
2264 m_Scope->ResetDataAndHistory();
2265 }
2266 }
2267
s_GetBaseName(const string & baseFile,bool isXML,bool withPath)2268 static string s_GetBaseName(const string & baseFile, bool isXML, bool withPath)
2269 {
2270 string dir = kEmptyStr;
2271 string base = kEmptyStr;
2272 string ext = kEmptyStr;
2273 CDirEntry::SplitPath(baseFile, withPath ? &dir:NULL, &base, &ext );
2274 if(!((isXML && NStr::CompareNocase(ext, ".xml") == 0 ) ||
2275 (!isXML && NStr::CompareNocase(ext, ".json") == 0))){
2276 base += ext;
2277 }
2278 if(withPath)
2279 return dir + base;
2280
2281 return base;
2282 }
2283
x_WriteXML2(CCmdLineBlastXML2ReportData & report_data)2284 void CBlastFormat::x_WriteXML2(CCmdLineBlastXML2ReportData & report_data)
2285 {
2286 if(m_FormatType == CFormattingArgs::eXml2_S) {
2287 BlastXML2_FormatReport(&report_data, &m_Outfile);
2288 }
2289 else if (m_FormatType == CFormattingArgs::eJson_S) {
2290 m_XMLFileCount++;
2291 if(m_XMLFileCount > 1) {
2292 m_Outfile << ",\n";
2293 }
2294 BlastJSON_FormatReport(&report_data, &m_Outfile);
2295 }
2296 else {
2297 m_XMLFileCount++;
2298
2299 if(m_FormatType == CFormattingArgs::eXml2) {
2300 string file_name = s_GetBaseName(m_BaseFile, true, true) + "_" + NStr::IntToString(m_XMLFileCount) + ".xml";
2301 BlastXML2_FormatReport(&report_data, file_name);
2302 }
2303 else {
2304 string file_name = s_GetBaseName(m_BaseFile, false, true) + "_" + NStr::IntToString(m_XMLFileCount) + ".json";
2305 BlastJSON_FormatReport(&report_data, file_name);
2306 }
2307 }
2308 }
2309
x_PrintXML2Report(const blast::CSearchResults & results,CConstRef<blast::CBlastQueryVector> queries)2310 void CBlastFormat::x_PrintXML2Report(const blast::CSearchResults& results,
2311 CConstRef<blast::CBlastQueryVector> queries)
2312 {
2313 CRef<CSearchResults> res(const_cast<CSearchResults*>(&results));
2314 res->TrimSeqAlign(m_HitlistSize);
2315 if((m_IsIterative) || (m_IsBl2Seq)) {
2316 if(m_AccumulatedResults.empty()) {
2317 _ASSERT(m_AccumulatedQueries->size() == 0);
2318 m_AccumulatedResults.push_back(res);
2319 CConstRef<CSeq_id> query_id = results.GetSeqId();
2320 ITERATE(CBlastQueryVector, itr, *queries) {
2321 if (query_id->Match(*(*itr)->GetQueryId())) {
2322 m_AccumulatedQueries->push_back(*itr);
2323 break;
2324 }
2325 }
2326 }
2327 else {
2328 CConstRef<CSeq_id> query_id = results.GetSeqId();
2329 if(m_AccumulatedResults[0].GetSeqId()->Match(*query_id)) {
2330 m_AccumulatedResults.push_back(res);
2331 }
2332 else {
2333 CRef <CBlastSearchQuery> q = m_AccumulatedQueries->GetBlastSearchQuery(0);
2334 if(m_IsBl2Seq) {
2335 CCmdLineBlastXML2ReportData report_data(q, m_AccumulatedResults, m_Options,
2336 m_Scope, m_SeqInfoSrc);
2337 x_WriteXML2(report_data);
2338 }
2339 else {
2340 CCmdLineBlastXML2ReportData report_data (q, m_AccumulatedResults, m_Options,
2341 m_Scope, m_DbInfo);
2342 x_WriteXML2(report_data);
2343 }
2344 m_AccumulatedResults.clear();
2345 m_AccumulatedQueries->clear();
2346
2347 m_AccumulatedResults.push_back(res);
2348 ITERATE(CBlastQueryVector, itr, *queries) {
2349 if (query_id->Match(*(*itr)->GetQueryId())) {
2350 m_AccumulatedQueries->push_back(*itr);
2351 break;
2352 }
2353 }
2354 }
2355 }
2356 }
2357 else {
2358 CRef<CBlastSearchQuery> q;
2359 CConstRef<CSeq_id> query_id = results.GetSeqId();
2360 ITERATE(CBlastQueryVector, itr, *queries) {
2361 if (query_id->Match(*(*itr)->GetQueryId())) {
2362 q = *itr;
2363 break;
2364 }
2365 }
2366 CCmdLineBlastXML2ReportData report_data (q, *res, m_Options, m_Scope, m_DbInfo);
2367 x_WriteXML2(report_data);
2368 }
2369 }
2370
x_GenerateXML2MasterFile(void)2371 void CBlastFormat::x_GenerateXML2MasterFile(void)
2372 {
2373 if(m_FormatType == CFormattingArgs::eXml2_S) {
2374 m_Outfile << "</BlastXML2>\n";
2375 return;
2376 }
2377
2378 m_Outfile << "<?xml version=\"1.0\"?>\n<BlastXML2\n"
2379 "xmlns=\"http://www.ncbi.nlm.nih.gov\"\n"
2380 "xmlns:xi=\"http://www.w3.org/2003/XInclude\"\n"
2381 "xmlns:xs=\"http://www.w3.org/2001/XMLSchema-instance\"\n"
2382 "xs:schemaLocation=\"http://www.ncbi.nlm.nih.gov http://www.ncbi.nlm.nih.gov/data_specs/schema_alt/NCBI_BlastOutput2.xsd\">\n";
2383
2384 string base = s_GetBaseName(m_BaseFile, true, false);
2385 for(int i = 1; i <= m_XMLFileCount; i ++) {
2386 string file_name = base + "_" + NStr::IntToString(i) + ".xml";
2387 m_Outfile << "\t<xi:include href=\"" + file_name + "\"/>\n";
2388 }
2389 m_Outfile << "</BlastXML2>\n";
2390 }
2391
x_GenerateJSONMasterFile(void)2392 void CBlastFormat::x_GenerateJSONMasterFile(void)
2393 {
2394 if(m_FormatType == CFormattingArgs::eJson_S) {
2395 m_Outfile << "]\n}\n";
2396 return;
2397 }
2398
2399 m_Outfile << "{\n\t\"BlastJSON\": [\n";
2400
2401 string base = s_GetBaseName(m_BaseFile, true, false);
2402 for(int i = 1; i <= m_XMLFileCount; i ++) {
2403 string file_name = base + "_" + NStr::IntToString(i) + ".json";
2404 m_Outfile << "\t\t{\"File\": \"" + file_name + "\" }";
2405 if(i != m_XMLFileCount)
2406 m_Outfile << ",";
2407 m_Outfile << "\n";
2408 }
2409 m_Outfile << "\t]\n}";
2410 }
2411
x_InitSAMFormatter()2412 void CBlastFormat::x_InitSAMFormatter()
2413 {
2414 CSAM_Formatter::SProgramInfo pg("0", blast::CBlastVersion().Print(), m_Cmdline);
2415 pg.m_Name = m_Program;
2416 m_SamFormatter.reset(new CBlast_SAM_Formatter(m_Outfile, *m_Scope,
2417 m_CustomOutputFormatSpec, pg));
2418 }
2419
s_SetCompBasedStats(EProgram program)2420 bool s_SetCompBasedStats(EProgram program)
2421 {
2422 if (program == eBlastp || program == eTblastn ||
2423 program == ePSIBlast || program == ePSITblastn ||
2424 program == eRPSBlast || program == eRPSTblastn ||
2425 program == eBlastx || program == eDeltaBlast) {
2426 return true;
2427 }
2428 return false;
2429 }
2430
LogBlastSearchInfo(CBlastUsageReport & report)2431 void CBlastFormat::LogBlastSearchInfo(CBlastUsageReport & report)
2432 {
2433 if (report.IsEnabled()) {
2434 report.AddParam(CBlastUsageReport::eProgram, m_Program);
2435 EProgram task = m_Options->GetProgram();
2436 string task_str = EProgramToTaskName(task);
2437 report.AddParam(CBlastUsageReport::eTask, task_str);
2438 report.AddParam(CBlastUsageReport::eEvalueThreshold, m_Options->GetEvalueThreshold());
2439 report.AddParam(CBlastUsageReport::eHitListSize, m_Options->GetHitlistSize());
2440 report.AddParam(CBlastUsageReport::eOutputFmt, m_FormatType);
2441
2442 if (s_SetCompBasedStats(task)) {
2443 report.AddParam(CBlastUsageReport::eCompBasedStats, m_Options->GetCompositionBasedStats());
2444 }
2445
2446 int num_seqs = 0;
2447 for (size_t i = 0; i < m_DbInfo.size(); i++) {
2448 num_seqs += m_DbInfo[i].number_seqs;
2449 }
2450 if( m_IsBl2Seq) {
2451 report.AddParam(CBlastUsageReport::eBl2seq, "true");
2452 if (m_IsDbScan) {
2453 report.AddParam(CBlastUsageReport::eNumSubjects, num_seqs);
2454 report.AddParam(CBlastUsageReport::eSubjectsLength, GetDbTotalLength());
2455 }
2456 else if (m_SeqInfoSrc.NotEmpty()){
2457 report.AddParam(CBlastUsageReport::eNumSubjects, (int) m_SeqInfoSrc->Size());
2458 int total_subj_length = 0;
2459 for (size_t i = 0; i < m_SeqInfoSrc->Size(); i++) {
2460 total_subj_length += m_SeqInfoSrc->GetLength(i);
2461 }
2462 report.AddParam(CBlastUsageReport::eSubjectsLength, total_subj_length);
2463 }
2464 }
2465 else {
2466 string dir = kEmptyStr;
2467 CFile::SplitPath(m_DbName, &dir);
2468 string db_name = m_DbName;
2469 if (dir != kEmptyStr) {
2470 db_name = m_DbName.substr(dir.length());
2471 }
2472 report.AddParam(CBlastUsageReport::eDBName, db_name);
2473 report.AddParam(CBlastUsageReport::eDBLength, GetDbTotalLength());
2474 report.AddParam(CBlastUsageReport::eDBNumSeqs, num_seqs);
2475 report.AddParam(CBlastUsageReport::eDBDate, m_DbInfo[0].date);
2476 if(m_SearchDb.NotEmpty()){
2477 if(m_SearchDb->GetGiList().NotEmpty()) {
2478 CRef<CSeqDBGiList> l = m_SearchDb->GetGiList();
2479 if (l->GetNumGis()) {
2480 report.AddParam(CBlastUsageReport::eGIList, true);
2481 }
2482 if (l->GetNumSis()){
2483 report.AddParam(CBlastUsageReport::eSeqIdList, true);
2484 }
2485 if (l->GetNumTaxIds()){
2486 report.AddParam(CBlastUsageReport::eTaxIdList, true);
2487 }
2488 if (l->GetNumPigs()) {
2489 report.AddParam(CBlastUsageReport::eIPGList, true);
2490 }
2491 }
2492 if(m_SearchDb->GetNegativeGiList().NotEmpty()) {
2493 CRef<CSeqDBGiList> l = m_SearchDb->GetNegativeGiList();
2494 if (l->GetNumGis()) {
2495 report.AddParam(CBlastUsageReport::eNegGIList, true);
2496 }
2497 if (l->GetNumSis()){
2498 report.AddParam(CBlastUsageReport::eNegSeqIdList, true);
2499 }
2500 if (l->GetNumTaxIds()){
2501 report.AddParam(CBlastUsageReport::eNegTaxIdList, true);
2502 }
2503 if (l->GetNumPigs()) {
2504 report.AddParam(CBlastUsageReport::eNegIPGList, true);
2505 }
2506 }
2507 }
2508 }
2509 }
2510 }
2511