1 /*  $Id: align_format_util.cpp 631547 2021-05-19 13:51:35Z ivanov $
2  * ===========================================================================
3  *
4  *                            PUBLIC DOMAIN NOTICE
5  *               National Center for Biotechnology Information
6  *
7  *  This software/database is a "United States Government Work" under the
8  *  terms of the United States Copyright Act.  It was written as part of
9  *  the author's official duties as a United States Government employee and
10  *  thus cannot be copyrighted.  This software/database is freely available
11  *  to the public for use. The National Library of Medicine and the U.S.
12  *  Government have not placed any restriction on its use or reproduction.
13  *
14  *  Although all reasonable efforts have been taken to ensure the accuracy
15  *  and reliability of the software and data, the NLM and the U.S.
16  *  Government do not and cannot warrant the performance or results that
17  *  may be obtained by using this software or data. The NLM and the U.S.
18  *  Government disclaim all warranties, express or implied, including
19  *  warranties of performance, merchantability or fitness for any particular
20  *  purpose.
21  *
22  *  Please cite the author in any work or product based on this material.
23  *
24  * ===========================================================================
25  *
26  * Author:  Jian Ye
27  * 12/2004
28  * File Description:
29  *   blast formatter utilities
30  *
31  */
32 #include <ncbi_pch.hpp>
33 
34 #include <math.h> // For use of ceil
35 
36 #include <objtools/align_format/align_format_util.hpp>
37 
38 #include <corelib/ncbireg.hpp>
39 #include <corelib/ncbidiag.hpp>
40 #include <corelib/ncbistre.hpp>
41 #include <corelib/ncbiutil.hpp>
42 #include <corelib/ncbiobj.hpp>
43 #include <corelib/ncbifile.hpp>
44 #include <corelib/metareg.hpp>
45 #include <html/htmlhelper.hpp>
46 #include <cgi/cgictx.hpp>
47 #include <util/tables/raw_scoremat.h>
48 
49 
50 #include <objects/seqalign/Seq_align.hpp>
51 #include <objects/seqalign/Score.hpp>
52 #include <objects/seqalign/Std_seg.hpp>
53 #include <objects/seqalign/Dense_diag.hpp>
54 #include <objects/seqalign/Dense_seg.hpp>
55 #include <objects/seqloc/Seq_id.hpp>
56 #include <objects/seqloc/Seq_loc.hpp>
57 #include <objects/seq/Seq_inst.hpp>
58 #include <objects/seq/Seq_descr.hpp>
59 #include <objects/seq/Seqdesc.hpp>
60 #include <objmgr/seqdesc_ci.hpp>
61 #include <objects/blastdb/defline_extra.hpp>
62 #include <objects/general/User_object.hpp>
63 
64 #include <objtools/blast/services/blast_services.hpp>   // for CBlastServices
65 #include <objtools/blast/seqdb_reader/seqdb.hpp>   // for CSeqDB
66 #include <objtools/blast/seqdb_reader/seqdbcommon.hpp>   // for CSeqDBException
67 
68 #include <objects/seq/seqport_util.hpp>
69 #include <objects/blastdb/defline_extra.hpp>
70 #include <objects/blastdb/Blast_def_line.hpp>
71 #include <objects/blastdb/Blast_def_line_set.hpp>
72 
73 #include <stdio.h>
74 #include <sstream>
75 #include <iomanip>
76 
77 BEGIN_NCBI_SCOPE
78 USING_SCOPE(ncbi);
79 USING_SCOPE(objects);
80 BEGIN_SCOPE(align_format)
81 
82 const char CAlignFormatUtil::kNoHitsFound[] = "No hits found";
83 
84 bool kTranslation;
85 CRef<CScope> kScope;
86 
87 const char k_PSymbol[ePMatrixSize+1] =
88 "ARNDCQEGHILKMFPSTWYVBZX";
89 
90 unique_ptr<CNcbiRegistry> CAlignFormatUtil::m_Reg;
91 string CAlignFormatUtil::m_Protocol = "";
92 bool  CAlignFormatUtil::m_geturl_debug_flag = false;
93 auto_ptr<CGeneInfoFileReader> CAlignFormatUtil::m_GeneInfoReader;
94 ///Get blast score information
95 ///@param scoreList: score container to extract score info from
96 ///@param score: place to extract the raw score to
97 ///@param bits: place to extract the bit score to
98 ///@param evalue: place to extract the e value to
99 ///@param sum_n: place to extract the sum_n to
100 ///@param num_ident: place to extract the num_ident to
101 ///@param use_this_gi: place to extract use_this_gi to
102 ///@return true if found score, false otherwise
103 ///
104 template<class container> bool
s_GetBlastScore(const container & scoreList,int & score,double & bits,double & evalue,int & sum_n,int & num_ident,list<TGi> & use_this_gi,int & comp_adj_method)105 s_GetBlastScore(const container&  scoreList,
106                 int& score,
107                 double& bits,
108                 double& evalue,
109                 int& sum_n,
110                 int& num_ident,
111                 list<TGi>& use_this_gi,
112                 int& comp_adj_method)
113 {
114     const string k_GiPrefix = "gi:";
115     bool hasScore = false;
116     ITERATE (typename container, iter, scoreList) {
117         const CObject_id& id=(*iter)->GetId();
118         if (id.IsStr()) {
119             if (id.GetStr()=="score"){
120                 score = (*iter)->GetValue().GetInt();
121             } else if (id.GetStr()=="bit_score"){
122                 bits = (*iter)->GetValue().GetReal();
123             } else if (id.GetStr()=="e_value" || id.GetStr()=="sum_e") {
124                 evalue = (*iter)->GetValue().GetReal();
125                 hasScore = true;
126             } else if (id.GetStr()=="use_this_gi"){
127                 use_this_gi.push_back(GI_FROM(int, (*iter)->GetValue().GetInt()));
128             } else if (id.GetStr()=="sum_n"){
129                 sum_n = (*iter)->GetValue().GetInt();
130             } else if (id.GetStr()=="num_ident"){
131                 num_ident = (*iter)->GetValue().GetInt();
132             } else if (id.GetStr()=="comp_adjustment_method") {
133                 comp_adj_method = (*iter)->GetValue().GetInt();
134             }
135             else if(NStr::StartsWith(id.GetStr(),k_GiPrefix)) { //will be used when switch to 64bit GIs
136                 string strGi = NStr::Replace(id.GetStr(),k_GiPrefix,"");
137                 TGi gi = NStr::StringToNumeric<TGi>(strGi);
138                 use_this_gi.push_back(gi);
139             }
140         }
141     }
142 
143     return hasScore;
144 }
145 
146 
147 ///Wrap a string to specified length.  If break happens to be in
148 /// a word, it will extend the line length until the end of the word
149 ///@param str: input string
150 ///@param line_len: length of each line desired
151 ///@param out: stream to ouput
152 ///
x_WrapOutputLine(string str,size_t line_len,CNcbiOstream & out,bool html)153 void CAlignFormatUtil::x_WrapOutputLine(string str, size_t line_len,
154                                         CNcbiOstream& out, bool html)
155 {
156     list<string> string_l;
157     NStr::TWrapFlags flags = NStr::fWrap_FlatFile;
158     if (html) {
159         flags = NStr::fWrap_HTMLPre;
160         str = CHTMLHelper::HTMLEncode(str);
161     }
162     NStr::Wrap(str, line_len, string_l, flags);
163     list<string>::iterator iter = string_l.begin();
164     while(iter != string_l.end())
165     {
166        out << *iter;
167        out << "\n";
168        iter++;
169     }
170 }
171 
BlastPrintError(list<SBlastError> & error_return,bool error_post,CNcbiOstream & out)172 void CAlignFormatUtil::BlastPrintError(list<SBlastError>&
173                                        error_return,
174                                        bool error_post, CNcbiOstream& out)
175 {
176 
177     string errsevmsg[] = { "UNKNOWN","INFO","WARNING","ERROR",
178                             "FATAL"};
179 
180     NON_CONST_ITERATE(list<SBlastError>, iter, error_return) {
181 
182         if(iter->level > 5){
183             iter->level = eDiag_Info;
184         }
185 
186         if(iter->level == 4){
187             iter->level = eDiag_Fatal;
188         } else{
189             iter->level = iter->level;
190         }
191 
192         if (error_post){
193             ERR_POST_EX(iter->level, 0, iter->message);
194         }
195         out << errsevmsg[iter->level] << ": " << iter->message << "\n";
196 
197     }
198 
199 }
200 
PrintTildeSepLines(string str,size_t line_len,CNcbiOstream & out)201 void  CAlignFormatUtil::PrintTildeSepLines(string str, size_t line_len,
202                                            CNcbiOstream& out) {
203 
204     vector<string> split_line;
205     NStr::Split(str, "~", split_line);
206     ITERATE(vector<string>, iter, split_line) {
207         x_WrapOutputLine(*iter,  line_len, out);
208     }
209 }
210 #ifdef DO_UNUSED
211 /// Initialize database statistics with data from BLAST servers
212 /// @param dbname name of a single BLAST database [in]
213 /// @param info structure to fill [in|out]
214 /// @return true if successfully filled, false otherwise (and a warning is
215 /// printed out)
s_FillDbInfoRemotely(const string & dbname,CAlignFormatUtil::SDbInfo & info)216 static bool s_FillDbInfoRemotely(const string& dbname,
217                                  CAlignFormatUtil::SDbInfo& info)
218 {
219     static CBlastServices rmt_blast_services;
220     CRef<CBlast4_database> blastdb(new CBlast4_database);
221     blastdb->SetName(dbname);
222     blastdb->SetType() = info.is_protein
223         ? eBlast4_residue_type_protein : eBlast4_residue_type_nucleotide;
224     CRef<CBlast4_database_info> dbinfo =
225         rmt_blast_services.GetDatabaseInfo(blastdb);
226 
227     info.name = dbname;
228     if ( !dbinfo ) {
229         return false;
230     }
231     info.definition = dbinfo->GetDescription();
232     if (info.definition.empty())
233         info.definition = info.name;
234     CTimeFormat tf("b d, Y H:m P", CTimeFormat::fFormat_Simple);
235     info.date = CTime(dbinfo->GetLast_updated()).AsString(tf);
236     info.total_length = dbinfo->GetTotal_length();
237     info.number_seqs = static_cast<int>(dbinfo->GetNum_sequences());
238     return true;
239 }
240 #endif
241 /// Initialize database statistics with data obtained from local BLAST
242 /// databases
243 /// @param dbname name of a single BLAST database [in]
244 /// @param info structure to fill [in|out]
245 /// @param dbfilt_algorithm filtering algorithm ID used for this search
246 /// [in]
247 /// @return true if successfully filled, false otherwise (and a warning is
248 /// printed out)
249 static bool
s_FillDbInfoLocally(const string & dbname,CAlignFormatUtil::SDbInfo & info,int dbfilt_algorithm)250 s_FillDbInfoLocally(const string& dbname,
251                     CAlignFormatUtil::SDbInfo& info,
252                     int dbfilt_algorithm)
253 {
254     CRef<CSeqDB> seqdb(new CSeqDB(dbname, info.is_protein
255                           ? CSeqDB::eProtein : CSeqDB::eNucleotide));
256     if ( !seqdb ) {
257         return false;
258     }
259     info.name = seqdb->GetDBNameList();
260     info.definition = seqdb->GetTitle();
261     if (info.definition.empty())
262         info.definition = info.name;
263     info.date = seqdb->GetDate();
264     info.total_length = seqdb->GetTotalLength();
265     info.number_seqs = seqdb->GetNumSeqs();
266 
267     // Process the filtering algorithm IDs
268     info.filt_algorithm_name.clear();
269     info.filt_algorithm_options.clear();
270     if (dbfilt_algorithm == -1) {
271         return true;
272     }
273 
274 #if ((!defined(NCBI_COMPILER_WORKSHOP) || (NCBI_COMPILER_VERSION  > 550)) && \
275      (!defined(NCBI_COMPILER_MIPSPRO)) )
276     string filtering_algorithm;
277     seqdb->GetMaskAlgorithmDetails(dbfilt_algorithm,
278                                    filtering_algorithm,
279                                    info.filt_algorithm_name,
280                                    info.filt_algorithm_options);
281 #endif
282     return true;
283 }
284 
285 void
FillScanModeBlastDbInfo(vector<CAlignFormatUtil::SDbInfo> & retval,bool is_protein,int numSeqs,Int8 numLetters,string & tag)286 CAlignFormatUtil::FillScanModeBlastDbInfo(vector<CAlignFormatUtil::SDbInfo>& retval,
287                            bool is_protein, int numSeqs, Int8 numLetters, string& tag)
288 {
289 	retval.clear();
290 	CAlignFormatUtil::SDbInfo info;
291 	info.is_protein = is_protein;
292 	if (tag == "")
293 		info.definition = string("User specified sequence set.");
294 	else
295 	{
296 		info.definition = string("User specified sequence set ") +
297 			string("(Input: ") + tag + string(").");
298 	}
299 	info.number_seqs = numSeqs;
300 	info.total_length = numLetters;
301 	retval.push_back(info);
302 }
303 
304 void
GetBlastDbInfo(vector<CAlignFormatUtil::SDbInfo> & retval,const string & blastdb_names,bool is_protein,int dbfilt_algorithm,bool is_remote)305 CAlignFormatUtil::GetBlastDbInfo(vector<CAlignFormatUtil::SDbInfo>& retval,
306                            const string& blastdb_names, bool is_protein,
307                            int dbfilt_algorithm /* = -1 */,
308                            bool is_remote /* = false */)
309 {
310     retval.clear();
311     if( is_remote ){
312 	bool found_all = false;
313         static CBlastServices rmt_blast_services;
314 	vector<string> missing_names;
315 	vector< CRef<objects::CBlast4_database_info> > all_db_info =
316 	    rmt_blast_services.GetDatabaseInfo(blastdb_names,is_protein,&found_all,&missing_names);
317 	if( !missing_names.empty() ){
318 	    string msg("'");
319 	    for(size_t ndx=0 ; ndx < missing_names.size(); ndx++){
320 		msg += missing_names[ndx];
321 	    }
322 	    msg += string("' not found on NCBI servers.\n");
323 	    NCBI_THROW(CSeqDBException, eFileErr, msg);
324 	}
325 	for(size_t ndx=0 ; ndx < all_db_info.size(); ndx++){
326 	    CAlignFormatUtil::SDbInfo info;
327 	    objects::CBlast4_database_info &dbinfo = *all_db_info[ndx];
328 	    info.name = dbinfo.GetDatabase().GetName();
329 	    info.definition = dbinfo.GetDescription();
330 	    if (info.definition.empty())
331 		info.definition = info.name;
332 	    CTimeFormat tf("b d, Y H:m P", CTimeFormat::fFormat_Simple);
333 	    info.date = CTime(dbinfo.GetLast_updated()).AsString(tf);
334 	    info.total_length = dbinfo.GetTotal_length();
335 	    info.number_seqs = static_cast<int>(dbinfo.GetNum_sequences());
336             if (info.total_length < 0) {
337 		const string kDbName = NStr::TruncateSpaces(info.name);
338                 if( ! s_FillDbInfoLocally(kDbName, info, dbfilt_algorithm) ){
339 		    string msg("'");
340 		    msg += kDbName;
341 		    msg += string("' has bad total length on NCBI servers.\n");
342 		    NCBI_THROW(CSeqDBException, eFileErr, msg);
343 		}
344             }
345             retval.push_back(info);
346 	}
347 	return;
348     }
349     else{
350     vector<CTempString> dbs;
351     SeqDB_SplitQuoted(blastdb_names, dbs, true);
352     retval.reserve(dbs.size());
353 
354     ITERATE(vector<CTempString>, i, dbs) {
355         CAlignFormatUtil::SDbInfo info;
356         info.is_protein = is_protein;
357         bool success = false;
358 	// Unsafe OK as kDbName only used in this loop.
359         const string kDbName = NStr::TruncateSpaces_Unsafe(*i);
360         if (kDbName.empty())
361             continue;
362 
363         success = s_FillDbInfoLocally(kDbName, info, dbfilt_algorithm);
364 
365         if (success) {
366             retval.push_back(info);
367         } else {
368             string msg("'");
369             msg += kDbName;
370             if (is_remote)
371                 msg += string("' not found on NCBI servers.\n");
372             else
373                 msg += string("' not found.\n");
374             NCBI_THROW(CSeqDBException, eFileErr, msg);
375         }
376     }
377     }
378 }
379 
PrintDbReport(const vector<SDbInfo> & dbinfo_list,size_t line_length,CNcbiOstream & out,bool top)380 void CAlignFormatUtil::PrintDbReport(const vector<SDbInfo>& dbinfo_list,
381                                      size_t line_length,
382                                      CNcbiOstream& out,
383                                      bool top)
384 {
385     if (top) {
386         const CAlignFormatUtil::SDbInfo* dbinfo = &(dbinfo_list.front());
387         out << "Database: ";
388 
389         string db_titles = dbinfo->definition;
390         Int8 tot_num_seqs = static_cast<Int8>(dbinfo->number_seqs);
391         Int8 tot_length = dbinfo->total_length;
392 
393         for (size_t i = 1; i < dbinfo_list.size(); i++) {
394             db_titles += "; " + dbinfo_list[i].definition;
395             tot_num_seqs += static_cast<Int8>(dbinfo_list[i].number_seqs);
396             tot_length += dbinfo_list[i].total_length;
397         }
398 
399         x_WrapOutputLine(db_titles, line_length, out);
400         if ( !dbinfo->filt_algorithm_name.empty() ) {
401             out << "Masked using: '" << dbinfo->filt_algorithm_name << "'";
402             if ( !dbinfo->filt_algorithm_options.empty() ) {
403                 out << ", options: '" << dbinfo->filt_algorithm_options << "'";
404             }
405             out << endl;
406         }
407         CAlignFormatUtil::AddSpace(out, 11);
408         out << NStr::Int8ToString(tot_num_seqs, NStr::fWithCommas) <<
409             " sequences; " <<
410             NStr::Int8ToString(tot_length, NStr::fWithCommas) <<
411             " total letters\n\n";
412         return;
413     }
414 
415     ITERATE(vector<SDbInfo>, dbinfo, dbinfo_list) {
416         if (dbinfo->subset == false) {
417             out << "  Database: ";
418             x_WrapOutputLine(dbinfo->definition, line_length, out);
419 
420             if ( !dbinfo->filt_algorithm_name.empty() ) {
421                 out << "  Masked using: '" << dbinfo->filt_algorithm_name << "'";
422                 if ( !dbinfo->filt_algorithm_options.empty() ) {
423                     out << ", options: '" << dbinfo->filt_algorithm_options << "'";
424                 }
425                 out << endl;
426             }
427 
428             out << "    Posted date:  ";
429             out << dbinfo->date << "\n";
430 
431             out << "  Number of letters in database: ";
432             out << NStr::Int8ToString(dbinfo->total_length,
433                                       NStr::fWithCommas) << "\n";
434             out << "  Number of sequences in database:  ";
435             out << NStr::IntToString(dbinfo->number_seqs,
436                                      NStr::fWithCommas) << "\n";
437 
438         } else {
439             out << "  Subset of the database(s) listed below" << "\n";
440             out << "  Number of letters searched: ";
441             out << NStr::Int8ToString(dbinfo->total_length,
442                                       NStr::fWithCommas) << "\n";
443             out << "  Number of sequences searched:  ";
444             out << NStr::IntToString(dbinfo->number_seqs,
445                                      NStr::fWithCommas) << "\n";
446         }
447         out << "\n";
448     }
449 
450 }
451 
PrintKAParameters(double lambda,double k,double h,size_t line_len,CNcbiOstream & out,bool gapped,const Blast_GumbelBlk * gbp)452 void CAlignFormatUtil::PrintKAParameters(double lambda, double k, double h,
453                                          size_t line_len,
454                                          CNcbiOstream& out, bool gapped,
455                                          const Blast_GumbelBlk *gbp)
456 {
457 
458     char buffer[256];
459     if (gapped) {
460         out << "Gapped" << "\n";
461     }
462     out << "Lambda      K        H";
463     if (gbp) {
464         if (gapped) {
465             out << "        a         alpha    sigma";
466         } else {
467             out << "        a         alpha";
468         }
469     }
470     out << "\n";
471     sprintf(buffer, "%#8.3g ", lambda);
472     out << buffer;
473     sprintf(buffer, "%#8.3g ", k);
474     out << buffer;
475     sprintf(buffer, "%#8.3g ", h);
476     out << buffer;
477     if (gbp) {
478         if (gapped) {
479             sprintf(buffer, "%#8.3g ", gbp->a);
480             out << buffer;
481             sprintf(buffer, "%#8.3g ", gbp->Alpha);
482             out << buffer;
483             sprintf(buffer, "%#8.3g ", gbp->Sigma);
484             out << buffer;
485         } else {
486             sprintf(buffer, "%#8.3g ", gbp->a_un);
487             out << buffer;
488             sprintf(buffer, "%#8.3g ", gbp->Alpha_un);
489             out << buffer;
490         }
491         //x_WrapOutputLine(buffer, line_len, out);
492     }
493     out << "\n";
494 }
495 
496 string
GetSeqIdString(const CBioseq & cbs,bool believe_local_id)497 CAlignFormatUtil::GetSeqIdString(const CBioseq& cbs, bool believe_local_id)
498 {
499     const CBioseq::TId& ids = cbs.GetId();
500     return CAlignFormatUtil::GetSeqIdString(ids, believe_local_id);
501 }
502 
503 string
GetSeqIdString(const list<CRef<CSeq_id>> & ids,bool believe_local_id)504 CAlignFormatUtil::GetSeqIdString(const list<CRef<CSeq_id> > & ids, bool believe_local_id)
505 {
506     string all_id_str = NcbiEmptyString;
507     CRef<CSeq_id> wid = FindBestChoice(ids, CSeq_id::WorstRank);
508 
509     if (wid && (wid->Which()!= CSeq_id::e_Local || believe_local_id)){
510         TGi gi = FindGi(ids);
511 
512         bool use_long_seqids = false;
513         CNcbiApplication* app = CNcbiApplication::Instance();
514         if (app) {
515             const CNcbiRegistry& registry = app->GetConfig();
516             use_long_seqids = (registry.Get("BLAST", "LONG_SEQID") == "1");
517         }
518         if (!use_long_seqids) {
519 
520             all_id_str = GetBareId(*wid);
521         }
522         else if (strncmp(wid->AsFastaString().c_str(), "lcl|", 4) == 0) {
523             if(gi == ZERO_GI){
524                 all_id_str =  wid->AsFastaString().substr(4);
525             } else {
526                 all_id_str = "gi|" + NStr::NumericToString(gi) +
527                     "|" + wid->AsFastaString().substr(4);
528             }
529         } else {
530             if(gi == ZERO_GI){
531                 all_id_str = wid->AsFastaString();
532             } else {
533                 all_id_str = "gi|" + NStr::NumericToString(gi) + "|" +
534                     wid->AsFastaString();
535             }
536         }
537     }
538 
539     return all_id_str;
540 }
541 
542 string
GetSeqDescrString(const CBioseq & cbs)543 CAlignFormatUtil::GetSeqDescrString(const CBioseq& cbs)
544 {
545     string all_descr_str = NcbiEmptyString;
546 
547     if (cbs.IsSetDescr()) {
548         const CBioseq::TDescr& descr = cbs.GetDescr();
549         const CBioseq::TDescr::Tdata& data = descr.Get();
550         ITERATE(CBioseq::TDescr::Tdata, iter, data) {
551             if((*iter)->IsTitle()) {
552                 all_descr_str += (*iter)->GetTitle();
553             }
554         }
555     }
556     return all_descr_str;
557 }
558 
AcknowledgeBlastQuery(const CBioseq & cbs,size_t line_len,CNcbiOstream & out,bool believe_query,bool html,bool tabular,const string & rid)559 void CAlignFormatUtil::AcknowledgeBlastQuery(const CBioseq& cbs,
560                                              size_t line_len,
561                                              CNcbiOstream& out,
562                                              bool believe_query,
563                                              bool html,
564                                              bool tabular /* = false */,
565                                              const string& rid /* = kEmptyStr*/)
566 {
567     const string label("Query");
568     CAlignFormatUtil::x_AcknowledgeBlastSequence(cbs, line_len, out,
569                                                  believe_query, html,
570                                                  label, tabular, rid);
571 }
572 
573 void
AcknowledgeBlastSubject(const CBioseq & cbs,size_t line_len,CNcbiOstream & out,bool believe_query,bool html,bool tabular)574 CAlignFormatUtil::AcknowledgeBlastSubject(const CBioseq& cbs,
575                                           size_t line_len,
576                                           CNcbiOstream& out,
577                                           bool believe_query,
578                                           bool html,
579                                           bool tabular /* = false */)
580 {
581     const string label("Subject");
582     CAlignFormatUtil::x_AcknowledgeBlastSequence(cbs, line_len, out,
583                                                  believe_query, html,
584                                                  label, tabular, kEmptyStr);
585 }
586 
587 void
x_AcknowledgeBlastSequence(const CBioseq & cbs,size_t line_len,CNcbiOstream & out,bool believe_query,bool html,const string & label,bool tabular,const string & rid)588 CAlignFormatUtil::x_AcknowledgeBlastSequence(const CBioseq& cbs,
589                                              size_t line_len,
590                                              CNcbiOstream& out,
591                                              bool believe_query,
592                                              bool html,
593                                              const string& label,
594                                              bool tabular /* = false */,
595                                              const string& rid /* = kEmptyStr*/)
596 {
597 
598     if (html) {
599         out << "<b>" << label << "=</b> ";
600     } else if (tabular) {
601         out << "# " << label << ": ";
602     } else {
603         out << label << "= ";
604     }
605 
606     string all_id_str = GetSeqIdString(cbs, believe_query);
607     all_id_str += " ";
608     all_id_str = NStr::TruncateSpaces(all_id_str + GetSeqDescrString(cbs));
609 
610     // For tabular output, there is no limit on the line length.
611     // There is also no extra line with the sequence length.
612     if (tabular) {
613         out << all_id_str;
614     } else {
615         x_WrapOutputLine(all_id_str, line_len, out, html);
616         if(cbs.IsSetInst() && cbs.GetInst().CanGetLength()){
617             out << "\nLength=";
618             out << cbs.GetInst().GetLength() <<"\n";
619         }
620     }
621 
622     if (rid != kEmptyStr) {
623         if (tabular) {
624             out << "\n" << "# RID: " << rid;
625         } else {
626             out << "\n" << "RID: " << rid << "\n";
627         }
628     }
629 }
630 
PrintPhiInfo(int num_patterns,const string & pattern,double prob,vector<int> & offsets,CNcbiOstream & out)631 void CAlignFormatUtil::PrintPhiInfo(int num_patterns,
632                                     const string& pattern,
633                                     double prob,
634                                     vector<int>& offsets,
635                                     CNcbiOstream& out)
636 {
637     out << num_patterns << " occurrence(s) of pattern: " << "\n"
638              << pattern << " at position(s) ";
639 
640     bool first = true;
641     for (vector<int>::iterator it = offsets.begin();
642       it != offsets.end(); it++)
643     {
644            if (!first)
645              out << ", ";
646 
647            out << 1 + *it ;
648 
649            first = false;
650     }
651     out << " of query sequence" << "\n";
652     out << "pattern probability=" << prob << "\n";
653 
654 }
655 
GetAlnScores(const CSeq_align & aln,int & score,double & bits,double & evalue,int & sum_n,int & num_ident,list<TGi> & use_this_gi)656 void CAlignFormatUtil::GetAlnScores(const CSeq_align& aln,
657                                     int& score,
658                                     double& bits,
659                                     double& evalue,
660                                     int& sum_n,
661                                     int& num_ident,
662                                     list<TGi>& use_this_gi)
663 {
664     int comp_adj_method = 0; // dummy variable
665 
666     CAlignFormatUtil::GetAlnScores(aln, score, bits, evalue, sum_n,
667                                  num_ident, use_this_gi, comp_adj_method);
668 }
669 
GetAlnScores(const CSeq_align & aln,int & score,double & bits,double & evalue,int & sum_n,int & num_ident,list<string> & use_this_seq)670 void CAlignFormatUtil::GetAlnScores(const CSeq_align& aln,
671                                     int& score,
672                                     double& bits,
673                                     double& evalue,
674                                     int& sum_n,
675                                     int& num_ident,
676                                     list<string>& use_this_seq)
677 {
678     int comp_adj_method = 0; // dummy variable
679 
680     CAlignFormatUtil::GetAlnScores(aln, score, bits, evalue, sum_n,
681                                  num_ident, use_this_seq, comp_adj_method);
682 }
683 
684 
GetAlnScores(const CSeq_align & aln,int & score,double & bits,double & evalue,int & sum_n,int & num_ident,list<TGi> & use_this_gi,int & comp_adj_method)685 void CAlignFormatUtil::GetAlnScores(const CSeq_align& aln,
686                                     int& score,
687                                     double& bits,
688                                     double& evalue,
689                                     int& sum_n,
690                                     int& num_ident,
691                                     list<TGi>& use_this_gi,
692                                     int& comp_adj_method)
693 {
694     bool hasScore = false;
695     score = -1;
696     bits = -1;
697     evalue = -1;
698     sum_n = -1;
699     num_ident = -1;
700     comp_adj_method = 0;
701 
702     //look for scores at seqalign level first
703     hasScore = s_GetBlastScore(aln.GetScore(), score, bits, evalue,
704                                sum_n, num_ident, use_this_gi, comp_adj_method);
705 
706     //look at the seg level
707     if(!hasScore){
708         const CSeq_align::TSegs& seg = aln.GetSegs();
709         if(seg.Which() == CSeq_align::C_Segs::e_Std){
710             s_GetBlastScore(seg.GetStd().front()->GetScores(),
711                             score, bits, evalue, sum_n, num_ident, use_this_gi, comp_adj_method);
712         } else if (seg.Which() == CSeq_align::C_Segs::e_Dendiag){
713             s_GetBlastScore(seg.GetDendiag().front()->GetScores(),
714                             score, bits, evalue, sum_n, num_ident, use_this_gi, comp_adj_method);
715         }  else if (seg.Which() == CSeq_align::C_Segs::e_Denseg){
716             s_GetBlastScore(seg.GetDenseg().GetScores(),
717                             score, bits, evalue, sum_n, num_ident, use_this_gi, comp_adj_method);
718         }
719     }
720     if(use_this_gi.size() == 0) {
721         GetUseThisSequence(aln,use_this_gi);
722     }
723 }
724 
725 //converts gi list to the list of gi:XXXXXXXX strings
s_NumGiToStringGiList(list<TGi> use_this_gi)726 static list<string> s_NumGiToStringGiList(list<TGi> use_this_gi)//for backward compatability
727 {
728     const string k_GiPrefix = "gi:";
729     list<string> use_this_seq;
730     ITERATE(list<TGi>, iter_gi, use_this_gi){
731         string strSeq = k_GiPrefix + NStr::NumericToString(*iter_gi);
732         use_this_seq.push_back(strSeq);
733     }
734     return use_this_seq;
735 }
736 
GetAlnScores(const CSeq_align & aln,int & score,double & bits,double & evalue,int & sum_n,int & num_ident,list<string> & use_this_seq,int & comp_adj_method)737 void CAlignFormatUtil::GetAlnScores(const CSeq_align& aln,
738                                     int& score,
739                                     double& bits,
740                                     double& evalue,
741                                     int& sum_n,
742                                     int& num_ident,
743                                     list<string>& use_this_seq,
744                                     int& comp_adj_method)
745 {
746     bool hasScore = false;
747     score = -1;
748     bits = -1;
749     evalue = -1;
750     sum_n = -1;
751     num_ident = -1;
752     comp_adj_method = 0;
753 
754     list<TGi> use_this_gi;
755     //look for scores at seqalign level first
756     hasScore = s_GetBlastScore(aln.GetScore(), score, bits, evalue,
757                                sum_n, num_ident, use_this_gi, comp_adj_method);
758 
759     //look at the seg level
760     if(!hasScore){
761         const CSeq_align::TSegs& seg = aln.GetSegs();
762         if(seg.Which() == CSeq_align::C_Segs::e_Std){
763             s_GetBlastScore(seg.GetStd().front()->GetScores(),
764                             score, bits, evalue, sum_n, num_ident, use_this_gi, comp_adj_method);
765         } else if (seg.Which() == CSeq_align::C_Segs::e_Dendiag){
766             s_GetBlastScore(seg.GetDendiag().front()->GetScores(),
767                             score, bits, evalue, sum_n, num_ident, use_this_gi, comp_adj_method);
768         }  else if (seg.Which() == CSeq_align::C_Segs::e_Denseg){
769             s_GetBlastScore(seg.GetDenseg().GetScores(),
770                             score, bits, evalue, sum_n, num_ident, use_this_gi, comp_adj_method);
771         }
772     }
773     if(use_this_gi.size() == 0) {
774         GetUseThisSequence(aln,use_this_seq);
775     }
776     else {
777         use_this_seq = s_NumGiToStringGiList(use_this_gi);//for backward compatability
778     }
779 }
780 
GetGnlID(const CDbtag & dtg)781 string CAlignFormatUtil::GetGnlID(const CDbtag& dtg)
782 {
783    string retval = NcbiEmptyString;
784 
785    if(dtg.GetTag().IsId())
786      retval = NStr::IntToString(dtg.GetTag().GetId());
787    else
788      retval = dtg.GetTag().GetStr();
789 
790    return retval;
791 }
792 
GetLabel(CConstRef<CSeq_id> id,bool with_version)793 string CAlignFormatUtil::GetLabel(CConstRef<CSeq_id> id,bool with_version)
794 {
795     string retval = "";
796     if (id->Which() == CSeq_id::e_General){
797         const CDbtag& dtg = id->GetGeneral();
798         retval = CAlignFormatUtil::GetGnlID(dtg);
799     }
800     if (retval == "")
801       retval = id->GetSeqIdString(with_version);
802 
803     return retval;
804 }
805 
AddSpace(CNcbiOstream & out,int number)806 void CAlignFormatUtil::AddSpace(CNcbiOstream& out, int number)
807 
808 {
809     for(int i=0; i<number; i++){
810         out<<" ";
811     }
812 
813 }
814 
GetScoreString(double evalue,double bit_score,double total_bit_score,int raw_score,string & evalue_str,string & bit_score_str,string & total_bit_score_str,string & raw_score_str)815 void CAlignFormatUtil::GetScoreString(double evalue,
816                                       double bit_score,
817                                       double total_bit_score,
818                                       int raw_score,
819                                       string& evalue_str,
820                                       string& bit_score_str,
821                                       string& total_bit_score_str,
822                                       string& raw_score_str)
823 {
824     char evalue_buf[100], bit_score_buf[100], total_bit_score_buf[100];
825 
826     /* Facilitates comparing formatted output using diff */
827     static string kBitScoreFormat("%4.1lf");
828 #ifdef CTOOLKIT_COMPATIBLE
829     static bool ctoolkit_compatible = false;
830     static bool value_set = false;
831     if ( !value_set ) {
832         if (getenv("CTOOLKIT_COMPATIBLE")) {
833             kBitScoreFormat.assign("%4.0lf");
834             ctoolkit_compatible = true;
835         }
836         value_set = true;
837     }
838 #endif /* CTOOLKIT_COMPATIBLE */
839 
840     if (evalue < 1.0e-180) {
841         snprintf(evalue_buf, sizeof(evalue_buf), "0.0");
842     } else if (evalue < 1.0e-99) {
843         snprintf(evalue_buf, sizeof(evalue_buf), "%2.0le", evalue);
844 #ifdef CTOOLKIT_COMPATIBLE
845         if (ctoolkit_compatible) {
846             strncpy(evalue_buf, evalue_buf+1, sizeof(evalue_buf-1));
847         }
848 #endif /* CTOOLKIT_COMPATIBLE */
849     } else if (evalue < 0.0009) {
850         snprintf(evalue_buf, sizeof(evalue_buf), "%3.0le", evalue);
851     } else if (evalue < 0.1) {
852         snprintf(evalue_buf, sizeof(evalue_buf), "%4.3lf", evalue);
853     } else if (evalue < 1.0) {
854         snprintf(evalue_buf, sizeof(evalue_buf), "%3.2lf", evalue);
855     } else if (evalue < 10.0) {
856         snprintf(evalue_buf, sizeof(evalue_buf), "%2.1lf", evalue);
857     } else {
858         snprintf(evalue_buf, sizeof(evalue_buf), "%2.0lf", evalue);
859     }
860 
861     if (bit_score > 99999){
862         snprintf(bit_score_buf, sizeof(bit_score_buf), "%5.3le", bit_score);
863     } else if (bit_score > 99.9){
864         snprintf(bit_score_buf, sizeof(bit_score_buf), "%3.0ld",
865             (long)bit_score);
866     } else {
867         snprintf(bit_score_buf, sizeof(bit_score_buf), kBitScoreFormat.c_str(),
868             bit_score);
869     }
870     if (total_bit_score > 99999){
871         snprintf(total_bit_score_buf, sizeof(total_bit_score_buf), "%5.3le",
872             total_bit_score);
873     } else if (total_bit_score > 99.9){
874         snprintf(total_bit_score_buf, sizeof(total_bit_score_buf), "%3.0ld",
875             (long)total_bit_score);
876     } else {
877         snprintf(total_bit_score_buf, sizeof(total_bit_score_buf), "%2.1lf",
878             total_bit_score);
879     }
880     evalue_str = evalue_buf;
881     bit_score_str = bit_score_buf;
882     total_bit_score_str = total_bit_score_buf;
883     if (raw_score <= 0)
884       raw_score = -1;
885     NStr::IntToString(raw_score_str, raw_score);
886 }
887 
888 
PruneSeqalign(const CSeq_align_set & source_aln,CSeq_align_set & new_aln,unsigned int number)889 void CAlignFormatUtil::PruneSeqalign(const CSeq_align_set& source_aln,
890                                      CSeq_align_set& new_aln,
891                                      unsigned int number)
892 {
893     CConstRef<CSeq_id> previous_id, subid;
894     bool is_first_aln = true;
895     unsigned int num_align = 0;
896     ITERATE(CSeq_align_set::Tdata, iter, source_aln.Get()){
897 
898         if ((*iter)->GetSegs().IsDisc()) {
899             ++num_align;
900         } else {
901             subid = &((*iter)->GetSeq_id(1));
902             if(is_first_aln || (!is_first_aln && !subid->Match(*previous_id))){
903                 ++num_align;
904             }
905 
906             if(num_align > number) {
907                  break;
908             }
909 
910             is_first_aln = false;
911             previous_id = subid;
912         }
913         new_aln.Set().push_back(*iter);
914     }
915 }
916 
917 
GetSubjectsNumber(const CSeq_align_set & source_aln,unsigned int number)918 unsigned int CAlignFormatUtil::GetSubjectsNumber(const CSeq_align_set& source_aln,
919                                              unsigned int number)
920 {
921     CConstRef<CSeq_id> previous_id, subid;
922     bool is_first_aln = true;
923     unsigned int num_align = 0;
924     ITERATE(CSeq_align_set::Tdata, iter, source_aln.Get()){
925 
926         if ((*iter)->GetSegs().IsDisc()) {
927             ++num_align;
928         } else {
929             subid = &((*iter)->GetSeq_id(1));
930             if(is_first_aln || (!is_first_aln && !subid->Match(*previous_id))){
931                 ++num_align;
932             }
933 
934             if(num_align >= number) {
935                  break;
936             }
937 
938             is_first_aln = false;
939             previous_id = subid;
940         }
941     }
942     return num_align;
943 }
944 
945 
PruneSeqalignAll(const CSeq_align_set & source_aln,CSeq_align_set & new_aln,unsigned int number)946 void CAlignFormatUtil::PruneSeqalignAll(const CSeq_align_set& source_aln,
947                                      CSeq_align_set& new_aln,
948                                      unsigned int number)
949 {
950     CConstRef<CSeq_id> previous_id, subid;
951     bool is_first_aln = true;
952     unsigned int num_align = 0;
953     bool finishCurrent = false;
954     ITERATE(CSeq_align_set::Tdata, iter, source_aln.Get()){
955         if ((*iter)->GetSegs().IsDisc()) {
956             ++num_align;
957         } else {
958             subid = &((*iter)->GetSeq_id(1));
959             if(is_first_aln || (!is_first_aln && !subid->Match(*previous_id))){
960                 finishCurrent = (num_align + 1 == number) ? true : false;
961                 ++num_align;
962             }
963             is_first_aln = false;
964             previous_id = subid;
965         }
966         if(num_align > number && !finishCurrent) {
967             break;
968         }
969         new_aln.Set().push_back(*iter);
970     }
971 }
972 
973 
974 void
GetAlignLengths(CAlnVec & salv,int & align_length,int & num_gaps,int & num_gap_opens)975 CAlignFormatUtil::GetAlignLengths(CAlnVec& salv, int& align_length,
976                                   int& num_gaps, int& num_gap_opens)
977 {
978     num_gaps = num_gap_opens = align_length = 0;
979 
980     for (int row = 0; row < salv.GetNumRows(); row++) {
981         CRef<CAlnMap::CAlnChunkVec> chunk_vec
982             = salv.GetAlnChunks(row, salv.GetSeqAlnRange(0));
983         for (int i=0; i<chunk_vec->size(); i++) {
984             CConstRef<CAlnMap::CAlnChunk> chunk = (*chunk_vec)[i];
985             int chunk_length = chunk->GetAlnRange().GetLength();
986             // Gaps are counted on all rows: gap can only be in one of the rows
987             // for any given segment.
988             if (chunk->IsGap()) {
989                 ++num_gap_opens;
990                 num_gaps += chunk_length;
991             }
992             // To calculate alignment length, only one row is needed.
993             if (row == 0)
994                 align_length += chunk_length;
995         }
996     }
997 }
998 
999 void
ExtractSeqalignSetFromDiscSegs(CSeq_align_set & target,const CSeq_align_set & source)1000 CAlignFormatUtil::ExtractSeqalignSetFromDiscSegs(CSeq_align_set& target,
1001                                                  const CSeq_align_set& source)
1002 {
1003     if (source.IsSet() && source.CanGet()) {
1004 
1005         for(CSeq_align_set::Tdata::const_iterator iter = source.Get().begin();
1006             iter != source.Get().end(); iter++) {
1007             if((*iter)->IsSetSegs()){
1008                 const CSeq_align::TSegs& seg = (*iter)->GetSegs();
1009                 if(seg.IsDisc()){
1010                     const CSeq_align_set& set = seg.GetDisc();
1011                     for(CSeq_align_set::Tdata::const_iterator iter2 =
1012                             set.Get().begin(); iter2 != set.Get().end();
1013                         iter2 ++) {
1014                         target.Set().push_back(*iter2);
1015                     }
1016                 } else {
1017                     target.Set().push_back(*iter);
1018                 }
1019             }
1020         }
1021     }
1022 }
1023 
1024 CRef<CSeq_align>
CreateDensegFromDendiag(const CSeq_align & aln)1025 CAlignFormatUtil::CreateDensegFromDendiag(const CSeq_align& aln)
1026 {
1027     CRef<CSeq_align> sa(new CSeq_align);
1028     if ( !aln.GetSegs().IsDendiag()) {
1029         NCBI_THROW(CException, eUnknown, "Input Seq-align should be Dendiag!");
1030     }
1031 
1032     if(aln.IsSetType()){
1033         sa->SetType(aln.GetType());
1034     }
1035     if(aln.IsSetDim()){
1036         sa->SetDim(aln.GetDim());
1037     }
1038     if(aln.IsSetScore()){
1039         sa->SetScore() = aln.GetScore();
1040     }
1041     if(aln.IsSetBounds()){
1042         sa->SetBounds() = aln.GetBounds();
1043     }
1044 
1045     CDense_seg& ds = sa->SetSegs().SetDenseg();
1046 
1047     int counter = 0;
1048     ds.SetNumseg() = 0;
1049     ITERATE (CSeq_align::C_Segs::TDendiag, iter, aln.GetSegs().GetDendiag()){
1050 
1051         if(counter == 0){//assume all dendiag segments have same dim and ids
1052             if((*iter)->IsSetDim()){
1053                 ds.SetDim((*iter)->GetDim());
1054             }
1055             if((*iter)->IsSetIds()){
1056                 ds.SetIds() = (*iter)->GetIds();
1057             }
1058         }
1059         ds.SetNumseg() ++;
1060         if((*iter)->IsSetStarts()){
1061             ITERATE(CDense_diag::TStarts, iterStarts, (*iter)->GetStarts()){
1062                 ds.SetStarts().push_back(*iterStarts);
1063             }
1064         }
1065         if((*iter)->IsSetLen()){
1066             ds.SetLens().push_back((*iter)->GetLen());
1067         }
1068         if((*iter)->IsSetStrands()){
1069             ITERATE(CDense_diag::TStrands, iterStrands, (*iter)->GetStrands()){
1070                 ds.SetStrands().push_back(*iterStrands);
1071             }
1072         }
1073         if((*iter)->IsSetScores()){
1074             ITERATE(CDense_diag::TScores, iterScores, (*iter)->GetScores()){
1075                 ds.SetScores().push_back(*iterScores); //this might not have
1076                                                        //right meaning
1077             }
1078         }
1079         counter ++;
1080     }
1081 
1082     return sa;
1083 }
1084 
GetTaxidForSeqid(const CSeq_id & id,CScope & scope)1085 TTaxId CAlignFormatUtil::GetTaxidForSeqid(const CSeq_id& id, CScope& scope)
1086 {
1087     TTaxId taxid = ZERO_TAX_ID;
1088     try{
1089         const CBioseq_Handle& handle = scope.GetBioseqHandle(id);
1090         const CRef<CBlast_def_line_set> bdlRef =
1091             CSeqDB::ExtractBlastDefline(handle);
1092         const list< CRef< CBlast_def_line > > &bdl = (bdlRef.Empty()) ? list< CRef< CBlast_def_line > >() : bdlRef->Get();
1093         ITERATE(list<CRef<CBlast_def_line> >, iter_bdl, bdl) {
1094             CConstRef<CSeq_id> bdl_id =
1095                 GetSeq_idByType((*iter_bdl)->GetSeqid(), id.Which());
1096             if(bdl_id && bdl_id->Match(id) &&
1097                (*iter_bdl)->IsSetTaxid() && (*iter_bdl)->CanGetTaxid()){
1098                 taxid = (*iter_bdl)->GetTaxid();
1099                 break;
1100             }
1101         }
1102     } catch (CException&) {
1103 
1104     }
1105     return taxid;
1106 }
1107 
GetFrame(int start,ENa_strand strand,const CBioseq_Handle & handle)1108 int CAlignFormatUtil::GetFrame (int start, ENa_strand strand,
1109                                 const CBioseq_Handle& handle)
1110 {
1111     int frame = 0;
1112     if (strand == eNa_strand_plus) {
1113         frame = (start % 3) + 1;
1114     } else if (strand == eNa_strand_minus) {
1115         frame = -(((int)handle.GetBioseqLength() - start - 1)
1116                   % 3 + 1);
1117 
1118     }
1119     return frame;
1120 }
1121 
1122 
1123 void CAlignFormatUtil::
SortHitByPercentIdentityDescending(list<CRef<CSeq_align_set>> & seqalign_hit_list,bool do_translation)1124 SortHitByPercentIdentityDescending(list< CRef<CSeq_align_set> >&
1125                                    seqalign_hit_list,
1126                                    bool do_translation
1127                                    )
1128 {
1129 
1130     kTranslation = do_translation;
1131     seqalign_hit_list.sort(SortHitByPercentIdentityDescendingEx);
1132 }
1133 
1134 
1135 bool CAlignFormatUtil::
SortHspByPercentIdentityDescending(const CRef<CSeq_align> & info1,const CRef<CSeq_align> & info2)1136 SortHspByPercentIdentityDescending(const CRef<CSeq_align>& info1,
1137                                    const CRef<CSeq_align>& info2)
1138 {
1139 
1140     int score1, sum_n1, num_ident1;
1141     double bits1, evalue1;
1142     list<TGi> use_this_gi1;
1143 
1144     int score2, sum_n2, num_ident2;
1145     double bits2, evalue2;
1146     list<TGi> use_this_gi2;
1147 
1148 
1149     GetAlnScores(*info1, score1,  bits1, evalue1, sum_n1, num_ident1, use_this_gi1);
1150     GetAlnScores(*info2, score2,  bits2, evalue2, sum_n2, num_ident2, use_this_gi2);
1151 
1152     int length1 = GetAlignmentLength(*info1, kTranslation);
1153     int length2 = GetAlignmentLength(*info2, kTranslation);
1154     bool retval = false;
1155 
1156 
1157     if(length1 > 0 && length2 > 0 && num_ident1 > 0 &&num_ident2 > 0 ) {
1158         if (((double)num_ident1)/length1 == ((double)num_ident2)/length2) {
1159 
1160             retval = evalue1 < evalue2;
1161 
1162         } else {
1163             retval = ((double)num_ident1)/length1 >= ((double)num_ident2)/length2;
1164 
1165         }
1166     } else {
1167         retval = evalue1 < evalue2;
1168     }
1169     return retval;
1170 }
1171 
1172 bool CAlignFormatUtil::
SortHitByScoreDescending(const CRef<CSeq_align_set> & info1,const CRef<CSeq_align_set> & info2)1173 SortHitByScoreDescending(const CRef<CSeq_align_set>& info1,
1174                          const CRef<CSeq_align_set>& info2)
1175 {
1176     CRef<CSeq_align_set> i1(info1), i2(info2);
1177 
1178     i1->Set().sort(SortHspByScoreDescending);
1179     i2->Set().sort(SortHspByScoreDescending);
1180 
1181 
1182     int score1, sum_n1, num_ident1;
1183     double bits1, evalue1;
1184     list<TGi> use_this_gi1;
1185 
1186     int score2, sum_n2, num_ident2;
1187     double bits2, evalue2;
1188     list<TGi> use_this_gi2;
1189 
1190     GetAlnScores(*(info1->Get().front()), score1,  bits1, evalue1, sum_n1, num_ident1, use_this_gi1);
1191     GetAlnScores(*(info2->Get().front()), score2,  bits2, evalue2, sum_n2, num_ident2, use_this_gi2);
1192     return bits1 > bits2;
1193 }
1194 
1195 bool CAlignFormatUtil::
SortHitByMasterCoverageDescending(CRef<CSeq_align_set> const & info1,CRef<CSeq_align_set> const & info2)1196 SortHitByMasterCoverageDescending(CRef<CSeq_align_set> const& info1,
1197                                   CRef<CSeq_align_set> const& info2)
1198 {
1199     int cov1 = GetMasterCoverage(*info1);
1200     int cov2 = GetMasterCoverage(*info2);
1201     bool retval = false;
1202 
1203     if (cov1 > cov2) {
1204         retval = cov1 > cov2;
1205     } else if (cov1 == cov2) {
1206         int score1, sum_n1, num_ident1;
1207         double bits1, evalue1;
1208         list<TGi> use_this_gi1;
1209 
1210         int score2, sum_n2, num_ident2;
1211         double bits2, evalue2;
1212         list<TGi> use_this_gi2;
1213         GetAlnScores(*(info1->Get().front()), score1,  bits1, evalue1, sum_n1, num_ident1, use_this_gi1);
1214         GetAlnScores(*(info2->Get().front()), score2,  bits2, evalue2, sum_n2, num_ident2, use_this_gi2);
1215         retval = evalue1 < evalue2;
1216     }
1217 
1218     return retval;
1219 }
1220 
SortHitByMasterStartAscending(CRef<CSeq_align_set> & info1,CRef<CSeq_align_set> & info2)1221 bool CAlignFormatUtil::SortHitByMasterStartAscending(CRef<CSeq_align_set>& info1,
1222                                                      CRef<CSeq_align_set>& info2)
1223 {
1224     int start1 = 0, start2 = 0;
1225 
1226 
1227     info1->Set().sort(SortHspByMasterStartAscending);
1228     info2->Set().sort(SortHspByMasterStartAscending);
1229 
1230 
1231     start1 = min(info1->Get().front()->GetSeqStart(0),
1232                   info1->Get().front()->GetSeqStop(0));
1233     start2 = min(info2->Get().front()->GetSeqStart(0),
1234                   info2->Get().front()->GetSeqStop(0));
1235 
1236     if (start1 == start2) {
1237         //same start then arrange by bits score
1238         int score1, sum_n1, num_ident1;
1239         double bits1, evalue1;
1240         list<TGi> use_this_gi1;
1241 
1242         int score2, sum_n2, num_ident2;
1243         double bits2, evalue2;
1244         list<TGi> use_this_gi2;
1245 
1246 
1247         GetAlnScores(*(info1->Get().front()), score1,  bits1, evalue1, sum_n1, num_ident1, use_this_gi1);
1248         GetAlnScores(*(info1->Get().front()), score2,  bits2, evalue2, sum_n2, num_ident2, use_this_gi2);
1249         return evalue1 < evalue2;
1250 
1251     } else {
1252         return start1 < start2;
1253     }
1254 
1255 }
1256 
1257 bool CAlignFormatUtil::
SortHspByScoreDescending(const CRef<CSeq_align> & info1,const CRef<CSeq_align> & info2)1258 SortHspByScoreDescending(const CRef<CSeq_align>& info1,
1259                          const CRef<CSeq_align>& info2)
1260 {
1261 
1262     int score1, sum_n1, num_ident1;
1263     double bits1, evalue1;
1264     list<TGi> use_this_gi1;
1265 
1266     int score2, sum_n2, num_ident2;
1267     double bits2, evalue2;
1268     list<TGi> use_this_gi2;
1269 
1270 
1271     GetAlnScores(*info1, score1,  bits1, evalue1, sum_n1, num_ident1, use_this_gi1);
1272     GetAlnScores(*info2, score2,  bits2, evalue2, sum_n2, num_ident2, use_this_gi2);
1273     return bits1 > bits2;
1274 
1275 }
1276 
1277 bool CAlignFormatUtil::
SortHspByMasterStartAscending(const CRef<CSeq_align> & info1,const CRef<CSeq_align> & info2)1278 SortHspByMasterStartAscending(const CRef<CSeq_align>& info1,
1279                               const CRef<CSeq_align>& info2)
1280 {
1281     int start1 = 0, start2 = 0;
1282 
1283     start1 = min(info1->GetSeqStart(0), info1->GetSeqStop(0));
1284     start2 = min(info2->GetSeqStart(0), info2->GetSeqStop(0)) ;
1285 
1286     if (start1 == start2) {
1287         //same start then arrange by bits score
1288         int score1, sum_n1, num_ident1;
1289         double bits1, evalue1;
1290         list<TGi> use_this_gi1;
1291 
1292         int score2, sum_n2, num_ident2;
1293         double bits2, evalue2;
1294         list<TGi> use_this_gi2;
1295 
1296 
1297         GetAlnScores(*info1, score1,  bits1, evalue1, sum_n1, num_ident1, use_this_gi1);
1298         GetAlnScores(*info2, score2,  bits2, evalue2, sum_n2, num_ident2, use_this_gi2);
1299         return evalue1 < evalue2;
1300 
1301     } else {
1302 
1303         return start1 < start2;
1304     }
1305 }
1306 
1307 bool CAlignFormatUtil::
SortHspBySubjectStartAscending(const CRef<CSeq_align> & info1,const CRef<CSeq_align> & info2)1308 SortHspBySubjectStartAscending(const CRef<CSeq_align>& info1,
1309                                const CRef<CSeq_align>& info2)
1310 {
1311     int start1 = 0, start2 = 0;
1312 
1313     start1 = min(info1->GetSeqStart(1), info1->GetSeqStop(1));
1314     start2 = min(info2->GetSeqStart(1), info2->GetSeqStop(1)) ;
1315 
1316     if (start1 == start2) {
1317         //same start then arrange by bits score
1318         int score1, sum_n1, num_ident1;
1319         double bits1, evalue1;
1320         list<TGi> use_this_gi1;
1321 
1322         int score2, sum_n2, num_ident2;
1323         double bits2, evalue2;
1324         list<TGi> use_this_gi2;
1325 
1326 
1327         GetAlnScores(*info1, score1,  bits1, evalue1, sum_n1, num_ident1, use_this_gi1);
1328         GetAlnScores(*info2, score2,  bits2, evalue2, sum_n2, num_ident2, use_this_gi2);
1329         return evalue1 < evalue2;
1330 
1331     } else {
1332 
1333         return start1 < start2;
1334     }
1335 }
1336 
GetAlignmentLength(const CSeq_align & aln,bool do_translation)1337 int CAlignFormatUtil::GetAlignmentLength(const CSeq_align& aln, bool do_translation)
1338 {
1339 
1340     CRef<CSeq_align> final_aln;
1341 
1342     // Convert Std-seg and Dense-diag alignments to Dense-seg.
1343     // Std-segs are produced only for translated searches; Dense-diags only for
1344     // ungapped, not translated searches.
1345 
1346     if (aln.GetSegs().IsStd()) {
1347         CRef<CSeq_align> denseg_aln = aln.CreateDensegFromStdseg();
1348         // When both query and subject are translated, i.e. tblastx, convert
1349         // to a special type of Dense-seg.
1350         if (do_translation) {
1351             final_aln = denseg_aln->CreateTranslatedDensegFromNADenseg();
1352         } else {
1353             final_aln = denseg_aln;
1354 
1355         }
1356     } else if (aln.GetSegs().IsDendiag()) {
1357         final_aln = CreateDensegFromDendiag(aln);
1358     }
1359 
1360     const CDense_seg& ds = (final_aln ? final_aln->GetSegs().GetDenseg() :
1361                             aln.GetSegs().GetDenseg());
1362 
1363     CAlnMap alnmap(ds);
1364     return alnmap.GetAlnStop() + 1;
1365 }
1366 
GetPercentIdentity(const CSeq_align & aln,CScope & scope,bool do_translation)1367 double CAlignFormatUtil::GetPercentIdentity(const CSeq_align& aln,
1368                                             CScope& scope,
1369                                             bool do_translation) {
1370     double identity = 0;
1371     CRef<CSeq_align> final_aln;
1372 
1373     // Convert Std-seg and Dense-diag alignments to Dense-seg.
1374     // Std-segs are produced only for translated searches; Dense-diags only for
1375     // ungapped, not translated searches.
1376 
1377     if (aln.GetSegs().IsStd()) {
1378         CRef<CSeq_align> denseg_aln = aln.CreateDensegFromStdseg();
1379         // When both query and subject are translated, i.e. tblastx, convert
1380         // to a special type of Dense-seg.
1381         if (do_translation) {
1382             final_aln = denseg_aln->CreateTranslatedDensegFromNADenseg();
1383         } else {
1384             final_aln = denseg_aln;
1385 
1386         }
1387     } else if (aln.GetSegs().IsDendiag()) {
1388         final_aln = CreateDensegFromDendiag(aln);
1389     }
1390 
1391     const CDense_seg& ds = (final_aln ? final_aln->GetSegs().GetDenseg() :
1392                             aln.GetSegs().GetDenseg());
1393 
1394     CAlnVec alnvec(ds, scope);
1395     string query, subject;
1396 
1397     alnvec.SetAaCoding(CSeq_data::e_Ncbieaa);
1398     alnvec.GetWholeAlnSeqString(0, query);
1399     alnvec.GetWholeAlnSeqString(1, subject);
1400 
1401     int num_ident = 0;
1402     int length = min(query.size(), subject.size());
1403 
1404     for (int i = 0; i < length; ++i) {
1405         if (query[i] == subject[i]) {
1406             ++num_ident;
1407         }
1408     }
1409 
1410     if (length > 0) {
1411         identity = ((double)num_ident)/length;
1412     }
1413 
1414     return identity;
1415 }
1416 
1417 
s_CalcAlnPercentIdent(const CRef<CSeq_align_set> & info1,const CRef<CSeq_align_set> & info2,double & percentIdent1,double & percentIdent2)1418 static void s_CalcAlnPercentIdent(const CRef<CSeq_align_set>& info1,
1419                                const CRef<CSeq_align_set>& info2,
1420                                double &percentIdent1,
1421                                double &percentIdent2)
1422 {
1423 
1424     CRef<CSeq_align_set> i1(info1), i2(info2);
1425     percentIdent1 = -1;
1426     percentIdent2 = -1;
1427 
1428     i1->Set().sort(CAlignFormatUtil::SortHspByPercentIdentityDescending);
1429     i2->Set().sort(CAlignFormatUtil::SortHspByPercentIdentityDescending);
1430 
1431     percentIdent1 = CAlignFormatUtil::GetSeqAlignSetCalcPercentIdent(*info1, kTranslation);
1432     percentIdent2 = CAlignFormatUtil::GetSeqAlignSetCalcPercentIdent(*info2, kTranslation);
1433     return;
1434 }
1435 
1436 
1437 bool CAlignFormatUtil::
SortHitByPercentIdentityDescendingEx(const CRef<CSeq_align_set> & info1,const CRef<CSeq_align_set> & info2)1438 SortHitByPercentIdentityDescendingEx(const CRef<CSeq_align_set>& info1,
1439                                      const CRef<CSeq_align_set>& info2)
1440 {
1441 
1442     CRef<CSeq_align_set> i1(info1), i2(info2);
1443 
1444     //i1->Set().sort(SortHspByPercentIdentityDescending);
1445     //i2->Set().sort(SortHspByPercentIdentityDescending);
1446 
1447 
1448     auto_ptr<CAlignFormatUtil::SSeqAlignSetCalcParams> seqSetInfo1( CAlignFormatUtil::GetSeqAlignSetCalcParamsFromASN(*info1));
1449     auto_ptr<CAlignFormatUtil::SSeqAlignSetCalcParams> seqSetInfo2( CAlignFormatUtil::GetSeqAlignSetCalcParamsFromASN(*info2));
1450     double evalue1 = seqSetInfo1->evalue;
1451     double evalue2 = seqSetInfo2->evalue;
1452     double percentIdent1 = seqSetInfo1->percent_identity;
1453     double  percentIdent2 = seqSetInfo2->percent_identity;
1454 
1455     bool retval = false;
1456     if(percentIdent1 < 0 || percentIdent2 < 0) {
1457         s_CalcAlnPercentIdent(info1, info2,percentIdent1,percentIdent2);
1458     }
1459     if(percentIdent1 > 0 &&percentIdent2 > 0) {
1460         if (percentIdent1 == percentIdent2) {
1461             retval = evalue1 < evalue2;
1462 
1463         } else {
1464             retval = percentIdent1 >= percentIdent2;
1465         }
1466     } else {
1467         retval = evalue1 < evalue2;
1468     }
1469     return retval;
1470 }
1471 
SortHitByTotalScoreDescending(CRef<CSeq_align_set> const & info1,CRef<CSeq_align_set> const & info2)1472 bool CAlignFormatUtil::SortHitByTotalScoreDescending(CRef<CSeq_align_set> const& info1,
1473                                                      CRef<CSeq_align_set> const& info2)
1474 {
1475     int score1,  score2, sum_n, num_ident;
1476     double bits, evalue;
1477     list<TGi> use_this_gi;
1478     double total_bits1 = 0, total_bits2 = 0;
1479 
1480     ITERATE(CSeq_align_set::Tdata, iter, info1->Get()) {
1481         CAlignFormatUtil::GetAlnScores(**iter, score1, bits, evalue,
1482                                        sum_n, num_ident, use_this_gi);
1483         total_bits1 += bits;
1484     }
1485 
1486     ITERATE(CSeq_align_set::Tdata, iter, info2->Get()) {
1487         CAlignFormatUtil::GetAlnScores(**iter, score2, bits, evalue,
1488                                        sum_n, num_ident, use_this_gi);
1489         total_bits2 += bits;
1490     }
1491 
1492 
1493     return total_bits1 >= total_bits2;
1494 
1495 }
1496 
1497 #ifndef NCBI_COMPILER_WORKSHOP
1498 /** Class to sort by linkout bit
1499  * @note this code doesn't compile under the Solaris' WorkShop, and because
1500  * this feature is only used inside NCBI (LinkoutDB), we disable this code.
1501  */
1502 class CSortHitByMolecularTypeEx
1503 {
1504 public:
CSortHitByMolecularTypeEx(ILinkoutDB * linkoutdb,const string & mv_build_name)1505     CSortHitByMolecularTypeEx(ILinkoutDB* linkoutdb,
1506                               const string& mv_build_name)
1507         : m_LinkoutDB(linkoutdb), m_MapViewerBuildName(mv_build_name) {}
1508 
operator ()(const CRef<CSeq_align_set> & info1,const CRef<CSeq_align_set> & info2)1509     bool operator() (const CRef<CSeq_align_set>& info1, const CRef<CSeq_align_set>& info2)
1510     {
1511         CConstRef<CSeq_id> id1, id2;
1512         id1 = &(info1->Get().front()->GetSeq_id(1));
1513         id2 = &(info2->Get().front()->GetSeq_id(1));
1514 
1515         int linkout1 = 0, linkout2 = 0;
1516         linkout1 = m_LinkoutDB
1517             ? m_LinkoutDB->GetLinkout(*id1, m_MapViewerBuildName)
1518             : 0;
1519         linkout2 = m_LinkoutDB
1520             ? m_LinkoutDB->GetLinkout(*id2, m_MapViewerBuildName)
1521             : 0;
1522 
1523         return (linkout1 & eGenomicSeq) <= (linkout2 & eGenomicSeq);
1524     }
1525 private:
1526     ILinkoutDB* m_LinkoutDB;
1527     string m_MapViewerBuildName;
1528 };
1529 #endif /* NCBI_COMPILER_WORKSHOP */
1530 
1531 void CAlignFormatUtil::
SortHitByMolecularType(list<CRef<CSeq_align_set>> & seqalign_hit_list,CScope & scope,ILinkoutDB * linkoutdb,const string & mv_build_name)1532 SortHitByMolecularType(list< CRef<CSeq_align_set> >& seqalign_hit_list,
1533                        CScope& scope, ILinkoutDB* linkoutdb,
1534                        const string& mv_build_name)
1535 {
1536 
1537     kScope = &scope;
1538 #ifndef NCBI_COMPILER_WORKSHOP
1539     seqalign_hit_list.sort(CSortHitByMolecularTypeEx(linkoutdb, mv_build_name));
1540 #endif /* NCBI_COMPILER_WORKSHOP */
1541 }
1542 
SortHit(list<CRef<CSeq_align_set>> & seqalign_hit_list,bool do_translation,CScope & scope,int sort_method,ILinkoutDB * linkoutdb,const string & mv_build_name)1543 void CAlignFormatUtil::SortHit(list< CRef<CSeq_align_set> >& seqalign_hit_list,
1544                                bool do_translation, CScope& scope, int
1545                                sort_method, ILinkoutDB* linkoutdb,
1546                                const string& mv_build_name)
1547 {
1548     kScope = &scope;
1549     kTranslation = do_translation;
1550 
1551     if (sort_method == 1) {
1552 #ifndef NCBI_COMPILER_WORKSHOP
1553         seqalign_hit_list.sort(CSortHitByMolecularTypeEx(linkoutdb,
1554                                                          mv_build_name));
1555 #endif /* NCBI_COMPILER_WORKSHOP */
1556     } else if (sort_method == 2) {
1557         seqalign_hit_list.sort(SortHitByTotalScoreDescending);
1558     } else if (sort_method == 3) {
1559         seqalign_hit_list.sort(SortHitByPercentIdentityDescendingEx);
1560     }
1561 }
1562 
1563 void CAlignFormatUtil::
SplitSeqalignByMolecularType(vector<CRef<CSeq_align_set>> & target,int sort_method,const CSeq_align_set & source,CScope & scope,ILinkoutDB * linkoutdb,const string & mv_build_name)1564 SplitSeqalignByMolecularType(vector< CRef<CSeq_align_set> >&
1565                              target,
1566                              int sort_method,
1567                              const CSeq_align_set& source,
1568                              CScope& scope,
1569                              ILinkoutDB* linkoutdb,
1570                              const string& mv_build_name)
1571 {
1572     CConstRef<CSeq_id> prevSubjectId;
1573     int count = 0;
1574     int linkoutPrev = 0;
1575     ITERATE(CSeq_align_set::Tdata, iter, source.Get()) {
1576 
1577         const CSeq_id& id = (*iter)->GetSeq_id(1);
1578         try {
1579             const CBioseq_Handle& handle = scope.GetBioseqHandle(id);
1580             if (handle) {
1581                 int linkout;
1582                 if(prevSubjectId.Empty() || !id.Match(*prevSubjectId)){
1583                     prevSubjectId = &id;
1584                     linkout = linkoutdb ? linkoutdb->GetLinkout(id, mv_build_name): 0;
1585                     linkoutPrev = linkout;
1586                     count++;
1587                 }
1588                 else {
1589                     linkout = linkoutPrev;
1590                 }
1591                 if (linkout & eGenomicSeq) {
1592                     if (sort_method == 1) {
1593                         target[1]->Set().push_back(*iter);
1594                     } else if (sort_method == 2){
1595                         target[0]->Set().push_back(*iter);
1596                     } else {
1597                         target[1]->Set().push_back(*iter);
1598                     }
1599                 } else {
1600                     if (sort_method == 1) {
1601                         target[0]->Set().push_back(*iter);
1602                     } else if (sort_method == 2) {
1603                         target[1]->Set().push_back(*iter);
1604                     }  else {
1605                         target[0]->Set().push_back(*iter);
1606                     }
1607                 }
1608             } else {
1609                 target[0]->Set().push_back(*iter);
1610             }
1611 
1612         } catch (const CException&){
1613             target[0]->Set().push_back(*iter); //no bioseq found, leave untouched
1614         }
1615     }
1616 }
1617 
HspListToHitList(list<CRef<CSeq_align_set>> & target,const CSeq_align_set & source)1618 void CAlignFormatUtil::HspListToHitList(list< CRef<CSeq_align_set> >& target,
1619                                         const CSeq_align_set& source)
1620 {
1621     CConstRef<CSeq_id> previous_id;
1622     CRef<CSeq_align_set> temp;
1623 
1624     ITERATE(CSeq_align_set::Tdata, iter, source.Get()) {
1625         const CSeq_id& cur_id = (*iter)->GetSeq_id(1);
1626         if(previous_id.Empty()) {
1627             temp =  new CSeq_align_set;
1628             temp->Set().push_back(*iter);
1629             target.push_back(temp);
1630         } else if (cur_id.Match(*previous_id)){
1631             temp->Set().push_back(*iter);
1632 
1633         } else {
1634             temp =  new CSeq_align_set;
1635             temp->Set().push_back(*iter);
1636             target.push_back(temp);
1637         }
1638         previous_id = &cur_id;
1639     }
1640 
1641 }
1642 
1643 CRef<CSeq_align_set>
HitListToHspList(list<CRef<CSeq_align_set>> & source)1644 CAlignFormatUtil::HitListToHspList(list< CRef<CSeq_align_set> >& source)
1645 {
1646     CRef<CSeq_align_set> align_set (new CSeq_align_set);
1647     CConstRef<CSeq_id> previous_id;
1648     CRef<CSeq_align_set> temp;
1649     // list<CRef<CSeq_align_set> >::iterator iter;
1650 
1651     for (list<CRef<CSeq_align_set> >::iterator iter = source.begin(); iter != source.end(); iter ++) {
1652         ITERATE(CSeq_align_set::Tdata, iter2, (*iter)->Get()) {
1653             align_set->Set().push_back(*iter2);
1654         }
1655     }
1656     return align_set;
1657 }
1658 
HspListToHitMap(vector<string> seqIdList,const CSeq_align_set & source)1659 map < string, CRef<CSeq_align_set>  >  CAlignFormatUtil::HspListToHitMap(vector <string> seqIdList,
1660                                        const CSeq_align_set& source)
1661 {
1662     CConstRef<CSeq_id> previous_id;
1663     CRef<CSeq_align_set> temp;
1664 
1665     map < string, CRef<CSeq_align_set>  > hitsMap;
1666 
1667     for(size_t i = 0; i < seqIdList.size();i++) {
1668         CRef<CSeq_align_set> new_aln(new CSeq_align_set);
1669         hitsMap.insert(map<string, CRef<CSeq_align_set> >::value_type(seqIdList[i],new_aln));
1670     }
1671     size_t count = 0;
1672     ITERATE(CSeq_align_set::Tdata, iter, source.Get()) {
1673         const CSeq_id& cur_id = (*iter)->GetSeq_id(1);
1674         if(previous_id.Empty() || !cur_id.Match(*previous_id)) {
1675             if(count >= seqIdList.size()) {
1676                 break;
1677             }
1678             string idString = NStr::TruncateSpaces(cur_id.AsFastaString());
1679             if(hitsMap.find(idString) != hitsMap.end()) {
1680                 temp =  new CSeq_align_set;
1681                 temp->Set().push_back(*iter);
1682                 hitsMap[idString] = temp;
1683                 count++;
1684             }
1685             else {
1686                 temp.Reset();
1687             }
1688         }
1689         else if (cur_id.Match(*previous_id)){
1690             if(!temp.Empty()) {
1691                 temp->Set().push_back(*iter);
1692             }
1693         }
1694         previous_id = &cur_id;
1695     }
1696     return hitsMap;
1697 }
1698 
ExtractSeqAlignForSeqList(CRef<CSeq_align_set> & all_aln_set,string alignSeqList)1699 void CAlignFormatUtil::ExtractSeqAlignForSeqList(CRef<CSeq_align_set> &all_aln_set, string alignSeqList)
1700 {
1701     vector <string> seqIds;
1702     NStr::Split(alignSeqList,",",seqIds);
1703 
1704     //SEQ_ALN_SET from ALIGNDB contains seq_aligns in random order
1705     //The followimg will create a map that contains seq-aln_set per gi from ALIGN_SEQ_LIST
1706     map < string, CRef<CSeq_align_set>  > hitsMap = CAlignFormatUtil::HspListToHitMap(seqIds,*all_aln_set) ;
1707 
1708     map < string, CRef<CSeq_align_set>  >::iterator it;
1709     list< CRef<CSeq_align_set> > orderedSet;
1710     //orderedSet wil have seq aligns in th order of gi list
1711     for(size_t i = 0; i < seqIds.size(); i++) {
1712         if(hitsMap.find(seqIds[i]) != hitsMap.end()) {
1713             orderedSet.push_back(hitsMap[seqIds[i]]);
1714         }
1715     }
1716     //This should contain seq align set in the order of gis in the list
1717     all_aln_set = CAlignFormatUtil::HitListToHspList(orderedSet);
1718 }
1719 
s_GetSRASeqMetadata(const CBioseq::TId & ids,string & strRun,string & strSpotId,string & strReadIndex)1720 static bool s_GetSRASeqMetadata(const CBioseq::TId& ids,string &strRun, string &strSpotId,string &strReadIndex)
1721 {
1722     bool success = false;
1723     string link = NcbiEmptyString;
1724     CConstRef<CSeq_id> seqId = GetSeq_idByType(ids, CSeq_id::e_General);
1725 
1726     if (!seqId.Empty())
1727     {
1728         // Get the SRA tag from seqId
1729         if (seqId->GetGeneral().CanGetDb() &&
1730             seqId->GetGeneral().CanGetTag() &&
1731             seqId->GetGeneral().GetTag().IsStr())
1732         {
1733             // Decode the tag to collect the SRA-specific indices
1734             string strTag = seqId->GetGeneral().GetTag().GetStr();
1735             if (!strTag.empty())
1736             {
1737                 vector<string> vecInfo;
1738                 try
1739                 {
1740                     NStr::Split(strTag, ".", vecInfo);
1741                 }
1742                 catch (...)
1743                 {
1744                     return false;
1745                 }
1746 
1747                 if (vecInfo.size() != 3)
1748                 {
1749                     return false;
1750                 }
1751 
1752                 strRun = vecInfo[0];
1753                 strSpotId = vecInfo[1];
1754                 strReadIndex = vecInfo[2];
1755                 success = true;
1756             }
1757         }
1758     }
1759     return success;
1760 }
1761 
BuildSRAUrl(const CBioseq::TId & ids,string user_url)1762 string CAlignFormatUtil::BuildSRAUrl(const CBioseq::TId& ids, string user_url)
1763 {
1764     string strRun, strSpotId,strReadIndex;
1765     string link = NcbiEmptyString;
1766 
1767     if(s_GetSRASeqMetadata(ids,strRun,strSpotId,strReadIndex))
1768     {
1769         // Generate the SRA link to the identified spot
1770         link += user_url;
1771         link += "?run=" + strRun;
1772         link += "." + strSpotId;
1773         link += "." + strReadIndex;
1774     }
1775     return link;
1776 }
1777 
s_GetBestIDForURL(CBioseq::TId & ids)1778 string s_GetBestIDForURL(CBioseq::TId& ids)
1779 {
1780     string gnl;
1781 
1782     CConstRef<CSeq_id> id_general = GetSeq_idByType(ids, CSeq_id::e_General);
1783     CConstRef<CSeq_id> id_other = GetSeq_idByType(ids, CSeq_id::e_Other);
1784     const CRef<CSeq_id> id_accession = FindBestChoice(ids, CSeq_id::WorstRank);
1785 
1786     if(!id_general.Empty()  && id_general->AsFastaString().find("gnl|BL_ORD_ID") != string::npos){
1787         return gnl;
1788     }
1789 
1790     const CSeq_id* bestid = NULL;
1791     if (id_general.Empty()){
1792         bestid = id_other;
1793         if (id_other.Empty()){
1794             bestid = id_accession;
1795         }
1796     } else {
1797         bestid = id_general;
1798     }
1799 
1800     if (bestid && bestid->Which() !=  CSeq_id::e_Gi){
1801         gnl = NStr::URLEncode(bestid->AsFastaString());
1802     }
1803     return gnl;
1804 }
1805 
BuildUserUrl(const CBioseq::TId & ids,TTaxId taxid,string user_url,string database,bool db_is_na,string rid,int query_number,bool for_alignment)1806 string CAlignFormatUtil::BuildUserUrl(const CBioseq::TId& ids, TTaxId taxid,
1807                                       string user_url, string database,
1808                                       bool db_is_na, string rid, int query_number,
1809                                       bool for_alignment) {
1810 
1811     string link = NcbiEmptyString;
1812     CConstRef<CSeq_id> id_general = GetSeq_idByType(ids, CSeq_id::e_General);
1813 
1814     if(!id_general.Empty()
1815        && id_general->AsFastaString().find("gnl|BL_ORD_ID") != string::npos){
1816         /* We do need to make security protected link to BLAST gnl */
1817         return NcbiEmptyString;
1818     }
1819     TGi gi = FindGi(ids);
1820     string bestID = s_GetBestIDForURL((CBioseq::TId &)ids);
1821 
1822 
1823     bool nodb_path =  false;
1824     /* dumpgnl.cgi need to use path  */
1825     if (user_url.find("dumpgnl.cgi") ==string::npos){
1826         nodb_path = true;
1827     }
1828     int length = (int)database.size();
1829     string str;
1830     char  *chptr, *dbtmp;
1831     char tmpbuff[256];
1832     char* dbname = new char[sizeof(char)*length + 2];
1833     strcpy(dbname, database.c_str());
1834     if(nodb_path) {
1835         int i, j;
1836         dbtmp = new char[sizeof(char)*length + 2]; /* aditional space and NULL */
1837         memset(dbtmp, '\0', sizeof(char)*length + 2);
1838         for(i = 0; i < length; i++) {
1839             if(i > 0) {
1840                 strcat(dbtmp, " ");  //space between db
1841             }
1842             if(isspace((unsigned char) dbname[i]) || dbname[i] == ',') {/* Rolling spaces */
1843                 continue;
1844             }
1845             j = 0;
1846             while (!isspace((unsigned char) dbname[i]) && j < 256  && i < length) {
1847                 tmpbuff[j] = dbname[i];
1848                 j++; i++;
1849                 if(dbname[i] == ',') { /* Comma is valid delimiter */
1850                     break;
1851                 }
1852             }
1853             tmpbuff[j] = '\0';
1854             if((chptr = strrchr(tmpbuff, '/')) != NULL) {
1855                 strcat(dbtmp, (char*)(chptr+1));
1856             } else {
1857                 strcat(dbtmp, tmpbuff);
1858             }
1859 
1860         }
1861     } else {
1862         dbtmp = dbname;
1863     }
1864 
1865     char gnl[256];
1866     if (!bestID.empty()){
1867         strcpy(gnl, bestID.c_str());
1868 
1869     } else {
1870         gnl[0] = '\0';
1871     }
1872 
1873     str = NStr::URLEncode(dbtmp == NULL ? (char*) "nr" : dbtmp);
1874 
1875     if (user_url.find("?") == string::npos){
1876         link += user_url + "?" + "db=" + str + "&na=" + (db_is_na? "1" : "0");
1877     } else {
1878         if (user_url.find("=") != string::npos) {
1879             user_url += "&";
1880         }
1881         link += user_url + "db=" + str + "&na=" + (db_is_na? "1" : "0");
1882     }
1883 
1884     if (gnl[0] != '\0'){
1885         str = gnl;
1886         link += "&gnl=";
1887         link += str;
1888     }
1889     if (gi > ZERO_GI){
1890         link += "&gi=" + NStr::NumericToString(gi);
1891         link += "&term=" + NStr::NumericToString(gi) + NStr::URLEncode("[gi]");
1892     }
1893     if(taxid > ZERO_TAX_ID){
1894         link += "&taxid=" + NStr::NumericToString(taxid);
1895     }
1896     if (rid != NcbiEmptyString){
1897         link += "&RID=" + rid;
1898     }
1899 
1900     if (query_number > 0){
1901         link += "&QUERY_NUMBER=" + NStr::IntToString(query_number);
1902     }
1903 
1904     if (user_url.find("dumpgnl.cgi") ==string::npos){
1905         if (for_alignment)
1906             link += "&log$=nuclalign";
1907         else
1908             link += "&log$=nucltop";
1909     }
1910 
1911     if(nodb_path){
1912         delete [] dbtmp;
1913     }
1914     delete [] dbname;
1915     return link;
1916 }
1917 void CAlignFormatUtil::
BuildFormatQueryString(CCgiContext & ctx,map<string,string> & parameters_to_change,string & cgi_query)1918 BuildFormatQueryString (CCgiContext& ctx,
1919                         map< string, string>& parameters_to_change,
1920                         string& cgi_query)
1921 {
1922 
1923     //add parameters to exclude
1924     parameters_to_change.insert(map<string, string>::
1925                                 value_type("service", ""));
1926     parameters_to_change.insert(map<string, string>::
1927                                 value_type("address", ""));
1928     parameters_to_change.insert(map<string, string>::
1929                                 value_type("platform", ""));
1930     parameters_to_change.insert(map<string, string>::
1931                                     value_type("_pgr", ""));
1932     parameters_to_change.insert(map<string, string>::
1933                                 value_type("client", ""));
1934     parameters_to_change.insert(map<string, string>::
1935                                 value_type("composition_based_statistics", ""));
1936 
1937     parameters_to_change.insert(map<string, string>::
1938                                 value_type("auto_format", ""));
1939     cgi_query = NcbiEmptyString;
1940     TCgiEntries& cgi_entry = ctx.GetRequest().GetEntries();
1941     bool is_first = true;
1942 
1943     for(TCgiEntriesI it=cgi_entry.begin(); it!=cgi_entry.end(); ++it) {
1944         string parameter = it->first;
1945         if (parameter != NcbiEmptyString) {
1946             if (parameters_to_change.count(NStr::ToLower(parameter)) > 0 ||
1947                 parameters_to_change.count(NStr::ToUpper(parameter)) > 0) {
1948                 if(parameters_to_change[NStr::ToLower(parameter)] !=
1949                    NcbiEmptyString &&
1950                    parameters_to_change[NStr::ToUpper(parameter)] !=
1951                    NcbiEmptyString) {
1952                     if (!is_first) {
1953                         cgi_query += "&";
1954                     }
1955                     cgi_query +=
1956                         it->first + "=" + parameters_to_change[it->first];
1957                     is_first = false;
1958                 }
1959             } else {
1960                 if (!is_first) {
1961                     cgi_query += "&";
1962                 }
1963                 cgi_query += it->first + "=" + it->second;
1964                 is_first = false;
1965             }
1966 
1967         }
1968     }
1969 }
1970 
BuildFormatQueryString(CCgiContext & ctx,string & cgi_query)1971 void CAlignFormatUtil::BuildFormatQueryString(CCgiContext& ctx, string& cgi_query) {
1972 
1973     string format_type = ctx.GetRequestValue("FORMAT_TYPE").GetValue();
1974     string ridstr = ctx.GetRequestValue("RID").GetValue();
1975     string align_view = ctx.GetRequestValue("ALIGNMENT_VIEW").GetValue();
1976 
1977     cgi_query += "RID=" + ridstr;
1978     cgi_query += "&FORMAT_TYPE=" + format_type;
1979     cgi_query += "&ALIGNMENT_VIEW=" + align_view;
1980 
1981     cgi_query += "&QUERY_NUMBER=" + ctx.GetRequestValue("QUERY_NUMBER").GetValue();
1982     cgi_query += "&FORMAT_OBJECT=" + ctx.GetRequestValue("FORMAT_OBJECT").GetValue();
1983     cgi_query += "&RUN_PSIBLAST=" + ctx.GetRequestValue("RUN_PSIBLAST").GetValue();
1984     cgi_query += "&I_THRESH=" + ctx.GetRequestValue("I_THRESH").GetValue();
1985 
1986     cgi_query += "&DESCRIPTIONS=" + ctx.GetRequestValue("DESCRIPTIONS").GetValue();
1987 
1988     cgi_query += "&ALIGNMENTS=" + ctx.GetRequestValue("ALIGNMENTS").GetValue();
1989 
1990     cgi_query += "&NUM_OVERVIEW=" + ctx.GetRequestValue("NUM_OVERVIEW").GetValue();
1991 
1992     cgi_query += "&NCBI_GI=" + ctx.GetRequestValue("NCBI_GI").GetValue();
1993 
1994     cgi_query += "&SHOW_OVERVIEW=" + ctx.GetRequestValue("SHOW_OVERVIEW").GetValue();
1995 
1996     cgi_query += "&SHOW_LINKOUT=" + ctx.GetRequestValue("SHOW_LINKOUT").GetValue();
1997 
1998     cgi_query += "&GET_SEQUENCE=" + ctx.GetRequestValue("GET_SEQUENCE").GetValue();
1999 
2000     cgi_query += "&MASK_CHAR=" + ctx.GetRequestValue("MASK_CHAR").GetValue();
2001     cgi_query += "&MASK_COLOR=" + ctx.GetRequestValue("MASK_COLOR").GetValue();
2002 
2003     cgi_query += "&SHOW_CDS_FEATURE=" + ctx.GetRequestValue("SHOW_CDS_FEATURE").GetValue();
2004 
2005     if (ctx.GetRequestValue("FORMAT_EQ_TEXT").GetValue() != NcbiEmptyString) {
2006         cgi_query += "&FORMAT_EQ_TEXT=" +
2007             NStr::URLEncode(NStr::TruncateSpaces(ctx.
2008             GetRequestValue("FORMAT_EQ_TEXT").
2009             GetValue()));
2010     }
2011 
2012     if (ctx.GetRequestValue("FORMAT_EQ_OP").GetValue() != NcbiEmptyString) {
2013         cgi_query += "&FORMAT_EQ_OP=" +
2014             NStr::URLEncode(NStr::TruncateSpaces(ctx.
2015             GetRequestValue("FORMAT_EQ_OP").
2016             GetValue()));
2017     }
2018 
2019     if (ctx.GetRequestValue("FORMAT_EQ_MENU").GetValue() != NcbiEmptyString) {
2020         cgi_query += "&FORMAT_EQ_MENU=" +
2021             NStr::URLEncode(NStr::TruncateSpaces(ctx.
2022             GetRequestValue("FORMAT_EQ_MENU").
2023             GetValue()));
2024     }
2025 
2026     cgi_query += "&EXPECT_LOW=" + ctx.GetRequestValue("EXPECT_LOW").GetValue();
2027     cgi_query += "&EXPECT_HIGH=" + ctx.GetRequestValue("EXPECT_HIGH").GetValue();
2028 
2029     cgi_query += "&BL2SEQ_LINK=" + ctx.GetRequestValue("BL2SEQ_LINK").GetValue();
2030 
2031 }
2032 
2033 
IsMixedDatabase(const CSeq_align_set & alnset,CScope & scope,ILinkoutDB * linkoutdb,const string & mv_build_name)2034 bool CAlignFormatUtil::IsMixedDatabase(const CSeq_align_set& alnset,
2035                                        CScope& scope, ILinkoutDB* linkoutdb,
2036                                        const string& mv_build_name)
2037 {
2038     bool is_mixed = false;
2039     bool is_first = true;
2040     int prev_database = 0;
2041 
2042     ITERATE(CSeq_align_set::Tdata, iter, alnset.Get()) {
2043 
2044         const CSeq_id& id = (*iter)->GetSeq_id(1);
2045         int linkout = linkoutdb
2046             ? linkoutdb->GetLinkout(id, mv_build_name)
2047             : 0;
2048         int cur_database = (linkout & eGenomicSeq);
2049         if (!is_first && cur_database != prev_database) {
2050             is_mixed = true;
2051             break;
2052         }
2053         prev_database = cur_database;
2054         is_first = false;
2055     }
2056 
2057     return is_mixed;
2058 
2059 }
2060 
2061 
IsMixedDatabase(CCgiContext & ctx)2062 bool CAlignFormatUtil::IsMixedDatabase(CCgiContext& ctx)
2063 {
2064     bool formatAsMixedDbs = false;
2065     string mixedDbs = ctx.GetRequestValue("MIXED_DATABASE").GetValue();
2066     if(!mixedDbs.empty()) {
2067         mixedDbs = NStr::ToLower(mixedDbs);
2068         formatAsMixedDbs = (mixedDbs == "on" || mixedDbs == "true" || mixedDbs == "yes") ? true : false;
2069     }
2070     return formatAsMixedDbs;
2071 }
2072 
s_MapLinkoutGenParam(string & url_link_tmpl,const string & rid,string giList,bool for_alignment,int cur_align,string & label,string & lnk_displ,string lnk_tl_info="",string lnk_title="")2073 static string s_MapLinkoutGenParam(string &url_link_tmpl,
2074                                    const string& rid,
2075                                    string giList,
2076                                    bool for_alignment,
2077                                    int cur_align,
2078                                    string &label,
2079                                    string &lnk_displ,
2080                                    string lnk_tl_info = "",
2081                                    string lnk_title = "")
2082 {
2083     const string kLinkTitle=" title=\"View <@lnk_tl_info@> for <@label@>\" ";
2084     const string kLinkTarget="target=\"lnk" + rid + "\"";
2085     string lnkTitle = (lnk_title.empty()) ? kLinkTitle : lnk_title;
2086     string url_link = CAlignFormatUtil::MapTemplate(url_link_tmpl,"gi",giList);
2087     url_link = CAlignFormatUtil::MapTemplate(url_link,"rid",rid);
2088     url_link = CAlignFormatUtil::MapTemplate(url_link,"log",for_alignment? "align" : "top");
2089     url_link = CAlignFormatUtil::MapTemplate(url_link,"blast_rank",NStr::IntToString(cur_align));
2090     lnkTitle = NStr::StartsWith(lnk_displ,"<img") ? "" : lnkTitle;
2091     string lnkTarget = NStr::StartsWith(lnk_displ,"<img") ? "" : kLinkTarget;
2092     url_link = CAlignFormatUtil::MapTemplate(url_link,"lnkTitle",lnkTitle);
2093     url_link = CAlignFormatUtil::MapTemplate(url_link,"lnkTarget",lnkTarget);
2094     url_link = CAlignFormatUtil::MapTemplate(url_link,"lnk_displ",lnk_displ);
2095     url_link = CAlignFormatUtil::MapTemplate(url_link,"lnk_tl_info",lnk_tl_info);
2096     url_link = CAlignFormatUtil::MapTemplate(url_link,"label",label);
2097     url_link = CAlignFormatUtil::MapProtocol(url_link);
2098     return url_link;
2099 }
2100 
2101 
s_GetLinkoutUrl(int linkout,string giList,string labelList,TGi first_gi,CAlignFormatUtil::SLinkoutInfo & linkoutInfo,bool textLink=true)2102 static list<string> s_GetLinkoutUrl(int linkout,
2103                                     string giList,
2104                                     string labelList,
2105                                     TGi first_gi,
2106                                     CAlignFormatUtil::SLinkoutInfo &linkoutInfo,
2107                                     bool textLink = true)
2108 
2109 {
2110     list<string> linkout_list;
2111     string url_link,lnk_displ,lnk_title,lnkTitleInfo;
2112 
2113     vector<string> accs;
2114     NStr::Split(labelList,",",accs);
2115     string firstAcc = (accs.size() > 0)? accs[0] : labelList;
2116 
2117     if (linkout & eUnigene) {
2118         url_link = CAlignFormatUtil::GetURLFromRegistry("UNIGEN");
2119         lnk_displ = textLink ? "UniGene" : kUnigeneImg;
2120 
2121         string termParam = NStr::Find(labelList,",") == NPOS ? kGeneTerm : ""; //kGeneTerm if only one seqid
2122         url_link = CAlignFormatUtil::MapTemplate(url_link,"termParam",termParam);
2123 
2124         lnkTitleInfo = "UniGene cluster";
2125         string uid = !linkoutInfo.is_na ? "[Protein Accession]" : "[Nucleotide Accession]";
2126         url_link = CAlignFormatUtil::MapTemplate(url_link,"uid",uid);
2127         url_link = s_MapLinkoutGenParam(url_link,linkoutInfo.rid,giList,linkoutInfo.for_alignment, linkoutInfo.cur_align,labelList,lnk_displ,lnkTitleInfo);
2128 
2129         if(textLink) {
2130             url_link = CAlignFormatUtil::MapTemplate(kUnigeneDispl,"lnk",url_link);
2131         }
2132         url_link = CAlignFormatUtil::MapProtocol(url_link);
2133         linkout_list.push_back(url_link);
2134     }
2135     if (linkout & eStructure){
2136         string struct_link = CAlignFormatUtil::GetURLFromRegistry(
2137                                                              "STRUCTURE_URL");
2138 
2139         url_link = struct_link.empty() ? kStructureUrl : struct_link;
2140         lnk_displ = textLink ? "Structure" : kStructureImg;
2141 
2142         string linkTitle = " title=\"View 3D structure <@label@>\"";
2143         string molID,chainID;
2144         NStr::SplitInTwo(firstAcc,"_",molID,chainID);
2145         url_link = CAlignFormatUtil::MapTemplate(url_link,"molid",molID);
2146         url_link = CAlignFormatUtil::MapTemplate(url_link,"queryID",linkoutInfo.queryID);
2147         url_link = s_MapLinkoutGenParam(url_link,linkoutInfo.rid,giList,linkoutInfo.for_alignment, linkoutInfo.cur_align,firstAcc,lnk_displ,"",linkTitle);
2148         if(textLink) {
2149             url_link = CAlignFormatUtil::MapTemplate(kStructureDispl,"lnk",url_link);
2150         }
2151         url_link = CAlignFormatUtil::MapProtocol(url_link);
2152         linkout_list.push_back(url_link);
2153     }
2154     if (linkout & eGeo){
2155         url_link = CAlignFormatUtil::GetURLFromRegistry("GEO");
2156         lnk_displ = textLink ? "GEO Profiles" : kGeoImg;
2157 
2158         lnkTitleInfo = "Expression profiles";
2159         //gilist contains comma separated gis
2160         url_link = s_MapLinkoutGenParam(url_link,linkoutInfo.rid,giList,linkoutInfo.for_alignment, linkoutInfo.cur_align,labelList,lnk_displ,lnkTitleInfo);
2161 
2162 
2163         if(textLink) {
2164             url_link = CAlignFormatUtil::MapTemplate(kGeoDispl,"lnk",url_link);
2165         }
2166         url_link = CAlignFormatUtil::MapProtocol(url_link);
2167         linkout_list.push_back(url_link);
2168     }
2169     if(linkout & eGene){
2170       url_link = CAlignFormatUtil::GetURLFromRegistry("GENE");
2171       if(textLink) {
2172         string geneSym = CAlignFormatUtil::GetGeneInfo(first_gi);
2173         lnk_displ = "Gene";
2174         lnkTitleInfo = "gene " + geneSym;
2175       }
2176       else {
2177         lnk_displ = kGeneImg;
2178       }
2179       string termParam = NStr::Find(labelList,",") == NPOS ? kGeneTerm : ""; //kGeneTerm if only one seqid
2180       url_link = CAlignFormatUtil::MapTemplate(url_link,"termParam",termParam);
2181 
2182       string uid = !linkoutInfo.is_na ? "[Protein Accession]" : "[Nucleotide Accession]";
2183       url_link = CAlignFormatUtil::MapTemplate(url_link,"uid",uid);
2184 
2185       url_link = s_MapLinkoutGenParam(url_link,linkoutInfo.rid,giList,linkoutInfo.for_alignment, linkoutInfo.cur_align,labelList,lnk_displ,lnkTitleInfo);
2186 
2187       if(textLink) {
2188             url_link = CAlignFormatUtil::MapTemplate(kGeneDispl,"lnk",url_link);
2189       }
2190       url_link = CAlignFormatUtil::MapProtocol(url_link);
2191       linkout_list.push_back(url_link);
2192     }
2193 
2194     if((linkout & eGenomicSeq)  && first_gi != ZERO_GI){  //only for advanced view -> textlink = true
2195         if(textLink) {
2196             url_link = kMapviewBlastHitParams;
2197             lnk_displ = "Map Viewer";
2198 
2199             lnkTitleInfo = "BLAST hits on the " + linkoutInfo.taxName + " genome";
2200 
2201             url_link = CAlignFormatUtil::MapTemplate(url_link,"gnl",NStr::URLEncode(linkoutInfo.gnl));
2202             url_link = CAlignFormatUtil::MapTemplate(url_link,"db",linkoutInfo.database);
2203             url_link = CAlignFormatUtil::MapTemplate(url_link,"is_na",linkoutInfo.is_na? "1" : "0");
2204             string user_url = (linkoutInfo.user_url.empty()) ? kMapviewBlastHitUrl : linkoutInfo.user_url;
2205             url_link = CAlignFormatUtil::MapTemplate(url_link,"user_url",user_url);
2206 
2207             string taxIDStr = (linkoutInfo.taxid > ZERO_TAX_ID) ? NStr::NumericToString(linkoutInfo.taxid) : "";
2208             url_link = CAlignFormatUtil::MapTemplate(url_link,"taxid",taxIDStr);
2209 
2210             string queryNumStr = (linkoutInfo.query_number > 0) ? NStr::IntToString(linkoutInfo.query_number) : "";
2211             url_link = CAlignFormatUtil::MapTemplate(url_link,"query_number",queryNumStr);  //gi,term
2212 
2213             string giStr = (first_gi > ZERO_GI)? NStr::NumericToString(first_gi) : "";
2214             url_link = s_MapLinkoutGenParam(url_link,linkoutInfo.rid,giStr,linkoutInfo.for_alignment, linkoutInfo.cur_align,labelList,lnk_displ,lnkTitleInfo);
2215 
2216             if(textLink) {
2217                 url_link = CAlignFormatUtil::MapTemplate(kMapviwerDispl,"lnk",url_link);
2218             }
2219             url_link = CAlignFormatUtil::MapProtocol(url_link);
2220             linkout_list.push_back(url_link);
2221         }
2222     }
2223     else if((linkout & eMapviewer) && first_gi != ZERO_GI){
2224         url_link = kMapviwerUrl;
2225         lnk_displ = textLink ? "Map Viewer" : kMapviwerImg;
2226 
2227         string linkTitle = " title=\"View <@label@> aligned to the "  + linkoutInfo.taxName + " genome\"";
2228         url_link = s_MapLinkoutGenParam(url_link,linkoutInfo.rid,giList,linkoutInfo.for_alignment, linkoutInfo.cur_align,labelList,lnk_displ,"",linkTitle);
2229 
2230         if(textLink) {
2231             url_link = CAlignFormatUtil::MapTemplate(kMapviwerDispl,"lnk",url_link);
2232         }
2233         url_link = CAlignFormatUtil::MapProtocol(url_link);
2234         linkout_list.push_back(url_link);
2235     }
2236     //View Bioassays involving <accession
2237     if(linkout & eBioAssay && linkoutInfo.is_na && first_gi != ZERO_GI){
2238         url_link = CAlignFormatUtil::GetURLFromRegistry("BIOASSAY_NUC");
2239         lnk_displ = textLink ? "PubChem BioAssay" : kBioAssayNucImg;
2240 
2241         string linkTitle = " title=\"View Bioassays involving <@label@>\"";
2242         //gilist contains comma separated gis, change it to the following
2243         giList = NStr::Replace(giList,",","[RNATargetGI] OR ");
2244         url_link = s_MapLinkoutGenParam(url_link,linkoutInfo.rid,giList,linkoutInfo.for_alignment, linkoutInfo.cur_align,labelList,lnk_displ,"",linkTitle);
2245 
2246         if(textLink) {
2247             url_link = CAlignFormatUtil::MapTemplate(kBioAssayDispl,"lnk",url_link);
2248         }
2249         url_link = CAlignFormatUtil::MapProtocol(url_link);
2250         linkout_list.push_back(url_link);
2251     }
2252     else if (linkout & eBioAssay && !linkoutInfo.is_na && first_gi != ZERO_GI) {
2253         url_link = CAlignFormatUtil::GetURLFromRegistry("BIOASSAY_PROT");
2254         lnk_displ = textLink ? "PubChem BioAssay" : kBioAssayProtImg;
2255 
2256         lnkTitleInfo ="Bioassay data";
2257         string linkTitle = " title=\"View Bioassays involving <@label@>\"";
2258         //gilist contains comma separated gis, change it to the following
2259         giList = NStr::Replace(giList,",","[PigGI] OR ");
2260         url_link = s_MapLinkoutGenParam(url_link,linkoutInfo.rid,giList,linkoutInfo.for_alignment, linkoutInfo.cur_align,labelList,lnk_displ,"",linkTitle);
2261 
2262         if(textLink) {
2263             url_link = CAlignFormatUtil::MapTemplate(kBioAssayDispl,"lnk",url_link);
2264         }
2265         url_link = CAlignFormatUtil::MapProtocol(url_link);
2266         linkout_list.push_back(url_link);
2267     }
2268     if(linkout & eReprMicrobialGenomes){
2269         url_link = CAlignFormatUtil::GetURLFromRegistry("REPR_MICROBIAL_GENOMES");
2270         lnk_displ = textLink ? "Genome" : kReprMicrobialGenomesImg;
2271 
2272         lnkTitleInfo = "genomic information";
2273         //gilist contains comma separated gis
2274         string uid = !linkoutInfo.is_na ? "Protein Accession" : "Nucleotide Accession";
2275         url_link = CAlignFormatUtil::MapTemplate(url_link,"uid",uid);
2276         url_link = s_MapLinkoutGenParam(url_link,linkoutInfo.rid,giList,linkoutInfo.for_alignment, linkoutInfo.cur_align,labelList,lnk_displ,lnkTitleInfo);
2277 
2278         if(textLink) {
2279             url_link = CAlignFormatUtil::MapTemplate(kReprMicrobialGenomesDispl,"lnk",url_link);
2280         }
2281         url_link = CAlignFormatUtil::MapProtocol(url_link);
2282         linkout_list.push_back(url_link);
2283     }
2284     if((linkout & eGenomeDataViewer) || (linkout & eTranscript)){
2285         string urlTag;
2286         lnk_displ = textLink ? "Genome Data Viewer" : kGenomeDataViewerImg;
2287         if(linkout & eTranscript) {
2288             urlTag = "GENOME_DATA_VIEWER_TRANSCR";
2289             lnkTitleInfo = "title=\"View the annotation of the transcript <@label@> within a genomic context in NCBI's Genome Data Viewer (GDV)- genome browser for RefSeq annotated assemblies. See other genomic features annotated at the same location as the protein annotation and browse to other regions.\"";
2290         }
2291         else {
2292             urlTag = linkoutInfo.is_na ? "GENOME_DATA_VIEWER_NUC" : "GENOME_DATA_VIEWER_PROT";
2293             lnkTitleInfo = linkoutInfo.is_na ?
2294                          "title=\"View BLAST hits for <@label@> within a genomic context in NCBI's Genome Data Viewer (GDV)- genome browser for RefSeq annotated assemblies. See other genomic features annotated at the same location as hits and browse to other regions.\""
2295                          :
2296                          "title=\"View the annotation of the protein <@label@> within a genomic context in NCBI's Genome Data Viewer (GDV)- genome browser for RefSeq annotated assemblies. See other genomic features annotated at the same location as the protein annotation and browse to other regions.\"";
2297         }
2298         url_link = CAlignFormatUtil::GetURLFromRegistry(urlTag);
2299         url_link = s_MapLinkoutGenParam(url_link,linkoutInfo.rid,giList,linkoutInfo.for_alignment, linkoutInfo.cur_align,firstAcc,lnk_displ,"",lnkTitleInfo);
2300 
2301         url_link = CAlignFormatUtil::MapTemplate(url_link,"queryID",linkoutInfo.queryID);
2302 
2303         TSeqPos seqFrom = linkoutInfo.subjRange.GetFrom();
2304         seqFrom = (seqFrom == 0) ? seqFrom : seqFrom - 1;
2305 
2306         TSeqPos seqTo = linkoutInfo.subjRange.GetTo();
2307         seqTo = (seqTo == 0) ? seqTo : seqTo - 1;
2308 
2309         url_link = CAlignFormatUtil::MapTemplate(url_link,"from",seqFrom);//-1
2310         url_link = CAlignFormatUtil::MapTemplate(url_link,"to",seqTo);//-1
2311 
2312         if(textLink) {
2313             url_link = CAlignFormatUtil::MapTemplate(kGenomeDataViewerDispl,"lnk",url_link);
2314         }
2315         url_link = CAlignFormatUtil::MapProtocol(url_link);
2316         linkout_list.push_back(url_link);
2317     }
2318     return linkout_list;
2319 }
2320 
2321 ///Get list of linkouts for one sequence
GetLinkoutUrl(int linkout,const CBioseq::TId & ids,const string & rid,const string & cdd_rid,const string & entrez_term,bool is_na,TGi first_gi,bool structure_linkout_as_group,bool for_alignment,int cur_align,string preComputedResID)2322 list<string> CAlignFormatUtil::GetLinkoutUrl(int linkout, const CBioseq::TId& ids,
2323                                              const string& rid,
2324                                              const string& cdd_rid,
2325                                              const string& entrez_term,
2326                                              bool is_na,
2327                                              TGi first_gi,
2328                                              bool structure_linkout_as_group,
2329                                              bool for_alignment, int cur_align,
2330                                              string preComputedResID)
2331 
2332 {
2333     list<string> linkout_list;
2334     TGi gi = FindGi(ids);
2335     CRef<CSeq_id> wid = FindBestChoice(ids, CSeq_id::WorstRank);
2336     string label;
2337     wid->GetLabel(&label, CSeq_id::eContent);
2338     string giString = NStr::NumericToString(gi);
2339     first_gi = (first_gi == ZERO_GI) ? gi : first_gi;
2340 
2341 
2342 
2343     SLinkoutInfo linkoutInfo;
2344     linkoutInfo.Init(rid,
2345                      cdd_rid,
2346                      entrez_term,
2347                      is_na,
2348                      "",  //database
2349                      0,  //query_number
2350                      "", //user_url
2351                      preComputedResID,
2352                      "", //linkoutOrder
2353                      structure_linkout_as_group,
2354                      for_alignment);
2355 
2356     linkoutInfo.cur_align = cur_align;
2357     linkoutInfo.taxid = ZERO_TAX_ID;
2358 
2359     linkout_list = s_GetLinkoutUrl(linkout,
2360                                    giString,
2361                                    label,
2362                                    first_gi,
2363                                    linkoutInfo,
2364                                    false); //textlink
2365 
2366     return linkout_list;
2367 }
2368 
2369 
s_LinkLetterToType(string linkLetter)2370 static int s_LinkLetterToType(string linkLetter)
2371 {
2372     int linkType = 0;
2373     if(linkLetter == "U") {
2374         linkType = eUnigene;
2375     }
2376     else if(linkLetter == "S") {
2377            linkType = eStructure;
2378     }
2379 	else if(linkLetter == "E") {
2380          linkType = eGeo;
2381     }
2382     else if(linkLetter == "G") {
2383         linkType = eGene;
2384     }
2385     else if(linkLetter == "M") {
2386         linkType = eMapviewer | eGenomicSeq;
2387     }
2388     else if(linkLetter == "N") {
2389         linkType = eGenomicSeq;
2390     }
2391     else if(linkLetter == "B") {
2392         linkType = eBioAssay;
2393     }
2394     else if(linkLetter == "R") {
2395         linkType = eReprMicrobialGenomes;
2396     }
2397     else if(linkLetter == "V") {
2398         linkType = eGenomeDataViewer;
2399     }
2400     else if(linkLetter == "T") {
2401         linkType = eTranscript;
2402     }
2403 
2404     return linkType;
2405 }
2406 
2407 
s_AddLinkoutInfo(map<int,vector<CBioseq::TId>> & linkout_map,int linkout,CBioseq::TId & cur_id)2408 static void s_AddLinkoutInfo(map<int, vector < CBioseq::TId > > &linkout_map,int linkout,CBioseq::TId &cur_id)
2409 {
2410     if(linkout_map.count(linkout) > 0){
2411         linkout_map[linkout].push_back(cur_id);
2412     }
2413     else {
2414         vector <CBioseq::TId > idList;
2415         idList.push_back(cur_id);
2416         linkout_map.insert(map<int,  vector <CBioseq::TId > >::value_type(linkout,idList));
2417     }
2418 }
2419 
GetSeqLinkoutInfo(CBioseq::TId & cur_id,ILinkoutDB ** linkoutdb,const string & mv_build_name,TGi gi)2420 int CAlignFormatUtil::GetSeqLinkoutInfo(CBioseq::TId& cur_id,
2421                                     ILinkoutDB **linkoutdb,
2422                                     const string& mv_build_name,
2423                                     TGi gi)
2424 {
2425     int linkout = 0;
2426 
2427     if(*linkoutdb) {
2428         if(gi == INVALID_GI) {
2429             gi = FindGi(cur_id);
2430         }
2431         try {
2432             if(gi > ZERO_GI) {
2433                 linkout = (*linkoutdb)->GetLinkout(gi, mv_build_name);
2434             }
2435             else if(GetTextSeqID(cur_id)){
2436                 CRef<CSeq_id> seqID = FindBestChoice(cur_id, CSeq_id::WorstRank);
2437                 linkout = (*linkoutdb)->GetLinkout(*seqID, mv_build_name);
2438             }
2439         }
2440         catch (const CException & e) {
2441             ERR_POST("Problem with linkoutdb: " + e.GetMsg());
2442             *linkoutdb = NULL;
2443         }
2444     }
2445     return linkout;
2446 }
2447 
2448 void
GetBdlLinkoutInfo(CBioseq::TId & cur_id,map<int,vector<CBioseq::TId>> & linkout_map,ILinkoutDB * linkoutdb,const string & mv_build_name)2449 CAlignFormatUtil::GetBdlLinkoutInfo(CBioseq::TId& cur_id,
2450                                     map<int, vector <CBioseq::TId > > &linkout_map,
2451                                     ILinkoutDB* linkoutdb,
2452                                     const string& mv_build_name)
2453 {
2454         if(!linkoutdb) return;
2455 
2456         int linkout = GetSeqLinkoutInfo(cur_id,
2457                                     &linkoutdb,
2458                                     mv_build_name);
2459 
2460         if(linkout & eGene){
2461             s_AddLinkoutInfo(linkout_map,eGene,cur_id);
2462         }
2463         if (linkout & eUnigene) {
2464             s_AddLinkoutInfo(linkout_map,eUnigene,cur_id);
2465         }
2466         if (linkout & eGeo){
2467             s_AddLinkoutInfo(linkout_map,eGeo,cur_id);
2468         }
2469         if (linkout & eStructure){
2470             s_AddLinkoutInfo(linkout_map,eStructure,cur_id);
2471         }
2472         //eGenomicSeq and eMapviewer cannot combine together
2473         if((linkout & eGenomicSeq) && (linkout & eMapviewer)){
2474             s_AddLinkoutInfo(linkout_map,eGenomicSeq,cur_id);
2475         }
2476         else if(linkout & eMapviewer){
2477             s_AddLinkoutInfo(linkout_map,eMapviewer,cur_id);
2478         }
2479         if(linkout & eBioAssay){
2480             s_AddLinkoutInfo(linkout_map,eBioAssay,cur_id);
2481         }
2482         if(linkout & eReprMicrobialGenomes){
2483             s_AddLinkoutInfo(linkout_map,eReprMicrobialGenomes,cur_id);
2484         }
2485 
2486         if(linkout & eGenomeDataViewer){
2487             s_AddLinkoutInfo(linkout_map,eGenomeDataViewer,cur_id);
2488         }
2489         if(linkout & eTranscript){
2490             s_AddLinkoutInfo(linkout_map,eTranscript,cur_id);
2491         }
2492 
2493 }
2494 
2495 void
GetBdlLinkoutInfo(const list<CRef<CBlast_def_line>> & bdl,map<int,vector<CBioseq::TId>> & linkout_map,ILinkoutDB * linkoutdb,const string & mv_build_name)2496 CAlignFormatUtil::GetBdlLinkoutInfo(const list< CRef< CBlast_def_line > > &bdl,
2497                                     map<int, vector <CBioseq::TId > > &linkout_map,
2498                                     ILinkoutDB* linkoutdb,
2499                                     const string& mv_build_name)
2500 {
2501     const int kMaxDeflineNum = 10;
2502     int num = 0;
2503     for(list< CRef< CBlast_def_line > >::const_iterator iter = bdl.begin();
2504             iter != bdl.end(); iter++){
2505         CBioseq::TId& cur_id = (CBioseq::TId &)(*iter)->GetSeqid();
2506 
2507         GetBdlLinkoutInfo(cur_id,
2508                           linkout_map,
2509                           linkoutdb,
2510                           mv_build_name);
2511         num++;
2512         if(num > kMaxDeflineNum) break;
2513     }
2514 }
2515 
s_GetTaxName(TTaxId taxid)2516 static string s_GetTaxName(TTaxId taxid)
2517 {
2518     string taxName;
2519     try {
2520         if(taxid != ZERO_TAX_ID) {
2521             SSeqDBTaxInfo info;
2522             CSeqDB::GetTaxInfo(taxid, info);
2523             taxName = info.common_name;
2524         }
2525     }
2526     catch (CException&) {
2527 
2528     }
2529     return taxName;
2530 }
2531 
s_AddOtherRelatedInfoLinks(CBioseq::TId & cur_id,const string & rid,bool is_na,bool for_alignment,int cur_align,list<string> & linkout_list)2532 void s_AddOtherRelatedInfoLinks(CBioseq::TId& cur_id,
2533                                 const string& rid,
2534                                 bool is_na,
2535                                 bool for_alignment,
2536                                 int cur_align,
2537                                 list<string> &linkout_list)
2538 
2539 {
2540     //Identical Proteins
2541 
2542      CRef<CSeq_id> wid = FindBestChoice(cur_id, CSeq_id::WorstRank);
2543      if (CAlignFormatUtil::GetTextSeqID(wid)) {
2544         string label;
2545         wid->GetLabel(&label, CSeq_id::eContent);
2546         string url_link = kIdenticalProteinsUrl;
2547         string lnk_displ = "Identical Proteins";
2548         url_link = s_MapLinkoutGenParam(url_link,rid,NStr::NumericToString(ZERO_GI),for_alignment, cur_align,label,lnk_displ);
2549         url_link = CAlignFormatUtil::MapTemplate(kIdenticalProteinsDispl,"lnk",url_link);
2550         url_link = CAlignFormatUtil::MapTemplate(url_link,"label",label);
2551         linkout_list.push_back(url_link);
2552     }
2553 }
2554 
2555 
2556 
2557 //reset:taxname,gnl
s_GetFullLinkoutUrl(CBioseq::TId & cur_id,CAlignFormatUtil::SLinkoutInfo & linkoutInfo,map<int,vector<CBioseq::TId>> & linkout_map,bool getIdentProteins)2558 static list<string> s_GetFullLinkoutUrl(CBioseq::TId& cur_id,
2559                                         CAlignFormatUtil::SLinkoutInfo &linkoutInfo,
2560                                         map<int, vector < CBioseq::TId > >  &linkout_map,
2561                                         bool getIdentProteins)
2562 
2563 {
2564     list<string> linkout_list;
2565 
2566     vector<string> linkLetters;
2567     NStr::Split(linkoutInfo.linkoutOrder,",",linkLetters); //linkoutOrder = "G,U,M,V,E,S,B,R,T"
2568 	for(size_t i = 0; i < linkLetters.size(); i++) {
2569         TGi first_gi = ZERO_GI;
2570         vector < CBioseq::TId > idList;
2571         int linkout = s_LinkLetterToType(linkLetters[i]);
2572         linkoutInfo.taxName.clear();
2573         if(linkout & (eMapviewer | eGenomicSeq)) {
2574             linkout = (linkout_map[eGenomicSeq].size() != 0) ? eGenomicSeq : eMapviewer;
2575             linkoutInfo.taxName = s_GetTaxName(linkoutInfo.taxid);
2576         }
2577         if(linkout_map.find(linkout) != linkout_map.end()) {
2578             idList = linkout_map[linkout];
2579         }
2580         bool disableLink = (linkout == 0 || idList.size() == 0 || ( (linkout & eStructure) && (linkoutInfo.cdd_rid == "" || linkoutInfo.cdd_rid == "0")));
2581 
2582         string giList,labelList;
2583         int seqVersion = ((linkout & eGenomeDataViewer) || (linkout & eTranscript)) ? true : false;
2584         for (size_t i = 0; i < idList.size(); i++) {
2585             const CBioseq::TId& ids = idList[i];
2586             TGi gi = FindGi(ids);
2587             if (first_gi == ZERO_GI) first_gi = gi;
2588 
2589 
2590             CRef<CSeq_id> wid = FindBestChoice(ids, CSeq_id::WorstRank);
2591             string label = CAlignFormatUtil::GetLabel(wid,seqVersion);
2592             if(!labelList.empty()) labelList += ",";
2593             labelList += label;
2594 
2595             //use only first gi for bioAssay protein
2596             if(!giList.empty() && (linkout & eBioAssay) && !linkoutInfo.is_na) continue;
2597             if(!giList.empty()) giList += ",";
2598             giList += NStr::NumericToString(gi);
2599         }
2600 
2601         linkoutInfo.gnl.clear();
2602         if(!disableLink && linkout == eGenomicSeq) {
2603             linkoutInfo.gnl = s_GetBestIDForURL(cur_id);
2604         }
2605 
2606         if(!disableLink) {//
2607             //The following list will contain only one entry for single linkout value
2608             list<string> one_linkout = s_GetLinkoutUrl(linkout,
2609                                                           giList,
2610                                                           labelList,
2611                                                           first_gi,
2612                                                           linkoutInfo);
2613             if(one_linkout.size() > 0) {
2614                 list<string>::iterator iter = one_linkout.begin();
2615                 linkout_list.push_back(*iter);
2616             }
2617         }
2618  }
2619  if(getIdentProteins) {
2620     s_AddOtherRelatedInfoLinks(cur_id,linkoutInfo.rid,linkoutInfo.is_na,linkoutInfo.for_alignment,linkoutInfo.cur_align,linkout_list);
2621  }
2622  return linkout_list;
2623 }
2624 
GetFullLinkoutUrl(const list<CRef<CBlast_def_line>> & bdl,CAlignFormatUtil::SLinkoutInfo & linkoutInfo)2625 list<string> CAlignFormatUtil::GetFullLinkoutUrl(const list< CRef< CBlast_def_line > > &bdl,
2626                                                     CAlignFormatUtil::SLinkoutInfo &linkoutInfo)
2627 {
2628     list<string> linkout_list;
2629     map<int, vector < CBioseq::TId > >  linkout_map;
2630     if(bdl.size() > 0) {
2631     	GetBdlLinkoutInfo(bdl,linkout_map, linkoutInfo.linkoutdb, linkoutInfo.mv_build_name);
2632     	list< CRef< CBlast_def_line > >::const_iterator iter = bdl.begin();
2633     	CBioseq::TId& cur_id = (CBioseq::TId &)(*iter)->GetSeqid();
2634         linkout_list = s_GetFullLinkoutUrl(cur_id,
2635                                            linkoutInfo,
2636                                            linkout_map,
2637                                            !linkoutInfo.is_na && bdl.size() > 1);
2638     }
2639     return linkout_list;
2640 }
2641 
2642 
GetFullLinkoutUrl(const list<CRef<CBlast_def_line>> & bdl,const string & rid,const string & cdd_rid,const string & entrez_term,bool is_na,bool structure_linkout_as_group,bool for_alignment,int cur_align,string & linkoutOrder,TTaxId taxid,string & database,int query_number,string & user_url,string & preComputedResID,ILinkoutDB * linkoutdb,const string & mv_build_name)2643 list<string> CAlignFormatUtil::GetFullLinkoutUrl(const list< CRef< CBlast_def_line > > &bdl,
2644                                                  const string& rid,
2645                                                  const string& cdd_rid,
2646                                                  const string& entrez_term,
2647                                                  bool is_na,
2648                                                  bool structure_linkout_as_group,
2649                                                  bool for_alignment,
2650                                                  int cur_align,
2651                                                  string& linkoutOrder,
2652                                                  TTaxId taxid,
2653                                                  string &database,
2654                                                  int query_number,
2655                                                  string &user_url,
2656                                                  string &preComputedResID,
2657                                                  ILinkoutDB* linkoutdb,
2658                                                  const string& mv_build_name)
2659 
2660 {
2661     list<string> linkout_list;
2662     map<int, vector < CBioseq::TId > >  linkout_map;
2663     if(bdl.size() > 0) {
2664     	GetBdlLinkoutInfo(bdl,linkout_map, linkoutdb, mv_build_name);
2665     	list< CRef< CBlast_def_line > >::const_iterator iter = bdl.begin();
2666     	CBioseq::TId& cur_id = (CBioseq::TId &)(*iter)->GetSeqid();
2667 
2668         SLinkoutInfo linkoutInfo;
2669         linkoutInfo.Init(rid,
2670                          cdd_rid,
2671                          entrez_term,
2672                          is_na,
2673                          database,
2674                          query_number,
2675                          user_url,
2676                          preComputedResID,
2677                          linkoutOrder,
2678                          structure_linkout_as_group,
2679                          for_alignment);
2680 
2681         linkoutInfo.cur_align = cur_align;
2682         linkoutInfo.taxid = taxid;
2683 
2684         linkout_list = s_GetFullLinkoutUrl(cur_id,
2685                                         linkoutInfo,
2686                                         linkout_map,
2687                                         !is_na && bdl.size() > 1);
2688     }
2689     return linkout_list;
2690 }
2691 
2692 
GetFullLinkoutUrl(CBioseq::TId & cur_id,CAlignFormatUtil::SLinkoutInfo & linkoutInfo,bool getIdentProteins)2693 list<string> CAlignFormatUtil::GetFullLinkoutUrl(CBioseq::TId& cur_id,
2694                                                  CAlignFormatUtil::SLinkoutInfo &linkoutInfo,
2695                                                  bool getIdentProteins)
2696 {
2697     list<string> linkout_list;
2698     map<int, vector < CBioseq::TId > >  linkout_map;
2699 
2700     GetBdlLinkoutInfo(cur_id,linkout_map, linkoutInfo.linkoutdb, linkoutInfo.mv_build_name);
2701     linkout_list = s_GetFullLinkoutUrl(cur_id,
2702                                        linkoutInfo,
2703                                        linkout_map,
2704                                        getIdentProteins);
2705     return linkout_list;
2706 }
2707 
GetFullLinkoutUrl(CBioseq::TId & cur_id,const string & rid,const string & cdd_rid,const string & entrez_term,bool is_na,bool structure_linkout_as_group,bool for_alignment,int cur_align,string & linkoutOrder,TTaxId taxid,string & database,int query_number,string & user_url,string & preComputedResID,ILinkoutDB * linkoutdb,const string & mv_build_name,bool getIdentProteins)2708 list<string> CAlignFormatUtil::GetFullLinkoutUrl(CBioseq::TId& cur_id,
2709                                                  const string& rid,
2710                                                  const string& cdd_rid,
2711                                                  const string& entrez_term,
2712                                                  bool is_na,
2713                                                  bool structure_linkout_as_group,
2714                                                  bool for_alignment,
2715                                                  int cur_align,
2716                                                  string& linkoutOrder,
2717                                                  TTaxId taxid,
2718                                                  string &database,
2719                                                  int query_number,
2720                                                  string &user_url,
2721                                                  string &preComputedResID,
2722                                                  ILinkoutDB* linkoutdb,
2723                                                  const string& mv_build_name,
2724                                                  bool getIdentProteins)
2725 
2726 {
2727     list<string> linkout_list;
2728 
2729     map<int, vector < CBioseq::TId > >  linkout_map;
2730     GetBdlLinkoutInfo(cur_id,linkout_map, linkoutdb, mv_build_name);
2731     SLinkoutInfo linkoutInfo;
2732     linkoutInfo.Init(rid,
2733                      cdd_rid,
2734                      entrez_term,
2735                      is_na,
2736                      database,
2737                      query_number,
2738                      user_url,
2739                      preComputedResID,
2740                      linkoutOrder,
2741                      structure_linkout_as_group,
2742                      for_alignment);
2743 
2744     linkoutInfo.cur_align = cur_align;
2745     linkoutInfo.taxid = taxid;
2746 
2747     linkout_list = s_GetFullLinkoutUrl(cur_id,
2748                                        linkoutInfo,
2749                                        linkout_map,
2750                                        getIdentProteins);
2751     return linkout_list;
2752 }
2753 
2754 
FromRangeAscendingSort(CRange<TSeqPos> const & info1,CRange<TSeqPos> const & info2)2755 static bool FromRangeAscendingSort(CRange<TSeqPos> const& info1,
2756                                    CRange<TSeqPos> const& info2)
2757 {
2758     return info1.GetFrom() < info2.GetFrom();
2759 }
2760 
2761 //0 for query, 1 for subject
2762 //Gets query and subject range lists,oppositeStrands param
s_ProcessAlignSet(const CSeq_align_set & alnset,list<CRange<TSeqPos>> & query_list,list<CRange<TSeqPos>> & subject_list)2763 static bool s_ProcessAlignSet(const CSeq_align_set& alnset,
2764                               list<CRange<TSeqPos> > &query_list,
2765                               list<CRange<TSeqPos> > &subject_list)
2766 {
2767     bool oppositeStrands = false;
2768     bool isFirst = false;
2769     ITERATE(CSeq_align_set::Tdata, iter, alnset.Get()) {
2770         CRange<TSeqPos> query_range = (*iter)->GetSeqRange(0);
2771         //for minus strand
2772         if(query_range.GetFrom() > query_range.GetTo()){
2773             query_range.Set(query_range.GetTo(), query_range.GetFrom());
2774         }
2775         query_list.push_back(query_range);
2776 
2777         CRange<TSeqPos> subject_range = (*iter)->GetSeqRange(1);
2778         //for minus strand
2779         if(subject_range.GetFrom() > subject_range.GetTo()){
2780             subject_range.Set(subject_range.GetTo(), subject_range.GetFrom());
2781         }
2782         subject_list.push_back(subject_range);
2783 
2784         oppositeStrands = (!isFirst) ? (*iter)->GetSeqStrand(0) != (*iter)->GetSeqStrand(1) : oppositeStrands;
2785         isFirst = true;
2786     }
2787 
2788     query_list.sort(FromRangeAscendingSort);
2789     subject_list.sort(FromRangeAscendingSort);
2790     return oppositeStrands;
2791 }
2792 
2793 
2794 
2795 //0 for query, 1 for subject
s_MergeRangeList(list<CRange<TSeqPos>> & source)2796 static list<CRange<TSeqPos> > s_MergeRangeList(list<CRange<TSeqPos> > &source)
2797 {
2798 
2799     list<CRange<TSeqPos> > merge_list;
2800 
2801     bool is_first = true;
2802     CRange<TSeqPos> prev_range (0, 0);
2803     ITERATE(list<CRange<TSeqPos> >, iter, source) {
2804 
2805         if (is_first) {
2806             merge_list.push_back(*iter);
2807             is_first= false;
2808             prev_range = *iter;
2809         } else {
2810             if (prev_range.IntersectingWith(*iter)) {
2811                 merge_list.pop_back();
2812                 CRange<TSeqPos> temp_range = prev_range.CombinationWith(*iter);
2813                 merge_list.push_back(temp_range);
2814                 prev_range = temp_range;
2815             } else {
2816                 merge_list.push_back(*iter);
2817                 prev_range = *iter;
2818             }
2819         }
2820 
2821     }
2822     return merge_list;
2823 }
2824 
GetMasterCoverage(const CSeq_align_set & alnset)2825 int CAlignFormatUtil::GetMasterCoverage(const CSeq_align_set& alnset)
2826 {
2827 
2828     list<CRange<TSeqPos> > merge_list;
2829 
2830     list<CRange<TSeqPos> > temp;
2831     ITERATE(CSeq_align_set::Tdata, iter, alnset.Get()) {
2832         CRange<TSeqPos> seq_range = (*iter)->GetSeqRange(0);
2833         //for minus strand
2834         if(seq_range.GetFrom() > seq_range.GetTo()){
2835             seq_range.Set(seq_range.GetTo(), seq_range.GetFrom());
2836         }
2837         temp.push_back(seq_range);
2838     }
2839 
2840     temp.sort(FromRangeAscendingSort);
2841 
2842     merge_list = s_MergeRangeList(temp);
2843 
2844     int master_covered_lenghth = 0;
2845     ITERATE(list<CRange<TSeqPos> >, iter, merge_list) {
2846         master_covered_lenghth += iter->GetLength();
2847     }
2848     return master_covered_lenghth;
2849 }
2850 
2851 
2852 
GetSeqAlignCoverageParams(const CSeq_align_set & alnset,int * master_covered_lenghth,bool * flip)2853 CRange<TSeqPos> CAlignFormatUtil::GetSeqAlignCoverageParams(const CSeq_align_set& alnset,int *master_covered_lenghth,bool *flip)
2854 
2855 {
2856 
2857     list<CRange<TSeqPos> > query_list;
2858     list<CRange<TSeqPos> > subject_list;
2859 
2860     *flip = s_ProcessAlignSet(alnset,query_list,subject_list);
2861     query_list = s_MergeRangeList(query_list);
2862     subject_list = s_MergeRangeList(subject_list);
2863 
2864 
2865     *master_covered_lenghth = 0;
2866     ITERATE(list<CRange<TSeqPos> >, iter, query_list) {
2867         *master_covered_lenghth += iter->GetLength();
2868     }
2869 
2870     TSeqPos from = 0,to = 0;
2871     ITERATE(list<CRange<TSeqPos> >, iter, subject_list) {
2872         from = (from == 0) ? iter->GetFrom() : min(from,iter->GetFrom());
2873         to = max(to,iter->GetTo());
2874     }
2875     //cerr << "from,to = " << from << "," << to << endl;
2876     CRange<TSeqPos> subjectRange(from + 1, to + 1);
2877     return subjectRange;
2878 }
2879 
2880 CRef<CSeq_align_set>
SortSeqalignForSortableFormat(CCgiContext & ctx,CScope & scope,CSeq_align_set & aln_set,bool nuc_to_nuc_translation,int db_sort,int hit_sort,int hsp_sort,ILinkoutDB * linkoutdb,const string & mv_build_name)2881 CAlignFormatUtil::SortSeqalignForSortableFormat(CCgiContext& ctx,
2882                                              CScope& scope,
2883                                              CSeq_align_set& aln_set,
2884                                              bool nuc_to_nuc_translation,
2885                                              int db_sort,
2886                                              int hit_sort,
2887                                              int hsp_sort,
2888                                              ILinkoutDB* linkoutdb,
2889                                              const string& mv_build_name) {
2890 
2891 
2892     if (db_sort == 0 && hit_sort < 1 && hsp_sort < 1)
2893        return (CRef<CSeq_align_set>) &aln_set;
2894 
2895     list< CRef<CSeq_align_set> > seqalign_hit_total_list;
2896     vector< CRef<CSeq_align_set> > seqalign_vec(2);
2897     seqalign_vec[0] = new CSeq_align_set;
2898     seqalign_vec[1] = new CSeq_align_set;
2899 
2900     if(IsMixedDatabase(ctx)) {
2901         SplitSeqalignByMolecularType(seqalign_vec, db_sort, aln_set, scope,
2902                                      linkoutdb, mv_build_name);
2903     }else {
2904         seqalign_vec[0] = const_cast<CSeq_align_set*>(&aln_set);
2905     }
2906 
2907 
2908     ITERATE(vector< CRef<CSeq_align_set> >, iter, seqalign_vec){
2909         list< CRef<CSeq_align_set> > one_seqalign_hit_total_list = SortOneSeqalignForSortableFormat(**iter,
2910                                                             nuc_to_nuc_translation,
2911                                                             hit_sort,
2912                                                             hsp_sort);
2913 
2914         seqalign_hit_total_list.splice(seqalign_hit_total_list.end(),one_seqalign_hit_total_list);
2915 
2916     }
2917 
2918     return HitListToHspList(seqalign_hit_total_list);
2919 }
2920 list< CRef<CSeq_align_set> >
SortOneSeqalignForSortableFormat(const CSeq_align_set & source,bool nuc_to_nuc_translation,int hit_sort,int hsp_sort)2921 CAlignFormatUtil::SortOneSeqalignForSortableFormat(const CSeq_align_set& source,
2922                                                 bool nuc_to_nuc_translation,
2923                                                 int hit_sort,
2924                                                 int hsp_sort)
2925 {
2926     list< CRef<CSeq_align_set> > seqalign_hit_total_list;
2927     list< CRef<CSeq_align_set> > seqalign_hit_list;
2928     HspListToHitList(seqalign_hit_list, source);
2929 
2930     if (hit_sort == eTotalScore) {
2931         seqalign_hit_list.sort(SortHitByTotalScoreDescending);
2932     } else if (hit_sort == eHighestScore) {
2933         seqalign_hit_list.sort(CAlignFormatUtil::SortHitByScoreDescending);
2934     } else if (hit_sort == ePercentIdentity) {
2935         SortHitByPercentIdentityDescending(seqalign_hit_list,
2936                                                nuc_to_nuc_translation);
2937     } else if (hit_sort == eQueryCoverage) {
2938         seqalign_hit_list.sort(SortHitByMasterCoverageDescending);
2939     }
2940 
2941     ITERATE(list< CRef<CSeq_align_set> >, iter2, seqalign_hit_list) {
2942         CRef<CSeq_align_set> temp(*iter2);
2943         if (hsp_sort == eQueryStart) {
2944             temp->Set().sort(SortHspByMasterStartAscending);
2945         } else if (hsp_sort == eHspPercentIdentity) {
2946             temp->Set().sort(SortHspByPercentIdentityDescending);
2947         } else if (hsp_sort == eScore) {
2948             temp->Set().sort(SortHspByScoreDescending);
2949         } else if (hsp_sort == eSubjectStart) {
2950             temp->Set().sort(SortHspBySubjectStartAscending);
2951 
2952         }
2953         seqalign_hit_total_list.push_back(temp);
2954     }
2955     return seqalign_hit_total_list;
2956 }
2957 
2958 CRef<CSeq_align_set>
SortSeqalignForSortableFormat(CSeq_align_set & aln_set,bool nuc_to_nuc_translation,int hit_sort,int hsp_sort)2959 CAlignFormatUtil::SortSeqalignForSortableFormat(CSeq_align_set& aln_set,
2960                                              bool nuc_to_nuc_translation,
2961                                              int hit_sort,
2962                                              int hsp_sort) {
2963 
2964     if (hit_sort <= eEvalue && hsp_sort <= eHspEvalue) {
2965        return (CRef<CSeq_align_set>) &aln_set;
2966     }
2967 
2968 //  seqalign_vec[0] = const_cast<CSeq_align_set*>(&aln_set);
2969     list< CRef<CSeq_align_set> > seqalign_hit_total_list = SortOneSeqalignForSortableFormat(aln_set,
2970                                                             nuc_to_nuc_translation,
2971                                                             hit_sort,
2972                                                             hsp_sort);
2973     return HitListToHspList(seqalign_hit_total_list);
2974 }
2975 
2976 
FilterSeqalignByEval(CSeq_align_set & source_aln,double evalueLow,double evalueHigh)2977 CRef<CSeq_align_set> CAlignFormatUtil::FilterSeqalignByEval(CSeq_align_set& source_aln,
2978                                      double evalueLow,
2979                                      double evalueHigh)
2980 {
2981     int score, sum_n, num_ident;
2982     double bits, evalue;
2983     list<TGi> use_this_gi;
2984 
2985     CRef<CSeq_align_set> new_aln(new CSeq_align_set);
2986 
2987     ITERATE(CSeq_align_set::Tdata, iter, source_aln.Get()){
2988         CAlignFormatUtil::GetAlnScores(**iter, score, bits, evalue,
2989                                        sum_n, num_ident, use_this_gi);
2990         //Add the next three lines to re-calculte seq align evalue to the obe that is displayed on the screen
2991 		//string evalue_buf, bit_score_buf, total_bit_buf, raw_score_buf;
2992 		//CAlignFormatUtil::GetScoreString(evalue, bits, 0, 0, evalue_buf, bit_score_buf, total_bit_buf, raw_score_buf);
2993 		//evalue = NStr::StringToDouble(evalue_buf);
2994         if(evalue >= evalueLow && evalue <= evalueHigh) {
2995             new_aln->Set().push_back(*iter);
2996         }
2997     }
2998     return new_aln;
2999 
3000 }
3001 
3002 /// Returns percent match for an alignment.
3003 /// Normally we round up the value, unless that means that an
3004 /// alignment with mismatches would be 100%.  In that case
3005 /// it becomes 99%.
3006 ///@param numerator: numerator in percent identity calculation.
3007 ///@param denominator: denominator in percent identity calculation.
GetPercentMatch(int numerator,int denominator)3008 int CAlignFormatUtil::GetPercentMatch(int numerator, int denominator)
3009 {
3010      if (numerator == denominator)
3011         return 100;
3012      else {
3013        int retval =(int) (0.5 + 100.0*((double)numerator)/((double)denominator));
3014        retval = min(99, retval);
3015        return retval;
3016      }
3017 }
3018 
GetPercentIdentity(int numerator,int denominator)3019 double CAlignFormatUtil::GetPercentIdentity(int numerator, int denominator)
3020 {
3021      if (numerator == denominator)
3022         return 100;
3023      else {
3024        double retval =100*(double)numerator/(double)denominator;
3025        return retval;
3026      }
3027 }
3028 
FilterSeqalignByPercentIdent(CSeq_align_set & source_aln,double percentIdentLow,double percentIdentHigh)3029 CRef<CSeq_align_set> CAlignFormatUtil::FilterSeqalignByPercentIdent(CSeq_align_set& source_aln,
3030                                                                     double percentIdentLow,
3031                                                                     double percentIdentHigh)
3032 {
3033     int score, sum_n, num_ident;
3034     double bits, evalue;
3035     list<TGi> use_this_gi;
3036 
3037     CRef<CSeq_align_set> new_aln(new CSeq_align_set);
3038 
3039     ITERATE(CSeq_align_set::Tdata, iter, source_aln.Get()){
3040         CAlignFormatUtil::GetAlnScores(**iter, score, bits, evalue,
3041                                         sum_n, num_ident, use_this_gi);
3042         int seqAlnLength = GetAlignmentLength(**iter, kTranslation);
3043         if(seqAlnLength > 0 && num_ident > 0) {
3044             double alnPercentIdent = GetPercentIdentity(num_ident, seqAlnLength);
3045             if(alnPercentIdent >= percentIdentLow && alnPercentIdent <= percentIdentHigh) {
3046                 new_aln->Set().push_back(*iter);
3047             }
3048         }
3049     }
3050     return new_aln;
3051 }
3052 
FilterSeqalignByScoreParams(CSeq_align_set & source_aln,double evalueLow,double evalueHigh,double percentIdentLow,double percentIdentHigh)3053 CRef<CSeq_align_set> CAlignFormatUtil::FilterSeqalignByScoreParams(CSeq_align_set& source_aln,
3054                                                                     double evalueLow,
3055                                                                     double evalueHigh,
3056                                                                     double percentIdentLow,
3057                                                                     double percentIdentHigh)
3058 {
3059     int score, sum_n, num_ident;
3060     double bits, evalue;
3061     list<TGi> use_this_gi;
3062 
3063     CRef<CSeq_align_set> new_aln(new CSeq_align_set);
3064 
3065     ITERATE(CSeq_align_set::Tdata, iter, source_aln.Get()){
3066         CAlignFormatUtil::GetAlnScores(**iter, score, bits, evalue,
3067                                        sum_n, num_ident, use_this_gi);
3068         //Add the next three lines to re-calculte seq align evalue to the one that is displayed on the screen
3069 		//string evalue_buf, bit_score_buf, total_bit_buf, raw_score_buf;
3070 		//CAlignFormatUtil::GetScoreString(evalue, bits, 0, 0, evalue_buf, bit_score_buf, total_bit_buf, raw_score_buf);
3071 		//evalue = NStr::StringToDouble(evalue_buf);
3072 		int seqAlnLength = GetAlignmentLength(**iter, kTranslation);
3073 		if(seqAlnLength > 0 && num_ident > 0) {
3074 			int alnPercentIdent = GetPercentMatch(num_ident, seqAlnLength);
3075 			if( (evalue >= evalueLow && evalue <= evalueHigh) &&
3076 				(alnPercentIdent >= percentIdentLow && alnPercentIdent <= percentIdentHigh)) {
3077 				new_aln->Set().push_back(*iter);
3078 			}
3079         }
3080     }
3081     return new_aln;
3082 }
3083 
adjustPercentIdentToDisplayValue(double value)3084 static double adjustPercentIdentToDisplayValue(double value)
3085 {
3086     char buffer[512];
3087     sprintf(buffer, "%.*f", 2, value);
3088     double newVal = NStr::StringToDouble(buffer);
3089     return newVal;
3090 }
3091 
s_isAlnInFilteringRange(double evalue,double percentIdent,int queryCover,double evalueLow,double evalueHigh,double percentIdentLow,double percentIdentHigh,int queryCoverLow,int queryCoverHigh)3092 static bool s_isAlnInFilteringRange(double evalue,
3093                                   double percentIdent,
3094                                   int queryCover,
3095                                   double evalueLow,
3096                                   double evalueHigh,
3097                                   double percentIdentLow,
3098                                   double percentIdentHigh,
3099                                   int queryCoverLow,
3100                                   int queryCoverHigh)
3101 {
3102 
3103 
3104     bool isInRange = false;
3105     //Adjust percent identity and evalue to display values
3106     percentIdent = adjustPercentIdentToDisplayValue(percentIdent);
3107     string evalue_buf, bit_score_buf, total_bit_buf, raw_score_buf;
3108     double bits = 0;
3109 	CAlignFormatUtil::GetScoreString(evalue, bits, 0, 0, evalue_buf, bit_score_buf, total_bit_buf, raw_score_buf);
3110 	evalue = NStr::StringToDouble(evalue_buf);
3111 
3112     if(evalueLow >= 0 && percentIdentLow >= 0 && queryCoverLow >= 0) {
3113         isInRange = (evalue >= evalueLow && evalue <= evalueHigh) &&
3114 				    (percentIdent >= percentIdentLow && percentIdent <= percentIdentHigh) &&
3115                     (queryCover >= queryCoverLow && queryCover <= queryCoverHigh);
3116     }
3117     else if(evalueLow >= 0 && percentIdentLow >= 0) {
3118         isInRange = (evalue >= evalueLow && evalue <= evalueHigh) &&
3119 				(percentIdent >= percentIdentLow && percentIdent <= percentIdentHigh);
3120 	}
3121     else if(evalueLow >= 0 && queryCoverLow >= 0) {
3122         isInRange = (evalue >= evalueLow && evalue <= evalueHigh) &&
3123 				(queryCover >= queryCoverLow && queryCover <= queryCoverHigh);
3124 	}
3125     else if(queryCoverLow >= 0 && 	percentIdentLow >= 0) {
3126         isInRange = (queryCover >= queryCoverLow && queryCover <= queryCoverHigh) &&
3127 				(percentIdent >= percentIdentLow && percentIdent <= percentIdentHigh);
3128 	}
3129     else if(evalueLow >= 0) {
3130         isInRange = (evalue >= evalueLow && evalue <= evalueHigh);
3131     }
3132 	else if(percentIdentLow >= 0) {
3133         isInRange = (percentIdent >= percentIdentLow && percentIdent <= percentIdentHigh);
3134 	}
3135     else if(queryCoverLow >= 0) {
3136         isInRange = (queryCover >= queryCoverLow && queryCover <= queryCoverHigh);
3137 	}
3138     return isInRange;
3139 }
3140 
FilterSeqalignByScoreParams(CSeq_align_set & source_aln,double evalueLow,double evalueHigh,double percentIdentLow,double percentIdentHigh,int queryCoverLow,int queryCoverHigh)3141 CRef<CSeq_align_set> CAlignFormatUtil::FilterSeqalignByScoreParams(CSeq_align_set& source_aln,
3142                                                                     double evalueLow,
3143                                                                     double evalueHigh,
3144                                                                     double percentIdentLow,
3145                                                                     double percentIdentHigh,
3146                                                                     int queryCoverLow,
3147                                                                     int queryCoverHigh)
3148 {
3149     list< CRef<CSeq_align_set> > seqalign_hit_total_list;
3150     list< CRef<CSeq_align_set> > seqalign_hit_list;
3151 
3152     HspListToHitList(seqalign_hit_list, source_aln);
3153 
3154     ITERATE(list< CRef<CSeq_align_set> >, iter, seqalign_hit_list) {
3155         CRef<CSeq_align_set> temp(*iter);
3156         CAlignFormatUtil::SSeqAlignSetCalcParams* seqSetInfo = CAlignFormatUtil::GetSeqAlignSetCalcParamsFromASN(*temp);
3157 
3158         if(s_isAlnInFilteringRange(seqSetInfo->evalue,
3159                                   seqSetInfo->percent_identity,
3160                                   seqSetInfo->percent_coverage,
3161                                   evalueLow,
3162                                   evalueHigh,
3163                                   percentIdentLow,
3164                                   percentIdentHigh,
3165                                   queryCoverLow,
3166                                   queryCoverHigh)) {
3167             seqalign_hit_total_list.push_back(temp);
3168         }
3169     }
3170     return HitListToHspList(seqalign_hit_total_list);
3171 }
3172 
LimitSeqalignByHsps(CSeq_align_set & source_aln,int maxAligns,int maxHsps)3173 CRef<CSeq_align_set> CAlignFormatUtil::LimitSeqalignByHsps(CSeq_align_set& source_aln,
3174                                                            int maxAligns,
3175                                                            int maxHsps)
3176 {
3177     CRef<CSeq_align_set> new_aln(new CSeq_align_set);
3178 
3179     CConstRef<CSeq_id> prevQueryId,prevSubjectId;
3180     int alignCount = 0,hspCount = 0;
3181     ITERATE(CSeq_align_set::Tdata, iter, source_aln.Get()){
3182         const CSeq_id& newQueryId = (*iter)->GetSeq_id(0);
3183         if(prevQueryId.Empty() || !newQueryId.Match(*prevQueryId)){
3184             if (hspCount >= maxHsps) {
3185                 break;
3186             }
3187             alignCount = 0;
3188             prevQueryId = &newQueryId;
3189         }
3190         if (alignCount < maxAligns) {
3191             const CSeq_id& newSubjectId = (*iter)->GetSeq_id(1);
3192             // Increment alignments count if subject sequence is different
3193             if(prevSubjectId.Empty() || !newSubjectId.Match(*prevSubjectId)){
3194                 ++alignCount;
3195                 prevSubjectId = &newSubjectId;
3196             }
3197             // Increment HSP count if the alignments limit is not reached
3198             ++hspCount;
3199             new_aln->Set().push_back(*iter);
3200         }
3201 
3202     }
3203     return new_aln;
3204 }
3205 
3206 
ExtractQuerySeqAlign(CRef<CSeq_align_set> & source_aln,int queryNumber)3207 CRef<CSeq_align_set> CAlignFormatUtil::ExtractQuerySeqAlign(CRef<CSeq_align_set> &source_aln,
3208                                                             int queryNumber)
3209 {
3210     if(queryNumber == 0) {
3211         return source_aln;
3212     }
3213     CRef<CSeq_align_set> new_aln;
3214 
3215     CConstRef<CSeq_id> prevQueryId;
3216     int currQueryNum = 0;
3217 
3218     ITERATE(CSeq_align_set::Tdata, iter, source_aln->Get()){
3219         const CSeq_id& newQueryId = (*iter)->GetSeq_id(0);
3220         if(prevQueryId.Empty() || !newQueryId.Match(*prevQueryId)){
3221             currQueryNum++;
3222             prevQueryId = &newQueryId;
3223         }
3224         //Record seq aligns corresponding to queryNumber
3225         if(currQueryNum == queryNumber) {
3226             if(new_aln.Empty()) {
3227                 new_aln.Reset(new CSeq_align_set);
3228             }
3229             new_aln->Set().push_back(*iter);
3230         }
3231         else if(currQueryNum > queryNumber) {
3232             break;
3233         }
3234     }
3235     return new_aln;
3236 }
3237 
3238 
InitConfig()3239 void CAlignFormatUtil::InitConfig()
3240 {
3241     string l_cfg_file_name;
3242     bool   l_dbg = CAlignFormatUtil::m_geturl_debug_flag;
3243     if( getenv("GETURL_DEBUG") ) CAlignFormatUtil::m_geturl_debug_flag = l_dbg = true;
3244     if( !m_Reg ) {
3245         bool cfgExists = true;
3246         string l_ncbi_env;
3247         string l_fmtcfg_env;
3248         if( NULL !=  getenv("NCBI")   ) l_ncbi_env = getenv("NCBI");
3249         if( NULL !=  getenv("FMTCFG") ) l_fmtcfg_env = getenv("FMTCFG");
3250         // config file name: value of FMTCFG or  default ( .ncbirc )
3251         if( l_fmtcfg_env.empty()  )
3252             l_cfg_file_name = ".ncbirc";
3253         else
3254             l_cfg_file_name = l_fmtcfg_env;
3255         // checkinf existance of configuration file
3256         CFile  l_fchecker( l_cfg_file_name );
3257         cfgExists = l_fchecker.Exists();
3258         if( (!cfgExists) && (!l_ncbi_env.empty()) ) {
3259             if( l_ncbi_env.rfind("/") != (l_ncbi_env.length() -1 ))
3260             l_ncbi_env.append("/");
3261             l_cfg_file_name = l_ncbi_env + l_cfg_file_name;
3262             CFile  l_fchecker2( l_cfg_file_name );
3263             cfgExists = l_fchecker2.Exists();
3264         }
3265         if(cfgExists) {
3266             CNcbiIfstream l_ConfigFile(l_cfg_file_name.c_str() );
3267             m_Reg.reset(new CNcbiRegistry(l_ConfigFile));
3268             if( l_dbg ) fprintf(stderr,"REGISTRY: %s\n",l_cfg_file_name.c_str());
3269         }
3270     }
3271     return;
3272 }
3273 
3274 //
3275 // get given url from registry file or return corresponding kNAME
3276 // value as default to preserve compatibility.
3277 //
3278 // algoritm:
3279 // 1) config file name is ".ncbirc" unless FMTCFG specifies another name
3280 // 2) try to read local configuration file before
3281 //    checking location specified by the NCBI environment.
3282 // 3) if index != -1, use it as trailing version number for a key name,
3283 //    ABCD_V0. try to read ABCD key if version variant doesn't exist.
3284 // 4) use INCLUDE_BASE_DIR key to specify base for all include files.
3285 // 5) treat "_FORMAT" key as filename first and  string in second.
3286 //    in case of existances of filename, read it starting from
3287 //    location specified by INCLUDE_BASE_DIR key
GetURLFromRegistry(const string url_name,int index)3288 string CAlignFormatUtil::GetURLFromRegistry( const string url_name, int index){
3289   string  result_url;
3290   string l_key, l_host_port, l_format;
3291   string l_secion_name = "BLASTFMTUTIL";
3292   string l_fmt_suffix = "_FORMAT";
3293   string l_host_port_suffix = "_HOST_PORT";
3294   string l_subst_pattern;
3295 
3296   if( !m_Reg ) {
3297     InitConfig();
3298   }
3299   if( !m_Reg ) return GetURLDefault(url_name,index); // can't read .ncbrc file
3300   string l_base_dir = m_Reg->Get(l_secion_name, "INCLUDE_BASE_DIR");
3301   if( !l_base_dir.empty() && ( l_base_dir.rfind("/") != (l_base_dir.length()-1)) ) {
3302     l_base_dir.append("/");
3303   }
3304 
3305 
3306   string default_host_port;
3307   string l_key_ndx;
3308   if( index >=0) {
3309     l_key_ndx = url_name + l_host_port_suffix + "_" + NStr::IntToString( index );
3310     l_subst_pattern="<@"+l_key_ndx+"@>";
3311     l_host_port = m_Reg->Get(l_secion_name, l_key_ndx); // try indexed
3312   }
3313   // next is initialization for non version/array type of settings
3314   if( l_host_port.empty()){  // not indexed or index wasn't found
3315     l_key = url_name + l_host_port_suffix; l_subst_pattern="<@"+l_key+"@>";
3316     l_host_port = m_Reg->Get(l_secion_name, l_key);
3317   }
3318   if( l_host_port.empty())   return GetURLDefault(url_name,index);
3319 
3320   // get format part
3321   l_key = url_name + l_fmt_suffix ; //"_FORMAT";
3322   l_key_ndx = l_key + "_" + NStr::IntToString( index );
3323   if( index >= 0 ){
3324     l_format = m_Reg->Get(l_secion_name, l_key_ndx);
3325   }
3326 
3327   if( l_format.empty() ) l_format = m_Reg->Get(l_secion_name, l_key);
3328   if( l_format.empty())   return GetURLDefault(url_name,index);
3329   // format found check wether this string or file name
3330   string l_format_file  = l_base_dir + l_format;
3331   CFile  l_fchecker( l_format_file );
3332   bool file_name_mode = l_fchecker.Exists();
3333   if( file_name_mode ) { // read whole content of the file to string buffer
3334     string l_inc_file_name = l_format_file;
3335     CNcbiIfstream l_file (l_inc_file_name.c_str(), ios::in|ios::binary|ios::ate);
3336     CT_POS_TYPE l_inc_size = l_file.tellg();
3337     //    size_t l_buf_sz = (size_t) l_inc_size;
3338     char *l_mem = new char [ (size_t) l_inc_size + 1];
3339     memset( l_mem,0, (size_t) l_inc_size + 1 ) ;
3340     l_file.seekg( 0, ios::beg );
3341     l_file.read(l_mem, l_inc_size);
3342     l_file.close();
3343     l_format.erase(); l_format.reserve( (size_t)l_inc_size + 1 );
3344     l_format =  l_mem;
3345     delete [] l_mem;
3346   }
3347 
3348   result_url = NStr::Replace(l_format,l_subst_pattern,l_host_port);
3349 
3350   if( result_url.empty()) return GetURLDefault(url_name,index);
3351   return result_url;
3352 }
3353 //
3354 // return default URL value for the given key.
3355 //
GetURLDefault(const string url_name,int index)3356 string  CAlignFormatUtil::GetURLDefault( const string url_name, int index) {
3357 
3358   string search_name = url_name;
3359   TTagUrlMap::const_iterator url_it;
3360   if( index >= 0 ) search_name += "_" + NStr::IntToString( index); // actual name for index value is NAME_{index}
3361 
3362   if( (url_it = sm_TagUrlMap.find( search_name ) ) != sm_TagUrlMap.end()) {
3363       string url_link = CAlignFormatUtil::MapProtocol(url_it->second);
3364       return url_link;
3365   }
3366 
3367   string error_msg = "CAlignFormatUtil::GetURLDefault:no_defualt_for"+url_name;
3368   if( index != -1 ) error_msg += "_index_"+ NStr::IntToString( index );
3369   return error_msg;
3370 }
3371 
3372 void
GetAsciiProteinMatrix(const char * matrix_name,CNcbiMatrix<int> & retval)3373 CAlignFormatUtil::GetAsciiProteinMatrix(const char* matrix_name,
3374                                         CNcbiMatrix<int>& retval)
3375 {
3376     retval.Resize(0, 0, -1);
3377     if (matrix_name == NULL ||
3378         NStr::TruncateSpaces(string(matrix_name)).empty()) {
3379         return;
3380     }
3381 
3382     const SNCBIPackedScoreMatrix* packed_mtx =
3383         NCBISM_GetStandardMatrix(matrix_name);
3384     if (packed_mtx == NULL) {
3385         return;
3386     }
3387     retval.Resize(k_NumAsciiChar, k_NumAsciiChar, -1000);
3388 
3389     SNCBIFullScoreMatrix mtx;
3390     NCBISM_Unpack(packed_mtx, &mtx);
3391 
3392     for(int i = 0; i < ePMatrixSize; ++i){
3393         for(int j = 0; j < ePMatrixSize; ++j){
3394             retval((size_t)k_PSymbol[i], (size_t)k_PSymbol[j]) =
3395                 mtx.s[(size_t)k_PSymbol[i]][(size_t)k_PSymbol[j]];
3396         }
3397     }
3398     for(int i = 0; i < ePMatrixSize; ++i) {
3399         retval((size_t)k_PSymbol[i], '*') = retval('*',(size_t)k_PSymbol[i]) = -4;
3400     }
3401     retval('*', '*') = 1;
3402     // this is to count Selenocysteine to Cysteine matches as positive
3403     retval('U', 'U') = retval('C', 'C');
3404     retval('U', 'C') = retval('C', 'C');
3405     retval('C', 'U') = retval('C', 'C');
3406 }
3407 
3408 
MapTemplate(string inpString,string tmplParamName,Int8 templParamVal)3409 string CAlignFormatUtil::MapTemplate(string inpString,string tmplParamName,Int8 templParamVal)
3410 {
3411     string outString;
3412     string tmplParam = "<@" + tmplParamName + "@>";
3413     NStr::Replace(inpString,tmplParam,NStr::NumericToString(templParamVal),outString);
3414     return outString;
3415 }
3416 
MapTemplate(string inpString,string tmplParamName,string templParamVal)3417 string CAlignFormatUtil::MapTemplate(string inpString,string tmplParamName,string templParamVal)
3418 {
3419     string outString;
3420     string tmplParam = "<@" + tmplParamName + "@>";
3421     NStr::Replace(inpString,tmplParam,templParamVal,outString);
3422     return outString;
3423 }
3424 
MapSpaceTemplate(string inpString,string tmplParamName,string templParamVal,unsigned int maxParamValLength,int spacesFormatFlag)3425 string CAlignFormatUtil::MapSpaceTemplate(string inpString,string tmplParamName,string templParamVal, unsigned int maxParamValLength, int spacesFormatFlag)
3426 {
3427     templParamVal = AddSpaces(templParamVal, maxParamValLength, spacesFormatFlag);
3428     string outString = MapTemplate(inpString,tmplParamName,templParamVal);
3429 
3430     return outString;
3431 }
3432 
3433 
AddSpaces(string paramVal,unsigned int maxParamValLength,int spacesFormatFlag)3434 string CAlignFormatUtil::AddSpaces(string paramVal, unsigned int maxParamValLength, int spacesFormatFlag)
3435 {
3436     //if(!spacePos.empty()) {
3437         string spaceString;
3438         if(maxParamValLength >= paramVal.size()) {
3439             unsigned int numSpaces = maxParamValLength - paramVal.size() + 1;
3440             if(spacesFormatFlag & eSpacePosToCenter) {
3441                 numSpaces = numSpaces/2;
3442             }
3443             for(size_t i=0; i < numSpaces ; i++){
3444                 spaceString += " ";
3445             }
3446         }
3447         else {
3448             paramVal = paramVal.substr(0, maxParamValLength - 3) + "...";
3449             spaceString += " ";
3450         }
3451         if(spacesFormatFlag & eSpacePosAtLineEnd) {
3452             paramVal = paramVal + spaceString;
3453         }
3454         else if(spacesFormatFlag & eSpacePosToCenter) {
3455             paramVal = spaceString + paramVal + spaceString;
3456         }
3457         else {
3458             paramVal = spaceString + paramVal;
3459         }
3460         if(spacesFormatFlag & eAddEOLAtLineStart) paramVal = "\n" + paramVal;
3461         if(spacesFormatFlag & eAddEOLAtLineEnd) paramVal = paramVal + "\n";
3462     //}
3463 
3464     return paramVal;
3465 }
3466 
3467 
3468 
GetProtocol()3469 string CAlignFormatUtil::GetProtocol()
3470 {
3471     CNcbiIfstream  config_file(".ncbirc");
3472     CNcbiRegistry config_reg(config_file);
3473     string httpProt = "https:";
3474     if(!config_reg.Empty()) {
3475         if(config_reg.HasEntry("BLASTFMTUTIL","PROTOCOL")) {
3476             httpProt = config_reg.Get("BLASTFMTUTIL","PROTOCOL");
3477         }
3478     }
3479     return httpProt;
3480 }
3481 
3482 /*
3483 if(no config file) protocol = "https:"
3484 if(no "BLASTFMTUTIL","PROTOCOL" entry in config file) protocol = "https:"
3485 if(there is entry in config) protocol = entry which could be blank = ""
3486 */
MapProtocol(string url_link)3487 string CAlignFormatUtil::MapProtocol(string url_link)
3488 {
3489     if(m_Protocol.empty()){
3490         if(!m_Reg) {
3491             InitConfig();
3492         }
3493         m_Protocol = (m_Reg && m_Reg->HasEntry("BLASTFMTUTIL","PROTOCOL")) ? m_Protocol = m_Reg->Get("BLASTFMTUTIL","PROTOCOL") : "https:";
3494     }
3495     url_link = CAlignFormatUtil::MapTemplate(url_link,"protocol",m_Protocol);
3496     return url_link;
3497 }
3498 
s_MapCommonUrlParams(string urlTemplate,CAlignFormatUtil::SSeqURLInfo * seqUrlInfo)3499 static string s_MapCommonUrlParams(string urlTemplate, CAlignFormatUtil::SSeqURLInfo *seqUrlInfo)
3500 {
3501     string db,logstr_moltype;
3502     if(seqUrlInfo->isDbNa) {
3503         db = "nucleotide";
3504         logstr_moltype = "nucl";
3505     } else {
3506         db = "protein";
3507         logstr_moltype ="prot";
3508     }
3509     string logstr_location = (seqUrlInfo->isAlignLink) ? "align" : "top";
3510     string url_link = CAlignFormatUtil::MapTemplate(urlTemplate,"db",db);
3511     url_link = CAlignFormatUtil::MapTemplate(url_link,"gi", GI_TO(TIntId, seqUrlInfo->gi));
3512     url_link = CAlignFormatUtil::MapTemplate(url_link,"log",logstr_moltype + logstr_location);
3513     url_link = CAlignFormatUtil::MapTemplate(url_link,"blast_rank",seqUrlInfo->blast_rank);
3514     url_link = CAlignFormatUtil::MapTemplate(url_link,"rid",seqUrlInfo->rid);
3515     url_link = CAlignFormatUtil::MapTemplate(url_link,"acc",seqUrlInfo->accession);
3516     url_link = CAlignFormatUtil::MapProtocol(url_link);
3517     return url_link;
3518 }
3519 
s_MapURLLink(string urlTemplate,CAlignFormatUtil::SSeqURLInfo * seqUrlInfo,const CBioseq::TId & ids)3520 static string s_MapURLLink(string urlTemplate, CAlignFormatUtil::SSeqURLInfo *seqUrlInfo, const CBioseq::TId& ids)
3521 {
3522     //Add specific blasttype/user_url template mapping here
3523     string url_link = urlTemplate;
3524     if (seqUrlInfo->user_url.find("sra.cgi") != string::npos) {
3525         string strRun, strSpotId,strReadIndex;
3526         if(s_GetSRASeqMetadata(ids,strRun,strSpotId,strReadIndex)) {
3527             url_link = CAlignFormatUtil::MapTemplate(url_link,"run",strRun);
3528             url_link = CAlignFormatUtil::MapTemplate(url_link,"spotid",strSpotId);
3529             url_link = CAlignFormatUtil::MapTemplate(url_link,"readindex",strReadIndex);
3530         }
3531     }
3532     //This maps generic params like log, blast_rank, rid
3533     url_link = s_MapCommonUrlParams(url_link, seqUrlInfo);
3534     return url_link;
3535 }
3536 
3537 
3538 
IsWGSPattern(string & wgsAccession)3539 bool CAlignFormatUtil::IsWGSPattern(string &wgsAccession)
3540 {
3541 	//const string  kWgsAccessionPattern = "^[A-Z]{4}[0-9]{8,10}(\.[0-9]+){0,1}$"; //example AUXO013124042 or AUXO013124042.1
3542     const unsigned int kWgsProjLength = 4;
3543     const unsigned int kWgsProjIDLengthMin = 8;
3544     const unsigned int kWgsProjIDLengthMax = 10;
3545     bool isWGS = true;
3546 
3547     if (wgsAccession.size() < 6) {
3548         return false;
3549     }
3550 
3551     if(NStr::Find(wgsAccession, ".") != NPOS) { //Accession has version AUXO013124042.1
3552 	    string version;
3553 		NStr::SplitInTwo(wgsAccession,".",wgsAccession,version);
3554 	}
3555 
3556     string wgsProj = wgsAccession.substr(0,kWgsProjLength);
3557     for (size_t i = 0; i < wgsProj.length(); i ++){
3558         if(!isalpha(wgsProj[i]&0xff)) {
3559             isWGS = false;
3560             break;
3561         }
3562     }
3563     if(isWGS) {
3564         string wgsId = wgsAccession.substr(kWgsProjLength);
3565         if(wgsId.length() >= kWgsProjIDLengthMin && wgsId.length() <= kWgsProjIDLengthMax) {
3566             for (size_t i = 0; i < wgsId.length(); i ++){
3567                 if(!isdigit(wgsId[i]&0xff)) {
3568                     isWGS = false;
3569                     break;
3570                 }
3571             }
3572         }
3573         else {
3574             isWGS = false;
3575         }
3576     }
3577     return isWGS;
3578 }
3579 
3580 
IsWGSAccession(string & wgsAccession,string & wgsProjName)3581 bool CAlignFormatUtil::IsWGSAccession(string &wgsAccession, string &wgsProjName)
3582 {
3583 	const unsigned int  kWgsProgNameLength = 6;
3584 	bool isWGS = IsWGSPattern(wgsAccession);
3585 	if(isWGS) {
3586 		wgsProjName = wgsAccession.substr(0,kWgsProgNameLength);
3587 	}
3588 	return isWGS;
3589 }
3590 
3591 
GetIDUrlGen(SSeqURLInfo * seqUrlInfo,const CBioseq::TId * ids)3592 string CAlignFormatUtil::GetIDUrlGen(SSeqURLInfo *seqUrlInfo,const CBioseq::TId* ids)
3593 {
3594     string url_link = NcbiEmptyString;
3595     CConstRef<CSeq_id> wid = FindBestChoice(*ids, CSeq_id::WorstRank);
3596 
3597     bool hasTextSeqID = GetTextSeqID(*ids);
3598     string title = "title=\"Show report for " + seqUrlInfo->accession + "\" ";
3599 
3600     string temp_class_info = kClassInfo; temp_class_info += " ";
3601 	string wgsProj;
3602 	string wgsAccession = seqUrlInfo->accession;
3603 	bool isWGS = false;
3604         if (!(wid->Which() == CSeq_id::e_Local || wid->Which() == CSeq_id::e_General)){
3605             isWGS = CAlignFormatUtil::IsWGSAccession(wgsAccession, wgsProj);
3606         }
3607 	if(isWGS && seqUrlInfo->useTemplates) {
3608 		string wgsUrl = CAlignFormatUtil::GetURLFromRegistry("WGS");
3609 		url_link = s_MapCommonUrlParams(wgsUrl, seqUrlInfo);
3610 		url_link = CAlignFormatUtil::MapTemplate(url_link,"wgsproj",wgsProj);
3611 		url_link = CAlignFormatUtil::MapTemplate(url_link,"wgsacc", wgsAccession);
3612 	}
3613     else if (hasTextSeqID) {
3614         string entrezTag = (seqUrlInfo->useTemplates) ? "ENTREZ_TM" : "ENTREZ";
3615         string l_EntrezUrl = CAlignFormatUtil::GetURLFromRegistry(entrezTag);
3616         url_link = s_MapCommonUrlParams(l_EntrezUrl, seqUrlInfo);
3617 
3618         if(!seqUrlInfo->useTemplates) {
3619 			url_link = CAlignFormatUtil::MapTemplate(url_link,"acc",seqUrlInfo->accession);
3620             temp_class_info = (!seqUrlInfo->defline.empty())? CAlignFormatUtil::MapTemplate(temp_class_info,"defline",NStr::JavaScriptEncode(seqUrlInfo->defline)):temp_class_info;
3621             url_link = CAlignFormatUtil::MapTemplate(url_link,"cssInf",(seqUrlInfo->addCssInfo) ? temp_class_info.c_str() : "");
3622             url_link = CAlignFormatUtil::MapTemplate(url_link,"target",seqUrlInfo->new_win ? "TARGET=\"EntrezView\"" : "");
3623         }
3624 
3625     } else {//seqid general, dbtag specified
3626         if(wid->Which() == CSeq_id::e_General){
3627             const CDbtag& dtg = wid->GetGeneral();
3628             const string& dbname = dtg.GetDb();
3629             if(NStr::CompareNocase(dbname, "TI") == 0){
3630                 string actual_id = CAlignFormatUtil::GetGnlID(dtg);
3631                 if(seqUrlInfo->useTemplates) {
3632                     string l_TraceUrl = CAlignFormatUtil::GetURLFromRegistry("TRACE_CGI");
3633                     url_link = l_TraceUrl + (string)"?cmd=retrieve&dopt=fasta&val=" + actual_id + "&RID=" + seqUrlInfo->rid;
3634                 }
3635                 else {
3636                     url_link = CAlignFormatUtil::MapTemplate(kTraceUrl,"val",actual_id);
3637                     temp_class_info = (!seqUrlInfo->defline.empty())? CAlignFormatUtil::MapTemplate(temp_class_info,"defline",seqUrlInfo->defline):temp_class_info;
3638                     url_link = CAlignFormatUtil::MapTemplate(url_link,"cssInf",(seqUrlInfo->addCssInfo) ? temp_class_info.c_str() : "");
3639                     url_link = CAlignFormatUtil::MapTemplate(url_link,"rid",seqUrlInfo->rid);
3640                 }
3641             }
3642         } else if (wid->Which() == CSeq_id::e_Local){
3643 
3644             string url_holder = CAlignFormatUtil::GetURLFromRegistry("LOCAL_ID");
3645 
3646             string user_url;
3647             if (m_Reg) {
3648                 user_url = (seqUrlInfo->addCssInfo) ? m_Reg->Get("LOCAL_ID","TOOL_URL_ALIGN") : m_Reg->Get("LOCAL_ID","TOOL_URL");
3649             }
3650             string id_string;
3651             wid->GetLabel(&id_string, CSeq_id::eContent);
3652             url_link = CAlignFormatUtil::MapTemplate(user_url,"seq_id", NStr::URLEncode(id_string));
3653             url_link = CAlignFormatUtil::MapTemplate(url_link,"db_name", NStr::URLEncode(seqUrlInfo->database));
3654             url_link = CAlignFormatUtil::MapTemplate(url_link,"taxid", TAX_ID_TO(int, seqUrlInfo->taxid));
3655             temp_class_info = (!seqUrlInfo->defline.empty())? CAlignFormatUtil::MapTemplate(temp_class_info,"defline",seqUrlInfo->defline):temp_class_info;
3656             url_link = CAlignFormatUtil::MapTemplate(url_link,"cssInf",(seqUrlInfo->addCssInfo) ? temp_class_info.c_str() : "");
3657             url_link = CAlignFormatUtil::MapTemplate(url_link,"title", id_string);
3658             url_link = CAlignFormatUtil::MapTemplate(url_link,"target",seqUrlInfo->new_win ? "TARGET=\"EntrezView\"" : "");
3659         }
3660     }
3661     url_link = CAlignFormatUtil::MapProtocol(url_link);
3662     seqUrlInfo->seqUrl = url_link;
3663 	return url_link;
3664 }
3665 
GetIDUrlGen(SSeqURLInfo * seqUrlInfo,const CSeq_id & id,objects::CScope & scope)3666 string CAlignFormatUtil::GetIDUrlGen(SSeqURLInfo *seqUrlInfo,const CSeq_id& id,objects::CScope &scope)
3667 {
3668     const CBioseq_Handle& handle = scope.GetBioseqHandle(id);
3669     const CBioseq::TId* ids = &handle.GetBioseqCore()->GetId();
3670 
3671     string url_link = GetIDUrlGen(seqUrlInfo,ids);
3672     return url_link;
3673 }
3674 
GetIDUrl(SSeqURLInfo * seqUrlInfo,const CBioseq::TId * ids)3675 string CAlignFormatUtil::GetIDUrl(SSeqURLInfo *seqUrlInfo,const CBioseq::TId* ids)
3676 {
3677     string url_link = NcbiEmptyString;
3678     CConstRef<CSeq_id> wid = FindBestChoice(*ids, CSeq_id::WorstRank);
3679 
3680     string title = "title=\"Show report for " + seqUrlInfo->accession + "\" ";
3681 
3682     if (seqUrlInfo->user_url != NcbiEmptyString &&
3683         !((seqUrlInfo->user_url.find("dumpgnl.cgi") != string::npos && seqUrlInfo->gi > ZERO_GI) ||
3684           (seqUrlInfo->user_url.find("maps.cgi") != string::npos))) {
3685 
3686         string url_with_parameters,toolURLParams;
3687         if(m_Reg && !seqUrlInfo->blastType.empty() && seqUrlInfo->blastType != "newblast") {
3688             toolURLParams = m_Reg->Get(seqUrlInfo->blastType, "TOOL_URL_PARAMS");
3689         }
3690         if(!toolURLParams.empty()) {
3691             string urlLinkTemplate = seqUrlInfo->user_url + toolURLParams;
3692             url_with_parameters = s_MapURLLink(urlLinkTemplate, seqUrlInfo, *ids);
3693         }
3694         else {
3695             if (seqUrlInfo->user_url.find("sra.cgi") != string::npos) {
3696                 url_with_parameters = CAlignFormatUtil::BuildSRAUrl(*ids, seqUrlInfo->user_url);
3697             }
3698             else {
3699                 url_with_parameters = CAlignFormatUtil::BuildUserUrl(*ids, seqUrlInfo->taxid, seqUrlInfo->user_url,
3700                                            seqUrlInfo->database,
3701                                            seqUrlInfo->isDbNa, seqUrlInfo->rid,
3702                                            seqUrlInfo->queryNumber,
3703                                            seqUrlInfo->isAlignLink);
3704             }
3705         }
3706         if (url_with_parameters != NcbiEmptyString) {
3707             if (!seqUrlInfo->useTemplates) {
3708                 string deflineInfo;
3709                 if(seqUrlInfo->addCssInfo) {
3710                     deflineInfo = (!seqUrlInfo->defline.empty())? CAlignFormatUtil::MapTemplate(kClassInfo,"defline",seqUrlInfo->defline):kClassInfo;
3711                 }
3712                 url_link += "<a " + title + deflineInfo + "href=\"";
3713             }
3714             url_link += url_with_parameters;
3715             if (!seqUrlInfo->useTemplates) url_link += "\">";
3716         }
3717     }
3718 	else {
3719         //use entrez or dbtag specified
3720         url_link = GetIDUrlGen(seqUrlInfo,ids);
3721     }
3722     seqUrlInfo->seqUrl = url_link;
3723     return url_link;
3724 }
3725 
3726 
GetIDUrl(SSeqURLInfo * seqUrlInfo,const CSeq_id & id,objects::CScope & scope)3727 string CAlignFormatUtil::GetIDUrl(SSeqURLInfo *seqUrlInfo,const CSeq_id& id,objects::CScope &scope)
3728 {
3729     const CBioseq_Handle& handle = scope.GetBioseqHandle(id);
3730     const CBioseq::TId* ids = &handle.GetBioseqCore()->GetId();
3731 
3732 
3733     seqUrlInfo->blastType = NStr::TruncateSpaces(NStr::ToLower(seqUrlInfo->blastType));
3734 
3735     if(seqUrlInfo->taxid == INVALID_TAX_ID) { //taxid is not set
3736         seqUrlInfo->taxid = ZERO_TAX_ID;
3737         if ((seqUrlInfo->advancedView || seqUrlInfo->blastType == "mapview" || seqUrlInfo->blastType == "mapview_prev") ||
3738             seqUrlInfo->blastType == "gsfasta" || seqUrlInfo->blastType == "gsfasta_prev") {
3739             seqUrlInfo->taxid = GetTaxidForSeqid(id, scope);
3740         }
3741     }
3742 	string url_link = GetIDUrl(seqUrlInfo,ids);
3743     return url_link;
3744 }
3745 
3746 //static const char kGenericLinkTemplate[] = "<a href=\"<@url@>\" target=\"lnk<@rid@>\" title=\"Show report for <@seqid@>\"><@gi@><@seqid@></a>";
GetFullIDLink(SSeqURLInfo * seqUrlInfo,const CBioseq::TId * ids)3747 string CAlignFormatUtil::GetFullIDLink(SSeqURLInfo *seqUrlInfo,const CBioseq::TId* ids)
3748 {
3749     string seqLink;
3750     string linkURL = GetIDUrl(seqUrlInfo,ids);
3751     if(!linkURL.empty()) {
3752         string linkTmpl = (seqUrlInfo->addCssInfo) ? kGenericLinkMouseoverTmpl : kGenericLinkTemplate;
3753         seqLink = CAlignFormatUtil::MapTemplate(linkTmpl,"url",linkURL);
3754         seqLink = CAlignFormatUtil::MapTemplate(seqLink,"rid",seqUrlInfo->rid);
3755         seqLink = CAlignFormatUtil::MapTemplate(seqLink,"seqid",seqUrlInfo->accession);
3756         seqLink = CAlignFormatUtil::MapTemplate(seqLink,"gi", GI_TO(TIntId, seqUrlInfo->gi));
3757         seqLink = CAlignFormatUtil::MapTemplate(seqLink,"target","EntrezView");
3758         if(seqUrlInfo->addCssInfo) {
3759             seqLink = CAlignFormatUtil::MapTemplate(seqLink,"defline",NStr::JavaScriptEncode(seqUrlInfo->defline));
3760         }
3761     }
3762     return seqLink;
3763 }
3764 
s_MapCustomLink(string linkUrl,string reportType,string accession,string linkText,string linktrg,string linkTitle=kCustomLinkTitle,string linkCls="")3765 static string s_MapCustomLink(string linkUrl,string reportType,string accession, string linkText, string linktrg, string linkTitle = kCustomLinkTitle,string linkCls = "")
3766 {
3767     string link = CAlignFormatUtil::MapTemplate(kCustomLinkTemplate,"custom_url",linkUrl);
3768     link = CAlignFormatUtil::MapProtocol(link);
3769     link = CAlignFormatUtil::MapTemplate(link,"custom_title",linkTitle);
3770     link = CAlignFormatUtil::MapTemplate(link,"custom_report_type",reportType);
3771     link = CAlignFormatUtil::MapTemplate(link,"seqid",accession);
3772     link = CAlignFormatUtil::MapTemplate(link,"custom_lnk_displ",linkText);
3773     link = CAlignFormatUtil::MapTemplate(link,"custom_cls",linkCls);
3774     link = CAlignFormatUtil::MapTemplate(link,"custom_trg",linktrg);
3775     return link;
3776 }
3777 
3778 
3779 
GetGiLinksList(SSeqURLInfo * seqUrlInfo,bool hspRange)3780 list<string>  CAlignFormatUtil::GetGiLinksList(SSeqURLInfo *seqUrlInfo,
3781                                                bool hspRange)
3782 {
3783     list<string> customLinksList;
3784     if (seqUrlInfo->hasTextSeqID) {
3785         //First show links to GenBank and FASTA
3786         string linkUrl,link,linkTiltle = kCustomLinkTitle;
3787 
3788         linkUrl = seqUrlInfo->seqUrl;
3789         if(NStr::Find(linkUrl, "report=genbank") == NPOS) { //Geo case
3790             linkUrl = s_MapCommonUrlParams(kEntrezTMUrl, seqUrlInfo);
3791         }
3792         string linkText = (seqUrlInfo->isDbNa) ? "GenBank" : "GenPept";
3793         if(hspRange) {
3794             linkUrl += "&from=<@fromHSP@>&to=<@toHSP@>";
3795             linkTiltle = "Aligned region spanning positions <@fromHSP@> to <@toHSP@> on <@seqid@>";
3796         }
3797 	    link = s_MapCustomLink(linkUrl,"genbank",seqUrlInfo->accession,linkText,"lnk" + seqUrlInfo->rid,linkTiltle);
3798         customLinksList.push_back(link);
3799     }
3800     return customLinksList;
3801 }
3802 
GetGraphiscLink(SSeqURLInfo * seqUrlInfo,bool hspRange)3803 string  CAlignFormatUtil::GetGraphiscLink(SSeqURLInfo *seqUrlInfo,
3804                                                bool hspRange)
3805 {
3806     //seqviewer
3807     string dbtype = (seqUrlInfo->isDbNa) ? "nuccore" : "protein";
3808     string seqViewUrl = (seqUrlInfo->gi > ZERO_GI)?kSeqViewerUrl:kSeqViewerUrlNonGi;
3809 
3810 	string linkUrl = CAlignFormatUtil::MapTemplate(seqViewUrl,"rid",seqUrlInfo->rid);
3811 
3812     string seqViewerParams;
3813     if(m_Reg && !seqUrlInfo->blastType.empty() && seqUrlInfo->blastType != "newblast") {
3814         seqViewerParams = m_Reg->Get(seqUrlInfo->blastType, "SEQVIEW_PARAMS");
3815     }
3816     seqViewerParams = seqViewerParams.empty() ? kSeqViewerParams : seqViewerParams;
3817     linkUrl = CAlignFormatUtil::MapTemplate(linkUrl,"seqViewerParams",seqViewerParams);
3818 
3819 	linkUrl = CAlignFormatUtil::MapTemplate(linkUrl,"dbtype",dbtype);
3820 	linkUrl = CAlignFormatUtil::MapTemplate(linkUrl,"gi", GI_TO(TIntId, seqUrlInfo->gi));
3821     string linkTitle = "Show alignment to <@seqid@> in <@custom_report_type@>";
3822     string link_loc;
3823     if(!hspRange) {
3824         int addToRange = (int) ((seqUrlInfo->seqRange.GetTo() - seqUrlInfo->seqRange.GetFrom()) * 0.05);//add 5% to each side
3825 		linkUrl = CAlignFormatUtil::MapTemplate(linkUrl,"from",max(0,(int)seqUrlInfo->seqRange.GetFrom() - addToRange));
3826 		linkUrl = CAlignFormatUtil::MapTemplate(linkUrl,"to",seqUrlInfo->seqRange.GetTo() + addToRange);
3827         link_loc = "fromSubj";
3828 		//linkUrl = CAlignFormatUtil::MapTemplate(linkUrl,"flip",NStr::BoolToString(seqUrlInfo->flip));
3829     }
3830     else {
3831         link_loc = "fromHSP";
3832         linkTitle += " for <@fromHSP@> to <@toHSP@> range";
3833     }
3834     linkUrl = CAlignFormatUtil::MapTemplate(linkUrl,"link_loc",link_loc);
3835 
3836     string title = (seqUrlInfo->isDbNa) ? "Nucleotide Graphics" : "Protein Graphics";
3837 
3838     string link = s_MapCustomLink(linkUrl,title,seqUrlInfo->accession, "Graphics","lnk" + seqUrlInfo->rid,linkTitle,"spr");
3839 
3840     return link;
3841 }
3842 
GetSeqLinksList(SSeqURLInfo * seqUrlInfo,bool hspRange)3843 list<string>  CAlignFormatUtil::GetSeqLinksList(SSeqURLInfo *seqUrlInfo,
3844                                                 bool hspRange)
3845 {
3846     list<string> customLinksList = GetGiLinksList(seqUrlInfo,hspRange);  //ONLY FOR genBank seqUrlInfo->seqUrl has "report=genbank"
3847     string graphicLink = GetGraphiscLink(seqUrlInfo,hspRange);
3848     if(!graphicLink.empty()) {
3849         customLinksList.push_back(graphicLink);
3850     }
3851     return customLinksList;
3852 }
3853 
SetCustomLinksTypes(SSeqURLInfo * seqUrlInfo,int customLinkTypesInp)3854 int CAlignFormatUtil::SetCustomLinksTypes(SSeqURLInfo *seqUrlInfo, int customLinkTypesInp)
3855 {
3856     int customLinkTypes = customLinkTypesInp;
3857     if ( seqUrlInfo->gi > ZERO_GI) {
3858         customLinkTypes +=eLinkTypeGenLinks;
3859     }
3860     //else if(NStr::StartsWith(seqUrlInfo->accession,"ti:")) {//seqUrlInfo->seqUrl has "trace.cgi"
3861     else if(NStr::Find(seqUrlInfo->seqUrl,"trace.cgi") != NPOS ){
3862         customLinkTypes +=eLinkTypeTraceLinks;
3863     }
3864     else if(seqUrlInfo->blastType == "sra") {//seqUrlInfo->seqUrl has sra.cgi
3865         customLinkTypes +=eLinkTypeSRALinks;
3866     }
3867     else if(seqUrlInfo->blastType == "snp") {//seqUrlInfo->seqUrl has snp_ref.cgi
3868         customLinkTypes +=eLinkTypeSNPLinks;
3869     }
3870     else if(seqUrlInfo->blastType == "gsfasta") {//seqUrlInfo->seqUrl has GSfasta.cgi
3871         customLinkTypes +=eLinkTypeGSFastaLinks;
3872     }
3873     return customLinkTypes;
3874 }
3875 
3876 
3877 //kCustomLinkTemplate:
3878 //<a href="<@custom_url@>" class="<@custom_cls@>" title="Show <@custom_report_type@> report for <@seqid@>"><@custom_lnk_displ@></a>
GetCustomLinksList(SSeqURLInfo * seqUrlInfo,const CSeq_id & id,objects::CScope & scope,int customLinkTypes)3879 list<string>  CAlignFormatUtil::GetCustomLinksList(SSeqURLInfo *seqUrlInfo,
3880                                           const CSeq_id& id,
3881                                           objects::CScope &scope,
3882                                           int customLinkTypes)
3883 {
3884     list<string> customLinksList;
3885     string linkUrl,link;
3886 
3887     customLinkTypes = SetCustomLinksTypes(seqUrlInfo, customLinkTypes);
3888     //First show links to GenBank and FASTA, then to Graphics
3889     customLinksList = GetSeqLinksList(seqUrlInfo);
3890     if(customLinkTypes & eLinkTypeTraceLinks) {
3891         linkUrl = seqUrlInfo->seqUrl;
3892 	    link = s_MapCustomLink(linkUrl,"Trace Archive FASTA",seqUrlInfo->accession, "FASTA","lnk" + seqUrlInfo->rid);
3893 	    customLinksList.push_back(link);
3894 
3895         linkUrl = NStr::Replace(seqUrlInfo->seqUrl,"fasta","trace");
3896         link = s_MapCustomLink(linkUrl,"Trace Archive Trace",seqUrlInfo->accession, "Trace","lnk" + seqUrlInfo->rid);
3897         customLinksList.push_back(link);
3898 
3899         linkUrl = NStr::Replace(seqUrlInfo->seqUrl,"fasta","quality");
3900         link = s_MapCustomLink(linkUrl,"Trace Archive Quality",seqUrlInfo->accession, "Quality","lnk" + seqUrlInfo->rid);
3901         customLinksList.push_back(link);
3902 
3903         linkUrl = NStr::Replace(seqUrlInfo->seqUrl,"fasta","info");
3904         link = s_MapCustomLink(linkUrl,"Trace Archive Info",seqUrlInfo->accession, "Info","lnk" + seqUrlInfo->rid);
3905         customLinksList.push_back(link);
3906     }
3907     else if(customLinkTypes & eLinkTypeSRALinks) {
3908         linkUrl = seqUrlInfo->seqUrl;
3909 	    link = s_MapCustomLink(linkUrl,"SRA",seqUrlInfo->accession, "SRA","lnk" + seqUrlInfo->rid);
3910 	    customLinksList.push_back(link);
3911     }
3912     else if(customLinkTypes & eLinkTypeSNPLinks) {
3913         linkUrl = seqUrlInfo->seqUrl;
3914 	    link = s_MapCustomLink(linkUrl,"SNP",seqUrlInfo->accession, "SNP","lnk" + seqUrlInfo->rid);
3915 	    customLinksList.push_back(link);
3916 
3917 
3918         //SNP accession=rs35885954
3919         string rs = NStr::Replace(seqUrlInfo->accession,"rs","");
3920 	    linkUrl = seqUrlInfo->resourcesUrl + rs + "?report=FLT";
3921 
3922 
3923         link = s_MapCustomLink(linkUrl,"Flatfile",seqUrlInfo->accession, "Flatfile","lnk" + seqUrlInfo->rid);
3924 	    customLinksList.push_back(link);
3925 
3926         linkUrl = NStr::Replace(linkUrl,"FLT","fasta");
3927         link = s_MapCustomLink(linkUrl,"FASTA",seqUrlInfo->accession, "FASTA","lnk" + seqUrlInfo->rid);
3928 	    customLinksList.push_back(link);
3929 
3930         linkUrl = NStr::Replace(linkUrl,"fasta","docsum");
3931         link = s_MapCustomLink(linkUrl,"Graphic summary ",seqUrlInfo->accession, "Graphic summary ","lnk" + seqUrlInfo->rid);
3932 	    customLinksList.push_back(link);
3933     }
3934     else if(customLinkTypes & eLinkTypeGSFastaLinks) {
3935         linkUrl = seqUrlInfo->seqUrl;
3936 	    link = s_MapCustomLink(linkUrl,"GSFASTA",seqUrlInfo->accession, "GSFASTA","lnk" + seqUrlInfo->rid);
3937 	    customLinksList.push_back(link);
3938     }
3939     return customLinksList;
3940 }
3941 
3942 
GetAlignedRegionsURL(SSeqURLInfo * seqUrlInfo,const CSeq_id & id,objects::CScope & scope)3943 string CAlignFormatUtil::GetAlignedRegionsURL(SSeqURLInfo *seqUrlInfo,
3944                                           const CSeq_id& id,
3945                                           objects::CScope &scope)
3946 {
3947     const CBioseq_Handle& handle = scope.GetBioseqHandle(id);
3948     const CBioseq::TId* ids = &handle.GetBioseqCore()->GetId();
3949     string linkUrl,link;
3950 
3951 
3952     linkUrl = CAlignFormatUtil::BuildUserUrl(*ids,
3953                                                  ZERO_TAX_ID,
3954                                                  kDownloadUrl,
3955                                                  seqUrlInfo->database,
3956                                                  seqUrlInfo->isDbNa,
3957                                                  seqUrlInfo->rid,
3958                                                  seqUrlInfo->queryNumber,
3959                                                  true);
3960         if(!linkUrl.empty()) {
3961             linkUrl += "&segs="+ seqUrlInfo->segs;
3962     }
3963 
3964     return linkUrl;
3965 }
3966 
3967 
3968 
GetFASTALinkURL(SSeqURLInfo * seqUrlInfo,const CSeq_id & id,objects::CScope & scope)3969 string  CAlignFormatUtil::GetFASTALinkURL(SSeqURLInfo *seqUrlInfo,
3970                                           const CSeq_id& id,
3971                                           objects::CScope &scope)
3972 
3973 {
3974     string linkUrl;
3975 
3976     int customLinkTypes = SetCustomLinksTypes(seqUrlInfo, CAlignFormatUtil::eLinkTypeDefault);
3977 
3978     if( (customLinkTypes & eLinkTypeGenLinks) || (customLinkTypes & eLinkTypeTraceLinks)){
3979          linkUrl = seqUrlInfo->seqUrl;
3980          linkUrl = NStr::Replace(linkUrl,"genbank","fasta");
3981     }
3982     else if(customLinkTypes & eLinkTypeSNPLinks) {
3983         linkUrl = seqUrlInfo->seqUrl;
3984         vector<string> parts;
3985         //SNP accession=dbSNP:rs35885954
3986         NStr::Split(seqUrlInfo->accession,":rs",parts,NStr::fSplit_MergeDelimiters);
3987         string rs;
3988         if(parts.size() > 1) {
3989             rs = parts[1];
3990         }
3991 	    linkUrl = seqUrlInfo->resourcesUrl + rs + "?report=fasta";
3992     }
3993     return linkUrl;
3994 }
3995 
GetGeneInfo(TGi giForGeneLookup)3996 string  CAlignFormatUtil::GetGeneInfo(TGi giForGeneLookup)
3997 {
3998     string geneSym;
3999     try
4000     {
4001         CNcbiEnvironment env;
4002         if (env.Get(GENE_INFO_PATH_ENV_VARIABLE) != kEmptyStr)
4003         {
4004 
4005             if (m_GeneInfoReader.get() == 0)
4006             {
4007                 m_GeneInfoReader.reset(new CGeneInfoFileReader(false));
4008             }
4009 
4010 
4011             CGeneInfoFileReader::TGeneInfoList infoList;
4012             m_GeneInfoReader->GetGeneInfoForGi(giForGeneLookup,infoList);
4013 
4014             CGeneInfoFileReader::TGeneInfoList::const_iterator itInfo = infoList.begin();
4015             for (; itInfo != infoList.end(); itInfo++)
4016             {
4017                 CRef<CGeneInfo> info = *itInfo;
4018                 geneSym = info->GetSymbol();
4019                 break;//???
4020             }
4021         }
4022     }
4023     catch (CException& e)
4024     {
4025         geneSym = "(Gene info extraction error: "  + e.GetMsg() +  ")";
4026     }
4027     catch (...)
4028     {
4029         geneSym = "(Gene info extraction error)";
4030     }
4031     return geneSym;
4032 }
4033 
4034 
GetDbType(const CSeq_align_set & actual_aln_list,CScope & scope)4035 CAlignFormatUtil::DbType CAlignFormatUtil::GetDbType(const CSeq_align_set& actual_aln_list, CScope & scope)
4036 {
4037     //determine if the database has gi by looking at the 1st hit.
4038     //Could be wrong but simple for now
4039     DbType type = eDbTypeNotSet;
4040     CRef<CSeq_align> first_aln = actual_aln_list.Get().front();
4041     const CSeq_id& subject_id = first_aln->GetSeq_id(1);
4042 
4043     if (subject_id.Which() != CSeq_id::e_Local){
4044         const CBioseq_Handle& handleTemp  = scope.GetBioseqHandle(subject_id);
4045         if(handleTemp){
4046             TGi giTemp = FindGi(handleTemp.GetBioseqCore()->GetId());
4047             if (giTemp > ZERO_GI || GetTextSeqID((CConstRef<CSeq_id>)&subject_id)) {
4048                 type = eDbGi;
4049             } else if (subject_id.Which() == CSeq_id::e_General){
4050                 const CDbtag& dtg = subject_id.GetGeneral();
4051                 const string& dbName = dtg.GetDb();
4052                 if(NStr::CompareNocase(dbName, "TI") == 0){
4053                     type = eDbGeneral;
4054                 }
4055             }
4056         }
4057     }
4058     return type;
4059 }
4060 
4061 CAlignFormatUtil::SSeqAlignSetCalcParams*
GetSeqAlignCalcParams(const CSeq_align & aln)4062 CAlignFormatUtil::GetSeqAlignCalcParams(const CSeq_align& aln)
4063 {
4064     int score = 0;
4065     double bits = 0;
4066     double evalue = 0;
4067     int sum_n = 0;
4068     int num_ident = 0;
4069     list<TGi> use_this_gi;
4070 
4071     use_this_gi.clear();
4072     //Gets scores directly from seq align
4073     GetAlnScores(aln, score, bits, evalue, sum_n,
4074                                        num_ident, use_this_gi);
4075 
4076     auto_ptr<SSeqAlignSetCalcParams> seqSetInfo(new SSeqAlignSetCalcParams);
4077     seqSetInfo->sum_n = sum_n == -1 ? 1:sum_n ;
4078     seqSetInfo->id = &(aln.GetSeq_id(1));
4079     seqSetInfo->use_this_gi = use_this_gi;
4080     seqSetInfo->bit_score = bits;
4081     seqSetInfo->raw_score = score;
4082     seqSetInfo->evalue = evalue;
4083     seqSetInfo->match = num_ident;
4084     seqSetInfo->id = &(aln.GetSeq_id(1));
4085     seqSetInfo->subjRange = CRange<TSeqPos>(0,0);
4086     seqSetInfo->flip = false;
4087 
4088     return seqSetInfo.release();
4089 }
4090 
4091 
4092 
4093 CAlignFormatUtil::SSeqAlignSetCalcParams*
GetSeqAlignSetCalcParams(const CSeq_align_set & aln,int queryLength,bool do_translation)4094 CAlignFormatUtil::GetSeqAlignSetCalcParams(const CSeq_align_set& aln,int queryLength, bool do_translation)
4095 {
4096     int score = 0;
4097     double bits = 0;
4098     double evalue = 0;
4099     int sum_n = 0;
4100     int num_ident = 0;
4101     SSeqAlignSetCalcParams* seqSetInfo = NULL;
4102 
4103     if(aln.Get().empty())
4104         return seqSetInfo;
4105 
4106     seqSetInfo = GetSeqAlignCalcParams(*(aln.Get().front()));
4107 
4108     double total_bits = 0;
4109     double highest_bits = 0;
4110     double lowest_evalue = 0;
4111     int highest_length = 1;
4112     int highest_ident = 0;
4113     //int highest_identity = 0;
4114     double totalLen = 0;
4115 
4116     list<TGi> use_this_gi;   // Not used here, but needed for GetAlnScores.
4117 
4118     seqSetInfo->subjRange = CAlignFormatUtil::GetSeqAlignCoverageParams(aln,&seqSetInfo->master_covered_length,&seqSetInfo->flip);
4119     seqSetInfo->percent_coverage = 100*seqSetInfo->master_covered_length/queryLength;
4120 
4121     ITERATE(CSeq_align_set::Tdata, iter, aln.Get()) {
4122         int align_length = CAlignFormatUtil::GetAlignmentLength(**iter, do_translation);
4123         totalLen += align_length;
4124 
4125         CAlignFormatUtil::GetAlnScores(**iter, score, bits, evalue, sum_n,
4126                                    num_ident, use_this_gi);
4127         use_this_gi.clear();
4128 
4129         total_bits += bits;
4130 
4131 /// IMPORTANT: based on WB-1175, the trigger for setting the highest identity
4132 ///            is not the highest identity value, but the identity value of
4133 ///            the alignment with the highest score!
4134 ///
4135 ///     if (100*num_ident/align_length > highest_identity) { -- this condition is disabled
4136 
4137         if (bits > highest_bits) {  // this is the replacement condition (WB-1175)
4138             highest_length = align_length;
4139             highest_ident = num_ident;
4140 ///         highest_identity = 100*num_ident/align_length;
4141         }
4142 
4143         if (bits > highest_bits) {
4144             highest_bits = bits;
4145             lowest_evalue = evalue;
4146         }
4147     }
4148     seqSetInfo->match = highest_ident;
4149     seqSetInfo->align_length = highest_length;
4150     seqSetInfo->percent_identity = CAlignFormatUtil::GetPercentIdentity(seqSetInfo->match, seqSetInfo->align_length);
4151 
4152     seqSetInfo->total_bit_score = total_bits;
4153     seqSetInfo->bit_score = highest_bits;
4154     seqSetInfo->evalue = lowest_evalue;
4155     seqSetInfo->hspNum = aln.Size();
4156     seqSetInfo->totalLen = (Int8)totalLen;
4157 
4158     return seqSetInfo;
4159 }
4160 
GetSeqAlignSetCalcPercentIdent(const CSeq_align_set & aln,bool do_translation)4161 double CAlignFormatUtil::GetSeqAlignSetCalcPercentIdent(const CSeq_align_set& aln, bool do_translation)
4162 {
4163     int score = 0;
4164     double bits = 0;
4165     double evalue = 0;
4166     int sum_n = 0;
4167     int num_ident = 0;
4168 
4169     if(aln.Get().empty())
4170       return -1;
4171 
4172     double highest_bits = 0;
4173     int highest_length = 1;
4174     int highest_ident = 0;
4175 
4176     list<TGi> use_this_gi;   // Not used here, but needed for GetAlnScores.
4177 
4178     ITERATE(CSeq_align_set::Tdata, iter, aln.Get()) {
4179         int align_length = CAlignFormatUtil::GetAlignmentLength(**iter, do_translation);
4180 
4181         CAlignFormatUtil::GetAlnScores(**iter, score, bits, evalue, sum_n,
4182                                    num_ident, use_this_gi);
4183 
4184 
4185 /// IMPORTANT: based on WB-1175, the trigger for setting the highest identity
4186 ///            is not the highest identity value, but the identity value of
4187 ///            the alignment with the highest score!
4188 ///
4189 ///     if (100*num_ident/align_length > highest_identity) { -- this condition is disabled
4190 
4191         if (bits > highest_bits) {  // this is the replacement condition (WB-1175)
4192             highest_length = align_length;
4193             highest_ident = num_ident;
4194 ///         highest_identity = 100*num_ident/align_length;
4195             highest_bits = bits;
4196         }
4197     }
4198 
4199     double percent_identity = CAlignFormatUtil::GetPercentIdentity(highest_ident, highest_length);
4200     return percent_identity;
4201 }
4202 
4203 
4204 template<class container> bool
s_GetBlastScore(const container & scoreList,double & evalue,double & bitScore,double & totalBitScore,int & percentCoverage,double & percentIdent,int & hspNum,double & totalLen,int & rawScore,int & sum_n,list<TGi> & use_this_gi)4205 s_GetBlastScore(const container&  scoreList,
4206                 double& evalue,
4207                 double& bitScore,
4208                 double& totalBitScore,
4209                 int& percentCoverage,
4210                 double& percentIdent,
4211                 int& hspNum,
4212                 double& totalLen,
4213                 int &rawScore,
4214                 int& sum_n,
4215                 list<TGi>& use_this_gi)
4216 {
4217     const string k_GiPrefix = "gi:";
4218     bool hasScore = false;
4219 
4220 
4221     ITERATE (typename container, iter, scoreList) {
4222         const CObject_id& id=(*iter)->GetId();
4223         if (id.IsStr()) {
4224             hasScore = true;
4225             if (id.GetStr()=="seq_evalue") {
4226                 evalue = (*iter)->GetValue().GetReal();
4227             } else if (id.GetStr()=="seq_bit_score"){
4228                 bitScore = (*iter)->GetValue().GetReal();
4229             } else if (id.GetStr()=="seq_total_bit_score"){
4230                 totalBitScore = (*iter)->GetValue().GetReal();
4231             } else if (id.GetStr()=="seq_percent_coverage"){
4232                 percentCoverage = (*iter)->GetValue().GetInt();
4233             } else if (id.GetStr()=="seq_percent_identity" && (*iter)->GetValue().IsInt()){
4234                 percentIdent = (*iter)->GetValue().GetInt();
4235             } else if (id.GetStr()=="seq_percent_identity" && (*iter)->GetValue().IsReal()){
4236                 percentIdent = (*iter)->GetValue().GetReal();
4237             } else if (id.GetStr()=="seq_hspnum"){
4238                 hspNum = (*iter)->GetValue().GetInt();
4239             } else if (id.GetStr()=="seq_align_totlen"){
4240                 totalLen = (*iter)->GetValue().GetReal();
4241             } else if (id.GetStr()=="score"){
4242                 rawScore = (*iter)->GetValue().GetInt();
4243             } else if (id.GetStr()=="use_this_gi"){
4244                 use_this_gi.push_back(GI_FROM(int, (*iter)->GetValue().GetInt()));
4245             } else if (id.GetStr()=="sum_n"){
4246                 sum_n = (*iter)->GetValue().GetInt();
4247             }
4248             else if(NStr::StartsWith(id.GetStr(),k_GiPrefix)) { //will be used when switch to 64bit GIs
4249                 string strGi = NStr::Replace(id.GetStr(),k_GiPrefix,"");
4250                 TGi gi = NStr::StringToNumeric<TGi>(strGi);
4251                 use_this_gi.push_back(gi);
4252             }
4253         }
4254     }
4255     return hasScore;
4256 }
4257 
4258 
GetUseThisSequence(const CSeq_align & aln,list<TGi> & use_this_gi)4259 void CAlignFormatUtil::GetUseThisSequence(const CSeq_align& aln,list<TGi>& use_this_gi)
4260 
4261 {
4262     const string k_GiPrefix = "gi:";
4263 
4264     if(!aln.CanGetExt() || aln.GetExt().size() == 0) return;
4265     const CUser_object &user = *(aln.GetExt().front());
4266 
4267     if (user.IsSetType() && user.GetType().IsStr() && user.GetType().GetStr() == "use_this_seqid" && user.IsSetData()) {
4268         const CUser_object::TData& fields = user.GetData();
4269         for (CUser_object::TData::const_iterator fit = fields.begin();  fit != fields.end(); ++fit) {
4270             const CUser_field& field = **fit;
4271 
4272             if (field.IsSetLabel() && field.GetLabel().IsStr() && field.GetLabel().GetStr() == "SEQIDS" &&
4273                                                                      field.IsSetData()  &&  field.GetData().IsStrs()) {
4274                 const CUser_field::C_Data::TStrs& strs = field.GetData().GetStrs();
4275                 ITERATE(CUser_field::TData::TStrs, acc_iter, strs) {
4276                     if(NStr::StartsWith(*acc_iter,k_GiPrefix)) { //will be used when switch to 64bit GIs
4277                         string strGi = NStr::Replace(*acc_iter,k_GiPrefix,"");
4278                         TGi gi = NStr::StringToNumeric<TGi>(strGi);
4279                         use_this_gi.push_back(gi);
4280                     }
4281                 }
4282             }
4283         }
4284     }
4285 }
4286 
4287 
4288 /*use_this_seq will contain gi:nnnnnn or seqid:ssssss string list*/
GetUseThisSequence(const CSeq_align & aln,list<string> & use_this_seq)4289 void CAlignFormatUtil::GetUseThisSequence(const CSeq_align& aln,list<string>& use_this_seq)
4290 
4291 {
4292     if(!aln.CanGetExt() || aln.GetExt().size() == 0) return;
4293     const CUser_object &user = *(aln.GetExt().front());
4294 
4295     if (user.IsSetType() && user.GetType().IsStr() && user.GetType().GetStr() == "use_this_seqid" && user.IsSetData()) {
4296         const CUser_object::TData& fields = user.GetData();
4297         for (CUser_object::TData::const_iterator fit = fields.begin();  fit != fields.end(); ++fit) {
4298             const CUser_field& field = **fit;
4299 
4300             if (field.IsSetLabel() && field.GetLabel().IsStr() && field.GetLabel().GetStr() == "SEQIDS" &&
4301                                                                      field.IsSetData()  &&  field.GetData().IsStrs()) {
4302                 const CUser_field::C_Data::TStrs& strs = field.GetData().GetStrs();
4303                 ITERATE(CUser_field::TData::TStrs, acc_iter, strs) {
4304                     use_this_seq.push_back(*acc_iter);
4305                 }
4306             }
4307         }
4308     }
4309 }
4310 
4311 
4312 
4313 CAlignFormatUtil::SSeqAlignSetCalcParams*
GetSeqAlignSetCalcParamsFromASN(const CSeq_align_set & alnSet)4314 CAlignFormatUtil::GetSeqAlignSetCalcParamsFromASN(const CSeq_align_set& alnSet)
4315 {
4316     bool hasScore = false;
4317     double evalue = -1;
4318     double bitScore = -1;
4319     double totalBitScore = -1;
4320     int percentCoverage = -1;
4321     double percentIdent = -1;
4322     int hspNum = 0;
4323     double totalLen = 0;
4324     int rawScore = -1;
4325     int sum_n = -1;
4326     list<TGi> use_this_gi;
4327     list<string> use_this_seq;
4328 
4329     const CSeq_align& aln = *(alnSet.Get().front());
4330 
4331     hasScore = s_GetBlastScore(aln.GetScore(),evalue,bitScore, totalBitScore,percentCoverage,percentIdent,hspNum,totalLen,rawScore,sum_n,use_this_gi);
4332 
4333     if(!hasScore){
4334         const CSeq_align::TSegs& seg = aln.GetSegs();
4335         if(seg.Which() == CSeq_align::C_Segs::e_Std){
4336             s_GetBlastScore(seg.GetStd().front()->GetScores(),
4337                             evalue,bitScore, totalBitScore,percentCoverage,percentIdent,hspNum,totalLen,rawScore,sum_n,use_this_gi);
4338         } else if (seg.Which() == CSeq_align::C_Segs::e_Dendiag){
4339             s_GetBlastScore(seg.GetDendiag().front()->GetScores(),
4340                             evalue,bitScore, totalBitScore,percentCoverage,percentIdent,hspNum,totalLen,rawScore,sum_n,use_this_gi);
4341         }  else if (seg.Which() == CSeq_align::C_Segs::e_Denseg){
4342             s_GetBlastScore(seg.GetDenseg().GetScores(),
4343                             evalue,bitScore, totalBitScore,percentCoverage,percentIdent,hspNum,totalLen,rawScore,sum_n,use_this_gi);
4344         }
4345     }
4346 
4347     if(use_this_gi.size() == 0) {
4348         GetUseThisSequence(aln,use_this_seq);
4349     }
4350     else {
4351         use_this_seq = s_NumGiToStringGiList(use_this_gi);//for backward compatability
4352     }
4353 
4354 
4355     auto_ptr<SSeqAlignSetCalcParams> seqSetInfo(new SSeqAlignSetCalcParams);
4356     seqSetInfo->evalue = evalue;
4357     seqSetInfo->bit_score = bitScore;
4358     seqSetInfo->total_bit_score = totalBitScore;
4359     seqSetInfo->percent_coverage = percentCoverage;
4360     seqSetInfo->percent_identity = percentIdent;
4361     seqSetInfo->hspNum = hspNum;
4362     seqSetInfo->totalLen = (Int8)totalLen;
4363 
4364     seqSetInfo->sum_n = sum_n == -1 ? 1:sum_n ;
4365     seqSetInfo->id = &(aln.GetSeq_id(1));
4366     seqSetInfo->use_this_gi = StringGiToNumGiList(use_this_seq);//for backward compatability
4367     seqSetInfo->use_this_seq = use_this_seq;
4368     seqSetInfo->raw_score = rawScore;//not used
4369 
4370     seqSetInfo->subjRange = CRange<TSeqPos>(0,0);
4371     seqSetInfo->flip = false;
4372 
4373     return seqSetInfo.release();
4374 }
4375 
GetDisplayIds(const CBioseq_Handle & handle,const CSeq_id & aln_id,list<TGi> & use_this_gi,TGi & gi)4376 CRef<CSeq_id> CAlignFormatUtil::GetDisplayIds(const CBioseq_Handle& handle,
4377                                 const CSeq_id& aln_id,
4378                                 list<TGi>& use_this_gi,
4379                                 TGi& gi)
4380 
4381 {
4382     TTaxId taxid = ZERO_TAX_ID;
4383     CRef<CSeq_id> wid = CAlignFormatUtil::GetDisplayIds(handle, aln_id, use_this_gi, gi, taxid);
4384     return wid;
4385 }
4386 
GetDisplayIds(const CBioseq_Handle & handle,const CSeq_id & aln_id,list<TGi> & use_this_gi,TGi & gi,TTaxId & taxid)4387 CRef<CSeq_id> CAlignFormatUtil::GetDisplayIds(const CBioseq_Handle& handle,
4388                                 const CSeq_id& aln_id,
4389                                 list<TGi>& use_this_gi,
4390                                 TGi& gi,
4391                                 TTaxId& taxid)
4392 
4393 {
4394     const CRef<CBlast_def_line_set> bdlRef = CSeqDB::ExtractBlastDefline(handle);
4395     const list< CRef< CBlast_def_line > > &bdl = (bdlRef.Empty()) ? list< CRef< CBlast_def_line > >() : bdlRef->Get();
4396 
4397     const CBioseq::TId* ids = &handle.GetBioseqCore()->GetId();
4398     CRef<CSeq_id> wid;
4399 
4400     gi = ZERO_GI;
4401     taxid = ZERO_TAX_ID;
4402     if(bdl.empty()){
4403         wid = FindBestChoice(*ids, CSeq_id::WorstRank);
4404         gi = FindGi(*ids);
4405     } else {
4406         bool found = false;
4407         for(list< CRef< CBlast_def_line > >::const_iterator iter = bdl.begin();
4408             iter != bdl.end(); iter++){
4409             const CBioseq::TId* cur_id = &((*iter)->GetSeqid());
4410             TGi cur_gi =  FindGi(*cur_id);
4411             wid = FindBestChoice(*cur_id, CSeq_id::WorstRank);
4412             if ((*iter)->IsSetTaxid() && (*iter)->CanGetTaxid()){
4413                 taxid = (*iter)->GetTaxid();
4414             }
4415             if (!use_this_gi.empty()) {
4416                 ITERATE(list<TGi>, iter_gi, use_this_gi){
4417                     if(cur_gi == *iter_gi){
4418                         found = true;
4419                         break;
4420                     }
4421                 }
4422             } else {
4423                 ITERATE(CBioseq::TId, iter_id, *cur_id) {
4424                     if ((*iter_id)->Match(aln_id)
4425                       || (aln_id.IsGeneral() && aln_id.GetGeneral().CanGetDb() &&
4426                          (*iter_id)->IsGeneral() && (*iter_id)->GetGeneral().CanGetDb() &&
4427                          aln_id.GetGeneral().GetDb() == (*iter_id)->GetGeneral().GetDb())) {
4428                         found = true;
4429                     }
4430                 }
4431             }
4432             if(found){
4433                 gi = cur_gi;
4434                 break;
4435             }
4436         }
4437     }
4438     return wid;
4439 }
4440 
4441 
4442 
4443 //removes "gi:" or "seqid:" prefix from gi:nnnnnnn or seqid:nnnnn
s_UseThisSeqToTextSeqID(string use_this_seqid,bool & isGi)4444 static string s_UseThisSeqToTextSeqID(string use_this_seqid, bool &isGi)
4445 {
4446     const string k_GiPrefix = "gi:";
4447     const string k_SeqIDPrefix = "seqid:";
4448     isGi = false;
4449     string textSeqid;
4450     if(NStr::StartsWith(use_this_seqid,k_GiPrefix)) {
4451         textSeqid = NStr::Replace(use_this_seqid,k_GiPrefix,"");
4452         isGi = true;
4453     }
4454     else if(NStr::StartsWith(use_this_seqid,k_SeqIDPrefix)) {
4455         textSeqid = NStr::Replace(use_this_seqid,k_SeqIDPrefix,"");
4456     }
4457     else  {//assume no prefix - gi
4458         if(NStr::StringToInt8(use_this_seqid,NStr::fConvErr_NoThrow)) {
4459             isGi = true;
4460         }
4461     }
4462     return textSeqid;
4463 }
4464 
4465 
4466 
4467 //assume that we have EITHER gi: OR seqid: in the list
IsGiList(list<string> & use_this_seq)4468 bool CAlignFormatUtil::IsGiList(list<string> &use_this_seq)
4469 {
4470     bool isGi = false;
4471     ITERATE(list<string>, iter_seq, use_this_seq){
4472         s_UseThisSeqToTextSeqID( *iter_seq, isGi);
4473         break;
4474     }
4475     return isGi;
4476 }
4477 
StringGiToNumGiList(list<string> & use_this_seq)4478 list<TGi> CAlignFormatUtil::StringGiToNumGiList(list<string> &use_this_seq)
4479 {
4480     list<TGi> use_this_gi;
4481     ITERATE(list<string>, iter_seq, use_this_seq){
4482         bool isGi = false;
4483         string strGI = s_UseThisSeqToTextSeqID( *iter_seq, isGi);
4484         if(isGi) use_this_gi.push_back(NStr::StringToNumeric<TGi>(strGI));
4485     }
4486     return use_this_gi;
4487 }
4488 
4489 
4490 
MatchSeqInSeqList(TGi cur_gi,CRef<CSeq_id> & seqID,list<string> & use_this_seq,bool * isGiList)4491 bool CAlignFormatUtil::MatchSeqInSeqList(TGi cur_gi, CRef<CSeq_id> &seqID, list<string> &use_this_seq,bool *isGiList)
4492 {
4493     bool found = false;
4494     bool isGi = false;
4495 
4496     string curSeqID = CAlignFormatUtil::GetLabel(seqID,true); //uses GetSeqIdString(true)
4497     ITERATE(list<string>, iter_seq, use_this_seq){
4498         isGi = false;
4499         string useThisSeq = s_UseThisSeqToTextSeqID(*iter_seq, isGi);
4500         if((isGi && cur_gi == NStr::StringToNumeric<TGi>((useThisSeq))) || (!isGi && curSeqID == useThisSeq)){
4501             found = true;
4502             break;
4503          }
4504     }
4505     if(isGiList) *isGiList = isGi;
4506     return found;
4507 }
4508 
4509 
MatchSeqInSeqList(CConstRef<CSeq_id> & alnSeqID,list<string> & use_this_seq,vector<string> & seqList)4510 bool CAlignFormatUtil::MatchSeqInSeqList(CConstRef<CSeq_id> &alnSeqID, list<string> &use_this_seq,vector <string> &seqList)
4511 {
4512     bool isGi = false;
4513     string curSeqID;
4514     if(alnSeqID->IsGi()) {
4515         curSeqID = NStr::NumericToString(alnSeqID->GetGi());
4516     }
4517     else {
4518         curSeqID = CAlignFormatUtil::GetLabel(alnSeqID,true); //uses GetSeqIdString(true)
4519     }
4520     //match with seqid in seq_align
4521     bool found = std::find(seqList.begin(), seqList.end(), curSeqID) != seqList.end();
4522     if(!found) {
4523         //match in use_this_seq list
4524         ITERATE(list<string>, iter_seq, use_this_seq){
4525             string useThisSeq = s_UseThisSeqToTextSeqID(*iter_seq, isGi);
4526             found = std::find(seqList.begin(), seqList.end(), useThisSeq) != seqList.end();
4527             if(found){
4528                 break;
4529             }
4530         }
4531     }
4532     return found;
4533 }
4534 
MatchSeqInUseThisSeqList(list<string> & use_this_seq,string textSeqIDToMatch)4535 bool CAlignFormatUtil::MatchSeqInUseThisSeqList(list<string> &use_this_seq, string textSeqIDToMatch)
4536 {
4537     bool has_match = false;
4538 
4539     ITERATE(list<string>, iter_seq, use_this_seq) {
4540         bool isGi;
4541         string useThisSeq = s_UseThisSeqToTextSeqID(*iter_seq, isGi);
4542         if(useThisSeq == textSeqIDToMatch) {
4543              has_match = true;
4544              break;
4545         }
4546     }
4547     return has_match;
4548 }
4549 
RemoveSeqsOfAccessionTypeFromSeqInUse(list<string> & use_this_seq,CSeq_id::EAccessionInfo accesionType)4550 bool CAlignFormatUtil::RemoveSeqsOfAccessionTypeFromSeqInUse(list<string> &use_this_seq, CSeq_id::EAccessionInfo accesionType)
4551 {
4552     list<string> new_use_this_seq;
4553     bool hasAccType = false;
4554     bool isGI = false;
4555 
4556     ITERATE(list<string>, iter_seq, use_this_seq) {
4557         string useThisSeq = s_UseThisSeqToTextSeqID(*iter_seq, isGI);
4558         CSeq_id::EAccessionInfo useThisSeqAccType = CSeq_id::IdentifyAccession (useThisSeq);
4559         if(useThisSeqAccType != accesionType) {
4560             new_use_this_seq.push_back(useThisSeq);
4561         }
4562         else {
4563             hasAccType = true;
4564         }
4565     }
4566     use_this_seq = new_use_this_seq;
4567     return hasAccType;
4568 }
4569 
GetDisplayIds(const CBioseq_Handle & handle,const CSeq_id & aln_id,list<string> & use_this_seq,TGi * gi,TTaxId * taxid,string * textSeqID)4570 CRef<CSeq_id> CAlignFormatUtil::GetDisplayIds(const CBioseq_Handle& handle,
4571                                 const CSeq_id& aln_id,
4572                                 list<string>& use_this_seq,
4573                                 TGi *gi,
4574                                 TTaxId *taxid,
4575                                 string *textSeqID)
4576 
4577 {
4578     const CRef<CBlast_def_line_set> bdlRef = CSeqDB::ExtractBlastDefline(handle);
4579     const list< CRef< CBlast_def_line > > &bdl = (bdlRef.Empty()) ? list< CRef< CBlast_def_line > >() : bdlRef->Get();
4580 
4581     const CBioseq::TId* ids = &handle.GetBioseqCore()->GetId();
4582     CRef<CSeq_id> wid;
4583 
4584     if(gi) *gi = ZERO_GI;
4585     if(taxid) *taxid = ZERO_TAX_ID;
4586     if(bdl.empty()){
4587         wid = FindBestChoice(*ids, CSeq_id::WorstRank);
4588         if(gi) *gi = FindGi(*ids);
4589         if(textSeqID) *textSeqID = GetLabel(wid,true);//uses GetSeqIdString(true)
4590     } else {
4591         bool found = false;
4592         for(list< CRef< CBlast_def_line > >::const_iterator iter = bdl.begin();
4593             iter != bdl.end(); iter++){
4594             const CBioseq::TId* cur_id = &((*iter)->GetSeqid());
4595             TGi cur_gi =  FindGi(*cur_id);
4596             wid = FindBestChoice(*cur_id, CSeq_id::WorstRank);
4597             string curSeqID = GetLabel(wid,true);//uses GetSeqIdString(true)
4598             if (taxid && (*iter)->IsSetTaxid() && (*iter)->CanGetTaxid()){
4599                 *taxid = (*iter)->GetTaxid();
4600             }
4601             if (!use_this_seq.empty()) {
4602                 ITERATE(list<string>, iter_seq, use_this_seq){
4603                     bool isGi = false;
4604                     string useThisSeq = s_UseThisSeqToTextSeqID( *iter_seq, isGi);
4605                     if((isGi && cur_gi == NStr::StringToNumeric<TGi>((useThisSeq))) || (!isGi && curSeqID == useThisSeq)){
4606                         found = true;
4607                         break;
4608                     }
4609                 }
4610             } else {
4611                 ITERATE(CBioseq::TId, iter_id, *cur_id) {
4612                     if ((*iter_id)->Match(aln_id)
4613                       || (aln_id.IsGeneral() && aln_id.GetGeneral().CanGetDb() &&
4614                          (*iter_id)->IsGeneral() && (*iter_id)->GetGeneral().CanGetDb() &&
4615                          aln_id.GetGeneral().GetDb() == (*iter_id)->GetGeneral().GetDb())) {
4616                         found = true;
4617                     }
4618                 }
4619             }
4620             if(found){
4621                 if(gi) *gi = cur_gi;
4622                 if(textSeqID) *textSeqID = curSeqID;
4623                 break;
4624             }
4625         }
4626     }
4627 
4628     return wid;
4629 }
4630 
4631 
GetDisplayIds(const list<CRef<CBlast_def_line>> & bdl,const CSeq_id & aln_id,list<TGi> & use_this_gi)4632 TGi CAlignFormatUtil::GetDisplayIds(const list< CRef< CBlast_def_line > > &bdl,
4633                                               const CSeq_id& aln_id,
4634                                               list<TGi>& use_this_gi)
4635 
4636 
4637 {
4638     TGi gi = ZERO_GI;
4639 
4640     if(!bdl.empty()){
4641         bool found = false;
4642         for(list< CRef< CBlast_def_line > >::const_iterator iter = bdl.begin();
4643             iter != bdl.end(); iter++){
4644             const CBioseq::TId* cur_id = &((*iter)->GetSeqid());
4645             TGi cur_gi =  FindGi(*cur_id);
4646             if (!use_this_gi.empty()) {
4647                 ITERATE(list<TGi>, iter_gi, use_this_gi){
4648                     if(cur_gi == *iter_gi){
4649                         found = true;
4650                         break;
4651                     }
4652                 }
4653             } else {
4654                 ITERATE(CBioseq::TId, iter_id, *cur_id) {
4655                     if ((*iter_id)->Match(aln_id)
4656                       || (aln_id.IsGeneral() && aln_id.GetGeneral().CanGetDb() &&
4657                          (*iter_id)->IsGeneral() && (*iter_id)->GetGeneral().CanGetDb() &&
4658                          aln_id.GetGeneral().GetDb() == (*iter_id)->GetGeneral().GetDb())) {
4659                         found = true;
4660                     }
4661                 }
4662             }
4663             if(found){
4664                 gi = cur_gi;
4665                 break;
4666             }
4667         }
4668     }
4669     return gi;
4670 }
4671 
s_FixMinusStrandRange(CRange<TSeqPos> & rng)4672 static inline CRange<TSeqPos> & s_FixMinusStrandRange(CRange<TSeqPos> & rng)
4673 {
4674 	if(rng.GetFrom() > rng.GetTo()){
4675 		rng.Set(rng.GetTo(), rng.GetFrom());
4676 	}
4677 	//cerr << "Query Rng: " << rng.GetFrom() << "-" << rng.GetTo() << endl;
4678 	return rng;
4679 }
4680 
GetUniqSeqCoverage(CSeq_align_set & alnset)4681 int CAlignFormatUtil::GetUniqSeqCoverage(CSeq_align_set & alnset)
4682 {
4683 	if(alnset.IsEmpty())
4684 		return 0;
4685 
4686 	bool isDenDiag = (alnset.Get().front()->GetSegs().Which() == CSeq_align::C_Segs::e_Dendiag) ?
4687 			          true : false;
4688 
4689 	list<CRef<CSeq_align> >::iterator mItr=alnset.Set().begin();
4690 	CRangeCollection<TSeqPos> subj_rng_coll((*mItr)->GetSeqRange(1));
4691 	CRange<TSeqPos> q_rng((*mItr)->GetSeqRange(0));
4692 	/*
4693 	cerr << MSerial_AsnText << **mItr;
4694 	cerr << (*mItr)->GetSeqRange(0).GetFrom() << endl;
4695 	cerr << (*mItr)->GetSeqRange(0).GetTo() << endl;
4696 	cerr << (*mItr)->GetSeqRange(0).GetToOpen() << endl;
4697 	cerr << (*mItr)->GetSeqRange(1).GetFrom() << endl;
4698 	cerr << (*mItr)->GetSeqRange(1).GetTo() << endl;
4699 	cerr << (*mItr)->GetSeqRange(1).GetToOpen() << endl;
4700 	*/
4701 	CRangeCollection<TSeqPos> query_rng_coll(s_FixMinusStrandRange(q_rng));
4702 	++mItr;
4703 	for(;mItr != alnset.Set().end(); ++mItr) {
4704 		const CRange<TSeqPos> align_subj_rng((*mItr)->GetSeqRange(1));
4705 		// subject range should always be on the positive strand
4706 		ASSERT(align_subj_rng.GetTo() > align_subj_rng.GetFrom());
4707 		CRangeCollection<TSeqPos> coll(align_subj_rng);
4708 		coll.Subtract(subj_rng_coll);
4709 
4710 		if (coll.empty())
4711 			continue;
4712 
4713 		if(coll[0] == align_subj_rng) {
4714 			CRange<TSeqPos> query_rng ((*mItr)->GetSeqRange(0));
4715 			//cerr << "Subj Rng :" << align_subj_rng.GetFrom() << "-" << align_subj_rng.GetTo() << endl;
4716 			query_rng_coll += s_FixMinusStrandRange(query_rng);
4717 			subj_rng_coll += align_subj_rng;
4718 		}
4719 		else {
4720 			ITERATE (CRangeCollection<TSeqPos>, uItr, coll) {
4721 				CRange<TSeqPos> query_rng;
4722 				const CRange<TSeqPos> & subj_rng = (*uItr);
4723 				CRef<CSeq_align> densegAln
4724 						= isDenDiag ? CAlignFormatUtil::CreateDensegFromDendiag(**mItr) : (*mItr);
4725 
4726 				CAlnMap map(densegAln->GetSegs().GetDenseg());
4727 				TSignedSeqPos subj_aln_start =  map.GetAlnPosFromSeqPos(1,subj_rng.GetFrom());
4728 				TSignedSeqPos subj_aln_end =  map.GetAlnPosFromSeqPos(1,subj_rng.GetTo());
4729 				query_rng.SetFrom(map.GetSeqPosFromAlnPos(0,subj_aln_start));
4730 				query_rng.SetTo(map.GetSeqPosFromAlnPos(0,subj_aln_end));
4731 
4732 				//cerr << "Subj Rng :" << subj_rng.GetFrom() << "-" << subj_rng.GetTo() << endl;
4733 				query_rng_coll += s_FixMinusStrandRange(query_rng);
4734 				subj_rng_coll += subj_rng;
4735 			}
4736 		}
4737 	}
4738 
4739 	return query_rng_coll.GetCoveredLength();
4740 }
4741 
4742 ///return id type specified or null ref
4743 ///@param ids: the input ids
4744 ///@param choice: id of choice
4745 ///@return: the id with specified type
4746 ///
s_GetSeqIdByType(const list<CRef<CSeq_id>> & ids,CSeq_id::E_Choice choice)4747 static CRef<CSeq_id> s_GetSeqIdByType(const list<CRef<CSeq_id> >& ids,
4748                                       CSeq_id::E_Choice choice)
4749 {
4750     CRef<CSeq_id> cid;
4751 
4752     for (CBioseq::TId::const_iterator iter = ids.begin(); iter != ids.end();
4753          iter ++){
4754         if ((*iter)->Which() == choice){
4755             cid = *iter;
4756             break;
4757         }
4758     }
4759 
4760     return cid;
4761 }
4762 
4763 ///return gi from id list
4764 ///@param ids: the input ids
4765 ///@return: the gi if found
4766 ///
GetGiForSeqIdList(const list<CRef<CSeq_id>> & ids)4767 TGi CAlignFormatUtil::GetGiForSeqIdList (const list<CRef<CSeq_id> >& ids)
4768 {
4769     TGi gi = ZERO_GI;
4770     CRef<CSeq_id> id = s_GetSeqIdByType(ids, CSeq_id::e_Gi);
4771     if (!(id.Empty())){
4772         return id->GetGi();
4773     }
4774     return gi;
4775 }
4776 
GetTitle(const CBioseq_Handle & bh)4777 string CAlignFormatUtil::GetTitle(const CBioseq_Handle & bh)
4778 {
4779 	CSeqdesc_CI desc_t(bh, CSeqdesc::e_Title);
4780 	string t = kEmptyStr;
4781 	for (;desc_t; ++desc_t) {
4782 		t += desc_t->GetTitle() + " ";
4783 	}
4784 	return t;
4785 }
4786 
GetBareId(const CSeq_id & id)4787 string CAlignFormatUtil::GetBareId(const CSeq_id& id)
4788 {
4789     string retval;
4790 
4791     if (id.IsGi() || id.IsPrf() || id.IsPir()) {
4792         retval = id.AsFastaString();
4793     }
4794     else {
4795         retval = id.GetSeqIdString(true);
4796     }
4797 
4798     return retval;
4799 }
4800 
4801 
GetTextSeqID(CConstRef<CSeq_id> seqID,string * textSeqID)4802 bool CAlignFormatUtil::GetTextSeqID(CConstRef<CSeq_id> seqID, string *textSeqID)
4803 {
4804     bool hasTextSeqID = true;
4805 
4806     const CTextseq_id* text_id = seqID->GetTextseq_Id();
4807     //returns non zero if e_Genbank,e_Embl,e_Ddbj,e_Pir,e_Swissprot,case e_Other,e_Prf,case e_Tpg,e_Tpe,case e_Tpd,case e_Gpipe, e_Named_annot_track
4808     if(!text_id) { //check for pdb and pat
4809         if(!(seqID->Which() == CSeq_id::e_Pdb) &&  !(seqID->Which() == CSeq_id::e_Patent) && !(seqID->Which() == CSeq_id::e_Gi)) {
4810             hasTextSeqID = false;
4811         }
4812     }
4813 
4814     if(hasTextSeqID && textSeqID) {
4815         seqID->GetLabel(textSeqID, CSeq_id::eContent);
4816     }
4817     return hasTextSeqID;
4818 }
4819 
4820 
4821 
GetTextSeqID(const list<CRef<CSeq_id>> & ids,string * textSeqID)4822 bool CAlignFormatUtil::GetTextSeqID(const list<CRef<CSeq_id> > & ids, string *textSeqID)
4823 {
4824     bool hasTextSeqID = false;
4825 
4826     CConstRef<CSeq_id> seqID = FindTextseq_id(ids);
4827     //returns non zero if e_Genbank,e_Embl,e_Ddbj,e_Pir,e_Swissprot,case e_Other,e_Prf,case e_Tpg,e_Tpe,case e_Tpd,case e_Gpipe, e_Named_annot_track
4828     if(seqID.Empty()) {
4829         seqID = GetSeq_idByType(ids, CSeq_id::e_Pdb);
4830     }
4831     if(seqID.Empty()) {
4832         seqID = GetSeq_idByType(ids, CSeq_id::e_Patent);
4833     }
4834     if(!seqID.Empty()) {
4835         hasTextSeqID = true;
4836         if(textSeqID) seqID->GetLabel(textSeqID, CSeq_id::eContent);
4837     }
4838     return hasTextSeqID;
4839 }
4840 
FilterSeqalignBySeqList(CSeq_align_set & source_aln,vector<string> & seqList)4841 CRef<CSeq_align_set> CAlignFormatUtil::FilterSeqalignBySeqList(CSeq_align_set& source_aln,
4842                                                                vector <string> &seqList)
4843 {
4844     CConstRef<CSeq_id> previous_id, subid;
4845     list<string> use_this_seq;
4846     bool match = false;
4847 
4848     CRef<CSeq_align_set> new_aln(new CSeq_align_set);
4849     ITERATE(CSeq_align_set::Tdata, iter, source_aln.Get()){
4850         subid = &((*iter)->GetSeq_id(1));
4851         if(previous_id.Empty() || !subid->Match(*previous_id)){
4852             use_this_seq.clear();
4853             CAlignFormatUtil::GetUseThisSequence(**iter,use_this_seq);
4854             match = MatchSeqInSeqList(subid, use_this_seq,seqList);
4855         }
4856 
4857         previous_id = subid;
4858         if(match) {
4859             new_aln->Set().push_back(*iter);
4860         }
4861     }
4862     return new_aln;
4863 }
4864 
4865 
4866 END_SCOPE(align_format)
4867 END_NCBI_SCOPE
4868