1 /* $Id: align_format_util.cpp 631547 2021-05-19 13:51:35Z ivanov $
2 * ===========================================================================
3 *
4 * PUBLIC DOMAIN NOTICE
5 * National Center for Biotechnology Information
6 *
7 * This software/database is a "United States Government Work" under the
8 * terms of the United States Copyright Act. It was written as part of
9 * the author's official duties as a United States Government employee and
10 * thus cannot be copyrighted. This software/database is freely available
11 * to the public for use. The National Library of Medicine and the U.S.
12 * Government have not placed any restriction on its use or reproduction.
13 *
14 * Although all reasonable efforts have been taken to ensure the accuracy
15 * and reliability of the software and data, the NLM and the U.S.
16 * Government do not and cannot warrant the performance or results that
17 * may be obtained by using this software or data. The NLM and the U.S.
18 * Government disclaim all warranties, express or implied, including
19 * warranties of performance, merchantability or fitness for any particular
20 * purpose.
21 *
22 * Please cite the author in any work or product based on this material.
23 *
24 * ===========================================================================
25 *
26 * Author: Jian Ye
27 * 12/2004
28 * File Description:
29 * blast formatter utilities
30 *
31 */
32 #include <ncbi_pch.hpp>
33
34 #include <math.h> // For use of ceil
35
36 #include <objtools/align_format/align_format_util.hpp>
37
38 #include <corelib/ncbireg.hpp>
39 #include <corelib/ncbidiag.hpp>
40 #include <corelib/ncbistre.hpp>
41 #include <corelib/ncbiutil.hpp>
42 #include <corelib/ncbiobj.hpp>
43 #include <corelib/ncbifile.hpp>
44 #include <corelib/metareg.hpp>
45 #include <html/htmlhelper.hpp>
46 #include <cgi/cgictx.hpp>
47 #include <util/tables/raw_scoremat.h>
48
49
50 #include <objects/seqalign/Seq_align.hpp>
51 #include <objects/seqalign/Score.hpp>
52 #include <objects/seqalign/Std_seg.hpp>
53 #include <objects/seqalign/Dense_diag.hpp>
54 #include <objects/seqalign/Dense_seg.hpp>
55 #include <objects/seqloc/Seq_id.hpp>
56 #include <objects/seqloc/Seq_loc.hpp>
57 #include <objects/seq/Seq_inst.hpp>
58 #include <objects/seq/Seq_descr.hpp>
59 #include <objects/seq/Seqdesc.hpp>
60 #include <objmgr/seqdesc_ci.hpp>
61 #include <objects/blastdb/defline_extra.hpp>
62 #include <objects/general/User_object.hpp>
63
64 #include <objtools/blast/services/blast_services.hpp> // for CBlastServices
65 #include <objtools/blast/seqdb_reader/seqdb.hpp> // for CSeqDB
66 #include <objtools/blast/seqdb_reader/seqdbcommon.hpp> // for CSeqDBException
67
68 #include <objects/seq/seqport_util.hpp>
69 #include <objects/blastdb/defline_extra.hpp>
70 #include <objects/blastdb/Blast_def_line.hpp>
71 #include <objects/blastdb/Blast_def_line_set.hpp>
72
73 #include <stdio.h>
74 #include <sstream>
75 #include <iomanip>
76
77 BEGIN_NCBI_SCOPE
78 USING_SCOPE(ncbi);
79 USING_SCOPE(objects);
80 BEGIN_SCOPE(align_format)
81
82 const char CAlignFormatUtil::kNoHitsFound[] = "No hits found";
83
84 bool kTranslation;
85 CRef<CScope> kScope;
86
87 const char k_PSymbol[ePMatrixSize+1] =
88 "ARNDCQEGHILKMFPSTWYVBZX";
89
90 unique_ptr<CNcbiRegistry> CAlignFormatUtil::m_Reg;
91 string CAlignFormatUtil::m_Protocol = "";
92 bool CAlignFormatUtil::m_geturl_debug_flag = false;
93 auto_ptr<CGeneInfoFileReader> CAlignFormatUtil::m_GeneInfoReader;
94 ///Get blast score information
95 ///@param scoreList: score container to extract score info from
96 ///@param score: place to extract the raw score to
97 ///@param bits: place to extract the bit score to
98 ///@param evalue: place to extract the e value to
99 ///@param sum_n: place to extract the sum_n to
100 ///@param num_ident: place to extract the num_ident to
101 ///@param use_this_gi: place to extract use_this_gi to
102 ///@return true if found score, false otherwise
103 ///
104 template<class container> bool
s_GetBlastScore(const container & scoreList,int & score,double & bits,double & evalue,int & sum_n,int & num_ident,list<TGi> & use_this_gi,int & comp_adj_method)105 s_GetBlastScore(const container& scoreList,
106 int& score,
107 double& bits,
108 double& evalue,
109 int& sum_n,
110 int& num_ident,
111 list<TGi>& use_this_gi,
112 int& comp_adj_method)
113 {
114 const string k_GiPrefix = "gi:";
115 bool hasScore = false;
116 ITERATE (typename container, iter, scoreList) {
117 const CObject_id& id=(*iter)->GetId();
118 if (id.IsStr()) {
119 if (id.GetStr()=="score"){
120 score = (*iter)->GetValue().GetInt();
121 } else if (id.GetStr()=="bit_score"){
122 bits = (*iter)->GetValue().GetReal();
123 } else if (id.GetStr()=="e_value" || id.GetStr()=="sum_e") {
124 evalue = (*iter)->GetValue().GetReal();
125 hasScore = true;
126 } else if (id.GetStr()=="use_this_gi"){
127 use_this_gi.push_back(GI_FROM(int, (*iter)->GetValue().GetInt()));
128 } else if (id.GetStr()=="sum_n"){
129 sum_n = (*iter)->GetValue().GetInt();
130 } else if (id.GetStr()=="num_ident"){
131 num_ident = (*iter)->GetValue().GetInt();
132 } else if (id.GetStr()=="comp_adjustment_method") {
133 comp_adj_method = (*iter)->GetValue().GetInt();
134 }
135 else if(NStr::StartsWith(id.GetStr(),k_GiPrefix)) { //will be used when switch to 64bit GIs
136 string strGi = NStr::Replace(id.GetStr(),k_GiPrefix,"");
137 TGi gi = NStr::StringToNumeric<TGi>(strGi);
138 use_this_gi.push_back(gi);
139 }
140 }
141 }
142
143 return hasScore;
144 }
145
146
147 ///Wrap a string to specified length. If break happens to be in
148 /// a word, it will extend the line length until the end of the word
149 ///@param str: input string
150 ///@param line_len: length of each line desired
151 ///@param out: stream to ouput
152 ///
x_WrapOutputLine(string str,size_t line_len,CNcbiOstream & out,bool html)153 void CAlignFormatUtil::x_WrapOutputLine(string str, size_t line_len,
154 CNcbiOstream& out, bool html)
155 {
156 list<string> string_l;
157 NStr::TWrapFlags flags = NStr::fWrap_FlatFile;
158 if (html) {
159 flags = NStr::fWrap_HTMLPre;
160 str = CHTMLHelper::HTMLEncode(str);
161 }
162 NStr::Wrap(str, line_len, string_l, flags);
163 list<string>::iterator iter = string_l.begin();
164 while(iter != string_l.end())
165 {
166 out << *iter;
167 out << "\n";
168 iter++;
169 }
170 }
171
BlastPrintError(list<SBlastError> & error_return,bool error_post,CNcbiOstream & out)172 void CAlignFormatUtil::BlastPrintError(list<SBlastError>&
173 error_return,
174 bool error_post, CNcbiOstream& out)
175 {
176
177 string errsevmsg[] = { "UNKNOWN","INFO","WARNING","ERROR",
178 "FATAL"};
179
180 NON_CONST_ITERATE(list<SBlastError>, iter, error_return) {
181
182 if(iter->level > 5){
183 iter->level = eDiag_Info;
184 }
185
186 if(iter->level == 4){
187 iter->level = eDiag_Fatal;
188 } else{
189 iter->level = iter->level;
190 }
191
192 if (error_post){
193 ERR_POST_EX(iter->level, 0, iter->message);
194 }
195 out << errsevmsg[iter->level] << ": " << iter->message << "\n";
196
197 }
198
199 }
200
PrintTildeSepLines(string str,size_t line_len,CNcbiOstream & out)201 void CAlignFormatUtil::PrintTildeSepLines(string str, size_t line_len,
202 CNcbiOstream& out) {
203
204 vector<string> split_line;
205 NStr::Split(str, "~", split_line);
206 ITERATE(vector<string>, iter, split_line) {
207 x_WrapOutputLine(*iter, line_len, out);
208 }
209 }
210 #ifdef DO_UNUSED
211 /// Initialize database statistics with data from BLAST servers
212 /// @param dbname name of a single BLAST database [in]
213 /// @param info structure to fill [in|out]
214 /// @return true if successfully filled, false otherwise (and a warning is
215 /// printed out)
s_FillDbInfoRemotely(const string & dbname,CAlignFormatUtil::SDbInfo & info)216 static bool s_FillDbInfoRemotely(const string& dbname,
217 CAlignFormatUtil::SDbInfo& info)
218 {
219 static CBlastServices rmt_blast_services;
220 CRef<CBlast4_database> blastdb(new CBlast4_database);
221 blastdb->SetName(dbname);
222 blastdb->SetType() = info.is_protein
223 ? eBlast4_residue_type_protein : eBlast4_residue_type_nucleotide;
224 CRef<CBlast4_database_info> dbinfo =
225 rmt_blast_services.GetDatabaseInfo(blastdb);
226
227 info.name = dbname;
228 if ( !dbinfo ) {
229 return false;
230 }
231 info.definition = dbinfo->GetDescription();
232 if (info.definition.empty())
233 info.definition = info.name;
234 CTimeFormat tf("b d, Y H:m P", CTimeFormat::fFormat_Simple);
235 info.date = CTime(dbinfo->GetLast_updated()).AsString(tf);
236 info.total_length = dbinfo->GetTotal_length();
237 info.number_seqs = static_cast<int>(dbinfo->GetNum_sequences());
238 return true;
239 }
240 #endif
241 /// Initialize database statistics with data obtained from local BLAST
242 /// databases
243 /// @param dbname name of a single BLAST database [in]
244 /// @param info structure to fill [in|out]
245 /// @param dbfilt_algorithm filtering algorithm ID used for this search
246 /// [in]
247 /// @return true if successfully filled, false otherwise (and a warning is
248 /// printed out)
249 static bool
s_FillDbInfoLocally(const string & dbname,CAlignFormatUtil::SDbInfo & info,int dbfilt_algorithm)250 s_FillDbInfoLocally(const string& dbname,
251 CAlignFormatUtil::SDbInfo& info,
252 int dbfilt_algorithm)
253 {
254 CRef<CSeqDB> seqdb(new CSeqDB(dbname, info.is_protein
255 ? CSeqDB::eProtein : CSeqDB::eNucleotide));
256 if ( !seqdb ) {
257 return false;
258 }
259 info.name = seqdb->GetDBNameList();
260 info.definition = seqdb->GetTitle();
261 if (info.definition.empty())
262 info.definition = info.name;
263 info.date = seqdb->GetDate();
264 info.total_length = seqdb->GetTotalLength();
265 info.number_seqs = seqdb->GetNumSeqs();
266
267 // Process the filtering algorithm IDs
268 info.filt_algorithm_name.clear();
269 info.filt_algorithm_options.clear();
270 if (dbfilt_algorithm == -1) {
271 return true;
272 }
273
274 #if ((!defined(NCBI_COMPILER_WORKSHOP) || (NCBI_COMPILER_VERSION > 550)) && \
275 (!defined(NCBI_COMPILER_MIPSPRO)) )
276 string filtering_algorithm;
277 seqdb->GetMaskAlgorithmDetails(dbfilt_algorithm,
278 filtering_algorithm,
279 info.filt_algorithm_name,
280 info.filt_algorithm_options);
281 #endif
282 return true;
283 }
284
285 void
FillScanModeBlastDbInfo(vector<CAlignFormatUtil::SDbInfo> & retval,bool is_protein,int numSeqs,Int8 numLetters,string & tag)286 CAlignFormatUtil::FillScanModeBlastDbInfo(vector<CAlignFormatUtil::SDbInfo>& retval,
287 bool is_protein, int numSeqs, Int8 numLetters, string& tag)
288 {
289 retval.clear();
290 CAlignFormatUtil::SDbInfo info;
291 info.is_protein = is_protein;
292 if (tag == "")
293 info.definition = string("User specified sequence set.");
294 else
295 {
296 info.definition = string("User specified sequence set ") +
297 string("(Input: ") + tag + string(").");
298 }
299 info.number_seqs = numSeqs;
300 info.total_length = numLetters;
301 retval.push_back(info);
302 }
303
304 void
GetBlastDbInfo(vector<CAlignFormatUtil::SDbInfo> & retval,const string & blastdb_names,bool is_protein,int dbfilt_algorithm,bool is_remote)305 CAlignFormatUtil::GetBlastDbInfo(vector<CAlignFormatUtil::SDbInfo>& retval,
306 const string& blastdb_names, bool is_protein,
307 int dbfilt_algorithm /* = -1 */,
308 bool is_remote /* = false */)
309 {
310 retval.clear();
311 if( is_remote ){
312 bool found_all = false;
313 static CBlastServices rmt_blast_services;
314 vector<string> missing_names;
315 vector< CRef<objects::CBlast4_database_info> > all_db_info =
316 rmt_blast_services.GetDatabaseInfo(blastdb_names,is_protein,&found_all,&missing_names);
317 if( !missing_names.empty() ){
318 string msg("'");
319 for(size_t ndx=0 ; ndx < missing_names.size(); ndx++){
320 msg += missing_names[ndx];
321 }
322 msg += string("' not found on NCBI servers.\n");
323 NCBI_THROW(CSeqDBException, eFileErr, msg);
324 }
325 for(size_t ndx=0 ; ndx < all_db_info.size(); ndx++){
326 CAlignFormatUtil::SDbInfo info;
327 objects::CBlast4_database_info &dbinfo = *all_db_info[ndx];
328 info.name = dbinfo.GetDatabase().GetName();
329 info.definition = dbinfo.GetDescription();
330 if (info.definition.empty())
331 info.definition = info.name;
332 CTimeFormat tf("b d, Y H:m P", CTimeFormat::fFormat_Simple);
333 info.date = CTime(dbinfo.GetLast_updated()).AsString(tf);
334 info.total_length = dbinfo.GetTotal_length();
335 info.number_seqs = static_cast<int>(dbinfo.GetNum_sequences());
336 if (info.total_length < 0) {
337 const string kDbName = NStr::TruncateSpaces(info.name);
338 if( ! s_FillDbInfoLocally(kDbName, info, dbfilt_algorithm) ){
339 string msg("'");
340 msg += kDbName;
341 msg += string("' has bad total length on NCBI servers.\n");
342 NCBI_THROW(CSeqDBException, eFileErr, msg);
343 }
344 }
345 retval.push_back(info);
346 }
347 return;
348 }
349 else{
350 vector<CTempString> dbs;
351 SeqDB_SplitQuoted(blastdb_names, dbs, true);
352 retval.reserve(dbs.size());
353
354 ITERATE(vector<CTempString>, i, dbs) {
355 CAlignFormatUtil::SDbInfo info;
356 info.is_protein = is_protein;
357 bool success = false;
358 // Unsafe OK as kDbName only used in this loop.
359 const string kDbName = NStr::TruncateSpaces_Unsafe(*i);
360 if (kDbName.empty())
361 continue;
362
363 success = s_FillDbInfoLocally(kDbName, info, dbfilt_algorithm);
364
365 if (success) {
366 retval.push_back(info);
367 } else {
368 string msg("'");
369 msg += kDbName;
370 if (is_remote)
371 msg += string("' not found on NCBI servers.\n");
372 else
373 msg += string("' not found.\n");
374 NCBI_THROW(CSeqDBException, eFileErr, msg);
375 }
376 }
377 }
378 }
379
PrintDbReport(const vector<SDbInfo> & dbinfo_list,size_t line_length,CNcbiOstream & out,bool top)380 void CAlignFormatUtil::PrintDbReport(const vector<SDbInfo>& dbinfo_list,
381 size_t line_length,
382 CNcbiOstream& out,
383 bool top)
384 {
385 if (top) {
386 const CAlignFormatUtil::SDbInfo* dbinfo = &(dbinfo_list.front());
387 out << "Database: ";
388
389 string db_titles = dbinfo->definition;
390 Int8 tot_num_seqs = static_cast<Int8>(dbinfo->number_seqs);
391 Int8 tot_length = dbinfo->total_length;
392
393 for (size_t i = 1; i < dbinfo_list.size(); i++) {
394 db_titles += "; " + dbinfo_list[i].definition;
395 tot_num_seqs += static_cast<Int8>(dbinfo_list[i].number_seqs);
396 tot_length += dbinfo_list[i].total_length;
397 }
398
399 x_WrapOutputLine(db_titles, line_length, out);
400 if ( !dbinfo->filt_algorithm_name.empty() ) {
401 out << "Masked using: '" << dbinfo->filt_algorithm_name << "'";
402 if ( !dbinfo->filt_algorithm_options.empty() ) {
403 out << ", options: '" << dbinfo->filt_algorithm_options << "'";
404 }
405 out << endl;
406 }
407 CAlignFormatUtil::AddSpace(out, 11);
408 out << NStr::Int8ToString(tot_num_seqs, NStr::fWithCommas) <<
409 " sequences; " <<
410 NStr::Int8ToString(tot_length, NStr::fWithCommas) <<
411 " total letters\n\n";
412 return;
413 }
414
415 ITERATE(vector<SDbInfo>, dbinfo, dbinfo_list) {
416 if (dbinfo->subset == false) {
417 out << " Database: ";
418 x_WrapOutputLine(dbinfo->definition, line_length, out);
419
420 if ( !dbinfo->filt_algorithm_name.empty() ) {
421 out << " Masked using: '" << dbinfo->filt_algorithm_name << "'";
422 if ( !dbinfo->filt_algorithm_options.empty() ) {
423 out << ", options: '" << dbinfo->filt_algorithm_options << "'";
424 }
425 out << endl;
426 }
427
428 out << " Posted date: ";
429 out << dbinfo->date << "\n";
430
431 out << " Number of letters in database: ";
432 out << NStr::Int8ToString(dbinfo->total_length,
433 NStr::fWithCommas) << "\n";
434 out << " Number of sequences in database: ";
435 out << NStr::IntToString(dbinfo->number_seqs,
436 NStr::fWithCommas) << "\n";
437
438 } else {
439 out << " Subset of the database(s) listed below" << "\n";
440 out << " Number of letters searched: ";
441 out << NStr::Int8ToString(dbinfo->total_length,
442 NStr::fWithCommas) << "\n";
443 out << " Number of sequences searched: ";
444 out << NStr::IntToString(dbinfo->number_seqs,
445 NStr::fWithCommas) << "\n";
446 }
447 out << "\n";
448 }
449
450 }
451
PrintKAParameters(double lambda,double k,double h,size_t line_len,CNcbiOstream & out,bool gapped,const Blast_GumbelBlk * gbp)452 void CAlignFormatUtil::PrintKAParameters(double lambda, double k, double h,
453 size_t line_len,
454 CNcbiOstream& out, bool gapped,
455 const Blast_GumbelBlk *gbp)
456 {
457
458 char buffer[256];
459 if (gapped) {
460 out << "Gapped" << "\n";
461 }
462 out << "Lambda K H";
463 if (gbp) {
464 if (gapped) {
465 out << " a alpha sigma";
466 } else {
467 out << " a alpha";
468 }
469 }
470 out << "\n";
471 sprintf(buffer, "%#8.3g ", lambda);
472 out << buffer;
473 sprintf(buffer, "%#8.3g ", k);
474 out << buffer;
475 sprintf(buffer, "%#8.3g ", h);
476 out << buffer;
477 if (gbp) {
478 if (gapped) {
479 sprintf(buffer, "%#8.3g ", gbp->a);
480 out << buffer;
481 sprintf(buffer, "%#8.3g ", gbp->Alpha);
482 out << buffer;
483 sprintf(buffer, "%#8.3g ", gbp->Sigma);
484 out << buffer;
485 } else {
486 sprintf(buffer, "%#8.3g ", gbp->a_un);
487 out << buffer;
488 sprintf(buffer, "%#8.3g ", gbp->Alpha_un);
489 out << buffer;
490 }
491 //x_WrapOutputLine(buffer, line_len, out);
492 }
493 out << "\n";
494 }
495
496 string
GetSeqIdString(const CBioseq & cbs,bool believe_local_id)497 CAlignFormatUtil::GetSeqIdString(const CBioseq& cbs, bool believe_local_id)
498 {
499 const CBioseq::TId& ids = cbs.GetId();
500 return CAlignFormatUtil::GetSeqIdString(ids, believe_local_id);
501 }
502
503 string
GetSeqIdString(const list<CRef<CSeq_id>> & ids,bool believe_local_id)504 CAlignFormatUtil::GetSeqIdString(const list<CRef<CSeq_id> > & ids, bool believe_local_id)
505 {
506 string all_id_str = NcbiEmptyString;
507 CRef<CSeq_id> wid = FindBestChoice(ids, CSeq_id::WorstRank);
508
509 if (wid && (wid->Which()!= CSeq_id::e_Local || believe_local_id)){
510 TGi gi = FindGi(ids);
511
512 bool use_long_seqids = false;
513 CNcbiApplication* app = CNcbiApplication::Instance();
514 if (app) {
515 const CNcbiRegistry& registry = app->GetConfig();
516 use_long_seqids = (registry.Get("BLAST", "LONG_SEQID") == "1");
517 }
518 if (!use_long_seqids) {
519
520 all_id_str = GetBareId(*wid);
521 }
522 else if (strncmp(wid->AsFastaString().c_str(), "lcl|", 4) == 0) {
523 if(gi == ZERO_GI){
524 all_id_str = wid->AsFastaString().substr(4);
525 } else {
526 all_id_str = "gi|" + NStr::NumericToString(gi) +
527 "|" + wid->AsFastaString().substr(4);
528 }
529 } else {
530 if(gi == ZERO_GI){
531 all_id_str = wid->AsFastaString();
532 } else {
533 all_id_str = "gi|" + NStr::NumericToString(gi) + "|" +
534 wid->AsFastaString();
535 }
536 }
537 }
538
539 return all_id_str;
540 }
541
542 string
GetSeqDescrString(const CBioseq & cbs)543 CAlignFormatUtil::GetSeqDescrString(const CBioseq& cbs)
544 {
545 string all_descr_str = NcbiEmptyString;
546
547 if (cbs.IsSetDescr()) {
548 const CBioseq::TDescr& descr = cbs.GetDescr();
549 const CBioseq::TDescr::Tdata& data = descr.Get();
550 ITERATE(CBioseq::TDescr::Tdata, iter, data) {
551 if((*iter)->IsTitle()) {
552 all_descr_str += (*iter)->GetTitle();
553 }
554 }
555 }
556 return all_descr_str;
557 }
558
AcknowledgeBlastQuery(const CBioseq & cbs,size_t line_len,CNcbiOstream & out,bool believe_query,bool html,bool tabular,const string & rid)559 void CAlignFormatUtil::AcknowledgeBlastQuery(const CBioseq& cbs,
560 size_t line_len,
561 CNcbiOstream& out,
562 bool believe_query,
563 bool html,
564 bool tabular /* = false */,
565 const string& rid /* = kEmptyStr*/)
566 {
567 const string label("Query");
568 CAlignFormatUtil::x_AcknowledgeBlastSequence(cbs, line_len, out,
569 believe_query, html,
570 label, tabular, rid);
571 }
572
573 void
AcknowledgeBlastSubject(const CBioseq & cbs,size_t line_len,CNcbiOstream & out,bool believe_query,bool html,bool tabular)574 CAlignFormatUtil::AcknowledgeBlastSubject(const CBioseq& cbs,
575 size_t line_len,
576 CNcbiOstream& out,
577 bool believe_query,
578 bool html,
579 bool tabular /* = false */)
580 {
581 const string label("Subject");
582 CAlignFormatUtil::x_AcknowledgeBlastSequence(cbs, line_len, out,
583 believe_query, html,
584 label, tabular, kEmptyStr);
585 }
586
587 void
x_AcknowledgeBlastSequence(const CBioseq & cbs,size_t line_len,CNcbiOstream & out,bool believe_query,bool html,const string & label,bool tabular,const string & rid)588 CAlignFormatUtil::x_AcknowledgeBlastSequence(const CBioseq& cbs,
589 size_t line_len,
590 CNcbiOstream& out,
591 bool believe_query,
592 bool html,
593 const string& label,
594 bool tabular /* = false */,
595 const string& rid /* = kEmptyStr*/)
596 {
597
598 if (html) {
599 out << "<b>" << label << "=</b> ";
600 } else if (tabular) {
601 out << "# " << label << ": ";
602 } else {
603 out << label << "= ";
604 }
605
606 string all_id_str = GetSeqIdString(cbs, believe_query);
607 all_id_str += " ";
608 all_id_str = NStr::TruncateSpaces(all_id_str + GetSeqDescrString(cbs));
609
610 // For tabular output, there is no limit on the line length.
611 // There is also no extra line with the sequence length.
612 if (tabular) {
613 out << all_id_str;
614 } else {
615 x_WrapOutputLine(all_id_str, line_len, out, html);
616 if(cbs.IsSetInst() && cbs.GetInst().CanGetLength()){
617 out << "\nLength=";
618 out << cbs.GetInst().GetLength() <<"\n";
619 }
620 }
621
622 if (rid != kEmptyStr) {
623 if (tabular) {
624 out << "\n" << "# RID: " << rid;
625 } else {
626 out << "\n" << "RID: " << rid << "\n";
627 }
628 }
629 }
630
PrintPhiInfo(int num_patterns,const string & pattern,double prob,vector<int> & offsets,CNcbiOstream & out)631 void CAlignFormatUtil::PrintPhiInfo(int num_patterns,
632 const string& pattern,
633 double prob,
634 vector<int>& offsets,
635 CNcbiOstream& out)
636 {
637 out << num_patterns << " occurrence(s) of pattern: " << "\n"
638 << pattern << " at position(s) ";
639
640 bool first = true;
641 for (vector<int>::iterator it = offsets.begin();
642 it != offsets.end(); it++)
643 {
644 if (!first)
645 out << ", ";
646
647 out << 1 + *it ;
648
649 first = false;
650 }
651 out << " of query sequence" << "\n";
652 out << "pattern probability=" << prob << "\n";
653
654 }
655
GetAlnScores(const CSeq_align & aln,int & score,double & bits,double & evalue,int & sum_n,int & num_ident,list<TGi> & use_this_gi)656 void CAlignFormatUtil::GetAlnScores(const CSeq_align& aln,
657 int& score,
658 double& bits,
659 double& evalue,
660 int& sum_n,
661 int& num_ident,
662 list<TGi>& use_this_gi)
663 {
664 int comp_adj_method = 0; // dummy variable
665
666 CAlignFormatUtil::GetAlnScores(aln, score, bits, evalue, sum_n,
667 num_ident, use_this_gi, comp_adj_method);
668 }
669
GetAlnScores(const CSeq_align & aln,int & score,double & bits,double & evalue,int & sum_n,int & num_ident,list<string> & use_this_seq)670 void CAlignFormatUtil::GetAlnScores(const CSeq_align& aln,
671 int& score,
672 double& bits,
673 double& evalue,
674 int& sum_n,
675 int& num_ident,
676 list<string>& use_this_seq)
677 {
678 int comp_adj_method = 0; // dummy variable
679
680 CAlignFormatUtil::GetAlnScores(aln, score, bits, evalue, sum_n,
681 num_ident, use_this_seq, comp_adj_method);
682 }
683
684
GetAlnScores(const CSeq_align & aln,int & score,double & bits,double & evalue,int & sum_n,int & num_ident,list<TGi> & use_this_gi,int & comp_adj_method)685 void CAlignFormatUtil::GetAlnScores(const CSeq_align& aln,
686 int& score,
687 double& bits,
688 double& evalue,
689 int& sum_n,
690 int& num_ident,
691 list<TGi>& use_this_gi,
692 int& comp_adj_method)
693 {
694 bool hasScore = false;
695 score = -1;
696 bits = -1;
697 evalue = -1;
698 sum_n = -1;
699 num_ident = -1;
700 comp_adj_method = 0;
701
702 //look for scores at seqalign level first
703 hasScore = s_GetBlastScore(aln.GetScore(), score, bits, evalue,
704 sum_n, num_ident, use_this_gi, comp_adj_method);
705
706 //look at the seg level
707 if(!hasScore){
708 const CSeq_align::TSegs& seg = aln.GetSegs();
709 if(seg.Which() == CSeq_align::C_Segs::e_Std){
710 s_GetBlastScore(seg.GetStd().front()->GetScores(),
711 score, bits, evalue, sum_n, num_ident, use_this_gi, comp_adj_method);
712 } else if (seg.Which() == CSeq_align::C_Segs::e_Dendiag){
713 s_GetBlastScore(seg.GetDendiag().front()->GetScores(),
714 score, bits, evalue, sum_n, num_ident, use_this_gi, comp_adj_method);
715 } else if (seg.Which() == CSeq_align::C_Segs::e_Denseg){
716 s_GetBlastScore(seg.GetDenseg().GetScores(),
717 score, bits, evalue, sum_n, num_ident, use_this_gi, comp_adj_method);
718 }
719 }
720 if(use_this_gi.size() == 0) {
721 GetUseThisSequence(aln,use_this_gi);
722 }
723 }
724
725 //converts gi list to the list of gi:XXXXXXXX strings
s_NumGiToStringGiList(list<TGi> use_this_gi)726 static list<string> s_NumGiToStringGiList(list<TGi> use_this_gi)//for backward compatability
727 {
728 const string k_GiPrefix = "gi:";
729 list<string> use_this_seq;
730 ITERATE(list<TGi>, iter_gi, use_this_gi){
731 string strSeq = k_GiPrefix + NStr::NumericToString(*iter_gi);
732 use_this_seq.push_back(strSeq);
733 }
734 return use_this_seq;
735 }
736
GetAlnScores(const CSeq_align & aln,int & score,double & bits,double & evalue,int & sum_n,int & num_ident,list<string> & use_this_seq,int & comp_adj_method)737 void CAlignFormatUtil::GetAlnScores(const CSeq_align& aln,
738 int& score,
739 double& bits,
740 double& evalue,
741 int& sum_n,
742 int& num_ident,
743 list<string>& use_this_seq,
744 int& comp_adj_method)
745 {
746 bool hasScore = false;
747 score = -1;
748 bits = -1;
749 evalue = -1;
750 sum_n = -1;
751 num_ident = -1;
752 comp_adj_method = 0;
753
754 list<TGi> use_this_gi;
755 //look for scores at seqalign level first
756 hasScore = s_GetBlastScore(aln.GetScore(), score, bits, evalue,
757 sum_n, num_ident, use_this_gi, comp_adj_method);
758
759 //look at the seg level
760 if(!hasScore){
761 const CSeq_align::TSegs& seg = aln.GetSegs();
762 if(seg.Which() == CSeq_align::C_Segs::e_Std){
763 s_GetBlastScore(seg.GetStd().front()->GetScores(),
764 score, bits, evalue, sum_n, num_ident, use_this_gi, comp_adj_method);
765 } else if (seg.Which() == CSeq_align::C_Segs::e_Dendiag){
766 s_GetBlastScore(seg.GetDendiag().front()->GetScores(),
767 score, bits, evalue, sum_n, num_ident, use_this_gi, comp_adj_method);
768 } else if (seg.Which() == CSeq_align::C_Segs::e_Denseg){
769 s_GetBlastScore(seg.GetDenseg().GetScores(),
770 score, bits, evalue, sum_n, num_ident, use_this_gi, comp_adj_method);
771 }
772 }
773 if(use_this_gi.size() == 0) {
774 GetUseThisSequence(aln,use_this_seq);
775 }
776 else {
777 use_this_seq = s_NumGiToStringGiList(use_this_gi);//for backward compatability
778 }
779 }
780
GetGnlID(const CDbtag & dtg)781 string CAlignFormatUtil::GetGnlID(const CDbtag& dtg)
782 {
783 string retval = NcbiEmptyString;
784
785 if(dtg.GetTag().IsId())
786 retval = NStr::IntToString(dtg.GetTag().GetId());
787 else
788 retval = dtg.GetTag().GetStr();
789
790 return retval;
791 }
792
GetLabel(CConstRef<CSeq_id> id,bool with_version)793 string CAlignFormatUtil::GetLabel(CConstRef<CSeq_id> id,bool with_version)
794 {
795 string retval = "";
796 if (id->Which() == CSeq_id::e_General){
797 const CDbtag& dtg = id->GetGeneral();
798 retval = CAlignFormatUtil::GetGnlID(dtg);
799 }
800 if (retval == "")
801 retval = id->GetSeqIdString(with_version);
802
803 return retval;
804 }
805
AddSpace(CNcbiOstream & out,int number)806 void CAlignFormatUtil::AddSpace(CNcbiOstream& out, int number)
807
808 {
809 for(int i=0; i<number; i++){
810 out<<" ";
811 }
812
813 }
814
GetScoreString(double evalue,double bit_score,double total_bit_score,int raw_score,string & evalue_str,string & bit_score_str,string & total_bit_score_str,string & raw_score_str)815 void CAlignFormatUtil::GetScoreString(double evalue,
816 double bit_score,
817 double total_bit_score,
818 int raw_score,
819 string& evalue_str,
820 string& bit_score_str,
821 string& total_bit_score_str,
822 string& raw_score_str)
823 {
824 char evalue_buf[100], bit_score_buf[100], total_bit_score_buf[100];
825
826 /* Facilitates comparing formatted output using diff */
827 static string kBitScoreFormat("%4.1lf");
828 #ifdef CTOOLKIT_COMPATIBLE
829 static bool ctoolkit_compatible = false;
830 static bool value_set = false;
831 if ( !value_set ) {
832 if (getenv("CTOOLKIT_COMPATIBLE")) {
833 kBitScoreFormat.assign("%4.0lf");
834 ctoolkit_compatible = true;
835 }
836 value_set = true;
837 }
838 #endif /* CTOOLKIT_COMPATIBLE */
839
840 if (evalue < 1.0e-180) {
841 snprintf(evalue_buf, sizeof(evalue_buf), "0.0");
842 } else if (evalue < 1.0e-99) {
843 snprintf(evalue_buf, sizeof(evalue_buf), "%2.0le", evalue);
844 #ifdef CTOOLKIT_COMPATIBLE
845 if (ctoolkit_compatible) {
846 strncpy(evalue_buf, evalue_buf+1, sizeof(evalue_buf-1));
847 }
848 #endif /* CTOOLKIT_COMPATIBLE */
849 } else if (evalue < 0.0009) {
850 snprintf(evalue_buf, sizeof(evalue_buf), "%3.0le", evalue);
851 } else if (evalue < 0.1) {
852 snprintf(evalue_buf, sizeof(evalue_buf), "%4.3lf", evalue);
853 } else if (evalue < 1.0) {
854 snprintf(evalue_buf, sizeof(evalue_buf), "%3.2lf", evalue);
855 } else if (evalue < 10.0) {
856 snprintf(evalue_buf, sizeof(evalue_buf), "%2.1lf", evalue);
857 } else {
858 snprintf(evalue_buf, sizeof(evalue_buf), "%2.0lf", evalue);
859 }
860
861 if (bit_score > 99999){
862 snprintf(bit_score_buf, sizeof(bit_score_buf), "%5.3le", bit_score);
863 } else if (bit_score > 99.9){
864 snprintf(bit_score_buf, sizeof(bit_score_buf), "%3.0ld",
865 (long)bit_score);
866 } else {
867 snprintf(bit_score_buf, sizeof(bit_score_buf), kBitScoreFormat.c_str(),
868 bit_score);
869 }
870 if (total_bit_score > 99999){
871 snprintf(total_bit_score_buf, sizeof(total_bit_score_buf), "%5.3le",
872 total_bit_score);
873 } else if (total_bit_score > 99.9){
874 snprintf(total_bit_score_buf, sizeof(total_bit_score_buf), "%3.0ld",
875 (long)total_bit_score);
876 } else {
877 snprintf(total_bit_score_buf, sizeof(total_bit_score_buf), "%2.1lf",
878 total_bit_score);
879 }
880 evalue_str = evalue_buf;
881 bit_score_str = bit_score_buf;
882 total_bit_score_str = total_bit_score_buf;
883 if (raw_score <= 0)
884 raw_score = -1;
885 NStr::IntToString(raw_score_str, raw_score);
886 }
887
888
PruneSeqalign(const CSeq_align_set & source_aln,CSeq_align_set & new_aln,unsigned int number)889 void CAlignFormatUtil::PruneSeqalign(const CSeq_align_set& source_aln,
890 CSeq_align_set& new_aln,
891 unsigned int number)
892 {
893 CConstRef<CSeq_id> previous_id, subid;
894 bool is_first_aln = true;
895 unsigned int num_align = 0;
896 ITERATE(CSeq_align_set::Tdata, iter, source_aln.Get()){
897
898 if ((*iter)->GetSegs().IsDisc()) {
899 ++num_align;
900 } else {
901 subid = &((*iter)->GetSeq_id(1));
902 if(is_first_aln || (!is_first_aln && !subid->Match(*previous_id))){
903 ++num_align;
904 }
905
906 if(num_align > number) {
907 break;
908 }
909
910 is_first_aln = false;
911 previous_id = subid;
912 }
913 new_aln.Set().push_back(*iter);
914 }
915 }
916
917
GetSubjectsNumber(const CSeq_align_set & source_aln,unsigned int number)918 unsigned int CAlignFormatUtil::GetSubjectsNumber(const CSeq_align_set& source_aln,
919 unsigned int number)
920 {
921 CConstRef<CSeq_id> previous_id, subid;
922 bool is_first_aln = true;
923 unsigned int num_align = 0;
924 ITERATE(CSeq_align_set::Tdata, iter, source_aln.Get()){
925
926 if ((*iter)->GetSegs().IsDisc()) {
927 ++num_align;
928 } else {
929 subid = &((*iter)->GetSeq_id(1));
930 if(is_first_aln || (!is_first_aln && !subid->Match(*previous_id))){
931 ++num_align;
932 }
933
934 if(num_align >= number) {
935 break;
936 }
937
938 is_first_aln = false;
939 previous_id = subid;
940 }
941 }
942 return num_align;
943 }
944
945
PruneSeqalignAll(const CSeq_align_set & source_aln,CSeq_align_set & new_aln,unsigned int number)946 void CAlignFormatUtil::PruneSeqalignAll(const CSeq_align_set& source_aln,
947 CSeq_align_set& new_aln,
948 unsigned int number)
949 {
950 CConstRef<CSeq_id> previous_id, subid;
951 bool is_first_aln = true;
952 unsigned int num_align = 0;
953 bool finishCurrent = false;
954 ITERATE(CSeq_align_set::Tdata, iter, source_aln.Get()){
955 if ((*iter)->GetSegs().IsDisc()) {
956 ++num_align;
957 } else {
958 subid = &((*iter)->GetSeq_id(1));
959 if(is_first_aln || (!is_first_aln && !subid->Match(*previous_id))){
960 finishCurrent = (num_align + 1 == number) ? true : false;
961 ++num_align;
962 }
963 is_first_aln = false;
964 previous_id = subid;
965 }
966 if(num_align > number && !finishCurrent) {
967 break;
968 }
969 new_aln.Set().push_back(*iter);
970 }
971 }
972
973
974 void
GetAlignLengths(CAlnVec & salv,int & align_length,int & num_gaps,int & num_gap_opens)975 CAlignFormatUtil::GetAlignLengths(CAlnVec& salv, int& align_length,
976 int& num_gaps, int& num_gap_opens)
977 {
978 num_gaps = num_gap_opens = align_length = 0;
979
980 for (int row = 0; row < salv.GetNumRows(); row++) {
981 CRef<CAlnMap::CAlnChunkVec> chunk_vec
982 = salv.GetAlnChunks(row, salv.GetSeqAlnRange(0));
983 for (int i=0; i<chunk_vec->size(); i++) {
984 CConstRef<CAlnMap::CAlnChunk> chunk = (*chunk_vec)[i];
985 int chunk_length = chunk->GetAlnRange().GetLength();
986 // Gaps are counted on all rows: gap can only be in one of the rows
987 // for any given segment.
988 if (chunk->IsGap()) {
989 ++num_gap_opens;
990 num_gaps += chunk_length;
991 }
992 // To calculate alignment length, only one row is needed.
993 if (row == 0)
994 align_length += chunk_length;
995 }
996 }
997 }
998
999 void
ExtractSeqalignSetFromDiscSegs(CSeq_align_set & target,const CSeq_align_set & source)1000 CAlignFormatUtil::ExtractSeqalignSetFromDiscSegs(CSeq_align_set& target,
1001 const CSeq_align_set& source)
1002 {
1003 if (source.IsSet() && source.CanGet()) {
1004
1005 for(CSeq_align_set::Tdata::const_iterator iter = source.Get().begin();
1006 iter != source.Get().end(); iter++) {
1007 if((*iter)->IsSetSegs()){
1008 const CSeq_align::TSegs& seg = (*iter)->GetSegs();
1009 if(seg.IsDisc()){
1010 const CSeq_align_set& set = seg.GetDisc();
1011 for(CSeq_align_set::Tdata::const_iterator iter2 =
1012 set.Get().begin(); iter2 != set.Get().end();
1013 iter2 ++) {
1014 target.Set().push_back(*iter2);
1015 }
1016 } else {
1017 target.Set().push_back(*iter);
1018 }
1019 }
1020 }
1021 }
1022 }
1023
1024 CRef<CSeq_align>
CreateDensegFromDendiag(const CSeq_align & aln)1025 CAlignFormatUtil::CreateDensegFromDendiag(const CSeq_align& aln)
1026 {
1027 CRef<CSeq_align> sa(new CSeq_align);
1028 if ( !aln.GetSegs().IsDendiag()) {
1029 NCBI_THROW(CException, eUnknown, "Input Seq-align should be Dendiag!");
1030 }
1031
1032 if(aln.IsSetType()){
1033 sa->SetType(aln.GetType());
1034 }
1035 if(aln.IsSetDim()){
1036 sa->SetDim(aln.GetDim());
1037 }
1038 if(aln.IsSetScore()){
1039 sa->SetScore() = aln.GetScore();
1040 }
1041 if(aln.IsSetBounds()){
1042 sa->SetBounds() = aln.GetBounds();
1043 }
1044
1045 CDense_seg& ds = sa->SetSegs().SetDenseg();
1046
1047 int counter = 0;
1048 ds.SetNumseg() = 0;
1049 ITERATE (CSeq_align::C_Segs::TDendiag, iter, aln.GetSegs().GetDendiag()){
1050
1051 if(counter == 0){//assume all dendiag segments have same dim and ids
1052 if((*iter)->IsSetDim()){
1053 ds.SetDim((*iter)->GetDim());
1054 }
1055 if((*iter)->IsSetIds()){
1056 ds.SetIds() = (*iter)->GetIds();
1057 }
1058 }
1059 ds.SetNumseg() ++;
1060 if((*iter)->IsSetStarts()){
1061 ITERATE(CDense_diag::TStarts, iterStarts, (*iter)->GetStarts()){
1062 ds.SetStarts().push_back(*iterStarts);
1063 }
1064 }
1065 if((*iter)->IsSetLen()){
1066 ds.SetLens().push_back((*iter)->GetLen());
1067 }
1068 if((*iter)->IsSetStrands()){
1069 ITERATE(CDense_diag::TStrands, iterStrands, (*iter)->GetStrands()){
1070 ds.SetStrands().push_back(*iterStrands);
1071 }
1072 }
1073 if((*iter)->IsSetScores()){
1074 ITERATE(CDense_diag::TScores, iterScores, (*iter)->GetScores()){
1075 ds.SetScores().push_back(*iterScores); //this might not have
1076 //right meaning
1077 }
1078 }
1079 counter ++;
1080 }
1081
1082 return sa;
1083 }
1084
GetTaxidForSeqid(const CSeq_id & id,CScope & scope)1085 TTaxId CAlignFormatUtil::GetTaxidForSeqid(const CSeq_id& id, CScope& scope)
1086 {
1087 TTaxId taxid = ZERO_TAX_ID;
1088 try{
1089 const CBioseq_Handle& handle = scope.GetBioseqHandle(id);
1090 const CRef<CBlast_def_line_set> bdlRef =
1091 CSeqDB::ExtractBlastDefline(handle);
1092 const list< CRef< CBlast_def_line > > &bdl = (bdlRef.Empty()) ? list< CRef< CBlast_def_line > >() : bdlRef->Get();
1093 ITERATE(list<CRef<CBlast_def_line> >, iter_bdl, bdl) {
1094 CConstRef<CSeq_id> bdl_id =
1095 GetSeq_idByType((*iter_bdl)->GetSeqid(), id.Which());
1096 if(bdl_id && bdl_id->Match(id) &&
1097 (*iter_bdl)->IsSetTaxid() && (*iter_bdl)->CanGetTaxid()){
1098 taxid = (*iter_bdl)->GetTaxid();
1099 break;
1100 }
1101 }
1102 } catch (CException&) {
1103
1104 }
1105 return taxid;
1106 }
1107
GetFrame(int start,ENa_strand strand,const CBioseq_Handle & handle)1108 int CAlignFormatUtil::GetFrame (int start, ENa_strand strand,
1109 const CBioseq_Handle& handle)
1110 {
1111 int frame = 0;
1112 if (strand == eNa_strand_plus) {
1113 frame = (start % 3) + 1;
1114 } else if (strand == eNa_strand_minus) {
1115 frame = -(((int)handle.GetBioseqLength() - start - 1)
1116 % 3 + 1);
1117
1118 }
1119 return frame;
1120 }
1121
1122
1123 void CAlignFormatUtil::
SortHitByPercentIdentityDescending(list<CRef<CSeq_align_set>> & seqalign_hit_list,bool do_translation)1124 SortHitByPercentIdentityDescending(list< CRef<CSeq_align_set> >&
1125 seqalign_hit_list,
1126 bool do_translation
1127 )
1128 {
1129
1130 kTranslation = do_translation;
1131 seqalign_hit_list.sort(SortHitByPercentIdentityDescendingEx);
1132 }
1133
1134
1135 bool CAlignFormatUtil::
SortHspByPercentIdentityDescending(const CRef<CSeq_align> & info1,const CRef<CSeq_align> & info2)1136 SortHspByPercentIdentityDescending(const CRef<CSeq_align>& info1,
1137 const CRef<CSeq_align>& info2)
1138 {
1139
1140 int score1, sum_n1, num_ident1;
1141 double bits1, evalue1;
1142 list<TGi> use_this_gi1;
1143
1144 int score2, sum_n2, num_ident2;
1145 double bits2, evalue2;
1146 list<TGi> use_this_gi2;
1147
1148
1149 GetAlnScores(*info1, score1, bits1, evalue1, sum_n1, num_ident1, use_this_gi1);
1150 GetAlnScores(*info2, score2, bits2, evalue2, sum_n2, num_ident2, use_this_gi2);
1151
1152 int length1 = GetAlignmentLength(*info1, kTranslation);
1153 int length2 = GetAlignmentLength(*info2, kTranslation);
1154 bool retval = false;
1155
1156
1157 if(length1 > 0 && length2 > 0 && num_ident1 > 0 &&num_ident2 > 0 ) {
1158 if (((double)num_ident1)/length1 == ((double)num_ident2)/length2) {
1159
1160 retval = evalue1 < evalue2;
1161
1162 } else {
1163 retval = ((double)num_ident1)/length1 >= ((double)num_ident2)/length2;
1164
1165 }
1166 } else {
1167 retval = evalue1 < evalue2;
1168 }
1169 return retval;
1170 }
1171
1172 bool CAlignFormatUtil::
SortHitByScoreDescending(const CRef<CSeq_align_set> & info1,const CRef<CSeq_align_set> & info2)1173 SortHitByScoreDescending(const CRef<CSeq_align_set>& info1,
1174 const CRef<CSeq_align_set>& info2)
1175 {
1176 CRef<CSeq_align_set> i1(info1), i2(info2);
1177
1178 i1->Set().sort(SortHspByScoreDescending);
1179 i2->Set().sort(SortHspByScoreDescending);
1180
1181
1182 int score1, sum_n1, num_ident1;
1183 double bits1, evalue1;
1184 list<TGi> use_this_gi1;
1185
1186 int score2, sum_n2, num_ident2;
1187 double bits2, evalue2;
1188 list<TGi> use_this_gi2;
1189
1190 GetAlnScores(*(info1->Get().front()), score1, bits1, evalue1, sum_n1, num_ident1, use_this_gi1);
1191 GetAlnScores(*(info2->Get().front()), score2, bits2, evalue2, sum_n2, num_ident2, use_this_gi2);
1192 return bits1 > bits2;
1193 }
1194
1195 bool CAlignFormatUtil::
SortHitByMasterCoverageDescending(CRef<CSeq_align_set> const & info1,CRef<CSeq_align_set> const & info2)1196 SortHitByMasterCoverageDescending(CRef<CSeq_align_set> const& info1,
1197 CRef<CSeq_align_set> const& info2)
1198 {
1199 int cov1 = GetMasterCoverage(*info1);
1200 int cov2 = GetMasterCoverage(*info2);
1201 bool retval = false;
1202
1203 if (cov1 > cov2) {
1204 retval = cov1 > cov2;
1205 } else if (cov1 == cov2) {
1206 int score1, sum_n1, num_ident1;
1207 double bits1, evalue1;
1208 list<TGi> use_this_gi1;
1209
1210 int score2, sum_n2, num_ident2;
1211 double bits2, evalue2;
1212 list<TGi> use_this_gi2;
1213 GetAlnScores(*(info1->Get().front()), score1, bits1, evalue1, sum_n1, num_ident1, use_this_gi1);
1214 GetAlnScores(*(info2->Get().front()), score2, bits2, evalue2, sum_n2, num_ident2, use_this_gi2);
1215 retval = evalue1 < evalue2;
1216 }
1217
1218 return retval;
1219 }
1220
SortHitByMasterStartAscending(CRef<CSeq_align_set> & info1,CRef<CSeq_align_set> & info2)1221 bool CAlignFormatUtil::SortHitByMasterStartAscending(CRef<CSeq_align_set>& info1,
1222 CRef<CSeq_align_set>& info2)
1223 {
1224 int start1 = 0, start2 = 0;
1225
1226
1227 info1->Set().sort(SortHspByMasterStartAscending);
1228 info2->Set().sort(SortHspByMasterStartAscending);
1229
1230
1231 start1 = min(info1->Get().front()->GetSeqStart(0),
1232 info1->Get().front()->GetSeqStop(0));
1233 start2 = min(info2->Get().front()->GetSeqStart(0),
1234 info2->Get().front()->GetSeqStop(0));
1235
1236 if (start1 == start2) {
1237 //same start then arrange by bits score
1238 int score1, sum_n1, num_ident1;
1239 double bits1, evalue1;
1240 list<TGi> use_this_gi1;
1241
1242 int score2, sum_n2, num_ident2;
1243 double bits2, evalue2;
1244 list<TGi> use_this_gi2;
1245
1246
1247 GetAlnScores(*(info1->Get().front()), score1, bits1, evalue1, sum_n1, num_ident1, use_this_gi1);
1248 GetAlnScores(*(info1->Get().front()), score2, bits2, evalue2, sum_n2, num_ident2, use_this_gi2);
1249 return evalue1 < evalue2;
1250
1251 } else {
1252 return start1 < start2;
1253 }
1254
1255 }
1256
1257 bool CAlignFormatUtil::
SortHspByScoreDescending(const CRef<CSeq_align> & info1,const CRef<CSeq_align> & info2)1258 SortHspByScoreDescending(const CRef<CSeq_align>& info1,
1259 const CRef<CSeq_align>& info2)
1260 {
1261
1262 int score1, sum_n1, num_ident1;
1263 double bits1, evalue1;
1264 list<TGi> use_this_gi1;
1265
1266 int score2, sum_n2, num_ident2;
1267 double bits2, evalue2;
1268 list<TGi> use_this_gi2;
1269
1270
1271 GetAlnScores(*info1, score1, bits1, evalue1, sum_n1, num_ident1, use_this_gi1);
1272 GetAlnScores(*info2, score2, bits2, evalue2, sum_n2, num_ident2, use_this_gi2);
1273 return bits1 > bits2;
1274
1275 }
1276
1277 bool CAlignFormatUtil::
SortHspByMasterStartAscending(const CRef<CSeq_align> & info1,const CRef<CSeq_align> & info2)1278 SortHspByMasterStartAscending(const CRef<CSeq_align>& info1,
1279 const CRef<CSeq_align>& info2)
1280 {
1281 int start1 = 0, start2 = 0;
1282
1283 start1 = min(info1->GetSeqStart(0), info1->GetSeqStop(0));
1284 start2 = min(info2->GetSeqStart(0), info2->GetSeqStop(0)) ;
1285
1286 if (start1 == start2) {
1287 //same start then arrange by bits score
1288 int score1, sum_n1, num_ident1;
1289 double bits1, evalue1;
1290 list<TGi> use_this_gi1;
1291
1292 int score2, sum_n2, num_ident2;
1293 double bits2, evalue2;
1294 list<TGi> use_this_gi2;
1295
1296
1297 GetAlnScores(*info1, score1, bits1, evalue1, sum_n1, num_ident1, use_this_gi1);
1298 GetAlnScores(*info2, score2, bits2, evalue2, sum_n2, num_ident2, use_this_gi2);
1299 return evalue1 < evalue2;
1300
1301 } else {
1302
1303 return start1 < start2;
1304 }
1305 }
1306
1307 bool CAlignFormatUtil::
SortHspBySubjectStartAscending(const CRef<CSeq_align> & info1,const CRef<CSeq_align> & info2)1308 SortHspBySubjectStartAscending(const CRef<CSeq_align>& info1,
1309 const CRef<CSeq_align>& info2)
1310 {
1311 int start1 = 0, start2 = 0;
1312
1313 start1 = min(info1->GetSeqStart(1), info1->GetSeqStop(1));
1314 start2 = min(info2->GetSeqStart(1), info2->GetSeqStop(1)) ;
1315
1316 if (start1 == start2) {
1317 //same start then arrange by bits score
1318 int score1, sum_n1, num_ident1;
1319 double bits1, evalue1;
1320 list<TGi> use_this_gi1;
1321
1322 int score2, sum_n2, num_ident2;
1323 double bits2, evalue2;
1324 list<TGi> use_this_gi2;
1325
1326
1327 GetAlnScores(*info1, score1, bits1, evalue1, sum_n1, num_ident1, use_this_gi1);
1328 GetAlnScores(*info2, score2, bits2, evalue2, sum_n2, num_ident2, use_this_gi2);
1329 return evalue1 < evalue2;
1330
1331 } else {
1332
1333 return start1 < start2;
1334 }
1335 }
1336
GetAlignmentLength(const CSeq_align & aln,bool do_translation)1337 int CAlignFormatUtil::GetAlignmentLength(const CSeq_align& aln, bool do_translation)
1338 {
1339
1340 CRef<CSeq_align> final_aln;
1341
1342 // Convert Std-seg and Dense-diag alignments to Dense-seg.
1343 // Std-segs are produced only for translated searches; Dense-diags only for
1344 // ungapped, not translated searches.
1345
1346 if (aln.GetSegs().IsStd()) {
1347 CRef<CSeq_align> denseg_aln = aln.CreateDensegFromStdseg();
1348 // When both query and subject are translated, i.e. tblastx, convert
1349 // to a special type of Dense-seg.
1350 if (do_translation) {
1351 final_aln = denseg_aln->CreateTranslatedDensegFromNADenseg();
1352 } else {
1353 final_aln = denseg_aln;
1354
1355 }
1356 } else if (aln.GetSegs().IsDendiag()) {
1357 final_aln = CreateDensegFromDendiag(aln);
1358 }
1359
1360 const CDense_seg& ds = (final_aln ? final_aln->GetSegs().GetDenseg() :
1361 aln.GetSegs().GetDenseg());
1362
1363 CAlnMap alnmap(ds);
1364 return alnmap.GetAlnStop() + 1;
1365 }
1366
GetPercentIdentity(const CSeq_align & aln,CScope & scope,bool do_translation)1367 double CAlignFormatUtil::GetPercentIdentity(const CSeq_align& aln,
1368 CScope& scope,
1369 bool do_translation) {
1370 double identity = 0;
1371 CRef<CSeq_align> final_aln;
1372
1373 // Convert Std-seg and Dense-diag alignments to Dense-seg.
1374 // Std-segs are produced only for translated searches; Dense-diags only for
1375 // ungapped, not translated searches.
1376
1377 if (aln.GetSegs().IsStd()) {
1378 CRef<CSeq_align> denseg_aln = aln.CreateDensegFromStdseg();
1379 // When both query and subject are translated, i.e. tblastx, convert
1380 // to a special type of Dense-seg.
1381 if (do_translation) {
1382 final_aln = denseg_aln->CreateTranslatedDensegFromNADenseg();
1383 } else {
1384 final_aln = denseg_aln;
1385
1386 }
1387 } else if (aln.GetSegs().IsDendiag()) {
1388 final_aln = CreateDensegFromDendiag(aln);
1389 }
1390
1391 const CDense_seg& ds = (final_aln ? final_aln->GetSegs().GetDenseg() :
1392 aln.GetSegs().GetDenseg());
1393
1394 CAlnVec alnvec(ds, scope);
1395 string query, subject;
1396
1397 alnvec.SetAaCoding(CSeq_data::e_Ncbieaa);
1398 alnvec.GetWholeAlnSeqString(0, query);
1399 alnvec.GetWholeAlnSeqString(1, subject);
1400
1401 int num_ident = 0;
1402 int length = min(query.size(), subject.size());
1403
1404 for (int i = 0; i < length; ++i) {
1405 if (query[i] == subject[i]) {
1406 ++num_ident;
1407 }
1408 }
1409
1410 if (length > 0) {
1411 identity = ((double)num_ident)/length;
1412 }
1413
1414 return identity;
1415 }
1416
1417
s_CalcAlnPercentIdent(const CRef<CSeq_align_set> & info1,const CRef<CSeq_align_set> & info2,double & percentIdent1,double & percentIdent2)1418 static void s_CalcAlnPercentIdent(const CRef<CSeq_align_set>& info1,
1419 const CRef<CSeq_align_set>& info2,
1420 double &percentIdent1,
1421 double &percentIdent2)
1422 {
1423
1424 CRef<CSeq_align_set> i1(info1), i2(info2);
1425 percentIdent1 = -1;
1426 percentIdent2 = -1;
1427
1428 i1->Set().sort(CAlignFormatUtil::SortHspByPercentIdentityDescending);
1429 i2->Set().sort(CAlignFormatUtil::SortHspByPercentIdentityDescending);
1430
1431 percentIdent1 = CAlignFormatUtil::GetSeqAlignSetCalcPercentIdent(*info1, kTranslation);
1432 percentIdent2 = CAlignFormatUtil::GetSeqAlignSetCalcPercentIdent(*info2, kTranslation);
1433 return;
1434 }
1435
1436
1437 bool CAlignFormatUtil::
SortHitByPercentIdentityDescendingEx(const CRef<CSeq_align_set> & info1,const CRef<CSeq_align_set> & info2)1438 SortHitByPercentIdentityDescendingEx(const CRef<CSeq_align_set>& info1,
1439 const CRef<CSeq_align_set>& info2)
1440 {
1441
1442 CRef<CSeq_align_set> i1(info1), i2(info2);
1443
1444 //i1->Set().sort(SortHspByPercentIdentityDescending);
1445 //i2->Set().sort(SortHspByPercentIdentityDescending);
1446
1447
1448 auto_ptr<CAlignFormatUtil::SSeqAlignSetCalcParams> seqSetInfo1( CAlignFormatUtil::GetSeqAlignSetCalcParamsFromASN(*info1));
1449 auto_ptr<CAlignFormatUtil::SSeqAlignSetCalcParams> seqSetInfo2( CAlignFormatUtil::GetSeqAlignSetCalcParamsFromASN(*info2));
1450 double evalue1 = seqSetInfo1->evalue;
1451 double evalue2 = seqSetInfo2->evalue;
1452 double percentIdent1 = seqSetInfo1->percent_identity;
1453 double percentIdent2 = seqSetInfo2->percent_identity;
1454
1455 bool retval = false;
1456 if(percentIdent1 < 0 || percentIdent2 < 0) {
1457 s_CalcAlnPercentIdent(info1, info2,percentIdent1,percentIdent2);
1458 }
1459 if(percentIdent1 > 0 &&percentIdent2 > 0) {
1460 if (percentIdent1 == percentIdent2) {
1461 retval = evalue1 < evalue2;
1462
1463 } else {
1464 retval = percentIdent1 >= percentIdent2;
1465 }
1466 } else {
1467 retval = evalue1 < evalue2;
1468 }
1469 return retval;
1470 }
1471
SortHitByTotalScoreDescending(CRef<CSeq_align_set> const & info1,CRef<CSeq_align_set> const & info2)1472 bool CAlignFormatUtil::SortHitByTotalScoreDescending(CRef<CSeq_align_set> const& info1,
1473 CRef<CSeq_align_set> const& info2)
1474 {
1475 int score1, score2, sum_n, num_ident;
1476 double bits, evalue;
1477 list<TGi> use_this_gi;
1478 double total_bits1 = 0, total_bits2 = 0;
1479
1480 ITERATE(CSeq_align_set::Tdata, iter, info1->Get()) {
1481 CAlignFormatUtil::GetAlnScores(**iter, score1, bits, evalue,
1482 sum_n, num_ident, use_this_gi);
1483 total_bits1 += bits;
1484 }
1485
1486 ITERATE(CSeq_align_set::Tdata, iter, info2->Get()) {
1487 CAlignFormatUtil::GetAlnScores(**iter, score2, bits, evalue,
1488 sum_n, num_ident, use_this_gi);
1489 total_bits2 += bits;
1490 }
1491
1492
1493 return total_bits1 >= total_bits2;
1494
1495 }
1496
1497 #ifndef NCBI_COMPILER_WORKSHOP
1498 /** Class to sort by linkout bit
1499 * @note this code doesn't compile under the Solaris' WorkShop, and because
1500 * this feature is only used inside NCBI (LinkoutDB), we disable this code.
1501 */
1502 class CSortHitByMolecularTypeEx
1503 {
1504 public:
CSortHitByMolecularTypeEx(ILinkoutDB * linkoutdb,const string & mv_build_name)1505 CSortHitByMolecularTypeEx(ILinkoutDB* linkoutdb,
1506 const string& mv_build_name)
1507 : m_LinkoutDB(linkoutdb), m_MapViewerBuildName(mv_build_name) {}
1508
operator ()(const CRef<CSeq_align_set> & info1,const CRef<CSeq_align_set> & info2)1509 bool operator() (const CRef<CSeq_align_set>& info1, const CRef<CSeq_align_set>& info2)
1510 {
1511 CConstRef<CSeq_id> id1, id2;
1512 id1 = &(info1->Get().front()->GetSeq_id(1));
1513 id2 = &(info2->Get().front()->GetSeq_id(1));
1514
1515 int linkout1 = 0, linkout2 = 0;
1516 linkout1 = m_LinkoutDB
1517 ? m_LinkoutDB->GetLinkout(*id1, m_MapViewerBuildName)
1518 : 0;
1519 linkout2 = m_LinkoutDB
1520 ? m_LinkoutDB->GetLinkout(*id2, m_MapViewerBuildName)
1521 : 0;
1522
1523 return (linkout1 & eGenomicSeq) <= (linkout2 & eGenomicSeq);
1524 }
1525 private:
1526 ILinkoutDB* m_LinkoutDB;
1527 string m_MapViewerBuildName;
1528 };
1529 #endif /* NCBI_COMPILER_WORKSHOP */
1530
1531 void CAlignFormatUtil::
SortHitByMolecularType(list<CRef<CSeq_align_set>> & seqalign_hit_list,CScope & scope,ILinkoutDB * linkoutdb,const string & mv_build_name)1532 SortHitByMolecularType(list< CRef<CSeq_align_set> >& seqalign_hit_list,
1533 CScope& scope, ILinkoutDB* linkoutdb,
1534 const string& mv_build_name)
1535 {
1536
1537 kScope = &scope;
1538 #ifndef NCBI_COMPILER_WORKSHOP
1539 seqalign_hit_list.sort(CSortHitByMolecularTypeEx(linkoutdb, mv_build_name));
1540 #endif /* NCBI_COMPILER_WORKSHOP */
1541 }
1542
SortHit(list<CRef<CSeq_align_set>> & seqalign_hit_list,bool do_translation,CScope & scope,int sort_method,ILinkoutDB * linkoutdb,const string & mv_build_name)1543 void CAlignFormatUtil::SortHit(list< CRef<CSeq_align_set> >& seqalign_hit_list,
1544 bool do_translation, CScope& scope, int
1545 sort_method, ILinkoutDB* linkoutdb,
1546 const string& mv_build_name)
1547 {
1548 kScope = &scope;
1549 kTranslation = do_translation;
1550
1551 if (sort_method == 1) {
1552 #ifndef NCBI_COMPILER_WORKSHOP
1553 seqalign_hit_list.sort(CSortHitByMolecularTypeEx(linkoutdb,
1554 mv_build_name));
1555 #endif /* NCBI_COMPILER_WORKSHOP */
1556 } else if (sort_method == 2) {
1557 seqalign_hit_list.sort(SortHitByTotalScoreDescending);
1558 } else if (sort_method == 3) {
1559 seqalign_hit_list.sort(SortHitByPercentIdentityDescendingEx);
1560 }
1561 }
1562
1563 void CAlignFormatUtil::
SplitSeqalignByMolecularType(vector<CRef<CSeq_align_set>> & target,int sort_method,const CSeq_align_set & source,CScope & scope,ILinkoutDB * linkoutdb,const string & mv_build_name)1564 SplitSeqalignByMolecularType(vector< CRef<CSeq_align_set> >&
1565 target,
1566 int sort_method,
1567 const CSeq_align_set& source,
1568 CScope& scope,
1569 ILinkoutDB* linkoutdb,
1570 const string& mv_build_name)
1571 {
1572 CConstRef<CSeq_id> prevSubjectId;
1573 int count = 0;
1574 int linkoutPrev = 0;
1575 ITERATE(CSeq_align_set::Tdata, iter, source.Get()) {
1576
1577 const CSeq_id& id = (*iter)->GetSeq_id(1);
1578 try {
1579 const CBioseq_Handle& handle = scope.GetBioseqHandle(id);
1580 if (handle) {
1581 int linkout;
1582 if(prevSubjectId.Empty() || !id.Match(*prevSubjectId)){
1583 prevSubjectId = &id;
1584 linkout = linkoutdb ? linkoutdb->GetLinkout(id, mv_build_name): 0;
1585 linkoutPrev = linkout;
1586 count++;
1587 }
1588 else {
1589 linkout = linkoutPrev;
1590 }
1591 if (linkout & eGenomicSeq) {
1592 if (sort_method == 1) {
1593 target[1]->Set().push_back(*iter);
1594 } else if (sort_method == 2){
1595 target[0]->Set().push_back(*iter);
1596 } else {
1597 target[1]->Set().push_back(*iter);
1598 }
1599 } else {
1600 if (sort_method == 1) {
1601 target[0]->Set().push_back(*iter);
1602 } else if (sort_method == 2) {
1603 target[1]->Set().push_back(*iter);
1604 } else {
1605 target[0]->Set().push_back(*iter);
1606 }
1607 }
1608 } else {
1609 target[0]->Set().push_back(*iter);
1610 }
1611
1612 } catch (const CException&){
1613 target[0]->Set().push_back(*iter); //no bioseq found, leave untouched
1614 }
1615 }
1616 }
1617
HspListToHitList(list<CRef<CSeq_align_set>> & target,const CSeq_align_set & source)1618 void CAlignFormatUtil::HspListToHitList(list< CRef<CSeq_align_set> >& target,
1619 const CSeq_align_set& source)
1620 {
1621 CConstRef<CSeq_id> previous_id;
1622 CRef<CSeq_align_set> temp;
1623
1624 ITERATE(CSeq_align_set::Tdata, iter, source.Get()) {
1625 const CSeq_id& cur_id = (*iter)->GetSeq_id(1);
1626 if(previous_id.Empty()) {
1627 temp = new CSeq_align_set;
1628 temp->Set().push_back(*iter);
1629 target.push_back(temp);
1630 } else if (cur_id.Match(*previous_id)){
1631 temp->Set().push_back(*iter);
1632
1633 } else {
1634 temp = new CSeq_align_set;
1635 temp->Set().push_back(*iter);
1636 target.push_back(temp);
1637 }
1638 previous_id = &cur_id;
1639 }
1640
1641 }
1642
1643 CRef<CSeq_align_set>
HitListToHspList(list<CRef<CSeq_align_set>> & source)1644 CAlignFormatUtil::HitListToHspList(list< CRef<CSeq_align_set> >& source)
1645 {
1646 CRef<CSeq_align_set> align_set (new CSeq_align_set);
1647 CConstRef<CSeq_id> previous_id;
1648 CRef<CSeq_align_set> temp;
1649 // list<CRef<CSeq_align_set> >::iterator iter;
1650
1651 for (list<CRef<CSeq_align_set> >::iterator iter = source.begin(); iter != source.end(); iter ++) {
1652 ITERATE(CSeq_align_set::Tdata, iter2, (*iter)->Get()) {
1653 align_set->Set().push_back(*iter2);
1654 }
1655 }
1656 return align_set;
1657 }
1658
HspListToHitMap(vector<string> seqIdList,const CSeq_align_set & source)1659 map < string, CRef<CSeq_align_set> > CAlignFormatUtil::HspListToHitMap(vector <string> seqIdList,
1660 const CSeq_align_set& source)
1661 {
1662 CConstRef<CSeq_id> previous_id;
1663 CRef<CSeq_align_set> temp;
1664
1665 map < string, CRef<CSeq_align_set> > hitsMap;
1666
1667 for(size_t i = 0; i < seqIdList.size();i++) {
1668 CRef<CSeq_align_set> new_aln(new CSeq_align_set);
1669 hitsMap.insert(map<string, CRef<CSeq_align_set> >::value_type(seqIdList[i],new_aln));
1670 }
1671 size_t count = 0;
1672 ITERATE(CSeq_align_set::Tdata, iter, source.Get()) {
1673 const CSeq_id& cur_id = (*iter)->GetSeq_id(1);
1674 if(previous_id.Empty() || !cur_id.Match(*previous_id)) {
1675 if(count >= seqIdList.size()) {
1676 break;
1677 }
1678 string idString = NStr::TruncateSpaces(cur_id.AsFastaString());
1679 if(hitsMap.find(idString) != hitsMap.end()) {
1680 temp = new CSeq_align_set;
1681 temp->Set().push_back(*iter);
1682 hitsMap[idString] = temp;
1683 count++;
1684 }
1685 else {
1686 temp.Reset();
1687 }
1688 }
1689 else if (cur_id.Match(*previous_id)){
1690 if(!temp.Empty()) {
1691 temp->Set().push_back(*iter);
1692 }
1693 }
1694 previous_id = &cur_id;
1695 }
1696 return hitsMap;
1697 }
1698
ExtractSeqAlignForSeqList(CRef<CSeq_align_set> & all_aln_set,string alignSeqList)1699 void CAlignFormatUtil::ExtractSeqAlignForSeqList(CRef<CSeq_align_set> &all_aln_set, string alignSeqList)
1700 {
1701 vector <string> seqIds;
1702 NStr::Split(alignSeqList,",",seqIds);
1703
1704 //SEQ_ALN_SET from ALIGNDB contains seq_aligns in random order
1705 //The followimg will create a map that contains seq-aln_set per gi from ALIGN_SEQ_LIST
1706 map < string, CRef<CSeq_align_set> > hitsMap = CAlignFormatUtil::HspListToHitMap(seqIds,*all_aln_set) ;
1707
1708 map < string, CRef<CSeq_align_set> >::iterator it;
1709 list< CRef<CSeq_align_set> > orderedSet;
1710 //orderedSet wil have seq aligns in th order of gi list
1711 for(size_t i = 0; i < seqIds.size(); i++) {
1712 if(hitsMap.find(seqIds[i]) != hitsMap.end()) {
1713 orderedSet.push_back(hitsMap[seqIds[i]]);
1714 }
1715 }
1716 //This should contain seq align set in the order of gis in the list
1717 all_aln_set = CAlignFormatUtil::HitListToHspList(orderedSet);
1718 }
1719
s_GetSRASeqMetadata(const CBioseq::TId & ids,string & strRun,string & strSpotId,string & strReadIndex)1720 static bool s_GetSRASeqMetadata(const CBioseq::TId& ids,string &strRun, string &strSpotId,string &strReadIndex)
1721 {
1722 bool success = false;
1723 string link = NcbiEmptyString;
1724 CConstRef<CSeq_id> seqId = GetSeq_idByType(ids, CSeq_id::e_General);
1725
1726 if (!seqId.Empty())
1727 {
1728 // Get the SRA tag from seqId
1729 if (seqId->GetGeneral().CanGetDb() &&
1730 seqId->GetGeneral().CanGetTag() &&
1731 seqId->GetGeneral().GetTag().IsStr())
1732 {
1733 // Decode the tag to collect the SRA-specific indices
1734 string strTag = seqId->GetGeneral().GetTag().GetStr();
1735 if (!strTag.empty())
1736 {
1737 vector<string> vecInfo;
1738 try
1739 {
1740 NStr::Split(strTag, ".", vecInfo);
1741 }
1742 catch (...)
1743 {
1744 return false;
1745 }
1746
1747 if (vecInfo.size() != 3)
1748 {
1749 return false;
1750 }
1751
1752 strRun = vecInfo[0];
1753 strSpotId = vecInfo[1];
1754 strReadIndex = vecInfo[2];
1755 success = true;
1756 }
1757 }
1758 }
1759 return success;
1760 }
1761
BuildSRAUrl(const CBioseq::TId & ids,string user_url)1762 string CAlignFormatUtil::BuildSRAUrl(const CBioseq::TId& ids, string user_url)
1763 {
1764 string strRun, strSpotId,strReadIndex;
1765 string link = NcbiEmptyString;
1766
1767 if(s_GetSRASeqMetadata(ids,strRun,strSpotId,strReadIndex))
1768 {
1769 // Generate the SRA link to the identified spot
1770 link += user_url;
1771 link += "?run=" + strRun;
1772 link += "." + strSpotId;
1773 link += "." + strReadIndex;
1774 }
1775 return link;
1776 }
1777
s_GetBestIDForURL(CBioseq::TId & ids)1778 string s_GetBestIDForURL(CBioseq::TId& ids)
1779 {
1780 string gnl;
1781
1782 CConstRef<CSeq_id> id_general = GetSeq_idByType(ids, CSeq_id::e_General);
1783 CConstRef<CSeq_id> id_other = GetSeq_idByType(ids, CSeq_id::e_Other);
1784 const CRef<CSeq_id> id_accession = FindBestChoice(ids, CSeq_id::WorstRank);
1785
1786 if(!id_general.Empty() && id_general->AsFastaString().find("gnl|BL_ORD_ID") != string::npos){
1787 return gnl;
1788 }
1789
1790 const CSeq_id* bestid = NULL;
1791 if (id_general.Empty()){
1792 bestid = id_other;
1793 if (id_other.Empty()){
1794 bestid = id_accession;
1795 }
1796 } else {
1797 bestid = id_general;
1798 }
1799
1800 if (bestid && bestid->Which() != CSeq_id::e_Gi){
1801 gnl = NStr::URLEncode(bestid->AsFastaString());
1802 }
1803 return gnl;
1804 }
1805
BuildUserUrl(const CBioseq::TId & ids,TTaxId taxid,string user_url,string database,bool db_is_na,string rid,int query_number,bool for_alignment)1806 string CAlignFormatUtil::BuildUserUrl(const CBioseq::TId& ids, TTaxId taxid,
1807 string user_url, string database,
1808 bool db_is_na, string rid, int query_number,
1809 bool for_alignment) {
1810
1811 string link = NcbiEmptyString;
1812 CConstRef<CSeq_id> id_general = GetSeq_idByType(ids, CSeq_id::e_General);
1813
1814 if(!id_general.Empty()
1815 && id_general->AsFastaString().find("gnl|BL_ORD_ID") != string::npos){
1816 /* We do need to make security protected link to BLAST gnl */
1817 return NcbiEmptyString;
1818 }
1819 TGi gi = FindGi(ids);
1820 string bestID = s_GetBestIDForURL((CBioseq::TId &)ids);
1821
1822
1823 bool nodb_path = false;
1824 /* dumpgnl.cgi need to use path */
1825 if (user_url.find("dumpgnl.cgi") ==string::npos){
1826 nodb_path = true;
1827 }
1828 int length = (int)database.size();
1829 string str;
1830 char *chptr, *dbtmp;
1831 char tmpbuff[256];
1832 char* dbname = new char[sizeof(char)*length + 2];
1833 strcpy(dbname, database.c_str());
1834 if(nodb_path) {
1835 int i, j;
1836 dbtmp = new char[sizeof(char)*length + 2]; /* aditional space and NULL */
1837 memset(dbtmp, '\0', sizeof(char)*length + 2);
1838 for(i = 0; i < length; i++) {
1839 if(i > 0) {
1840 strcat(dbtmp, " "); //space between db
1841 }
1842 if(isspace((unsigned char) dbname[i]) || dbname[i] == ',') {/* Rolling spaces */
1843 continue;
1844 }
1845 j = 0;
1846 while (!isspace((unsigned char) dbname[i]) && j < 256 && i < length) {
1847 tmpbuff[j] = dbname[i];
1848 j++; i++;
1849 if(dbname[i] == ',') { /* Comma is valid delimiter */
1850 break;
1851 }
1852 }
1853 tmpbuff[j] = '\0';
1854 if((chptr = strrchr(tmpbuff, '/')) != NULL) {
1855 strcat(dbtmp, (char*)(chptr+1));
1856 } else {
1857 strcat(dbtmp, tmpbuff);
1858 }
1859
1860 }
1861 } else {
1862 dbtmp = dbname;
1863 }
1864
1865 char gnl[256];
1866 if (!bestID.empty()){
1867 strcpy(gnl, bestID.c_str());
1868
1869 } else {
1870 gnl[0] = '\0';
1871 }
1872
1873 str = NStr::URLEncode(dbtmp == NULL ? (char*) "nr" : dbtmp);
1874
1875 if (user_url.find("?") == string::npos){
1876 link += user_url + "?" + "db=" + str + "&na=" + (db_is_na? "1" : "0");
1877 } else {
1878 if (user_url.find("=") != string::npos) {
1879 user_url += "&";
1880 }
1881 link += user_url + "db=" + str + "&na=" + (db_is_na? "1" : "0");
1882 }
1883
1884 if (gnl[0] != '\0'){
1885 str = gnl;
1886 link += "&gnl=";
1887 link += str;
1888 }
1889 if (gi > ZERO_GI){
1890 link += "&gi=" + NStr::NumericToString(gi);
1891 link += "&term=" + NStr::NumericToString(gi) + NStr::URLEncode("[gi]");
1892 }
1893 if(taxid > ZERO_TAX_ID){
1894 link += "&taxid=" + NStr::NumericToString(taxid);
1895 }
1896 if (rid != NcbiEmptyString){
1897 link += "&RID=" + rid;
1898 }
1899
1900 if (query_number > 0){
1901 link += "&QUERY_NUMBER=" + NStr::IntToString(query_number);
1902 }
1903
1904 if (user_url.find("dumpgnl.cgi") ==string::npos){
1905 if (for_alignment)
1906 link += "&log$=nuclalign";
1907 else
1908 link += "&log$=nucltop";
1909 }
1910
1911 if(nodb_path){
1912 delete [] dbtmp;
1913 }
1914 delete [] dbname;
1915 return link;
1916 }
1917 void CAlignFormatUtil::
BuildFormatQueryString(CCgiContext & ctx,map<string,string> & parameters_to_change,string & cgi_query)1918 BuildFormatQueryString (CCgiContext& ctx,
1919 map< string, string>& parameters_to_change,
1920 string& cgi_query)
1921 {
1922
1923 //add parameters to exclude
1924 parameters_to_change.insert(map<string, string>::
1925 value_type("service", ""));
1926 parameters_to_change.insert(map<string, string>::
1927 value_type("address", ""));
1928 parameters_to_change.insert(map<string, string>::
1929 value_type("platform", ""));
1930 parameters_to_change.insert(map<string, string>::
1931 value_type("_pgr", ""));
1932 parameters_to_change.insert(map<string, string>::
1933 value_type("client", ""));
1934 parameters_to_change.insert(map<string, string>::
1935 value_type("composition_based_statistics", ""));
1936
1937 parameters_to_change.insert(map<string, string>::
1938 value_type("auto_format", ""));
1939 cgi_query = NcbiEmptyString;
1940 TCgiEntries& cgi_entry = ctx.GetRequest().GetEntries();
1941 bool is_first = true;
1942
1943 for(TCgiEntriesI it=cgi_entry.begin(); it!=cgi_entry.end(); ++it) {
1944 string parameter = it->first;
1945 if (parameter != NcbiEmptyString) {
1946 if (parameters_to_change.count(NStr::ToLower(parameter)) > 0 ||
1947 parameters_to_change.count(NStr::ToUpper(parameter)) > 0) {
1948 if(parameters_to_change[NStr::ToLower(parameter)] !=
1949 NcbiEmptyString &&
1950 parameters_to_change[NStr::ToUpper(parameter)] !=
1951 NcbiEmptyString) {
1952 if (!is_first) {
1953 cgi_query += "&";
1954 }
1955 cgi_query +=
1956 it->first + "=" + parameters_to_change[it->first];
1957 is_first = false;
1958 }
1959 } else {
1960 if (!is_first) {
1961 cgi_query += "&";
1962 }
1963 cgi_query += it->first + "=" + it->second;
1964 is_first = false;
1965 }
1966
1967 }
1968 }
1969 }
1970
BuildFormatQueryString(CCgiContext & ctx,string & cgi_query)1971 void CAlignFormatUtil::BuildFormatQueryString(CCgiContext& ctx, string& cgi_query) {
1972
1973 string format_type = ctx.GetRequestValue("FORMAT_TYPE").GetValue();
1974 string ridstr = ctx.GetRequestValue("RID").GetValue();
1975 string align_view = ctx.GetRequestValue("ALIGNMENT_VIEW").GetValue();
1976
1977 cgi_query += "RID=" + ridstr;
1978 cgi_query += "&FORMAT_TYPE=" + format_type;
1979 cgi_query += "&ALIGNMENT_VIEW=" + align_view;
1980
1981 cgi_query += "&QUERY_NUMBER=" + ctx.GetRequestValue("QUERY_NUMBER").GetValue();
1982 cgi_query += "&FORMAT_OBJECT=" + ctx.GetRequestValue("FORMAT_OBJECT").GetValue();
1983 cgi_query += "&RUN_PSIBLAST=" + ctx.GetRequestValue("RUN_PSIBLAST").GetValue();
1984 cgi_query += "&I_THRESH=" + ctx.GetRequestValue("I_THRESH").GetValue();
1985
1986 cgi_query += "&DESCRIPTIONS=" + ctx.GetRequestValue("DESCRIPTIONS").GetValue();
1987
1988 cgi_query += "&ALIGNMENTS=" + ctx.GetRequestValue("ALIGNMENTS").GetValue();
1989
1990 cgi_query += "&NUM_OVERVIEW=" + ctx.GetRequestValue("NUM_OVERVIEW").GetValue();
1991
1992 cgi_query += "&NCBI_GI=" + ctx.GetRequestValue("NCBI_GI").GetValue();
1993
1994 cgi_query += "&SHOW_OVERVIEW=" + ctx.GetRequestValue("SHOW_OVERVIEW").GetValue();
1995
1996 cgi_query += "&SHOW_LINKOUT=" + ctx.GetRequestValue("SHOW_LINKOUT").GetValue();
1997
1998 cgi_query += "&GET_SEQUENCE=" + ctx.GetRequestValue("GET_SEQUENCE").GetValue();
1999
2000 cgi_query += "&MASK_CHAR=" + ctx.GetRequestValue("MASK_CHAR").GetValue();
2001 cgi_query += "&MASK_COLOR=" + ctx.GetRequestValue("MASK_COLOR").GetValue();
2002
2003 cgi_query += "&SHOW_CDS_FEATURE=" + ctx.GetRequestValue("SHOW_CDS_FEATURE").GetValue();
2004
2005 if (ctx.GetRequestValue("FORMAT_EQ_TEXT").GetValue() != NcbiEmptyString) {
2006 cgi_query += "&FORMAT_EQ_TEXT=" +
2007 NStr::URLEncode(NStr::TruncateSpaces(ctx.
2008 GetRequestValue("FORMAT_EQ_TEXT").
2009 GetValue()));
2010 }
2011
2012 if (ctx.GetRequestValue("FORMAT_EQ_OP").GetValue() != NcbiEmptyString) {
2013 cgi_query += "&FORMAT_EQ_OP=" +
2014 NStr::URLEncode(NStr::TruncateSpaces(ctx.
2015 GetRequestValue("FORMAT_EQ_OP").
2016 GetValue()));
2017 }
2018
2019 if (ctx.GetRequestValue("FORMAT_EQ_MENU").GetValue() != NcbiEmptyString) {
2020 cgi_query += "&FORMAT_EQ_MENU=" +
2021 NStr::URLEncode(NStr::TruncateSpaces(ctx.
2022 GetRequestValue("FORMAT_EQ_MENU").
2023 GetValue()));
2024 }
2025
2026 cgi_query += "&EXPECT_LOW=" + ctx.GetRequestValue("EXPECT_LOW").GetValue();
2027 cgi_query += "&EXPECT_HIGH=" + ctx.GetRequestValue("EXPECT_HIGH").GetValue();
2028
2029 cgi_query += "&BL2SEQ_LINK=" + ctx.GetRequestValue("BL2SEQ_LINK").GetValue();
2030
2031 }
2032
2033
IsMixedDatabase(const CSeq_align_set & alnset,CScope & scope,ILinkoutDB * linkoutdb,const string & mv_build_name)2034 bool CAlignFormatUtil::IsMixedDatabase(const CSeq_align_set& alnset,
2035 CScope& scope, ILinkoutDB* linkoutdb,
2036 const string& mv_build_name)
2037 {
2038 bool is_mixed = false;
2039 bool is_first = true;
2040 int prev_database = 0;
2041
2042 ITERATE(CSeq_align_set::Tdata, iter, alnset.Get()) {
2043
2044 const CSeq_id& id = (*iter)->GetSeq_id(1);
2045 int linkout = linkoutdb
2046 ? linkoutdb->GetLinkout(id, mv_build_name)
2047 : 0;
2048 int cur_database = (linkout & eGenomicSeq);
2049 if (!is_first && cur_database != prev_database) {
2050 is_mixed = true;
2051 break;
2052 }
2053 prev_database = cur_database;
2054 is_first = false;
2055 }
2056
2057 return is_mixed;
2058
2059 }
2060
2061
IsMixedDatabase(CCgiContext & ctx)2062 bool CAlignFormatUtil::IsMixedDatabase(CCgiContext& ctx)
2063 {
2064 bool formatAsMixedDbs = false;
2065 string mixedDbs = ctx.GetRequestValue("MIXED_DATABASE").GetValue();
2066 if(!mixedDbs.empty()) {
2067 mixedDbs = NStr::ToLower(mixedDbs);
2068 formatAsMixedDbs = (mixedDbs == "on" || mixedDbs == "true" || mixedDbs == "yes") ? true : false;
2069 }
2070 return formatAsMixedDbs;
2071 }
2072
s_MapLinkoutGenParam(string & url_link_tmpl,const string & rid,string giList,bool for_alignment,int cur_align,string & label,string & lnk_displ,string lnk_tl_info="",string lnk_title="")2073 static string s_MapLinkoutGenParam(string &url_link_tmpl,
2074 const string& rid,
2075 string giList,
2076 bool for_alignment,
2077 int cur_align,
2078 string &label,
2079 string &lnk_displ,
2080 string lnk_tl_info = "",
2081 string lnk_title = "")
2082 {
2083 const string kLinkTitle=" title=\"View <@lnk_tl_info@> for <@label@>\" ";
2084 const string kLinkTarget="target=\"lnk" + rid + "\"";
2085 string lnkTitle = (lnk_title.empty()) ? kLinkTitle : lnk_title;
2086 string url_link = CAlignFormatUtil::MapTemplate(url_link_tmpl,"gi",giList);
2087 url_link = CAlignFormatUtil::MapTemplate(url_link,"rid",rid);
2088 url_link = CAlignFormatUtil::MapTemplate(url_link,"log",for_alignment? "align" : "top");
2089 url_link = CAlignFormatUtil::MapTemplate(url_link,"blast_rank",NStr::IntToString(cur_align));
2090 lnkTitle = NStr::StartsWith(lnk_displ,"<img") ? "" : lnkTitle;
2091 string lnkTarget = NStr::StartsWith(lnk_displ,"<img") ? "" : kLinkTarget;
2092 url_link = CAlignFormatUtil::MapTemplate(url_link,"lnkTitle",lnkTitle);
2093 url_link = CAlignFormatUtil::MapTemplate(url_link,"lnkTarget",lnkTarget);
2094 url_link = CAlignFormatUtil::MapTemplate(url_link,"lnk_displ",lnk_displ);
2095 url_link = CAlignFormatUtil::MapTemplate(url_link,"lnk_tl_info",lnk_tl_info);
2096 url_link = CAlignFormatUtil::MapTemplate(url_link,"label",label);
2097 url_link = CAlignFormatUtil::MapProtocol(url_link);
2098 return url_link;
2099 }
2100
2101
s_GetLinkoutUrl(int linkout,string giList,string labelList,TGi first_gi,CAlignFormatUtil::SLinkoutInfo & linkoutInfo,bool textLink=true)2102 static list<string> s_GetLinkoutUrl(int linkout,
2103 string giList,
2104 string labelList,
2105 TGi first_gi,
2106 CAlignFormatUtil::SLinkoutInfo &linkoutInfo,
2107 bool textLink = true)
2108
2109 {
2110 list<string> linkout_list;
2111 string url_link,lnk_displ,lnk_title,lnkTitleInfo;
2112
2113 vector<string> accs;
2114 NStr::Split(labelList,",",accs);
2115 string firstAcc = (accs.size() > 0)? accs[0] : labelList;
2116
2117 if (linkout & eUnigene) {
2118 url_link = CAlignFormatUtil::GetURLFromRegistry("UNIGEN");
2119 lnk_displ = textLink ? "UniGene" : kUnigeneImg;
2120
2121 string termParam = NStr::Find(labelList,",") == NPOS ? kGeneTerm : ""; //kGeneTerm if only one seqid
2122 url_link = CAlignFormatUtil::MapTemplate(url_link,"termParam",termParam);
2123
2124 lnkTitleInfo = "UniGene cluster";
2125 string uid = !linkoutInfo.is_na ? "[Protein Accession]" : "[Nucleotide Accession]";
2126 url_link = CAlignFormatUtil::MapTemplate(url_link,"uid",uid);
2127 url_link = s_MapLinkoutGenParam(url_link,linkoutInfo.rid,giList,linkoutInfo.for_alignment, linkoutInfo.cur_align,labelList,lnk_displ,lnkTitleInfo);
2128
2129 if(textLink) {
2130 url_link = CAlignFormatUtil::MapTemplate(kUnigeneDispl,"lnk",url_link);
2131 }
2132 url_link = CAlignFormatUtil::MapProtocol(url_link);
2133 linkout_list.push_back(url_link);
2134 }
2135 if (linkout & eStructure){
2136 string struct_link = CAlignFormatUtil::GetURLFromRegistry(
2137 "STRUCTURE_URL");
2138
2139 url_link = struct_link.empty() ? kStructureUrl : struct_link;
2140 lnk_displ = textLink ? "Structure" : kStructureImg;
2141
2142 string linkTitle = " title=\"View 3D structure <@label@>\"";
2143 string molID,chainID;
2144 NStr::SplitInTwo(firstAcc,"_",molID,chainID);
2145 url_link = CAlignFormatUtil::MapTemplate(url_link,"molid",molID);
2146 url_link = CAlignFormatUtil::MapTemplate(url_link,"queryID",linkoutInfo.queryID);
2147 url_link = s_MapLinkoutGenParam(url_link,linkoutInfo.rid,giList,linkoutInfo.for_alignment, linkoutInfo.cur_align,firstAcc,lnk_displ,"",linkTitle);
2148 if(textLink) {
2149 url_link = CAlignFormatUtil::MapTemplate(kStructureDispl,"lnk",url_link);
2150 }
2151 url_link = CAlignFormatUtil::MapProtocol(url_link);
2152 linkout_list.push_back(url_link);
2153 }
2154 if (linkout & eGeo){
2155 url_link = CAlignFormatUtil::GetURLFromRegistry("GEO");
2156 lnk_displ = textLink ? "GEO Profiles" : kGeoImg;
2157
2158 lnkTitleInfo = "Expression profiles";
2159 //gilist contains comma separated gis
2160 url_link = s_MapLinkoutGenParam(url_link,linkoutInfo.rid,giList,linkoutInfo.for_alignment, linkoutInfo.cur_align,labelList,lnk_displ,lnkTitleInfo);
2161
2162
2163 if(textLink) {
2164 url_link = CAlignFormatUtil::MapTemplate(kGeoDispl,"lnk",url_link);
2165 }
2166 url_link = CAlignFormatUtil::MapProtocol(url_link);
2167 linkout_list.push_back(url_link);
2168 }
2169 if(linkout & eGene){
2170 url_link = CAlignFormatUtil::GetURLFromRegistry("GENE");
2171 if(textLink) {
2172 string geneSym = CAlignFormatUtil::GetGeneInfo(first_gi);
2173 lnk_displ = "Gene";
2174 lnkTitleInfo = "gene " + geneSym;
2175 }
2176 else {
2177 lnk_displ = kGeneImg;
2178 }
2179 string termParam = NStr::Find(labelList,",") == NPOS ? kGeneTerm : ""; //kGeneTerm if only one seqid
2180 url_link = CAlignFormatUtil::MapTemplate(url_link,"termParam",termParam);
2181
2182 string uid = !linkoutInfo.is_na ? "[Protein Accession]" : "[Nucleotide Accession]";
2183 url_link = CAlignFormatUtil::MapTemplate(url_link,"uid",uid);
2184
2185 url_link = s_MapLinkoutGenParam(url_link,linkoutInfo.rid,giList,linkoutInfo.for_alignment, linkoutInfo.cur_align,labelList,lnk_displ,lnkTitleInfo);
2186
2187 if(textLink) {
2188 url_link = CAlignFormatUtil::MapTemplate(kGeneDispl,"lnk",url_link);
2189 }
2190 url_link = CAlignFormatUtil::MapProtocol(url_link);
2191 linkout_list.push_back(url_link);
2192 }
2193
2194 if((linkout & eGenomicSeq) && first_gi != ZERO_GI){ //only for advanced view -> textlink = true
2195 if(textLink) {
2196 url_link = kMapviewBlastHitParams;
2197 lnk_displ = "Map Viewer";
2198
2199 lnkTitleInfo = "BLAST hits on the " + linkoutInfo.taxName + " genome";
2200
2201 url_link = CAlignFormatUtil::MapTemplate(url_link,"gnl",NStr::URLEncode(linkoutInfo.gnl));
2202 url_link = CAlignFormatUtil::MapTemplate(url_link,"db",linkoutInfo.database);
2203 url_link = CAlignFormatUtil::MapTemplate(url_link,"is_na",linkoutInfo.is_na? "1" : "0");
2204 string user_url = (linkoutInfo.user_url.empty()) ? kMapviewBlastHitUrl : linkoutInfo.user_url;
2205 url_link = CAlignFormatUtil::MapTemplate(url_link,"user_url",user_url);
2206
2207 string taxIDStr = (linkoutInfo.taxid > ZERO_TAX_ID) ? NStr::NumericToString(linkoutInfo.taxid) : "";
2208 url_link = CAlignFormatUtil::MapTemplate(url_link,"taxid",taxIDStr);
2209
2210 string queryNumStr = (linkoutInfo.query_number > 0) ? NStr::IntToString(linkoutInfo.query_number) : "";
2211 url_link = CAlignFormatUtil::MapTemplate(url_link,"query_number",queryNumStr); //gi,term
2212
2213 string giStr = (first_gi > ZERO_GI)? NStr::NumericToString(first_gi) : "";
2214 url_link = s_MapLinkoutGenParam(url_link,linkoutInfo.rid,giStr,linkoutInfo.for_alignment, linkoutInfo.cur_align,labelList,lnk_displ,lnkTitleInfo);
2215
2216 if(textLink) {
2217 url_link = CAlignFormatUtil::MapTemplate(kMapviwerDispl,"lnk",url_link);
2218 }
2219 url_link = CAlignFormatUtil::MapProtocol(url_link);
2220 linkout_list.push_back(url_link);
2221 }
2222 }
2223 else if((linkout & eMapviewer) && first_gi != ZERO_GI){
2224 url_link = kMapviwerUrl;
2225 lnk_displ = textLink ? "Map Viewer" : kMapviwerImg;
2226
2227 string linkTitle = " title=\"View <@label@> aligned to the " + linkoutInfo.taxName + " genome\"";
2228 url_link = s_MapLinkoutGenParam(url_link,linkoutInfo.rid,giList,linkoutInfo.for_alignment, linkoutInfo.cur_align,labelList,lnk_displ,"",linkTitle);
2229
2230 if(textLink) {
2231 url_link = CAlignFormatUtil::MapTemplate(kMapviwerDispl,"lnk",url_link);
2232 }
2233 url_link = CAlignFormatUtil::MapProtocol(url_link);
2234 linkout_list.push_back(url_link);
2235 }
2236 //View Bioassays involving <accession
2237 if(linkout & eBioAssay && linkoutInfo.is_na && first_gi != ZERO_GI){
2238 url_link = CAlignFormatUtil::GetURLFromRegistry("BIOASSAY_NUC");
2239 lnk_displ = textLink ? "PubChem BioAssay" : kBioAssayNucImg;
2240
2241 string linkTitle = " title=\"View Bioassays involving <@label@>\"";
2242 //gilist contains comma separated gis, change it to the following
2243 giList = NStr::Replace(giList,",","[RNATargetGI] OR ");
2244 url_link = s_MapLinkoutGenParam(url_link,linkoutInfo.rid,giList,linkoutInfo.for_alignment, linkoutInfo.cur_align,labelList,lnk_displ,"",linkTitle);
2245
2246 if(textLink) {
2247 url_link = CAlignFormatUtil::MapTemplate(kBioAssayDispl,"lnk",url_link);
2248 }
2249 url_link = CAlignFormatUtil::MapProtocol(url_link);
2250 linkout_list.push_back(url_link);
2251 }
2252 else if (linkout & eBioAssay && !linkoutInfo.is_na && first_gi != ZERO_GI) {
2253 url_link = CAlignFormatUtil::GetURLFromRegistry("BIOASSAY_PROT");
2254 lnk_displ = textLink ? "PubChem BioAssay" : kBioAssayProtImg;
2255
2256 lnkTitleInfo ="Bioassay data";
2257 string linkTitle = " title=\"View Bioassays involving <@label@>\"";
2258 //gilist contains comma separated gis, change it to the following
2259 giList = NStr::Replace(giList,",","[PigGI] OR ");
2260 url_link = s_MapLinkoutGenParam(url_link,linkoutInfo.rid,giList,linkoutInfo.for_alignment, linkoutInfo.cur_align,labelList,lnk_displ,"",linkTitle);
2261
2262 if(textLink) {
2263 url_link = CAlignFormatUtil::MapTemplate(kBioAssayDispl,"lnk",url_link);
2264 }
2265 url_link = CAlignFormatUtil::MapProtocol(url_link);
2266 linkout_list.push_back(url_link);
2267 }
2268 if(linkout & eReprMicrobialGenomes){
2269 url_link = CAlignFormatUtil::GetURLFromRegistry("REPR_MICROBIAL_GENOMES");
2270 lnk_displ = textLink ? "Genome" : kReprMicrobialGenomesImg;
2271
2272 lnkTitleInfo = "genomic information";
2273 //gilist contains comma separated gis
2274 string uid = !linkoutInfo.is_na ? "Protein Accession" : "Nucleotide Accession";
2275 url_link = CAlignFormatUtil::MapTemplate(url_link,"uid",uid);
2276 url_link = s_MapLinkoutGenParam(url_link,linkoutInfo.rid,giList,linkoutInfo.for_alignment, linkoutInfo.cur_align,labelList,lnk_displ,lnkTitleInfo);
2277
2278 if(textLink) {
2279 url_link = CAlignFormatUtil::MapTemplate(kReprMicrobialGenomesDispl,"lnk",url_link);
2280 }
2281 url_link = CAlignFormatUtil::MapProtocol(url_link);
2282 linkout_list.push_back(url_link);
2283 }
2284 if((linkout & eGenomeDataViewer) || (linkout & eTranscript)){
2285 string urlTag;
2286 lnk_displ = textLink ? "Genome Data Viewer" : kGenomeDataViewerImg;
2287 if(linkout & eTranscript) {
2288 urlTag = "GENOME_DATA_VIEWER_TRANSCR";
2289 lnkTitleInfo = "title=\"View the annotation of the transcript <@label@> within a genomic context in NCBI's Genome Data Viewer (GDV)- genome browser for RefSeq annotated assemblies. See other genomic features annotated at the same location as the protein annotation and browse to other regions.\"";
2290 }
2291 else {
2292 urlTag = linkoutInfo.is_na ? "GENOME_DATA_VIEWER_NUC" : "GENOME_DATA_VIEWER_PROT";
2293 lnkTitleInfo = linkoutInfo.is_na ?
2294 "title=\"View BLAST hits for <@label@> within a genomic context in NCBI's Genome Data Viewer (GDV)- genome browser for RefSeq annotated assemblies. See other genomic features annotated at the same location as hits and browse to other regions.\""
2295 :
2296 "title=\"View the annotation of the protein <@label@> within a genomic context in NCBI's Genome Data Viewer (GDV)- genome browser for RefSeq annotated assemblies. See other genomic features annotated at the same location as the protein annotation and browse to other regions.\"";
2297 }
2298 url_link = CAlignFormatUtil::GetURLFromRegistry(urlTag);
2299 url_link = s_MapLinkoutGenParam(url_link,linkoutInfo.rid,giList,linkoutInfo.for_alignment, linkoutInfo.cur_align,firstAcc,lnk_displ,"",lnkTitleInfo);
2300
2301 url_link = CAlignFormatUtil::MapTemplate(url_link,"queryID",linkoutInfo.queryID);
2302
2303 TSeqPos seqFrom = linkoutInfo.subjRange.GetFrom();
2304 seqFrom = (seqFrom == 0) ? seqFrom : seqFrom - 1;
2305
2306 TSeqPos seqTo = linkoutInfo.subjRange.GetTo();
2307 seqTo = (seqTo == 0) ? seqTo : seqTo - 1;
2308
2309 url_link = CAlignFormatUtil::MapTemplate(url_link,"from",seqFrom);//-1
2310 url_link = CAlignFormatUtil::MapTemplate(url_link,"to",seqTo);//-1
2311
2312 if(textLink) {
2313 url_link = CAlignFormatUtil::MapTemplate(kGenomeDataViewerDispl,"lnk",url_link);
2314 }
2315 url_link = CAlignFormatUtil::MapProtocol(url_link);
2316 linkout_list.push_back(url_link);
2317 }
2318 return linkout_list;
2319 }
2320
2321 ///Get list of linkouts for one sequence
GetLinkoutUrl(int linkout,const CBioseq::TId & ids,const string & rid,const string & cdd_rid,const string & entrez_term,bool is_na,TGi first_gi,bool structure_linkout_as_group,bool for_alignment,int cur_align,string preComputedResID)2322 list<string> CAlignFormatUtil::GetLinkoutUrl(int linkout, const CBioseq::TId& ids,
2323 const string& rid,
2324 const string& cdd_rid,
2325 const string& entrez_term,
2326 bool is_na,
2327 TGi first_gi,
2328 bool structure_linkout_as_group,
2329 bool for_alignment, int cur_align,
2330 string preComputedResID)
2331
2332 {
2333 list<string> linkout_list;
2334 TGi gi = FindGi(ids);
2335 CRef<CSeq_id> wid = FindBestChoice(ids, CSeq_id::WorstRank);
2336 string label;
2337 wid->GetLabel(&label, CSeq_id::eContent);
2338 string giString = NStr::NumericToString(gi);
2339 first_gi = (first_gi == ZERO_GI) ? gi : first_gi;
2340
2341
2342
2343 SLinkoutInfo linkoutInfo;
2344 linkoutInfo.Init(rid,
2345 cdd_rid,
2346 entrez_term,
2347 is_na,
2348 "", //database
2349 0, //query_number
2350 "", //user_url
2351 preComputedResID,
2352 "", //linkoutOrder
2353 structure_linkout_as_group,
2354 for_alignment);
2355
2356 linkoutInfo.cur_align = cur_align;
2357 linkoutInfo.taxid = ZERO_TAX_ID;
2358
2359 linkout_list = s_GetLinkoutUrl(linkout,
2360 giString,
2361 label,
2362 first_gi,
2363 linkoutInfo,
2364 false); //textlink
2365
2366 return linkout_list;
2367 }
2368
2369
s_LinkLetterToType(string linkLetter)2370 static int s_LinkLetterToType(string linkLetter)
2371 {
2372 int linkType = 0;
2373 if(linkLetter == "U") {
2374 linkType = eUnigene;
2375 }
2376 else if(linkLetter == "S") {
2377 linkType = eStructure;
2378 }
2379 else if(linkLetter == "E") {
2380 linkType = eGeo;
2381 }
2382 else if(linkLetter == "G") {
2383 linkType = eGene;
2384 }
2385 else if(linkLetter == "M") {
2386 linkType = eMapviewer | eGenomicSeq;
2387 }
2388 else if(linkLetter == "N") {
2389 linkType = eGenomicSeq;
2390 }
2391 else if(linkLetter == "B") {
2392 linkType = eBioAssay;
2393 }
2394 else if(linkLetter == "R") {
2395 linkType = eReprMicrobialGenomes;
2396 }
2397 else if(linkLetter == "V") {
2398 linkType = eGenomeDataViewer;
2399 }
2400 else if(linkLetter == "T") {
2401 linkType = eTranscript;
2402 }
2403
2404 return linkType;
2405 }
2406
2407
s_AddLinkoutInfo(map<int,vector<CBioseq::TId>> & linkout_map,int linkout,CBioseq::TId & cur_id)2408 static void s_AddLinkoutInfo(map<int, vector < CBioseq::TId > > &linkout_map,int linkout,CBioseq::TId &cur_id)
2409 {
2410 if(linkout_map.count(linkout) > 0){
2411 linkout_map[linkout].push_back(cur_id);
2412 }
2413 else {
2414 vector <CBioseq::TId > idList;
2415 idList.push_back(cur_id);
2416 linkout_map.insert(map<int, vector <CBioseq::TId > >::value_type(linkout,idList));
2417 }
2418 }
2419
GetSeqLinkoutInfo(CBioseq::TId & cur_id,ILinkoutDB ** linkoutdb,const string & mv_build_name,TGi gi)2420 int CAlignFormatUtil::GetSeqLinkoutInfo(CBioseq::TId& cur_id,
2421 ILinkoutDB **linkoutdb,
2422 const string& mv_build_name,
2423 TGi gi)
2424 {
2425 int linkout = 0;
2426
2427 if(*linkoutdb) {
2428 if(gi == INVALID_GI) {
2429 gi = FindGi(cur_id);
2430 }
2431 try {
2432 if(gi > ZERO_GI) {
2433 linkout = (*linkoutdb)->GetLinkout(gi, mv_build_name);
2434 }
2435 else if(GetTextSeqID(cur_id)){
2436 CRef<CSeq_id> seqID = FindBestChoice(cur_id, CSeq_id::WorstRank);
2437 linkout = (*linkoutdb)->GetLinkout(*seqID, mv_build_name);
2438 }
2439 }
2440 catch (const CException & e) {
2441 ERR_POST("Problem with linkoutdb: " + e.GetMsg());
2442 *linkoutdb = NULL;
2443 }
2444 }
2445 return linkout;
2446 }
2447
2448 void
GetBdlLinkoutInfo(CBioseq::TId & cur_id,map<int,vector<CBioseq::TId>> & linkout_map,ILinkoutDB * linkoutdb,const string & mv_build_name)2449 CAlignFormatUtil::GetBdlLinkoutInfo(CBioseq::TId& cur_id,
2450 map<int, vector <CBioseq::TId > > &linkout_map,
2451 ILinkoutDB* linkoutdb,
2452 const string& mv_build_name)
2453 {
2454 if(!linkoutdb) return;
2455
2456 int linkout = GetSeqLinkoutInfo(cur_id,
2457 &linkoutdb,
2458 mv_build_name);
2459
2460 if(linkout & eGene){
2461 s_AddLinkoutInfo(linkout_map,eGene,cur_id);
2462 }
2463 if (linkout & eUnigene) {
2464 s_AddLinkoutInfo(linkout_map,eUnigene,cur_id);
2465 }
2466 if (linkout & eGeo){
2467 s_AddLinkoutInfo(linkout_map,eGeo,cur_id);
2468 }
2469 if (linkout & eStructure){
2470 s_AddLinkoutInfo(linkout_map,eStructure,cur_id);
2471 }
2472 //eGenomicSeq and eMapviewer cannot combine together
2473 if((linkout & eGenomicSeq) && (linkout & eMapviewer)){
2474 s_AddLinkoutInfo(linkout_map,eGenomicSeq,cur_id);
2475 }
2476 else if(linkout & eMapviewer){
2477 s_AddLinkoutInfo(linkout_map,eMapviewer,cur_id);
2478 }
2479 if(linkout & eBioAssay){
2480 s_AddLinkoutInfo(linkout_map,eBioAssay,cur_id);
2481 }
2482 if(linkout & eReprMicrobialGenomes){
2483 s_AddLinkoutInfo(linkout_map,eReprMicrobialGenomes,cur_id);
2484 }
2485
2486 if(linkout & eGenomeDataViewer){
2487 s_AddLinkoutInfo(linkout_map,eGenomeDataViewer,cur_id);
2488 }
2489 if(linkout & eTranscript){
2490 s_AddLinkoutInfo(linkout_map,eTranscript,cur_id);
2491 }
2492
2493 }
2494
2495 void
GetBdlLinkoutInfo(const list<CRef<CBlast_def_line>> & bdl,map<int,vector<CBioseq::TId>> & linkout_map,ILinkoutDB * linkoutdb,const string & mv_build_name)2496 CAlignFormatUtil::GetBdlLinkoutInfo(const list< CRef< CBlast_def_line > > &bdl,
2497 map<int, vector <CBioseq::TId > > &linkout_map,
2498 ILinkoutDB* linkoutdb,
2499 const string& mv_build_name)
2500 {
2501 const int kMaxDeflineNum = 10;
2502 int num = 0;
2503 for(list< CRef< CBlast_def_line > >::const_iterator iter = bdl.begin();
2504 iter != bdl.end(); iter++){
2505 CBioseq::TId& cur_id = (CBioseq::TId &)(*iter)->GetSeqid();
2506
2507 GetBdlLinkoutInfo(cur_id,
2508 linkout_map,
2509 linkoutdb,
2510 mv_build_name);
2511 num++;
2512 if(num > kMaxDeflineNum) break;
2513 }
2514 }
2515
s_GetTaxName(TTaxId taxid)2516 static string s_GetTaxName(TTaxId taxid)
2517 {
2518 string taxName;
2519 try {
2520 if(taxid != ZERO_TAX_ID) {
2521 SSeqDBTaxInfo info;
2522 CSeqDB::GetTaxInfo(taxid, info);
2523 taxName = info.common_name;
2524 }
2525 }
2526 catch (CException&) {
2527
2528 }
2529 return taxName;
2530 }
2531
s_AddOtherRelatedInfoLinks(CBioseq::TId & cur_id,const string & rid,bool is_na,bool for_alignment,int cur_align,list<string> & linkout_list)2532 void s_AddOtherRelatedInfoLinks(CBioseq::TId& cur_id,
2533 const string& rid,
2534 bool is_na,
2535 bool for_alignment,
2536 int cur_align,
2537 list<string> &linkout_list)
2538
2539 {
2540 //Identical Proteins
2541
2542 CRef<CSeq_id> wid = FindBestChoice(cur_id, CSeq_id::WorstRank);
2543 if (CAlignFormatUtil::GetTextSeqID(wid)) {
2544 string label;
2545 wid->GetLabel(&label, CSeq_id::eContent);
2546 string url_link = kIdenticalProteinsUrl;
2547 string lnk_displ = "Identical Proteins";
2548 url_link = s_MapLinkoutGenParam(url_link,rid,NStr::NumericToString(ZERO_GI),for_alignment, cur_align,label,lnk_displ);
2549 url_link = CAlignFormatUtil::MapTemplate(kIdenticalProteinsDispl,"lnk",url_link);
2550 url_link = CAlignFormatUtil::MapTemplate(url_link,"label",label);
2551 linkout_list.push_back(url_link);
2552 }
2553 }
2554
2555
2556
2557 //reset:taxname,gnl
s_GetFullLinkoutUrl(CBioseq::TId & cur_id,CAlignFormatUtil::SLinkoutInfo & linkoutInfo,map<int,vector<CBioseq::TId>> & linkout_map,bool getIdentProteins)2558 static list<string> s_GetFullLinkoutUrl(CBioseq::TId& cur_id,
2559 CAlignFormatUtil::SLinkoutInfo &linkoutInfo,
2560 map<int, vector < CBioseq::TId > > &linkout_map,
2561 bool getIdentProteins)
2562
2563 {
2564 list<string> linkout_list;
2565
2566 vector<string> linkLetters;
2567 NStr::Split(linkoutInfo.linkoutOrder,",",linkLetters); //linkoutOrder = "G,U,M,V,E,S,B,R,T"
2568 for(size_t i = 0; i < linkLetters.size(); i++) {
2569 TGi first_gi = ZERO_GI;
2570 vector < CBioseq::TId > idList;
2571 int linkout = s_LinkLetterToType(linkLetters[i]);
2572 linkoutInfo.taxName.clear();
2573 if(linkout & (eMapviewer | eGenomicSeq)) {
2574 linkout = (linkout_map[eGenomicSeq].size() != 0) ? eGenomicSeq : eMapviewer;
2575 linkoutInfo.taxName = s_GetTaxName(linkoutInfo.taxid);
2576 }
2577 if(linkout_map.find(linkout) != linkout_map.end()) {
2578 idList = linkout_map[linkout];
2579 }
2580 bool disableLink = (linkout == 0 || idList.size() == 0 || ( (linkout & eStructure) && (linkoutInfo.cdd_rid == "" || linkoutInfo.cdd_rid == "0")));
2581
2582 string giList,labelList;
2583 int seqVersion = ((linkout & eGenomeDataViewer) || (linkout & eTranscript)) ? true : false;
2584 for (size_t i = 0; i < idList.size(); i++) {
2585 const CBioseq::TId& ids = idList[i];
2586 TGi gi = FindGi(ids);
2587 if (first_gi == ZERO_GI) first_gi = gi;
2588
2589
2590 CRef<CSeq_id> wid = FindBestChoice(ids, CSeq_id::WorstRank);
2591 string label = CAlignFormatUtil::GetLabel(wid,seqVersion);
2592 if(!labelList.empty()) labelList += ",";
2593 labelList += label;
2594
2595 //use only first gi for bioAssay protein
2596 if(!giList.empty() && (linkout & eBioAssay) && !linkoutInfo.is_na) continue;
2597 if(!giList.empty()) giList += ",";
2598 giList += NStr::NumericToString(gi);
2599 }
2600
2601 linkoutInfo.gnl.clear();
2602 if(!disableLink && linkout == eGenomicSeq) {
2603 linkoutInfo.gnl = s_GetBestIDForURL(cur_id);
2604 }
2605
2606 if(!disableLink) {//
2607 //The following list will contain only one entry for single linkout value
2608 list<string> one_linkout = s_GetLinkoutUrl(linkout,
2609 giList,
2610 labelList,
2611 first_gi,
2612 linkoutInfo);
2613 if(one_linkout.size() > 0) {
2614 list<string>::iterator iter = one_linkout.begin();
2615 linkout_list.push_back(*iter);
2616 }
2617 }
2618 }
2619 if(getIdentProteins) {
2620 s_AddOtherRelatedInfoLinks(cur_id,linkoutInfo.rid,linkoutInfo.is_na,linkoutInfo.for_alignment,linkoutInfo.cur_align,linkout_list);
2621 }
2622 return linkout_list;
2623 }
2624
GetFullLinkoutUrl(const list<CRef<CBlast_def_line>> & bdl,CAlignFormatUtil::SLinkoutInfo & linkoutInfo)2625 list<string> CAlignFormatUtil::GetFullLinkoutUrl(const list< CRef< CBlast_def_line > > &bdl,
2626 CAlignFormatUtil::SLinkoutInfo &linkoutInfo)
2627 {
2628 list<string> linkout_list;
2629 map<int, vector < CBioseq::TId > > linkout_map;
2630 if(bdl.size() > 0) {
2631 GetBdlLinkoutInfo(bdl,linkout_map, linkoutInfo.linkoutdb, linkoutInfo.mv_build_name);
2632 list< CRef< CBlast_def_line > >::const_iterator iter = bdl.begin();
2633 CBioseq::TId& cur_id = (CBioseq::TId &)(*iter)->GetSeqid();
2634 linkout_list = s_GetFullLinkoutUrl(cur_id,
2635 linkoutInfo,
2636 linkout_map,
2637 !linkoutInfo.is_na && bdl.size() > 1);
2638 }
2639 return linkout_list;
2640 }
2641
2642
GetFullLinkoutUrl(const list<CRef<CBlast_def_line>> & bdl,const string & rid,const string & cdd_rid,const string & entrez_term,bool is_na,bool structure_linkout_as_group,bool for_alignment,int cur_align,string & linkoutOrder,TTaxId taxid,string & database,int query_number,string & user_url,string & preComputedResID,ILinkoutDB * linkoutdb,const string & mv_build_name)2643 list<string> CAlignFormatUtil::GetFullLinkoutUrl(const list< CRef< CBlast_def_line > > &bdl,
2644 const string& rid,
2645 const string& cdd_rid,
2646 const string& entrez_term,
2647 bool is_na,
2648 bool structure_linkout_as_group,
2649 bool for_alignment,
2650 int cur_align,
2651 string& linkoutOrder,
2652 TTaxId taxid,
2653 string &database,
2654 int query_number,
2655 string &user_url,
2656 string &preComputedResID,
2657 ILinkoutDB* linkoutdb,
2658 const string& mv_build_name)
2659
2660 {
2661 list<string> linkout_list;
2662 map<int, vector < CBioseq::TId > > linkout_map;
2663 if(bdl.size() > 0) {
2664 GetBdlLinkoutInfo(bdl,linkout_map, linkoutdb, mv_build_name);
2665 list< CRef< CBlast_def_line > >::const_iterator iter = bdl.begin();
2666 CBioseq::TId& cur_id = (CBioseq::TId &)(*iter)->GetSeqid();
2667
2668 SLinkoutInfo linkoutInfo;
2669 linkoutInfo.Init(rid,
2670 cdd_rid,
2671 entrez_term,
2672 is_na,
2673 database,
2674 query_number,
2675 user_url,
2676 preComputedResID,
2677 linkoutOrder,
2678 structure_linkout_as_group,
2679 for_alignment);
2680
2681 linkoutInfo.cur_align = cur_align;
2682 linkoutInfo.taxid = taxid;
2683
2684 linkout_list = s_GetFullLinkoutUrl(cur_id,
2685 linkoutInfo,
2686 linkout_map,
2687 !is_na && bdl.size() > 1);
2688 }
2689 return linkout_list;
2690 }
2691
2692
GetFullLinkoutUrl(CBioseq::TId & cur_id,CAlignFormatUtil::SLinkoutInfo & linkoutInfo,bool getIdentProteins)2693 list<string> CAlignFormatUtil::GetFullLinkoutUrl(CBioseq::TId& cur_id,
2694 CAlignFormatUtil::SLinkoutInfo &linkoutInfo,
2695 bool getIdentProteins)
2696 {
2697 list<string> linkout_list;
2698 map<int, vector < CBioseq::TId > > linkout_map;
2699
2700 GetBdlLinkoutInfo(cur_id,linkout_map, linkoutInfo.linkoutdb, linkoutInfo.mv_build_name);
2701 linkout_list = s_GetFullLinkoutUrl(cur_id,
2702 linkoutInfo,
2703 linkout_map,
2704 getIdentProteins);
2705 return linkout_list;
2706 }
2707
GetFullLinkoutUrl(CBioseq::TId & cur_id,const string & rid,const string & cdd_rid,const string & entrez_term,bool is_na,bool structure_linkout_as_group,bool for_alignment,int cur_align,string & linkoutOrder,TTaxId taxid,string & database,int query_number,string & user_url,string & preComputedResID,ILinkoutDB * linkoutdb,const string & mv_build_name,bool getIdentProteins)2708 list<string> CAlignFormatUtil::GetFullLinkoutUrl(CBioseq::TId& cur_id,
2709 const string& rid,
2710 const string& cdd_rid,
2711 const string& entrez_term,
2712 bool is_na,
2713 bool structure_linkout_as_group,
2714 bool for_alignment,
2715 int cur_align,
2716 string& linkoutOrder,
2717 TTaxId taxid,
2718 string &database,
2719 int query_number,
2720 string &user_url,
2721 string &preComputedResID,
2722 ILinkoutDB* linkoutdb,
2723 const string& mv_build_name,
2724 bool getIdentProteins)
2725
2726 {
2727 list<string> linkout_list;
2728
2729 map<int, vector < CBioseq::TId > > linkout_map;
2730 GetBdlLinkoutInfo(cur_id,linkout_map, linkoutdb, mv_build_name);
2731 SLinkoutInfo linkoutInfo;
2732 linkoutInfo.Init(rid,
2733 cdd_rid,
2734 entrez_term,
2735 is_na,
2736 database,
2737 query_number,
2738 user_url,
2739 preComputedResID,
2740 linkoutOrder,
2741 structure_linkout_as_group,
2742 for_alignment);
2743
2744 linkoutInfo.cur_align = cur_align;
2745 linkoutInfo.taxid = taxid;
2746
2747 linkout_list = s_GetFullLinkoutUrl(cur_id,
2748 linkoutInfo,
2749 linkout_map,
2750 getIdentProteins);
2751 return linkout_list;
2752 }
2753
2754
FromRangeAscendingSort(CRange<TSeqPos> const & info1,CRange<TSeqPos> const & info2)2755 static bool FromRangeAscendingSort(CRange<TSeqPos> const& info1,
2756 CRange<TSeqPos> const& info2)
2757 {
2758 return info1.GetFrom() < info2.GetFrom();
2759 }
2760
2761 //0 for query, 1 for subject
2762 //Gets query and subject range lists,oppositeStrands param
s_ProcessAlignSet(const CSeq_align_set & alnset,list<CRange<TSeqPos>> & query_list,list<CRange<TSeqPos>> & subject_list)2763 static bool s_ProcessAlignSet(const CSeq_align_set& alnset,
2764 list<CRange<TSeqPos> > &query_list,
2765 list<CRange<TSeqPos> > &subject_list)
2766 {
2767 bool oppositeStrands = false;
2768 bool isFirst = false;
2769 ITERATE(CSeq_align_set::Tdata, iter, alnset.Get()) {
2770 CRange<TSeqPos> query_range = (*iter)->GetSeqRange(0);
2771 //for minus strand
2772 if(query_range.GetFrom() > query_range.GetTo()){
2773 query_range.Set(query_range.GetTo(), query_range.GetFrom());
2774 }
2775 query_list.push_back(query_range);
2776
2777 CRange<TSeqPos> subject_range = (*iter)->GetSeqRange(1);
2778 //for minus strand
2779 if(subject_range.GetFrom() > subject_range.GetTo()){
2780 subject_range.Set(subject_range.GetTo(), subject_range.GetFrom());
2781 }
2782 subject_list.push_back(subject_range);
2783
2784 oppositeStrands = (!isFirst) ? (*iter)->GetSeqStrand(0) != (*iter)->GetSeqStrand(1) : oppositeStrands;
2785 isFirst = true;
2786 }
2787
2788 query_list.sort(FromRangeAscendingSort);
2789 subject_list.sort(FromRangeAscendingSort);
2790 return oppositeStrands;
2791 }
2792
2793
2794
2795 //0 for query, 1 for subject
s_MergeRangeList(list<CRange<TSeqPos>> & source)2796 static list<CRange<TSeqPos> > s_MergeRangeList(list<CRange<TSeqPos> > &source)
2797 {
2798
2799 list<CRange<TSeqPos> > merge_list;
2800
2801 bool is_first = true;
2802 CRange<TSeqPos> prev_range (0, 0);
2803 ITERATE(list<CRange<TSeqPos> >, iter, source) {
2804
2805 if (is_first) {
2806 merge_list.push_back(*iter);
2807 is_first= false;
2808 prev_range = *iter;
2809 } else {
2810 if (prev_range.IntersectingWith(*iter)) {
2811 merge_list.pop_back();
2812 CRange<TSeqPos> temp_range = prev_range.CombinationWith(*iter);
2813 merge_list.push_back(temp_range);
2814 prev_range = temp_range;
2815 } else {
2816 merge_list.push_back(*iter);
2817 prev_range = *iter;
2818 }
2819 }
2820
2821 }
2822 return merge_list;
2823 }
2824
GetMasterCoverage(const CSeq_align_set & alnset)2825 int CAlignFormatUtil::GetMasterCoverage(const CSeq_align_set& alnset)
2826 {
2827
2828 list<CRange<TSeqPos> > merge_list;
2829
2830 list<CRange<TSeqPos> > temp;
2831 ITERATE(CSeq_align_set::Tdata, iter, alnset.Get()) {
2832 CRange<TSeqPos> seq_range = (*iter)->GetSeqRange(0);
2833 //for minus strand
2834 if(seq_range.GetFrom() > seq_range.GetTo()){
2835 seq_range.Set(seq_range.GetTo(), seq_range.GetFrom());
2836 }
2837 temp.push_back(seq_range);
2838 }
2839
2840 temp.sort(FromRangeAscendingSort);
2841
2842 merge_list = s_MergeRangeList(temp);
2843
2844 int master_covered_lenghth = 0;
2845 ITERATE(list<CRange<TSeqPos> >, iter, merge_list) {
2846 master_covered_lenghth += iter->GetLength();
2847 }
2848 return master_covered_lenghth;
2849 }
2850
2851
2852
GetSeqAlignCoverageParams(const CSeq_align_set & alnset,int * master_covered_lenghth,bool * flip)2853 CRange<TSeqPos> CAlignFormatUtil::GetSeqAlignCoverageParams(const CSeq_align_set& alnset,int *master_covered_lenghth,bool *flip)
2854
2855 {
2856
2857 list<CRange<TSeqPos> > query_list;
2858 list<CRange<TSeqPos> > subject_list;
2859
2860 *flip = s_ProcessAlignSet(alnset,query_list,subject_list);
2861 query_list = s_MergeRangeList(query_list);
2862 subject_list = s_MergeRangeList(subject_list);
2863
2864
2865 *master_covered_lenghth = 0;
2866 ITERATE(list<CRange<TSeqPos> >, iter, query_list) {
2867 *master_covered_lenghth += iter->GetLength();
2868 }
2869
2870 TSeqPos from = 0,to = 0;
2871 ITERATE(list<CRange<TSeqPos> >, iter, subject_list) {
2872 from = (from == 0) ? iter->GetFrom() : min(from,iter->GetFrom());
2873 to = max(to,iter->GetTo());
2874 }
2875 //cerr << "from,to = " << from << "," << to << endl;
2876 CRange<TSeqPos> subjectRange(from + 1, to + 1);
2877 return subjectRange;
2878 }
2879
2880 CRef<CSeq_align_set>
SortSeqalignForSortableFormat(CCgiContext & ctx,CScope & scope,CSeq_align_set & aln_set,bool nuc_to_nuc_translation,int db_sort,int hit_sort,int hsp_sort,ILinkoutDB * linkoutdb,const string & mv_build_name)2881 CAlignFormatUtil::SortSeqalignForSortableFormat(CCgiContext& ctx,
2882 CScope& scope,
2883 CSeq_align_set& aln_set,
2884 bool nuc_to_nuc_translation,
2885 int db_sort,
2886 int hit_sort,
2887 int hsp_sort,
2888 ILinkoutDB* linkoutdb,
2889 const string& mv_build_name) {
2890
2891
2892 if (db_sort == 0 && hit_sort < 1 && hsp_sort < 1)
2893 return (CRef<CSeq_align_set>) &aln_set;
2894
2895 list< CRef<CSeq_align_set> > seqalign_hit_total_list;
2896 vector< CRef<CSeq_align_set> > seqalign_vec(2);
2897 seqalign_vec[0] = new CSeq_align_set;
2898 seqalign_vec[1] = new CSeq_align_set;
2899
2900 if(IsMixedDatabase(ctx)) {
2901 SplitSeqalignByMolecularType(seqalign_vec, db_sort, aln_set, scope,
2902 linkoutdb, mv_build_name);
2903 }else {
2904 seqalign_vec[0] = const_cast<CSeq_align_set*>(&aln_set);
2905 }
2906
2907
2908 ITERATE(vector< CRef<CSeq_align_set> >, iter, seqalign_vec){
2909 list< CRef<CSeq_align_set> > one_seqalign_hit_total_list = SortOneSeqalignForSortableFormat(**iter,
2910 nuc_to_nuc_translation,
2911 hit_sort,
2912 hsp_sort);
2913
2914 seqalign_hit_total_list.splice(seqalign_hit_total_list.end(),one_seqalign_hit_total_list);
2915
2916 }
2917
2918 return HitListToHspList(seqalign_hit_total_list);
2919 }
2920 list< CRef<CSeq_align_set> >
SortOneSeqalignForSortableFormat(const CSeq_align_set & source,bool nuc_to_nuc_translation,int hit_sort,int hsp_sort)2921 CAlignFormatUtil::SortOneSeqalignForSortableFormat(const CSeq_align_set& source,
2922 bool nuc_to_nuc_translation,
2923 int hit_sort,
2924 int hsp_sort)
2925 {
2926 list< CRef<CSeq_align_set> > seqalign_hit_total_list;
2927 list< CRef<CSeq_align_set> > seqalign_hit_list;
2928 HspListToHitList(seqalign_hit_list, source);
2929
2930 if (hit_sort == eTotalScore) {
2931 seqalign_hit_list.sort(SortHitByTotalScoreDescending);
2932 } else if (hit_sort == eHighestScore) {
2933 seqalign_hit_list.sort(CAlignFormatUtil::SortHitByScoreDescending);
2934 } else if (hit_sort == ePercentIdentity) {
2935 SortHitByPercentIdentityDescending(seqalign_hit_list,
2936 nuc_to_nuc_translation);
2937 } else if (hit_sort == eQueryCoverage) {
2938 seqalign_hit_list.sort(SortHitByMasterCoverageDescending);
2939 }
2940
2941 ITERATE(list< CRef<CSeq_align_set> >, iter2, seqalign_hit_list) {
2942 CRef<CSeq_align_set> temp(*iter2);
2943 if (hsp_sort == eQueryStart) {
2944 temp->Set().sort(SortHspByMasterStartAscending);
2945 } else if (hsp_sort == eHspPercentIdentity) {
2946 temp->Set().sort(SortHspByPercentIdentityDescending);
2947 } else if (hsp_sort == eScore) {
2948 temp->Set().sort(SortHspByScoreDescending);
2949 } else if (hsp_sort == eSubjectStart) {
2950 temp->Set().sort(SortHspBySubjectStartAscending);
2951
2952 }
2953 seqalign_hit_total_list.push_back(temp);
2954 }
2955 return seqalign_hit_total_list;
2956 }
2957
2958 CRef<CSeq_align_set>
SortSeqalignForSortableFormat(CSeq_align_set & aln_set,bool nuc_to_nuc_translation,int hit_sort,int hsp_sort)2959 CAlignFormatUtil::SortSeqalignForSortableFormat(CSeq_align_set& aln_set,
2960 bool nuc_to_nuc_translation,
2961 int hit_sort,
2962 int hsp_sort) {
2963
2964 if (hit_sort <= eEvalue && hsp_sort <= eHspEvalue) {
2965 return (CRef<CSeq_align_set>) &aln_set;
2966 }
2967
2968 // seqalign_vec[0] = const_cast<CSeq_align_set*>(&aln_set);
2969 list< CRef<CSeq_align_set> > seqalign_hit_total_list = SortOneSeqalignForSortableFormat(aln_set,
2970 nuc_to_nuc_translation,
2971 hit_sort,
2972 hsp_sort);
2973 return HitListToHspList(seqalign_hit_total_list);
2974 }
2975
2976
FilterSeqalignByEval(CSeq_align_set & source_aln,double evalueLow,double evalueHigh)2977 CRef<CSeq_align_set> CAlignFormatUtil::FilterSeqalignByEval(CSeq_align_set& source_aln,
2978 double evalueLow,
2979 double evalueHigh)
2980 {
2981 int score, sum_n, num_ident;
2982 double bits, evalue;
2983 list<TGi> use_this_gi;
2984
2985 CRef<CSeq_align_set> new_aln(new CSeq_align_set);
2986
2987 ITERATE(CSeq_align_set::Tdata, iter, source_aln.Get()){
2988 CAlignFormatUtil::GetAlnScores(**iter, score, bits, evalue,
2989 sum_n, num_ident, use_this_gi);
2990 //Add the next three lines to re-calculte seq align evalue to the obe that is displayed on the screen
2991 //string evalue_buf, bit_score_buf, total_bit_buf, raw_score_buf;
2992 //CAlignFormatUtil::GetScoreString(evalue, bits, 0, 0, evalue_buf, bit_score_buf, total_bit_buf, raw_score_buf);
2993 //evalue = NStr::StringToDouble(evalue_buf);
2994 if(evalue >= evalueLow && evalue <= evalueHigh) {
2995 new_aln->Set().push_back(*iter);
2996 }
2997 }
2998 return new_aln;
2999
3000 }
3001
3002 /// Returns percent match for an alignment.
3003 /// Normally we round up the value, unless that means that an
3004 /// alignment with mismatches would be 100%. In that case
3005 /// it becomes 99%.
3006 ///@param numerator: numerator in percent identity calculation.
3007 ///@param denominator: denominator in percent identity calculation.
GetPercentMatch(int numerator,int denominator)3008 int CAlignFormatUtil::GetPercentMatch(int numerator, int denominator)
3009 {
3010 if (numerator == denominator)
3011 return 100;
3012 else {
3013 int retval =(int) (0.5 + 100.0*((double)numerator)/((double)denominator));
3014 retval = min(99, retval);
3015 return retval;
3016 }
3017 }
3018
GetPercentIdentity(int numerator,int denominator)3019 double CAlignFormatUtil::GetPercentIdentity(int numerator, int denominator)
3020 {
3021 if (numerator == denominator)
3022 return 100;
3023 else {
3024 double retval =100*(double)numerator/(double)denominator;
3025 return retval;
3026 }
3027 }
3028
FilterSeqalignByPercentIdent(CSeq_align_set & source_aln,double percentIdentLow,double percentIdentHigh)3029 CRef<CSeq_align_set> CAlignFormatUtil::FilterSeqalignByPercentIdent(CSeq_align_set& source_aln,
3030 double percentIdentLow,
3031 double percentIdentHigh)
3032 {
3033 int score, sum_n, num_ident;
3034 double bits, evalue;
3035 list<TGi> use_this_gi;
3036
3037 CRef<CSeq_align_set> new_aln(new CSeq_align_set);
3038
3039 ITERATE(CSeq_align_set::Tdata, iter, source_aln.Get()){
3040 CAlignFormatUtil::GetAlnScores(**iter, score, bits, evalue,
3041 sum_n, num_ident, use_this_gi);
3042 int seqAlnLength = GetAlignmentLength(**iter, kTranslation);
3043 if(seqAlnLength > 0 && num_ident > 0) {
3044 double alnPercentIdent = GetPercentIdentity(num_ident, seqAlnLength);
3045 if(alnPercentIdent >= percentIdentLow && alnPercentIdent <= percentIdentHigh) {
3046 new_aln->Set().push_back(*iter);
3047 }
3048 }
3049 }
3050 return new_aln;
3051 }
3052
FilterSeqalignByScoreParams(CSeq_align_set & source_aln,double evalueLow,double evalueHigh,double percentIdentLow,double percentIdentHigh)3053 CRef<CSeq_align_set> CAlignFormatUtil::FilterSeqalignByScoreParams(CSeq_align_set& source_aln,
3054 double evalueLow,
3055 double evalueHigh,
3056 double percentIdentLow,
3057 double percentIdentHigh)
3058 {
3059 int score, sum_n, num_ident;
3060 double bits, evalue;
3061 list<TGi> use_this_gi;
3062
3063 CRef<CSeq_align_set> new_aln(new CSeq_align_set);
3064
3065 ITERATE(CSeq_align_set::Tdata, iter, source_aln.Get()){
3066 CAlignFormatUtil::GetAlnScores(**iter, score, bits, evalue,
3067 sum_n, num_ident, use_this_gi);
3068 //Add the next three lines to re-calculte seq align evalue to the one that is displayed on the screen
3069 //string evalue_buf, bit_score_buf, total_bit_buf, raw_score_buf;
3070 //CAlignFormatUtil::GetScoreString(evalue, bits, 0, 0, evalue_buf, bit_score_buf, total_bit_buf, raw_score_buf);
3071 //evalue = NStr::StringToDouble(evalue_buf);
3072 int seqAlnLength = GetAlignmentLength(**iter, kTranslation);
3073 if(seqAlnLength > 0 && num_ident > 0) {
3074 int alnPercentIdent = GetPercentMatch(num_ident, seqAlnLength);
3075 if( (evalue >= evalueLow && evalue <= evalueHigh) &&
3076 (alnPercentIdent >= percentIdentLow && alnPercentIdent <= percentIdentHigh)) {
3077 new_aln->Set().push_back(*iter);
3078 }
3079 }
3080 }
3081 return new_aln;
3082 }
3083
adjustPercentIdentToDisplayValue(double value)3084 static double adjustPercentIdentToDisplayValue(double value)
3085 {
3086 char buffer[512];
3087 sprintf(buffer, "%.*f", 2, value);
3088 double newVal = NStr::StringToDouble(buffer);
3089 return newVal;
3090 }
3091
s_isAlnInFilteringRange(double evalue,double percentIdent,int queryCover,double evalueLow,double evalueHigh,double percentIdentLow,double percentIdentHigh,int queryCoverLow,int queryCoverHigh)3092 static bool s_isAlnInFilteringRange(double evalue,
3093 double percentIdent,
3094 int queryCover,
3095 double evalueLow,
3096 double evalueHigh,
3097 double percentIdentLow,
3098 double percentIdentHigh,
3099 int queryCoverLow,
3100 int queryCoverHigh)
3101 {
3102
3103
3104 bool isInRange = false;
3105 //Adjust percent identity and evalue to display values
3106 percentIdent = adjustPercentIdentToDisplayValue(percentIdent);
3107 string evalue_buf, bit_score_buf, total_bit_buf, raw_score_buf;
3108 double bits = 0;
3109 CAlignFormatUtil::GetScoreString(evalue, bits, 0, 0, evalue_buf, bit_score_buf, total_bit_buf, raw_score_buf);
3110 evalue = NStr::StringToDouble(evalue_buf);
3111
3112 if(evalueLow >= 0 && percentIdentLow >= 0 && queryCoverLow >= 0) {
3113 isInRange = (evalue >= evalueLow && evalue <= evalueHigh) &&
3114 (percentIdent >= percentIdentLow && percentIdent <= percentIdentHigh) &&
3115 (queryCover >= queryCoverLow && queryCover <= queryCoverHigh);
3116 }
3117 else if(evalueLow >= 0 && percentIdentLow >= 0) {
3118 isInRange = (evalue >= evalueLow && evalue <= evalueHigh) &&
3119 (percentIdent >= percentIdentLow && percentIdent <= percentIdentHigh);
3120 }
3121 else if(evalueLow >= 0 && queryCoverLow >= 0) {
3122 isInRange = (evalue >= evalueLow && evalue <= evalueHigh) &&
3123 (queryCover >= queryCoverLow && queryCover <= queryCoverHigh);
3124 }
3125 else if(queryCoverLow >= 0 && percentIdentLow >= 0) {
3126 isInRange = (queryCover >= queryCoverLow && queryCover <= queryCoverHigh) &&
3127 (percentIdent >= percentIdentLow && percentIdent <= percentIdentHigh);
3128 }
3129 else if(evalueLow >= 0) {
3130 isInRange = (evalue >= evalueLow && evalue <= evalueHigh);
3131 }
3132 else if(percentIdentLow >= 0) {
3133 isInRange = (percentIdent >= percentIdentLow && percentIdent <= percentIdentHigh);
3134 }
3135 else if(queryCoverLow >= 0) {
3136 isInRange = (queryCover >= queryCoverLow && queryCover <= queryCoverHigh);
3137 }
3138 return isInRange;
3139 }
3140
FilterSeqalignByScoreParams(CSeq_align_set & source_aln,double evalueLow,double evalueHigh,double percentIdentLow,double percentIdentHigh,int queryCoverLow,int queryCoverHigh)3141 CRef<CSeq_align_set> CAlignFormatUtil::FilterSeqalignByScoreParams(CSeq_align_set& source_aln,
3142 double evalueLow,
3143 double evalueHigh,
3144 double percentIdentLow,
3145 double percentIdentHigh,
3146 int queryCoverLow,
3147 int queryCoverHigh)
3148 {
3149 list< CRef<CSeq_align_set> > seqalign_hit_total_list;
3150 list< CRef<CSeq_align_set> > seqalign_hit_list;
3151
3152 HspListToHitList(seqalign_hit_list, source_aln);
3153
3154 ITERATE(list< CRef<CSeq_align_set> >, iter, seqalign_hit_list) {
3155 CRef<CSeq_align_set> temp(*iter);
3156 CAlignFormatUtil::SSeqAlignSetCalcParams* seqSetInfo = CAlignFormatUtil::GetSeqAlignSetCalcParamsFromASN(*temp);
3157
3158 if(s_isAlnInFilteringRange(seqSetInfo->evalue,
3159 seqSetInfo->percent_identity,
3160 seqSetInfo->percent_coverage,
3161 evalueLow,
3162 evalueHigh,
3163 percentIdentLow,
3164 percentIdentHigh,
3165 queryCoverLow,
3166 queryCoverHigh)) {
3167 seqalign_hit_total_list.push_back(temp);
3168 }
3169 }
3170 return HitListToHspList(seqalign_hit_total_list);
3171 }
3172
LimitSeqalignByHsps(CSeq_align_set & source_aln,int maxAligns,int maxHsps)3173 CRef<CSeq_align_set> CAlignFormatUtil::LimitSeqalignByHsps(CSeq_align_set& source_aln,
3174 int maxAligns,
3175 int maxHsps)
3176 {
3177 CRef<CSeq_align_set> new_aln(new CSeq_align_set);
3178
3179 CConstRef<CSeq_id> prevQueryId,prevSubjectId;
3180 int alignCount = 0,hspCount = 0;
3181 ITERATE(CSeq_align_set::Tdata, iter, source_aln.Get()){
3182 const CSeq_id& newQueryId = (*iter)->GetSeq_id(0);
3183 if(prevQueryId.Empty() || !newQueryId.Match(*prevQueryId)){
3184 if (hspCount >= maxHsps) {
3185 break;
3186 }
3187 alignCount = 0;
3188 prevQueryId = &newQueryId;
3189 }
3190 if (alignCount < maxAligns) {
3191 const CSeq_id& newSubjectId = (*iter)->GetSeq_id(1);
3192 // Increment alignments count if subject sequence is different
3193 if(prevSubjectId.Empty() || !newSubjectId.Match(*prevSubjectId)){
3194 ++alignCount;
3195 prevSubjectId = &newSubjectId;
3196 }
3197 // Increment HSP count if the alignments limit is not reached
3198 ++hspCount;
3199 new_aln->Set().push_back(*iter);
3200 }
3201
3202 }
3203 return new_aln;
3204 }
3205
3206
ExtractQuerySeqAlign(CRef<CSeq_align_set> & source_aln,int queryNumber)3207 CRef<CSeq_align_set> CAlignFormatUtil::ExtractQuerySeqAlign(CRef<CSeq_align_set> &source_aln,
3208 int queryNumber)
3209 {
3210 if(queryNumber == 0) {
3211 return source_aln;
3212 }
3213 CRef<CSeq_align_set> new_aln;
3214
3215 CConstRef<CSeq_id> prevQueryId;
3216 int currQueryNum = 0;
3217
3218 ITERATE(CSeq_align_set::Tdata, iter, source_aln->Get()){
3219 const CSeq_id& newQueryId = (*iter)->GetSeq_id(0);
3220 if(prevQueryId.Empty() || !newQueryId.Match(*prevQueryId)){
3221 currQueryNum++;
3222 prevQueryId = &newQueryId;
3223 }
3224 //Record seq aligns corresponding to queryNumber
3225 if(currQueryNum == queryNumber) {
3226 if(new_aln.Empty()) {
3227 new_aln.Reset(new CSeq_align_set);
3228 }
3229 new_aln->Set().push_back(*iter);
3230 }
3231 else if(currQueryNum > queryNumber) {
3232 break;
3233 }
3234 }
3235 return new_aln;
3236 }
3237
3238
InitConfig()3239 void CAlignFormatUtil::InitConfig()
3240 {
3241 string l_cfg_file_name;
3242 bool l_dbg = CAlignFormatUtil::m_geturl_debug_flag;
3243 if( getenv("GETURL_DEBUG") ) CAlignFormatUtil::m_geturl_debug_flag = l_dbg = true;
3244 if( !m_Reg ) {
3245 bool cfgExists = true;
3246 string l_ncbi_env;
3247 string l_fmtcfg_env;
3248 if( NULL != getenv("NCBI") ) l_ncbi_env = getenv("NCBI");
3249 if( NULL != getenv("FMTCFG") ) l_fmtcfg_env = getenv("FMTCFG");
3250 // config file name: value of FMTCFG or default ( .ncbirc )
3251 if( l_fmtcfg_env.empty() )
3252 l_cfg_file_name = ".ncbirc";
3253 else
3254 l_cfg_file_name = l_fmtcfg_env;
3255 // checkinf existance of configuration file
3256 CFile l_fchecker( l_cfg_file_name );
3257 cfgExists = l_fchecker.Exists();
3258 if( (!cfgExists) && (!l_ncbi_env.empty()) ) {
3259 if( l_ncbi_env.rfind("/") != (l_ncbi_env.length() -1 ))
3260 l_ncbi_env.append("/");
3261 l_cfg_file_name = l_ncbi_env + l_cfg_file_name;
3262 CFile l_fchecker2( l_cfg_file_name );
3263 cfgExists = l_fchecker2.Exists();
3264 }
3265 if(cfgExists) {
3266 CNcbiIfstream l_ConfigFile(l_cfg_file_name.c_str() );
3267 m_Reg.reset(new CNcbiRegistry(l_ConfigFile));
3268 if( l_dbg ) fprintf(stderr,"REGISTRY: %s\n",l_cfg_file_name.c_str());
3269 }
3270 }
3271 return;
3272 }
3273
3274 //
3275 // get given url from registry file or return corresponding kNAME
3276 // value as default to preserve compatibility.
3277 //
3278 // algoritm:
3279 // 1) config file name is ".ncbirc" unless FMTCFG specifies another name
3280 // 2) try to read local configuration file before
3281 // checking location specified by the NCBI environment.
3282 // 3) if index != -1, use it as trailing version number for a key name,
3283 // ABCD_V0. try to read ABCD key if version variant doesn't exist.
3284 // 4) use INCLUDE_BASE_DIR key to specify base for all include files.
3285 // 5) treat "_FORMAT" key as filename first and string in second.
3286 // in case of existances of filename, read it starting from
3287 // location specified by INCLUDE_BASE_DIR key
GetURLFromRegistry(const string url_name,int index)3288 string CAlignFormatUtil::GetURLFromRegistry( const string url_name, int index){
3289 string result_url;
3290 string l_key, l_host_port, l_format;
3291 string l_secion_name = "BLASTFMTUTIL";
3292 string l_fmt_suffix = "_FORMAT";
3293 string l_host_port_suffix = "_HOST_PORT";
3294 string l_subst_pattern;
3295
3296 if( !m_Reg ) {
3297 InitConfig();
3298 }
3299 if( !m_Reg ) return GetURLDefault(url_name,index); // can't read .ncbrc file
3300 string l_base_dir = m_Reg->Get(l_secion_name, "INCLUDE_BASE_DIR");
3301 if( !l_base_dir.empty() && ( l_base_dir.rfind("/") != (l_base_dir.length()-1)) ) {
3302 l_base_dir.append("/");
3303 }
3304
3305
3306 string default_host_port;
3307 string l_key_ndx;
3308 if( index >=0) {
3309 l_key_ndx = url_name + l_host_port_suffix + "_" + NStr::IntToString( index );
3310 l_subst_pattern="<@"+l_key_ndx+"@>";
3311 l_host_port = m_Reg->Get(l_secion_name, l_key_ndx); // try indexed
3312 }
3313 // next is initialization for non version/array type of settings
3314 if( l_host_port.empty()){ // not indexed or index wasn't found
3315 l_key = url_name + l_host_port_suffix; l_subst_pattern="<@"+l_key+"@>";
3316 l_host_port = m_Reg->Get(l_secion_name, l_key);
3317 }
3318 if( l_host_port.empty()) return GetURLDefault(url_name,index);
3319
3320 // get format part
3321 l_key = url_name + l_fmt_suffix ; //"_FORMAT";
3322 l_key_ndx = l_key + "_" + NStr::IntToString( index );
3323 if( index >= 0 ){
3324 l_format = m_Reg->Get(l_secion_name, l_key_ndx);
3325 }
3326
3327 if( l_format.empty() ) l_format = m_Reg->Get(l_secion_name, l_key);
3328 if( l_format.empty()) return GetURLDefault(url_name,index);
3329 // format found check wether this string or file name
3330 string l_format_file = l_base_dir + l_format;
3331 CFile l_fchecker( l_format_file );
3332 bool file_name_mode = l_fchecker.Exists();
3333 if( file_name_mode ) { // read whole content of the file to string buffer
3334 string l_inc_file_name = l_format_file;
3335 CNcbiIfstream l_file (l_inc_file_name.c_str(), ios::in|ios::binary|ios::ate);
3336 CT_POS_TYPE l_inc_size = l_file.tellg();
3337 // size_t l_buf_sz = (size_t) l_inc_size;
3338 char *l_mem = new char [ (size_t) l_inc_size + 1];
3339 memset( l_mem,0, (size_t) l_inc_size + 1 ) ;
3340 l_file.seekg( 0, ios::beg );
3341 l_file.read(l_mem, l_inc_size);
3342 l_file.close();
3343 l_format.erase(); l_format.reserve( (size_t)l_inc_size + 1 );
3344 l_format = l_mem;
3345 delete [] l_mem;
3346 }
3347
3348 result_url = NStr::Replace(l_format,l_subst_pattern,l_host_port);
3349
3350 if( result_url.empty()) return GetURLDefault(url_name,index);
3351 return result_url;
3352 }
3353 //
3354 // return default URL value for the given key.
3355 //
GetURLDefault(const string url_name,int index)3356 string CAlignFormatUtil::GetURLDefault( const string url_name, int index) {
3357
3358 string search_name = url_name;
3359 TTagUrlMap::const_iterator url_it;
3360 if( index >= 0 ) search_name += "_" + NStr::IntToString( index); // actual name for index value is NAME_{index}
3361
3362 if( (url_it = sm_TagUrlMap.find( search_name ) ) != sm_TagUrlMap.end()) {
3363 string url_link = CAlignFormatUtil::MapProtocol(url_it->second);
3364 return url_link;
3365 }
3366
3367 string error_msg = "CAlignFormatUtil::GetURLDefault:no_defualt_for"+url_name;
3368 if( index != -1 ) error_msg += "_index_"+ NStr::IntToString( index );
3369 return error_msg;
3370 }
3371
3372 void
GetAsciiProteinMatrix(const char * matrix_name,CNcbiMatrix<int> & retval)3373 CAlignFormatUtil::GetAsciiProteinMatrix(const char* matrix_name,
3374 CNcbiMatrix<int>& retval)
3375 {
3376 retval.Resize(0, 0, -1);
3377 if (matrix_name == NULL ||
3378 NStr::TruncateSpaces(string(matrix_name)).empty()) {
3379 return;
3380 }
3381
3382 const SNCBIPackedScoreMatrix* packed_mtx =
3383 NCBISM_GetStandardMatrix(matrix_name);
3384 if (packed_mtx == NULL) {
3385 return;
3386 }
3387 retval.Resize(k_NumAsciiChar, k_NumAsciiChar, -1000);
3388
3389 SNCBIFullScoreMatrix mtx;
3390 NCBISM_Unpack(packed_mtx, &mtx);
3391
3392 for(int i = 0; i < ePMatrixSize; ++i){
3393 for(int j = 0; j < ePMatrixSize; ++j){
3394 retval((size_t)k_PSymbol[i], (size_t)k_PSymbol[j]) =
3395 mtx.s[(size_t)k_PSymbol[i]][(size_t)k_PSymbol[j]];
3396 }
3397 }
3398 for(int i = 0; i < ePMatrixSize; ++i) {
3399 retval((size_t)k_PSymbol[i], '*') = retval('*',(size_t)k_PSymbol[i]) = -4;
3400 }
3401 retval('*', '*') = 1;
3402 // this is to count Selenocysteine to Cysteine matches as positive
3403 retval('U', 'U') = retval('C', 'C');
3404 retval('U', 'C') = retval('C', 'C');
3405 retval('C', 'U') = retval('C', 'C');
3406 }
3407
3408
MapTemplate(string inpString,string tmplParamName,Int8 templParamVal)3409 string CAlignFormatUtil::MapTemplate(string inpString,string tmplParamName,Int8 templParamVal)
3410 {
3411 string outString;
3412 string tmplParam = "<@" + tmplParamName + "@>";
3413 NStr::Replace(inpString,tmplParam,NStr::NumericToString(templParamVal),outString);
3414 return outString;
3415 }
3416
MapTemplate(string inpString,string tmplParamName,string templParamVal)3417 string CAlignFormatUtil::MapTemplate(string inpString,string tmplParamName,string templParamVal)
3418 {
3419 string outString;
3420 string tmplParam = "<@" + tmplParamName + "@>";
3421 NStr::Replace(inpString,tmplParam,templParamVal,outString);
3422 return outString;
3423 }
3424
MapSpaceTemplate(string inpString,string tmplParamName,string templParamVal,unsigned int maxParamValLength,int spacesFormatFlag)3425 string CAlignFormatUtil::MapSpaceTemplate(string inpString,string tmplParamName,string templParamVal, unsigned int maxParamValLength, int spacesFormatFlag)
3426 {
3427 templParamVal = AddSpaces(templParamVal, maxParamValLength, spacesFormatFlag);
3428 string outString = MapTemplate(inpString,tmplParamName,templParamVal);
3429
3430 return outString;
3431 }
3432
3433
AddSpaces(string paramVal,unsigned int maxParamValLength,int spacesFormatFlag)3434 string CAlignFormatUtil::AddSpaces(string paramVal, unsigned int maxParamValLength, int spacesFormatFlag)
3435 {
3436 //if(!spacePos.empty()) {
3437 string spaceString;
3438 if(maxParamValLength >= paramVal.size()) {
3439 unsigned int numSpaces = maxParamValLength - paramVal.size() + 1;
3440 if(spacesFormatFlag & eSpacePosToCenter) {
3441 numSpaces = numSpaces/2;
3442 }
3443 for(size_t i=0; i < numSpaces ; i++){
3444 spaceString += " ";
3445 }
3446 }
3447 else {
3448 paramVal = paramVal.substr(0, maxParamValLength - 3) + "...";
3449 spaceString += " ";
3450 }
3451 if(spacesFormatFlag & eSpacePosAtLineEnd) {
3452 paramVal = paramVal + spaceString;
3453 }
3454 else if(spacesFormatFlag & eSpacePosToCenter) {
3455 paramVal = spaceString + paramVal + spaceString;
3456 }
3457 else {
3458 paramVal = spaceString + paramVal;
3459 }
3460 if(spacesFormatFlag & eAddEOLAtLineStart) paramVal = "\n" + paramVal;
3461 if(spacesFormatFlag & eAddEOLAtLineEnd) paramVal = paramVal + "\n";
3462 //}
3463
3464 return paramVal;
3465 }
3466
3467
3468
GetProtocol()3469 string CAlignFormatUtil::GetProtocol()
3470 {
3471 CNcbiIfstream config_file(".ncbirc");
3472 CNcbiRegistry config_reg(config_file);
3473 string httpProt = "https:";
3474 if(!config_reg.Empty()) {
3475 if(config_reg.HasEntry("BLASTFMTUTIL","PROTOCOL")) {
3476 httpProt = config_reg.Get("BLASTFMTUTIL","PROTOCOL");
3477 }
3478 }
3479 return httpProt;
3480 }
3481
3482 /*
3483 if(no config file) protocol = "https:"
3484 if(no "BLASTFMTUTIL","PROTOCOL" entry in config file) protocol = "https:"
3485 if(there is entry in config) protocol = entry which could be blank = ""
3486 */
MapProtocol(string url_link)3487 string CAlignFormatUtil::MapProtocol(string url_link)
3488 {
3489 if(m_Protocol.empty()){
3490 if(!m_Reg) {
3491 InitConfig();
3492 }
3493 m_Protocol = (m_Reg && m_Reg->HasEntry("BLASTFMTUTIL","PROTOCOL")) ? m_Protocol = m_Reg->Get("BLASTFMTUTIL","PROTOCOL") : "https:";
3494 }
3495 url_link = CAlignFormatUtil::MapTemplate(url_link,"protocol",m_Protocol);
3496 return url_link;
3497 }
3498
s_MapCommonUrlParams(string urlTemplate,CAlignFormatUtil::SSeqURLInfo * seqUrlInfo)3499 static string s_MapCommonUrlParams(string urlTemplate, CAlignFormatUtil::SSeqURLInfo *seqUrlInfo)
3500 {
3501 string db,logstr_moltype;
3502 if(seqUrlInfo->isDbNa) {
3503 db = "nucleotide";
3504 logstr_moltype = "nucl";
3505 } else {
3506 db = "protein";
3507 logstr_moltype ="prot";
3508 }
3509 string logstr_location = (seqUrlInfo->isAlignLink) ? "align" : "top";
3510 string url_link = CAlignFormatUtil::MapTemplate(urlTemplate,"db",db);
3511 url_link = CAlignFormatUtil::MapTemplate(url_link,"gi", GI_TO(TIntId, seqUrlInfo->gi));
3512 url_link = CAlignFormatUtil::MapTemplate(url_link,"log",logstr_moltype + logstr_location);
3513 url_link = CAlignFormatUtil::MapTemplate(url_link,"blast_rank",seqUrlInfo->blast_rank);
3514 url_link = CAlignFormatUtil::MapTemplate(url_link,"rid",seqUrlInfo->rid);
3515 url_link = CAlignFormatUtil::MapTemplate(url_link,"acc",seqUrlInfo->accession);
3516 url_link = CAlignFormatUtil::MapProtocol(url_link);
3517 return url_link;
3518 }
3519
s_MapURLLink(string urlTemplate,CAlignFormatUtil::SSeqURLInfo * seqUrlInfo,const CBioseq::TId & ids)3520 static string s_MapURLLink(string urlTemplate, CAlignFormatUtil::SSeqURLInfo *seqUrlInfo, const CBioseq::TId& ids)
3521 {
3522 //Add specific blasttype/user_url template mapping here
3523 string url_link = urlTemplate;
3524 if (seqUrlInfo->user_url.find("sra.cgi") != string::npos) {
3525 string strRun, strSpotId,strReadIndex;
3526 if(s_GetSRASeqMetadata(ids,strRun,strSpotId,strReadIndex)) {
3527 url_link = CAlignFormatUtil::MapTemplate(url_link,"run",strRun);
3528 url_link = CAlignFormatUtil::MapTemplate(url_link,"spotid",strSpotId);
3529 url_link = CAlignFormatUtil::MapTemplate(url_link,"readindex",strReadIndex);
3530 }
3531 }
3532 //This maps generic params like log, blast_rank, rid
3533 url_link = s_MapCommonUrlParams(url_link, seqUrlInfo);
3534 return url_link;
3535 }
3536
3537
3538
IsWGSPattern(string & wgsAccession)3539 bool CAlignFormatUtil::IsWGSPattern(string &wgsAccession)
3540 {
3541 //const string kWgsAccessionPattern = "^[A-Z]{4}[0-9]{8,10}(\.[0-9]+){0,1}$"; //example AUXO013124042 or AUXO013124042.1
3542 const unsigned int kWgsProjLength = 4;
3543 const unsigned int kWgsProjIDLengthMin = 8;
3544 const unsigned int kWgsProjIDLengthMax = 10;
3545 bool isWGS = true;
3546
3547 if (wgsAccession.size() < 6) {
3548 return false;
3549 }
3550
3551 if(NStr::Find(wgsAccession, ".") != NPOS) { //Accession has version AUXO013124042.1
3552 string version;
3553 NStr::SplitInTwo(wgsAccession,".",wgsAccession,version);
3554 }
3555
3556 string wgsProj = wgsAccession.substr(0,kWgsProjLength);
3557 for (size_t i = 0; i < wgsProj.length(); i ++){
3558 if(!isalpha(wgsProj[i]&0xff)) {
3559 isWGS = false;
3560 break;
3561 }
3562 }
3563 if(isWGS) {
3564 string wgsId = wgsAccession.substr(kWgsProjLength);
3565 if(wgsId.length() >= kWgsProjIDLengthMin && wgsId.length() <= kWgsProjIDLengthMax) {
3566 for (size_t i = 0; i < wgsId.length(); i ++){
3567 if(!isdigit(wgsId[i]&0xff)) {
3568 isWGS = false;
3569 break;
3570 }
3571 }
3572 }
3573 else {
3574 isWGS = false;
3575 }
3576 }
3577 return isWGS;
3578 }
3579
3580
IsWGSAccession(string & wgsAccession,string & wgsProjName)3581 bool CAlignFormatUtil::IsWGSAccession(string &wgsAccession, string &wgsProjName)
3582 {
3583 const unsigned int kWgsProgNameLength = 6;
3584 bool isWGS = IsWGSPattern(wgsAccession);
3585 if(isWGS) {
3586 wgsProjName = wgsAccession.substr(0,kWgsProgNameLength);
3587 }
3588 return isWGS;
3589 }
3590
3591
GetIDUrlGen(SSeqURLInfo * seqUrlInfo,const CBioseq::TId * ids)3592 string CAlignFormatUtil::GetIDUrlGen(SSeqURLInfo *seqUrlInfo,const CBioseq::TId* ids)
3593 {
3594 string url_link = NcbiEmptyString;
3595 CConstRef<CSeq_id> wid = FindBestChoice(*ids, CSeq_id::WorstRank);
3596
3597 bool hasTextSeqID = GetTextSeqID(*ids);
3598 string title = "title=\"Show report for " + seqUrlInfo->accession + "\" ";
3599
3600 string temp_class_info = kClassInfo; temp_class_info += " ";
3601 string wgsProj;
3602 string wgsAccession = seqUrlInfo->accession;
3603 bool isWGS = false;
3604 if (!(wid->Which() == CSeq_id::e_Local || wid->Which() == CSeq_id::e_General)){
3605 isWGS = CAlignFormatUtil::IsWGSAccession(wgsAccession, wgsProj);
3606 }
3607 if(isWGS && seqUrlInfo->useTemplates) {
3608 string wgsUrl = CAlignFormatUtil::GetURLFromRegistry("WGS");
3609 url_link = s_MapCommonUrlParams(wgsUrl, seqUrlInfo);
3610 url_link = CAlignFormatUtil::MapTemplate(url_link,"wgsproj",wgsProj);
3611 url_link = CAlignFormatUtil::MapTemplate(url_link,"wgsacc", wgsAccession);
3612 }
3613 else if (hasTextSeqID) {
3614 string entrezTag = (seqUrlInfo->useTemplates) ? "ENTREZ_TM" : "ENTREZ";
3615 string l_EntrezUrl = CAlignFormatUtil::GetURLFromRegistry(entrezTag);
3616 url_link = s_MapCommonUrlParams(l_EntrezUrl, seqUrlInfo);
3617
3618 if(!seqUrlInfo->useTemplates) {
3619 url_link = CAlignFormatUtil::MapTemplate(url_link,"acc",seqUrlInfo->accession);
3620 temp_class_info = (!seqUrlInfo->defline.empty())? CAlignFormatUtil::MapTemplate(temp_class_info,"defline",NStr::JavaScriptEncode(seqUrlInfo->defline)):temp_class_info;
3621 url_link = CAlignFormatUtil::MapTemplate(url_link,"cssInf",(seqUrlInfo->addCssInfo) ? temp_class_info.c_str() : "");
3622 url_link = CAlignFormatUtil::MapTemplate(url_link,"target",seqUrlInfo->new_win ? "TARGET=\"EntrezView\"" : "");
3623 }
3624
3625 } else {//seqid general, dbtag specified
3626 if(wid->Which() == CSeq_id::e_General){
3627 const CDbtag& dtg = wid->GetGeneral();
3628 const string& dbname = dtg.GetDb();
3629 if(NStr::CompareNocase(dbname, "TI") == 0){
3630 string actual_id = CAlignFormatUtil::GetGnlID(dtg);
3631 if(seqUrlInfo->useTemplates) {
3632 string l_TraceUrl = CAlignFormatUtil::GetURLFromRegistry("TRACE_CGI");
3633 url_link = l_TraceUrl + (string)"?cmd=retrieve&dopt=fasta&val=" + actual_id + "&RID=" + seqUrlInfo->rid;
3634 }
3635 else {
3636 url_link = CAlignFormatUtil::MapTemplate(kTraceUrl,"val",actual_id);
3637 temp_class_info = (!seqUrlInfo->defline.empty())? CAlignFormatUtil::MapTemplate(temp_class_info,"defline",seqUrlInfo->defline):temp_class_info;
3638 url_link = CAlignFormatUtil::MapTemplate(url_link,"cssInf",(seqUrlInfo->addCssInfo) ? temp_class_info.c_str() : "");
3639 url_link = CAlignFormatUtil::MapTemplate(url_link,"rid",seqUrlInfo->rid);
3640 }
3641 }
3642 } else if (wid->Which() == CSeq_id::e_Local){
3643
3644 string url_holder = CAlignFormatUtil::GetURLFromRegistry("LOCAL_ID");
3645
3646 string user_url;
3647 if (m_Reg) {
3648 user_url = (seqUrlInfo->addCssInfo) ? m_Reg->Get("LOCAL_ID","TOOL_URL_ALIGN") : m_Reg->Get("LOCAL_ID","TOOL_URL");
3649 }
3650 string id_string;
3651 wid->GetLabel(&id_string, CSeq_id::eContent);
3652 url_link = CAlignFormatUtil::MapTemplate(user_url,"seq_id", NStr::URLEncode(id_string));
3653 url_link = CAlignFormatUtil::MapTemplate(url_link,"db_name", NStr::URLEncode(seqUrlInfo->database));
3654 url_link = CAlignFormatUtil::MapTemplate(url_link,"taxid", TAX_ID_TO(int, seqUrlInfo->taxid));
3655 temp_class_info = (!seqUrlInfo->defline.empty())? CAlignFormatUtil::MapTemplate(temp_class_info,"defline",seqUrlInfo->defline):temp_class_info;
3656 url_link = CAlignFormatUtil::MapTemplate(url_link,"cssInf",(seqUrlInfo->addCssInfo) ? temp_class_info.c_str() : "");
3657 url_link = CAlignFormatUtil::MapTemplate(url_link,"title", id_string);
3658 url_link = CAlignFormatUtil::MapTemplate(url_link,"target",seqUrlInfo->new_win ? "TARGET=\"EntrezView\"" : "");
3659 }
3660 }
3661 url_link = CAlignFormatUtil::MapProtocol(url_link);
3662 seqUrlInfo->seqUrl = url_link;
3663 return url_link;
3664 }
3665
GetIDUrlGen(SSeqURLInfo * seqUrlInfo,const CSeq_id & id,objects::CScope & scope)3666 string CAlignFormatUtil::GetIDUrlGen(SSeqURLInfo *seqUrlInfo,const CSeq_id& id,objects::CScope &scope)
3667 {
3668 const CBioseq_Handle& handle = scope.GetBioseqHandle(id);
3669 const CBioseq::TId* ids = &handle.GetBioseqCore()->GetId();
3670
3671 string url_link = GetIDUrlGen(seqUrlInfo,ids);
3672 return url_link;
3673 }
3674
GetIDUrl(SSeqURLInfo * seqUrlInfo,const CBioseq::TId * ids)3675 string CAlignFormatUtil::GetIDUrl(SSeqURLInfo *seqUrlInfo,const CBioseq::TId* ids)
3676 {
3677 string url_link = NcbiEmptyString;
3678 CConstRef<CSeq_id> wid = FindBestChoice(*ids, CSeq_id::WorstRank);
3679
3680 string title = "title=\"Show report for " + seqUrlInfo->accession + "\" ";
3681
3682 if (seqUrlInfo->user_url != NcbiEmptyString &&
3683 !((seqUrlInfo->user_url.find("dumpgnl.cgi") != string::npos && seqUrlInfo->gi > ZERO_GI) ||
3684 (seqUrlInfo->user_url.find("maps.cgi") != string::npos))) {
3685
3686 string url_with_parameters,toolURLParams;
3687 if(m_Reg && !seqUrlInfo->blastType.empty() && seqUrlInfo->blastType != "newblast") {
3688 toolURLParams = m_Reg->Get(seqUrlInfo->blastType, "TOOL_URL_PARAMS");
3689 }
3690 if(!toolURLParams.empty()) {
3691 string urlLinkTemplate = seqUrlInfo->user_url + toolURLParams;
3692 url_with_parameters = s_MapURLLink(urlLinkTemplate, seqUrlInfo, *ids);
3693 }
3694 else {
3695 if (seqUrlInfo->user_url.find("sra.cgi") != string::npos) {
3696 url_with_parameters = CAlignFormatUtil::BuildSRAUrl(*ids, seqUrlInfo->user_url);
3697 }
3698 else {
3699 url_with_parameters = CAlignFormatUtil::BuildUserUrl(*ids, seqUrlInfo->taxid, seqUrlInfo->user_url,
3700 seqUrlInfo->database,
3701 seqUrlInfo->isDbNa, seqUrlInfo->rid,
3702 seqUrlInfo->queryNumber,
3703 seqUrlInfo->isAlignLink);
3704 }
3705 }
3706 if (url_with_parameters != NcbiEmptyString) {
3707 if (!seqUrlInfo->useTemplates) {
3708 string deflineInfo;
3709 if(seqUrlInfo->addCssInfo) {
3710 deflineInfo = (!seqUrlInfo->defline.empty())? CAlignFormatUtil::MapTemplate(kClassInfo,"defline",seqUrlInfo->defline):kClassInfo;
3711 }
3712 url_link += "<a " + title + deflineInfo + "href=\"";
3713 }
3714 url_link += url_with_parameters;
3715 if (!seqUrlInfo->useTemplates) url_link += "\">";
3716 }
3717 }
3718 else {
3719 //use entrez or dbtag specified
3720 url_link = GetIDUrlGen(seqUrlInfo,ids);
3721 }
3722 seqUrlInfo->seqUrl = url_link;
3723 return url_link;
3724 }
3725
3726
GetIDUrl(SSeqURLInfo * seqUrlInfo,const CSeq_id & id,objects::CScope & scope)3727 string CAlignFormatUtil::GetIDUrl(SSeqURLInfo *seqUrlInfo,const CSeq_id& id,objects::CScope &scope)
3728 {
3729 const CBioseq_Handle& handle = scope.GetBioseqHandle(id);
3730 const CBioseq::TId* ids = &handle.GetBioseqCore()->GetId();
3731
3732
3733 seqUrlInfo->blastType = NStr::TruncateSpaces(NStr::ToLower(seqUrlInfo->blastType));
3734
3735 if(seqUrlInfo->taxid == INVALID_TAX_ID) { //taxid is not set
3736 seqUrlInfo->taxid = ZERO_TAX_ID;
3737 if ((seqUrlInfo->advancedView || seqUrlInfo->blastType == "mapview" || seqUrlInfo->blastType == "mapview_prev") ||
3738 seqUrlInfo->blastType == "gsfasta" || seqUrlInfo->blastType == "gsfasta_prev") {
3739 seqUrlInfo->taxid = GetTaxidForSeqid(id, scope);
3740 }
3741 }
3742 string url_link = GetIDUrl(seqUrlInfo,ids);
3743 return url_link;
3744 }
3745
3746 //static const char kGenericLinkTemplate[] = "<a href=\"<@url@>\" target=\"lnk<@rid@>\" title=\"Show report for <@seqid@>\"><@gi@><@seqid@></a>";
GetFullIDLink(SSeqURLInfo * seqUrlInfo,const CBioseq::TId * ids)3747 string CAlignFormatUtil::GetFullIDLink(SSeqURLInfo *seqUrlInfo,const CBioseq::TId* ids)
3748 {
3749 string seqLink;
3750 string linkURL = GetIDUrl(seqUrlInfo,ids);
3751 if(!linkURL.empty()) {
3752 string linkTmpl = (seqUrlInfo->addCssInfo) ? kGenericLinkMouseoverTmpl : kGenericLinkTemplate;
3753 seqLink = CAlignFormatUtil::MapTemplate(linkTmpl,"url",linkURL);
3754 seqLink = CAlignFormatUtil::MapTemplate(seqLink,"rid",seqUrlInfo->rid);
3755 seqLink = CAlignFormatUtil::MapTemplate(seqLink,"seqid",seqUrlInfo->accession);
3756 seqLink = CAlignFormatUtil::MapTemplate(seqLink,"gi", GI_TO(TIntId, seqUrlInfo->gi));
3757 seqLink = CAlignFormatUtil::MapTemplate(seqLink,"target","EntrezView");
3758 if(seqUrlInfo->addCssInfo) {
3759 seqLink = CAlignFormatUtil::MapTemplate(seqLink,"defline",NStr::JavaScriptEncode(seqUrlInfo->defline));
3760 }
3761 }
3762 return seqLink;
3763 }
3764
s_MapCustomLink(string linkUrl,string reportType,string accession,string linkText,string linktrg,string linkTitle=kCustomLinkTitle,string linkCls="")3765 static string s_MapCustomLink(string linkUrl,string reportType,string accession, string linkText, string linktrg, string linkTitle = kCustomLinkTitle,string linkCls = "")
3766 {
3767 string link = CAlignFormatUtil::MapTemplate(kCustomLinkTemplate,"custom_url",linkUrl);
3768 link = CAlignFormatUtil::MapProtocol(link);
3769 link = CAlignFormatUtil::MapTemplate(link,"custom_title",linkTitle);
3770 link = CAlignFormatUtil::MapTemplate(link,"custom_report_type",reportType);
3771 link = CAlignFormatUtil::MapTemplate(link,"seqid",accession);
3772 link = CAlignFormatUtil::MapTemplate(link,"custom_lnk_displ",linkText);
3773 link = CAlignFormatUtil::MapTemplate(link,"custom_cls",linkCls);
3774 link = CAlignFormatUtil::MapTemplate(link,"custom_trg",linktrg);
3775 return link;
3776 }
3777
3778
3779
GetGiLinksList(SSeqURLInfo * seqUrlInfo,bool hspRange)3780 list<string> CAlignFormatUtil::GetGiLinksList(SSeqURLInfo *seqUrlInfo,
3781 bool hspRange)
3782 {
3783 list<string> customLinksList;
3784 if (seqUrlInfo->hasTextSeqID) {
3785 //First show links to GenBank and FASTA
3786 string linkUrl,link,linkTiltle = kCustomLinkTitle;
3787
3788 linkUrl = seqUrlInfo->seqUrl;
3789 if(NStr::Find(linkUrl, "report=genbank") == NPOS) { //Geo case
3790 linkUrl = s_MapCommonUrlParams(kEntrezTMUrl, seqUrlInfo);
3791 }
3792 string linkText = (seqUrlInfo->isDbNa) ? "GenBank" : "GenPept";
3793 if(hspRange) {
3794 linkUrl += "&from=<@fromHSP@>&to=<@toHSP@>";
3795 linkTiltle = "Aligned region spanning positions <@fromHSP@> to <@toHSP@> on <@seqid@>";
3796 }
3797 link = s_MapCustomLink(linkUrl,"genbank",seqUrlInfo->accession,linkText,"lnk" + seqUrlInfo->rid,linkTiltle);
3798 customLinksList.push_back(link);
3799 }
3800 return customLinksList;
3801 }
3802
GetGraphiscLink(SSeqURLInfo * seqUrlInfo,bool hspRange)3803 string CAlignFormatUtil::GetGraphiscLink(SSeqURLInfo *seqUrlInfo,
3804 bool hspRange)
3805 {
3806 //seqviewer
3807 string dbtype = (seqUrlInfo->isDbNa) ? "nuccore" : "protein";
3808 string seqViewUrl = (seqUrlInfo->gi > ZERO_GI)?kSeqViewerUrl:kSeqViewerUrlNonGi;
3809
3810 string linkUrl = CAlignFormatUtil::MapTemplate(seqViewUrl,"rid",seqUrlInfo->rid);
3811
3812 string seqViewerParams;
3813 if(m_Reg && !seqUrlInfo->blastType.empty() && seqUrlInfo->blastType != "newblast") {
3814 seqViewerParams = m_Reg->Get(seqUrlInfo->blastType, "SEQVIEW_PARAMS");
3815 }
3816 seqViewerParams = seqViewerParams.empty() ? kSeqViewerParams : seqViewerParams;
3817 linkUrl = CAlignFormatUtil::MapTemplate(linkUrl,"seqViewerParams",seqViewerParams);
3818
3819 linkUrl = CAlignFormatUtil::MapTemplate(linkUrl,"dbtype",dbtype);
3820 linkUrl = CAlignFormatUtil::MapTemplate(linkUrl,"gi", GI_TO(TIntId, seqUrlInfo->gi));
3821 string linkTitle = "Show alignment to <@seqid@> in <@custom_report_type@>";
3822 string link_loc;
3823 if(!hspRange) {
3824 int addToRange = (int) ((seqUrlInfo->seqRange.GetTo() - seqUrlInfo->seqRange.GetFrom()) * 0.05);//add 5% to each side
3825 linkUrl = CAlignFormatUtil::MapTemplate(linkUrl,"from",max(0,(int)seqUrlInfo->seqRange.GetFrom() - addToRange));
3826 linkUrl = CAlignFormatUtil::MapTemplate(linkUrl,"to",seqUrlInfo->seqRange.GetTo() + addToRange);
3827 link_loc = "fromSubj";
3828 //linkUrl = CAlignFormatUtil::MapTemplate(linkUrl,"flip",NStr::BoolToString(seqUrlInfo->flip));
3829 }
3830 else {
3831 link_loc = "fromHSP";
3832 linkTitle += " for <@fromHSP@> to <@toHSP@> range";
3833 }
3834 linkUrl = CAlignFormatUtil::MapTemplate(linkUrl,"link_loc",link_loc);
3835
3836 string title = (seqUrlInfo->isDbNa) ? "Nucleotide Graphics" : "Protein Graphics";
3837
3838 string link = s_MapCustomLink(linkUrl,title,seqUrlInfo->accession, "Graphics","lnk" + seqUrlInfo->rid,linkTitle,"spr");
3839
3840 return link;
3841 }
3842
GetSeqLinksList(SSeqURLInfo * seqUrlInfo,bool hspRange)3843 list<string> CAlignFormatUtil::GetSeqLinksList(SSeqURLInfo *seqUrlInfo,
3844 bool hspRange)
3845 {
3846 list<string> customLinksList = GetGiLinksList(seqUrlInfo,hspRange); //ONLY FOR genBank seqUrlInfo->seqUrl has "report=genbank"
3847 string graphicLink = GetGraphiscLink(seqUrlInfo,hspRange);
3848 if(!graphicLink.empty()) {
3849 customLinksList.push_back(graphicLink);
3850 }
3851 return customLinksList;
3852 }
3853
SetCustomLinksTypes(SSeqURLInfo * seqUrlInfo,int customLinkTypesInp)3854 int CAlignFormatUtil::SetCustomLinksTypes(SSeqURLInfo *seqUrlInfo, int customLinkTypesInp)
3855 {
3856 int customLinkTypes = customLinkTypesInp;
3857 if ( seqUrlInfo->gi > ZERO_GI) {
3858 customLinkTypes +=eLinkTypeGenLinks;
3859 }
3860 //else if(NStr::StartsWith(seqUrlInfo->accession,"ti:")) {//seqUrlInfo->seqUrl has "trace.cgi"
3861 else if(NStr::Find(seqUrlInfo->seqUrl,"trace.cgi") != NPOS ){
3862 customLinkTypes +=eLinkTypeTraceLinks;
3863 }
3864 else if(seqUrlInfo->blastType == "sra") {//seqUrlInfo->seqUrl has sra.cgi
3865 customLinkTypes +=eLinkTypeSRALinks;
3866 }
3867 else if(seqUrlInfo->blastType == "snp") {//seqUrlInfo->seqUrl has snp_ref.cgi
3868 customLinkTypes +=eLinkTypeSNPLinks;
3869 }
3870 else if(seqUrlInfo->blastType == "gsfasta") {//seqUrlInfo->seqUrl has GSfasta.cgi
3871 customLinkTypes +=eLinkTypeGSFastaLinks;
3872 }
3873 return customLinkTypes;
3874 }
3875
3876
3877 //kCustomLinkTemplate:
3878 //<a href="<@custom_url@>" class="<@custom_cls@>" title="Show <@custom_report_type@> report for <@seqid@>"><@custom_lnk_displ@></a>
GetCustomLinksList(SSeqURLInfo * seqUrlInfo,const CSeq_id & id,objects::CScope & scope,int customLinkTypes)3879 list<string> CAlignFormatUtil::GetCustomLinksList(SSeqURLInfo *seqUrlInfo,
3880 const CSeq_id& id,
3881 objects::CScope &scope,
3882 int customLinkTypes)
3883 {
3884 list<string> customLinksList;
3885 string linkUrl,link;
3886
3887 customLinkTypes = SetCustomLinksTypes(seqUrlInfo, customLinkTypes);
3888 //First show links to GenBank and FASTA, then to Graphics
3889 customLinksList = GetSeqLinksList(seqUrlInfo);
3890 if(customLinkTypes & eLinkTypeTraceLinks) {
3891 linkUrl = seqUrlInfo->seqUrl;
3892 link = s_MapCustomLink(linkUrl,"Trace Archive FASTA",seqUrlInfo->accession, "FASTA","lnk" + seqUrlInfo->rid);
3893 customLinksList.push_back(link);
3894
3895 linkUrl = NStr::Replace(seqUrlInfo->seqUrl,"fasta","trace");
3896 link = s_MapCustomLink(linkUrl,"Trace Archive Trace",seqUrlInfo->accession, "Trace","lnk" + seqUrlInfo->rid);
3897 customLinksList.push_back(link);
3898
3899 linkUrl = NStr::Replace(seqUrlInfo->seqUrl,"fasta","quality");
3900 link = s_MapCustomLink(linkUrl,"Trace Archive Quality",seqUrlInfo->accession, "Quality","lnk" + seqUrlInfo->rid);
3901 customLinksList.push_back(link);
3902
3903 linkUrl = NStr::Replace(seqUrlInfo->seqUrl,"fasta","info");
3904 link = s_MapCustomLink(linkUrl,"Trace Archive Info",seqUrlInfo->accession, "Info","lnk" + seqUrlInfo->rid);
3905 customLinksList.push_back(link);
3906 }
3907 else if(customLinkTypes & eLinkTypeSRALinks) {
3908 linkUrl = seqUrlInfo->seqUrl;
3909 link = s_MapCustomLink(linkUrl,"SRA",seqUrlInfo->accession, "SRA","lnk" + seqUrlInfo->rid);
3910 customLinksList.push_back(link);
3911 }
3912 else if(customLinkTypes & eLinkTypeSNPLinks) {
3913 linkUrl = seqUrlInfo->seqUrl;
3914 link = s_MapCustomLink(linkUrl,"SNP",seqUrlInfo->accession, "SNP","lnk" + seqUrlInfo->rid);
3915 customLinksList.push_back(link);
3916
3917
3918 //SNP accession=rs35885954
3919 string rs = NStr::Replace(seqUrlInfo->accession,"rs","");
3920 linkUrl = seqUrlInfo->resourcesUrl + rs + "?report=FLT";
3921
3922
3923 link = s_MapCustomLink(linkUrl,"Flatfile",seqUrlInfo->accession, "Flatfile","lnk" + seqUrlInfo->rid);
3924 customLinksList.push_back(link);
3925
3926 linkUrl = NStr::Replace(linkUrl,"FLT","fasta");
3927 link = s_MapCustomLink(linkUrl,"FASTA",seqUrlInfo->accession, "FASTA","lnk" + seqUrlInfo->rid);
3928 customLinksList.push_back(link);
3929
3930 linkUrl = NStr::Replace(linkUrl,"fasta","docsum");
3931 link = s_MapCustomLink(linkUrl,"Graphic summary ",seqUrlInfo->accession, "Graphic summary ","lnk" + seqUrlInfo->rid);
3932 customLinksList.push_back(link);
3933 }
3934 else if(customLinkTypes & eLinkTypeGSFastaLinks) {
3935 linkUrl = seqUrlInfo->seqUrl;
3936 link = s_MapCustomLink(linkUrl,"GSFASTA",seqUrlInfo->accession, "GSFASTA","lnk" + seqUrlInfo->rid);
3937 customLinksList.push_back(link);
3938 }
3939 return customLinksList;
3940 }
3941
3942
GetAlignedRegionsURL(SSeqURLInfo * seqUrlInfo,const CSeq_id & id,objects::CScope & scope)3943 string CAlignFormatUtil::GetAlignedRegionsURL(SSeqURLInfo *seqUrlInfo,
3944 const CSeq_id& id,
3945 objects::CScope &scope)
3946 {
3947 const CBioseq_Handle& handle = scope.GetBioseqHandle(id);
3948 const CBioseq::TId* ids = &handle.GetBioseqCore()->GetId();
3949 string linkUrl,link;
3950
3951
3952 linkUrl = CAlignFormatUtil::BuildUserUrl(*ids,
3953 ZERO_TAX_ID,
3954 kDownloadUrl,
3955 seqUrlInfo->database,
3956 seqUrlInfo->isDbNa,
3957 seqUrlInfo->rid,
3958 seqUrlInfo->queryNumber,
3959 true);
3960 if(!linkUrl.empty()) {
3961 linkUrl += "&segs="+ seqUrlInfo->segs;
3962 }
3963
3964 return linkUrl;
3965 }
3966
3967
3968
GetFASTALinkURL(SSeqURLInfo * seqUrlInfo,const CSeq_id & id,objects::CScope & scope)3969 string CAlignFormatUtil::GetFASTALinkURL(SSeqURLInfo *seqUrlInfo,
3970 const CSeq_id& id,
3971 objects::CScope &scope)
3972
3973 {
3974 string linkUrl;
3975
3976 int customLinkTypes = SetCustomLinksTypes(seqUrlInfo, CAlignFormatUtil::eLinkTypeDefault);
3977
3978 if( (customLinkTypes & eLinkTypeGenLinks) || (customLinkTypes & eLinkTypeTraceLinks)){
3979 linkUrl = seqUrlInfo->seqUrl;
3980 linkUrl = NStr::Replace(linkUrl,"genbank","fasta");
3981 }
3982 else if(customLinkTypes & eLinkTypeSNPLinks) {
3983 linkUrl = seqUrlInfo->seqUrl;
3984 vector<string> parts;
3985 //SNP accession=dbSNP:rs35885954
3986 NStr::Split(seqUrlInfo->accession,":rs",parts,NStr::fSplit_MergeDelimiters);
3987 string rs;
3988 if(parts.size() > 1) {
3989 rs = parts[1];
3990 }
3991 linkUrl = seqUrlInfo->resourcesUrl + rs + "?report=fasta";
3992 }
3993 return linkUrl;
3994 }
3995
GetGeneInfo(TGi giForGeneLookup)3996 string CAlignFormatUtil::GetGeneInfo(TGi giForGeneLookup)
3997 {
3998 string geneSym;
3999 try
4000 {
4001 CNcbiEnvironment env;
4002 if (env.Get(GENE_INFO_PATH_ENV_VARIABLE) != kEmptyStr)
4003 {
4004
4005 if (m_GeneInfoReader.get() == 0)
4006 {
4007 m_GeneInfoReader.reset(new CGeneInfoFileReader(false));
4008 }
4009
4010
4011 CGeneInfoFileReader::TGeneInfoList infoList;
4012 m_GeneInfoReader->GetGeneInfoForGi(giForGeneLookup,infoList);
4013
4014 CGeneInfoFileReader::TGeneInfoList::const_iterator itInfo = infoList.begin();
4015 for (; itInfo != infoList.end(); itInfo++)
4016 {
4017 CRef<CGeneInfo> info = *itInfo;
4018 geneSym = info->GetSymbol();
4019 break;//???
4020 }
4021 }
4022 }
4023 catch (CException& e)
4024 {
4025 geneSym = "(Gene info extraction error: " + e.GetMsg() + ")";
4026 }
4027 catch (...)
4028 {
4029 geneSym = "(Gene info extraction error)";
4030 }
4031 return geneSym;
4032 }
4033
4034
GetDbType(const CSeq_align_set & actual_aln_list,CScope & scope)4035 CAlignFormatUtil::DbType CAlignFormatUtil::GetDbType(const CSeq_align_set& actual_aln_list, CScope & scope)
4036 {
4037 //determine if the database has gi by looking at the 1st hit.
4038 //Could be wrong but simple for now
4039 DbType type = eDbTypeNotSet;
4040 CRef<CSeq_align> first_aln = actual_aln_list.Get().front();
4041 const CSeq_id& subject_id = first_aln->GetSeq_id(1);
4042
4043 if (subject_id.Which() != CSeq_id::e_Local){
4044 const CBioseq_Handle& handleTemp = scope.GetBioseqHandle(subject_id);
4045 if(handleTemp){
4046 TGi giTemp = FindGi(handleTemp.GetBioseqCore()->GetId());
4047 if (giTemp > ZERO_GI || GetTextSeqID((CConstRef<CSeq_id>)&subject_id)) {
4048 type = eDbGi;
4049 } else if (subject_id.Which() == CSeq_id::e_General){
4050 const CDbtag& dtg = subject_id.GetGeneral();
4051 const string& dbName = dtg.GetDb();
4052 if(NStr::CompareNocase(dbName, "TI") == 0){
4053 type = eDbGeneral;
4054 }
4055 }
4056 }
4057 }
4058 return type;
4059 }
4060
4061 CAlignFormatUtil::SSeqAlignSetCalcParams*
GetSeqAlignCalcParams(const CSeq_align & aln)4062 CAlignFormatUtil::GetSeqAlignCalcParams(const CSeq_align& aln)
4063 {
4064 int score = 0;
4065 double bits = 0;
4066 double evalue = 0;
4067 int sum_n = 0;
4068 int num_ident = 0;
4069 list<TGi> use_this_gi;
4070
4071 use_this_gi.clear();
4072 //Gets scores directly from seq align
4073 GetAlnScores(aln, score, bits, evalue, sum_n,
4074 num_ident, use_this_gi);
4075
4076 auto_ptr<SSeqAlignSetCalcParams> seqSetInfo(new SSeqAlignSetCalcParams);
4077 seqSetInfo->sum_n = sum_n == -1 ? 1:sum_n ;
4078 seqSetInfo->id = &(aln.GetSeq_id(1));
4079 seqSetInfo->use_this_gi = use_this_gi;
4080 seqSetInfo->bit_score = bits;
4081 seqSetInfo->raw_score = score;
4082 seqSetInfo->evalue = evalue;
4083 seqSetInfo->match = num_ident;
4084 seqSetInfo->id = &(aln.GetSeq_id(1));
4085 seqSetInfo->subjRange = CRange<TSeqPos>(0,0);
4086 seqSetInfo->flip = false;
4087
4088 return seqSetInfo.release();
4089 }
4090
4091
4092
4093 CAlignFormatUtil::SSeqAlignSetCalcParams*
GetSeqAlignSetCalcParams(const CSeq_align_set & aln,int queryLength,bool do_translation)4094 CAlignFormatUtil::GetSeqAlignSetCalcParams(const CSeq_align_set& aln,int queryLength, bool do_translation)
4095 {
4096 int score = 0;
4097 double bits = 0;
4098 double evalue = 0;
4099 int sum_n = 0;
4100 int num_ident = 0;
4101 SSeqAlignSetCalcParams* seqSetInfo = NULL;
4102
4103 if(aln.Get().empty())
4104 return seqSetInfo;
4105
4106 seqSetInfo = GetSeqAlignCalcParams(*(aln.Get().front()));
4107
4108 double total_bits = 0;
4109 double highest_bits = 0;
4110 double lowest_evalue = 0;
4111 int highest_length = 1;
4112 int highest_ident = 0;
4113 //int highest_identity = 0;
4114 double totalLen = 0;
4115
4116 list<TGi> use_this_gi; // Not used here, but needed for GetAlnScores.
4117
4118 seqSetInfo->subjRange = CAlignFormatUtil::GetSeqAlignCoverageParams(aln,&seqSetInfo->master_covered_length,&seqSetInfo->flip);
4119 seqSetInfo->percent_coverage = 100*seqSetInfo->master_covered_length/queryLength;
4120
4121 ITERATE(CSeq_align_set::Tdata, iter, aln.Get()) {
4122 int align_length = CAlignFormatUtil::GetAlignmentLength(**iter, do_translation);
4123 totalLen += align_length;
4124
4125 CAlignFormatUtil::GetAlnScores(**iter, score, bits, evalue, sum_n,
4126 num_ident, use_this_gi);
4127 use_this_gi.clear();
4128
4129 total_bits += bits;
4130
4131 /// IMPORTANT: based on WB-1175, the trigger for setting the highest identity
4132 /// is not the highest identity value, but the identity value of
4133 /// the alignment with the highest score!
4134 ///
4135 /// if (100*num_ident/align_length > highest_identity) { -- this condition is disabled
4136
4137 if (bits > highest_bits) { // this is the replacement condition (WB-1175)
4138 highest_length = align_length;
4139 highest_ident = num_ident;
4140 /// highest_identity = 100*num_ident/align_length;
4141 }
4142
4143 if (bits > highest_bits) {
4144 highest_bits = bits;
4145 lowest_evalue = evalue;
4146 }
4147 }
4148 seqSetInfo->match = highest_ident;
4149 seqSetInfo->align_length = highest_length;
4150 seqSetInfo->percent_identity = CAlignFormatUtil::GetPercentIdentity(seqSetInfo->match, seqSetInfo->align_length);
4151
4152 seqSetInfo->total_bit_score = total_bits;
4153 seqSetInfo->bit_score = highest_bits;
4154 seqSetInfo->evalue = lowest_evalue;
4155 seqSetInfo->hspNum = aln.Size();
4156 seqSetInfo->totalLen = (Int8)totalLen;
4157
4158 return seqSetInfo;
4159 }
4160
GetSeqAlignSetCalcPercentIdent(const CSeq_align_set & aln,bool do_translation)4161 double CAlignFormatUtil::GetSeqAlignSetCalcPercentIdent(const CSeq_align_set& aln, bool do_translation)
4162 {
4163 int score = 0;
4164 double bits = 0;
4165 double evalue = 0;
4166 int sum_n = 0;
4167 int num_ident = 0;
4168
4169 if(aln.Get().empty())
4170 return -1;
4171
4172 double highest_bits = 0;
4173 int highest_length = 1;
4174 int highest_ident = 0;
4175
4176 list<TGi> use_this_gi; // Not used here, but needed for GetAlnScores.
4177
4178 ITERATE(CSeq_align_set::Tdata, iter, aln.Get()) {
4179 int align_length = CAlignFormatUtil::GetAlignmentLength(**iter, do_translation);
4180
4181 CAlignFormatUtil::GetAlnScores(**iter, score, bits, evalue, sum_n,
4182 num_ident, use_this_gi);
4183
4184
4185 /// IMPORTANT: based on WB-1175, the trigger for setting the highest identity
4186 /// is not the highest identity value, but the identity value of
4187 /// the alignment with the highest score!
4188 ///
4189 /// if (100*num_ident/align_length > highest_identity) { -- this condition is disabled
4190
4191 if (bits > highest_bits) { // this is the replacement condition (WB-1175)
4192 highest_length = align_length;
4193 highest_ident = num_ident;
4194 /// highest_identity = 100*num_ident/align_length;
4195 highest_bits = bits;
4196 }
4197 }
4198
4199 double percent_identity = CAlignFormatUtil::GetPercentIdentity(highest_ident, highest_length);
4200 return percent_identity;
4201 }
4202
4203
4204 template<class container> bool
s_GetBlastScore(const container & scoreList,double & evalue,double & bitScore,double & totalBitScore,int & percentCoverage,double & percentIdent,int & hspNum,double & totalLen,int & rawScore,int & sum_n,list<TGi> & use_this_gi)4205 s_GetBlastScore(const container& scoreList,
4206 double& evalue,
4207 double& bitScore,
4208 double& totalBitScore,
4209 int& percentCoverage,
4210 double& percentIdent,
4211 int& hspNum,
4212 double& totalLen,
4213 int &rawScore,
4214 int& sum_n,
4215 list<TGi>& use_this_gi)
4216 {
4217 const string k_GiPrefix = "gi:";
4218 bool hasScore = false;
4219
4220
4221 ITERATE (typename container, iter, scoreList) {
4222 const CObject_id& id=(*iter)->GetId();
4223 if (id.IsStr()) {
4224 hasScore = true;
4225 if (id.GetStr()=="seq_evalue") {
4226 evalue = (*iter)->GetValue().GetReal();
4227 } else if (id.GetStr()=="seq_bit_score"){
4228 bitScore = (*iter)->GetValue().GetReal();
4229 } else if (id.GetStr()=="seq_total_bit_score"){
4230 totalBitScore = (*iter)->GetValue().GetReal();
4231 } else if (id.GetStr()=="seq_percent_coverage"){
4232 percentCoverage = (*iter)->GetValue().GetInt();
4233 } else if (id.GetStr()=="seq_percent_identity" && (*iter)->GetValue().IsInt()){
4234 percentIdent = (*iter)->GetValue().GetInt();
4235 } else if (id.GetStr()=="seq_percent_identity" && (*iter)->GetValue().IsReal()){
4236 percentIdent = (*iter)->GetValue().GetReal();
4237 } else if (id.GetStr()=="seq_hspnum"){
4238 hspNum = (*iter)->GetValue().GetInt();
4239 } else if (id.GetStr()=="seq_align_totlen"){
4240 totalLen = (*iter)->GetValue().GetReal();
4241 } else if (id.GetStr()=="score"){
4242 rawScore = (*iter)->GetValue().GetInt();
4243 } else if (id.GetStr()=="use_this_gi"){
4244 use_this_gi.push_back(GI_FROM(int, (*iter)->GetValue().GetInt()));
4245 } else if (id.GetStr()=="sum_n"){
4246 sum_n = (*iter)->GetValue().GetInt();
4247 }
4248 else if(NStr::StartsWith(id.GetStr(),k_GiPrefix)) { //will be used when switch to 64bit GIs
4249 string strGi = NStr::Replace(id.GetStr(),k_GiPrefix,"");
4250 TGi gi = NStr::StringToNumeric<TGi>(strGi);
4251 use_this_gi.push_back(gi);
4252 }
4253 }
4254 }
4255 return hasScore;
4256 }
4257
4258
GetUseThisSequence(const CSeq_align & aln,list<TGi> & use_this_gi)4259 void CAlignFormatUtil::GetUseThisSequence(const CSeq_align& aln,list<TGi>& use_this_gi)
4260
4261 {
4262 const string k_GiPrefix = "gi:";
4263
4264 if(!aln.CanGetExt() || aln.GetExt().size() == 0) return;
4265 const CUser_object &user = *(aln.GetExt().front());
4266
4267 if (user.IsSetType() && user.GetType().IsStr() && user.GetType().GetStr() == "use_this_seqid" && user.IsSetData()) {
4268 const CUser_object::TData& fields = user.GetData();
4269 for (CUser_object::TData::const_iterator fit = fields.begin(); fit != fields.end(); ++fit) {
4270 const CUser_field& field = **fit;
4271
4272 if (field.IsSetLabel() && field.GetLabel().IsStr() && field.GetLabel().GetStr() == "SEQIDS" &&
4273 field.IsSetData() && field.GetData().IsStrs()) {
4274 const CUser_field::C_Data::TStrs& strs = field.GetData().GetStrs();
4275 ITERATE(CUser_field::TData::TStrs, acc_iter, strs) {
4276 if(NStr::StartsWith(*acc_iter,k_GiPrefix)) { //will be used when switch to 64bit GIs
4277 string strGi = NStr::Replace(*acc_iter,k_GiPrefix,"");
4278 TGi gi = NStr::StringToNumeric<TGi>(strGi);
4279 use_this_gi.push_back(gi);
4280 }
4281 }
4282 }
4283 }
4284 }
4285 }
4286
4287
4288 /*use_this_seq will contain gi:nnnnnn or seqid:ssssss string list*/
GetUseThisSequence(const CSeq_align & aln,list<string> & use_this_seq)4289 void CAlignFormatUtil::GetUseThisSequence(const CSeq_align& aln,list<string>& use_this_seq)
4290
4291 {
4292 if(!aln.CanGetExt() || aln.GetExt().size() == 0) return;
4293 const CUser_object &user = *(aln.GetExt().front());
4294
4295 if (user.IsSetType() && user.GetType().IsStr() && user.GetType().GetStr() == "use_this_seqid" && user.IsSetData()) {
4296 const CUser_object::TData& fields = user.GetData();
4297 for (CUser_object::TData::const_iterator fit = fields.begin(); fit != fields.end(); ++fit) {
4298 const CUser_field& field = **fit;
4299
4300 if (field.IsSetLabel() && field.GetLabel().IsStr() && field.GetLabel().GetStr() == "SEQIDS" &&
4301 field.IsSetData() && field.GetData().IsStrs()) {
4302 const CUser_field::C_Data::TStrs& strs = field.GetData().GetStrs();
4303 ITERATE(CUser_field::TData::TStrs, acc_iter, strs) {
4304 use_this_seq.push_back(*acc_iter);
4305 }
4306 }
4307 }
4308 }
4309 }
4310
4311
4312
4313 CAlignFormatUtil::SSeqAlignSetCalcParams*
GetSeqAlignSetCalcParamsFromASN(const CSeq_align_set & alnSet)4314 CAlignFormatUtil::GetSeqAlignSetCalcParamsFromASN(const CSeq_align_set& alnSet)
4315 {
4316 bool hasScore = false;
4317 double evalue = -1;
4318 double bitScore = -1;
4319 double totalBitScore = -1;
4320 int percentCoverage = -1;
4321 double percentIdent = -1;
4322 int hspNum = 0;
4323 double totalLen = 0;
4324 int rawScore = -1;
4325 int sum_n = -1;
4326 list<TGi> use_this_gi;
4327 list<string> use_this_seq;
4328
4329 const CSeq_align& aln = *(alnSet.Get().front());
4330
4331 hasScore = s_GetBlastScore(aln.GetScore(),evalue,bitScore, totalBitScore,percentCoverage,percentIdent,hspNum,totalLen,rawScore,sum_n,use_this_gi);
4332
4333 if(!hasScore){
4334 const CSeq_align::TSegs& seg = aln.GetSegs();
4335 if(seg.Which() == CSeq_align::C_Segs::e_Std){
4336 s_GetBlastScore(seg.GetStd().front()->GetScores(),
4337 evalue,bitScore, totalBitScore,percentCoverage,percentIdent,hspNum,totalLen,rawScore,sum_n,use_this_gi);
4338 } else if (seg.Which() == CSeq_align::C_Segs::e_Dendiag){
4339 s_GetBlastScore(seg.GetDendiag().front()->GetScores(),
4340 evalue,bitScore, totalBitScore,percentCoverage,percentIdent,hspNum,totalLen,rawScore,sum_n,use_this_gi);
4341 } else if (seg.Which() == CSeq_align::C_Segs::e_Denseg){
4342 s_GetBlastScore(seg.GetDenseg().GetScores(),
4343 evalue,bitScore, totalBitScore,percentCoverage,percentIdent,hspNum,totalLen,rawScore,sum_n,use_this_gi);
4344 }
4345 }
4346
4347 if(use_this_gi.size() == 0) {
4348 GetUseThisSequence(aln,use_this_seq);
4349 }
4350 else {
4351 use_this_seq = s_NumGiToStringGiList(use_this_gi);//for backward compatability
4352 }
4353
4354
4355 auto_ptr<SSeqAlignSetCalcParams> seqSetInfo(new SSeqAlignSetCalcParams);
4356 seqSetInfo->evalue = evalue;
4357 seqSetInfo->bit_score = bitScore;
4358 seqSetInfo->total_bit_score = totalBitScore;
4359 seqSetInfo->percent_coverage = percentCoverage;
4360 seqSetInfo->percent_identity = percentIdent;
4361 seqSetInfo->hspNum = hspNum;
4362 seqSetInfo->totalLen = (Int8)totalLen;
4363
4364 seqSetInfo->sum_n = sum_n == -1 ? 1:sum_n ;
4365 seqSetInfo->id = &(aln.GetSeq_id(1));
4366 seqSetInfo->use_this_gi = StringGiToNumGiList(use_this_seq);//for backward compatability
4367 seqSetInfo->use_this_seq = use_this_seq;
4368 seqSetInfo->raw_score = rawScore;//not used
4369
4370 seqSetInfo->subjRange = CRange<TSeqPos>(0,0);
4371 seqSetInfo->flip = false;
4372
4373 return seqSetInfo.release();
4374 }
4375
GetDisplayIds(const CBioseq_Handle & handle,const CSeq_id & aln_id,list<TGi> & use_this_gi,TGi & gi)4376 CRef<CSeq_id> CAlignFormatUtil::GetDisplayIds(const CBioseq_Handle& handle,
4377 const CSeq_id& aln_id,
4378 list<TGi>& use_this_gi,
4379 TGi& gi)
4380
4381 {
4382 TTaxId taxid = ZERO_TAX_ID;
4383 CRef<CSeq_id> wid = CAlignFormatUtil::GetDisplayIds(handle, aln_id, use_this_gi, gi, taxid);
4384 return wid;
4385 }
4386
GetDisplayIds(const CBioseq_Handle & handle,const CSeq_id & aln_id,list<TGi> & use_this_gi,TGi & gi,TTaxId & taxid)4387 CRef<CSeq_id> CAlignFormatUtil::GetDisplayIds(const CBioseq_Handle& handle,
4388 const CSeq_id& aln_id,
4389 list<TGi>& use_this_gi,
4390 TGi& gi,
4391 TTaxId& taxid)
4392
4393 {
4394 const CRef<CBlast_def_line_set> bdlRef = CSeqDB::ExtractBlastDefline(handle);
4395 const list< CRef< CBlast_def_line > > &bdl = (bdlRef.Empty()) ? list< CRef< CBlast_def_line > >() : bdlRef->Get();
4396
4397 const CBioseq::TId* ids = &handle.GetBioseqCore()->GetId();
4398 CRef<CSeq_id> wid;
4399
4400 gi = ZERO_GI;
4401 taxid = ZERO_TAX_ID;
4402 if(bdl.empty()){
4403 wid = FindBestChoice(*ids, CSeq_id::WorstRank);
4404 gi = FindGi(*ids);
4405 } else {
4406 bool found = false;
4407 for(list< CRef< CBlast_def_line > >::const_iterator iter = bdl.begin();
4408 iter != bdl.end(); iter++){
4409 const CBioseq::TId* cur_id = &((*iter)->GetSeqid());
4410 TGi cur_gi = FindGi(*cur_id);
4411 wid = FindBestChoice(*cur_id, CSeq_id::WorstRank);
4412 if ((*iter)->IsSetTaxid() && (*iter)->CanGetTaxid()){
4413 taxid = (*iter)->GetTaxid();
4414 }
4415 if (!use_this_gi.empty()) {
4416 ITERATE(list<TGi>, iter_gi, use_this_gi){
4417 if(cur_gi == *iter_gi){
4418 found = true;
4419 break;
4420 }
4421 }
4422 } else {
4423 ITERATE(CBioseq::TId, iter_id, *cur_id) {
4424 if ((*iter_id)->Match(aln_id)
4425 || (aln_id.IsGeneral() && aln_id.GetGeneral().CanGetDb() &&
4426 (*iter_id)->IsGeneral() && (*iter_id)->GetGeneral().CanGetDb() &&
4427 aln_id.GetGeneral().GetDb() == (*iter_id)->GetGeneral().GetDb())) {
4428 found = true;
4429 }
4430 }
4431 }
4432 if(found){
4433 gi = cur_gi;
4434 break;
4435 }
4436 }
4437 }
4438 return wid;
4439 }
4440
4441
4442
4443 //removes "gi:" or "seqid:" prefix from gi:nnnnnnn or seqid:nnnnn
s_UseThisSeqToTextSeqID(string use_this_seqid,bool & isGi)4444 static string s_UseThisSeqToTextSeqID(string use_this_seqid, bool &isGi)
4445 {
4446 const string k_GiPrefix = "gi:";
4447 const string k_SeqIDPrefix = "seqid:";
4448 isGi = false;
4449 string textSeqid;
4450 if(NStr::StartsWith(use_this_seqid,k_GiPrefix)) {
4451 textSeqid = NStr::Replace(use_this_seqid,k_GiPrefix,"");
4452 isGi = true;
4453 }
4454 else if(NStr::StartsWith(use_this_seqid,k_SeqIDPrefix)) {
4455 textSeqid = NStr::Replace(use_this_seqid,k_SeqIDPrefix,"");
4456 }
4457 else {//assume no prefix - gi
4458 if(NStr::StringToInt8(use_this_seqid,NStr::fConvErr_NoThrow)) {
4459 isGi = true;
4460 }
4461 }
4462 return textSeqid;
4463 }
4464
4465
4466
4467 //assume that we have EITHER gi: OR seqid: in the list
IsGiList(list<string> & use_this_seq)4468 bool CAlignFormatUtil::IsGiList(list<string> &use_this_seq)
4469 {
4470 bool isGi = false;
4471 ITERATE(list<string>, iter_seq, use_this_seq){
4472 s_UseThisSeqToTextSeqID( *iter_seq, isGi);
4473 break;
4474 }
4475 return isGi;
4476 }
4477
StringGiToNumGiList(list<string> & use_this_seq)4478 list<TGi> CAlignFormatUtil::StringGiToNumGiList(list<string> &use_this_seq)
4479 {
4480 list<TGi> use_this_gi;
4481 ITERATE(list<string>, iter_seq, use_this_seq){
4482 bool isGi = false;
4483 string strGI = s_UseThisSeqToTextSeqID( *iter_seq, isGi);
4484 if(isGi) use_this_gi.push_back(NStr::StringToNumeric<TGi>(strGI));
4485 }
4486 return use_this_gi;
4487 }
4488
4489
4490
MatchSeqInSeqList(TGi cur_gi,CRef<CSeq_id> & seqID,list<string> & use_this_seq,bool * isGiList)4491 bool CAlignFormatUtil::MatchSeqInSeqList(TGi cur_gi, CRef<CSeq_id> &seqID, list<string> &use_this_seq,bool *isGiList)
4492 {
4493 bool found = false;
4494 bool isGi = false;
4495
4496 string curSeqID = CAlignFormatUtil::GetLabel(seqID,true); //uses GetSeqIdString(true)
4497 ITERATE(list<string>, iter_seq, use_this_seq){
4498 isGi = false;
4499 string useThisSeq = s_UseThisSeqToTextSeqID(*iter_seq, isGi);
4500 if((isGi && cur_gi == NStr::StringToNumeric<TGi>((useThisSeq))) || (!isGi && curSeqID == useThisSeq)){
4501 found = true;
4502 break;
4503 }
4504 }
4505 if(isGiList) *isGiList = isGi;
4506 return found;
4507 }
4508
4509
MatchSeqInSeqList(CConstRef<CSeq_id> & alnSeqID,list<string> & use_this_seq,vector<string> & seqList)4510 bool CAlignFormatUtil::MatchSeqInSeqList(CConstRef<CSeq_id> &alnSeqID, list<string> &use_this_seq,vector <string> &seqList)
4511 {
4512 bool isGi = false;
4513 string curSeqID;
4514 if(alnSeqID->IsGi()) {
4515 curSeqID = NStr::NumericToString(alnSeqID->GetGi());
4516 }
4517 else {
4518 curSeqID = CAlignFormatUtil::GetLabel(alnSeqID,true); //uses GetSeqIdString(true)
4519 }
4520 //match with seqid in seq_align
4521 bool found = std::find(seqList.begin(), seqList.end(), curSeqID) != seqList.end();
4522 if(!found) {
4523 //match in use_this_seq list
4524 ITERATE(list<string>, iter_seq, use_this_seq){
4525 string useThisSeq = s_UseThisSeqToTextSeqID(*iter_seq, isGi);
4526 found = std::find(seqList.begin(), seqList.end(), useThisSeq) != seqList.end();
4527 if(found){
4528 break;
4529 }
4530 }
4531 }
4532 return found;
4533 }
4534
MatchSeqInUseThisSeqList(list<string> & use_this_seq,string textSeqIDToMatch)4535 bool CAlignFormatUtil::MatchSeqInUseThisSeqList(list<string> &use_this_seq, string textSeqIDToMatch)
4536 {
4537 bool has_match = false;
4538
4539 ITERATE(list<string>, iter_seq, use_this_seq) {
4540 bool isGi;
4541 string useThisSeq = s_UseThisSeqToTextSeqID(*iter_seq, isGi);
4542 if(useThisSeq == textSeqIDToMatch) {
4543 has_match = true;
4544 break;
4545 }
4546 }
4547 return has_match;
4548 }
4549
RemoveSeqsOfAccessionTypeFromSeqInUse(list<string> & use_this_seq,CSeq_id::EAccessionInfo accesionType)4550 bool CAlignFormatUtil::RemoveSeqsOfAccessionTypeFromSeqInUse(list<string> &use_this_seq, CSeq_id::EAccessionInfo accesionType)
4551 {
4552 list<string> new_use_this_seq;
4553 bool hasAccType = false;
4554 bool isGI = false;
4555
4556 ITERATE(list<string>, iter_seq, use_this_seq) {
4557 string useThisSeq = s_UseThisSeqToTextSeqID(*iter_seq, isGI);
4558 CSeq_id::EAccessionInfo useThisSeqAccType = CSeq_id::IdentifyAccession (useThisSeq);
4559 if(useThisSeqAccType != accesionType) {
4560 new_use_this_seq.push_back(useThisSeq);
4561 }
4562 else {
4563 hasAccType = true;
4564 }
4565 }
4566 use_this_seq = new_use_this_seq;
4567 return hasAccType;
4568 }
4569
GetDisplayIds(const CBioseq_Handle & handle,const CSeq_id & aln_id,list<string> & use_this_seq,TGi * gi,TTaxId * taxid,string * textSeqID)4570 CRef<CSeq_id> CAlignFormatUtil::GetDisplayIds(const CBioseq_Handle& handle,
4571 const CSeq_id& aln_id,
4572 list<string>& use_this_seq,
4573 TGi *gi,
4574 TTaxId *taxid,
4575 string *textSeqID)
4576
4577 {
4578 const CRef<CBlast_def_line_set> bdlRef = CSeqDB::ExtractBlastDefline(handle);
4579 const list< CRef< CBlast_def_line > > &bdl = (bdlRef.Empty()) ? list< CRef< CBlast_def_line > >() : bdlRef->Get();
4580
4581 const CBioseq::TId* ids = &handle.GetBioseqCore()->GetId();
4582 CRef<CSeq_id> wid;
4583
4584 if(gi) *gi = ZERO_GI;
4585 if(taxid) *taxid = ZERO_TAX_ID;
4586 if(bdl.empty()){
4587 wid = FindBestChoice(*ids, CSeq_id::WorstRank);
4588 if(gi) *gi = FindGi(*ids);
4589 if(textSeqID) *textSeqID = GetLabel(wid,true);//uses GetSeqIdString(true)
4590 } else {
4591 bool found = false;
4592 for(list< CRef< CBlast_def_line > >::const_iterator iter = bdl.begin();
4593 iter != bdl.end(); iter++){
4594 const CBioseq::TId* cur_id = &((*iter)->GetSeqid());
4595 TGi cur_gi = FindGi(*cur_id);
4596 wid = FindBestChoice(*cur_id, CSeq_id::WorstRank);
4597 string curSeqID = GetLabel(wid,true);//uses GetSeqIdString(true)
4598 if (taxid && (*iter)->IsSetTaxid() && (*iter)->CanGetTaxid()){
4599 *taxid = (*iter)->GetTaxid();
4600 }
4601 if (!use_this_seq.empty()) {
4602 ITERATE(list<string>, iter_seq, use_this_seq){
4603 bool isGi = false;
4604 string useThisSeq = s_UseThisSeqToTextSeqID( *iter_seq, isGi);
4605 if((isGi && cur_gi == NStr::StringToNumeric<TGi>((useThisSeq))) || (!isGi && curSeqID == useThisSeq)){
4606 found = true;
4607 break;
4608 }
4609 }
4610 } else {
4611 ITERATE(CBioseq::TId, iter_id, *cur_id) {
4612 if ((*iter_id)->Match(aln_id)
4613 || (aln_id.IsGeneral() && aln_id.GetGeneral().CanGetDb() &&
4614 (*iter_id)->IsGeneral() && (*iter_id)->GetGeneral().CanGetDb() &&
4615 aln_id.GetGeneral().GetDb() == (*iter_id)->GetGeneral().GetDb())) {
4616 found = true;
4617 }
4618 }
4619 }
4620 if(found){
4621 if(gi) *gi = cur_gi;
4622 if(textSeqID) *textSeqID = curSeqID;
4623 break;
4624 }
4625 }
4626 }
4627
4628 return wid;
4629 }
4630
4631
GetDisplayIds(const list<CRef<CBlast_def_line>> & bdl,const CSeq_id & aln_id,list<TGi> & use_this_gi)4632 TGi CAlignFormatUtil::GetDisplayIds(const list< CRef< CBlast_def_line > > &bdl,
4633 const CSeq_id& aln_id,
4634 list<TGi>& use_this_gi)
4635
4636
4637 {
4638 TGi gi = ZERO_GI;
4639
4640 if(!bdl.empty()){
4641 bool found = false;
4642 for(list< CRef< CBlast_def_line > >::const_iterator iter = bdl.begin();
4643 iter != bdl.end(); iter++){
4644 const CBioseq::TId* cur_id = &((*iter)->GetSeqid());
4645 TGi cur_gi = FindGi(*cur_id);
4646 if (!use_this_gi.empty()) {
4647 ITERATE(list<TGi>, iter_gi, use_this_gi){
4648 if(cur_gi == *iter_gi){
4649 found = true;
4650 break;
4651 }
4652 }
4653 } else {
4654 ITERATE(CBioseq::TId, iter_id, *cur_id) {
4655 if ((*iter_id)->Match(aln_id)
4656 || (aln_id.IsGeneral() && aln_id.GetGeneral().CanGetDb() &&
4657 (*iter_id)->IsGeneral() && (*iter_id)->GetGeneral().CanGetDb() &&
4658 aln_id.GetGeneral().GetDb() == (*iter_id)->GetGeneral().GetDb())) {
4659 found = true;
4660 }
4661 }
4662 }
4663 if(found){
4664 gi = cur_gi;
4665 break;
4666 }
4667 }
4668 }
4669 return gi;
4670 }
4671
s_FixMinusStrandRange(CRange<TSeqPos> & rng)4672 static inline CRange<TSeqPos> & s_FixMinusStrandRange(CRange<TSeqPos> & rng)
4673 {
4674 if(rng.GetFrom() > rng.GetTo()){
4675 rng.Set(rng.GetTo(), rng.GetFrom());
4676 }
4677 //cerr << "Query Rng: " << rng.GetFrom() << "-" << rng.GetTo() << endl;
4678 return rng;
4679 }
4680
GetUniqSeqCoverage(CSeq_align_set & alnset)4681 int CAlignFormatUtil::GetUniqSeqCoverage(CSeq_align_set & alnset)
4682 {
4683 if(alnset.IsEmpty())
4684 return 0;
4685
4686 bool isDenDiag = (alnset.Get().front()->GetSegs().Which() == CSeq_align::C_Segs::e_Dendiag) ?
4687 true : false;
4688
4689 list<CRef<CSeq_align> >::iterator mItr=alnset.Set().begin();
4690 CRangeCollection<TSeqPos> subj_rng_coll((*mItr)->GetSeqRange(1));
4691 CRange<TSeqPos> q_rng((*mItr)->GetSeqRange(0));
4692 /*
4693 cerr << MSerial_AsnText << **mItr;
4694 cerr << (*mItr)->GetSeqRange(0).GetFrom() << endl;
4695 cerr << (*mItr)->GetSeqRange(0).GetTo() << endl;
4696 cerr << (*mItr)->GetSeqRange(0).GetToOpen() << endl;
4697 cerr << (*mItr)->GetSeqRange(1).GetFrom() << endl;
4698 cerr << (*mItr)->GetSeqRange(1).GetTo() << endl;
4699 cerr << (*mItr)->GetSeqRange(1).GetToOpen() << endl;
4700 */
4701 CRangeCollection<TSeqPos> query_rng_coll(s_FixMinusStrandRange(q_rng));
4702 ++mItr;
4703 for(;mItr != alnset.Set().end(); ++mItr) {
4704 const CRange<TSeqPos> align_subj_rng((*mItr)->GetSeqRange(1));
4705 // subject range should always be on the positive strand
4706 ASSERT(align_subj_rng.GetTo() > align_subj_rng.GetFrom());
4707 CRangeCollection<TSeqPos> coll(align_subj_rng);
4708 coll.Subtract(subj_rng_coll);
4709
4710 if (coll.empty())
4711 continue;
4712
4713 if(coll[0] == align_subj_rng) {
4714 CRange<TSeqPos> query_rng ((*mItr)->GetSeqRange(0));
4715 //cerr << "Subj Rng :" << align_subj_rng.GetFrom() << "-" << align_subj_rng.GetTo() << endl;
4716 query_rng_coll += s_FixMinusStrandRange(query_rng);
4717 subj_rng_coll += align_subj_rng;
4718 }
4719 else {
4720 ITERATE (CRangeCollection<TSeqPos>, uItr, coll) {
4721 CRange<TSeqPos> query_rng;
4722 const CRange<TSeqPos> & subj_rng = (*uItr);
4723 CRef<CSeq_align> densegAln
4724 = isDenDiag ? CAlignFormatUtil::CreateDensegFromDendiag(**mItr) : (*mItr);
4725
4726 CAlnMap map(densegAln->GetSegs().GetDenseg());
4727 TSignedSeqPos subj_aln_start = map.GetAlnPosFromSeqPos(1,subj_rng.GetFrom());
4728 TSignedSeqPos subj_aln_end = map.GetAlnPosFromSeqPos(1,subj_rng.GetTo());
4729 query_rng.SetFrom(map.GetSeqPosFromAlnPos(0,subj_aln_start));
4730 query_rng.SetTo(map.GetSeqPosFromAlnPos(0,subj_aln_end));
4731
4732 //cerr << "Subj Rng :" << subj_rng.GetFrom() << "-" << subj_rng.GetTo() << endl;
4733 query_rng_coll += s_FixMinusStrandRange(query_rng);
4734 subj_rng_coll += subj_rng;
4735 }
4736 }
4737 }
4738
4739 return query_rng_coll.GetCoveredLength();
4740 }
4741
4742 ///return id type specified or null ref
4743 ///@param ids: the input ids
4744 ///@param choice: id of choice
4745 ///@return: the id with specified type
4746 ///
s_GetSeqIdByType(const list<CRef<CSeq_id>> & ids,CSeq_id::E_Choice choice)4747 static CRef<CSeq_id> s_GetSeqIdByType(const list<CRef<CSeq_id> >& ids,
4748 CSeq_id::E_Choice choice)
4749 {
4750 CRef<CSeq_id> cid;
4751
4752 for (CBioseq::TId::const_iterator iter = ids.begin(); iter != ids.end();
4753 iter ++){
4754 if ((*iter)->Which() == choice){
4755 cid = *iter;
4756 break;
4757 }
4758 }
4759
4760 return cid;
4761 }
4762
4763 ///return gi from id list
4764 ///@param ids: the input ids
4765 ///@return: the gi if found
4766 ///
GetGiForSeqIdList(const list<CRef<CSeq_id>> & ids)4767 TGi CAlignFormatUtil::GetGiForSeqIdList (const list<CRef<CSeq_id> >& ids)
4768 {
4769 TGi gi = ZERO_GI;
4770 CRef<CSeq_id> id = s_GetSeqIdByType(ids, CSeq_id::e_Gi);
4771 if (!(id.Empty())){
4772 return id->GetGi();
4773 }
4774 return gi;
4775 }
4776
GetTitle(const CBioseq_Handle & bh)4777 string CAlignFormatUtil::GetTitle(const CBioseq_Handle & bh)
4778 {
4779 CSeqdesc_CI desc_t(bh, CSeqdesc::e_Title);
4780 string t = kEmptyStr;
4781 for (;desc_t; ++desc_t) {
4782 t += desc_t->GetTitle() + " ";
4783 }
4784 return t;
4785 }
4786
GetBareId(const CSeq_id & id)4787 string CAlignFormatUtil::GetBareId(const CSeq_id& id)
4788 {
4789 string retval;
4790
4791 if (id.IsGi() || id.IsPrf() || id.IsPir()) {
4792 retval = id.AsFastaString();
4793 }
4794 else {
4795 retval = id.GetSeqIdString(true);
4796 }
4797
4798 return retval;
4799 }
4800
4801
GetTextSeqID(CConstRef<CSeq_id> seqID,string * textSeqID)4802 bool CAlignFormatUtil::GetTextSeqID(CConstRef<CSeq_id> seqID, string *textSeqID)
4803 {
4804 bool hasTextSeqID = true;
4805
4806 const CTextseq_id* text_id = seqID->GetTextseq_Id();
4807 //returns non zero if e_Genbank,e_Embl,e_Ddbj,e_Pir,e_Swissprot,case e_Other,e_Prf,case e_Tpg,e_Tpe,case e_Tpd,case e_Gpipe, e_Named_annot_track
4808 if(!text_id) { //check for pdb and pat
4809 if(!(seqID->Which() == CSeq_id::e_Pdb) && !(seqID->Which() == CSeq_id::e_Patent) && !(seqID->Which() == CSeq_id::e_Gi)) {
4810 hasTextSeqID = false;
4811 }
4812 }
4813
4814 if(hasTextSeqID && textSeqID) {
4815 seqID->GetLabel(textSeqID, CSeq_id::eContent);
4816 }
4817 return hasTextSeqID;
4818 }
4819
4820
4821
GetTextSeqID(const list<CRef<CSeq_id>> & ids,string * textSeqID)4822 bool CAlignFormatUtil::GetTextSeqID(const list<CRef<CSeq_id> > & ids, string *textSeqID)
4823 {
4824 bool hasTextSeqID = false;
4825
4826 CConstRef<CSeq_id> seqID = FindTextseq_id(ids);
4827 //returns non zero if e_Genbank,e_Embl,e_Ddbj,e_Pir,e_Swissprot,case e_Other,e_Prf,case e_Tpg,e_Tpe,case e_Tpd,case e_Gpipe, e_Named_annot_track
4828 if(seqID.Empty()) {
4829 seqID = GetSeq_idByType(ids, CSeq_id::e_Pdb);
4830 }
4831 if(seqID.Empty()) {
4832 seqID = GetSeq_idByType(ids, CSeq_id::e_Patent);
4833 }
4834 if(!seqID.Empty()) {
4835 hasTextSeqID = true;
4836 if(textSeqID) seqID->GetLabel(textSeqID, CSeq_id::eContent);
4837 }
4838 return hasTextSeqID;
4839 }
4840
FilterSeqalignBySeqList(CSeq_align_set & source_aln,vector<string> & seqList)4841 CRef<CSeq_align_set> CAlignFormatUtil::FilterSeqalignBySeqList(CSeq_align_set& source_aln,
4842 vector <string> &seqList)
4843 {
4844 CConstRef<CSeq_id> previous_id, subid;
4845 list<string> use_this_seq;
4846 bool match = false;
4847
4848 CRef<CSeq_align_set> new_aln(new CSeq_align_set);
4849 ITERATE(CSeq_align_set::Tdata, iter, source_aln.Get()){
4850 subid = &((*iter)->GetSeq_id(1));
4851 if(previous_id.Empty() || !subid->Match(*previous_id)){
4852 use_this_seq.clear();
4853 CAlignFormatUtil::GetUseThisSequence(**iter,use_this_seq);
4854 match = MatchSeqInSeqList(subid, use_this_seq,seqList);
4855 }
4856
4857 previous_id = subid;
4858 if(match) {
4859 new_aln->Set().push_back(*iter);
4860 }
4861 }
4862 return new_aln;
4863 }
4864
4865
4866 END_SCOPE(align_format)
4867 END_NCBI_SCOPE
4868