1 /*
2 * ===========================================================================
3 *
4 *                            PUBLIC DOMAIN NOTICE
5 *               National Center for Biotechnology Information
6 *
7 *  This software/database is a "United States Government Work" under the
8 *  terms of the United States Copyright Act.  It was written as part of
9 *  the author's official duties as a United States Government employee and
10 *  thus cannot be copyrighted.  This software/database is freely available
11 *  to the public for use. The National Library of Medicine and the U.S.
12 *  Government have not placed any restriction on its use or reproduction.
13 *
14 *  Although all reasonable efforts have been taken to ensure the accuracy
15 *  and reliability of the software and data, the NLM and the U.S.
16 *  Government do not and cannot warrant the performance or results that
17 *  may be obtained by using this software or data. The NLM and the U.S.
18 *  Government disclaim all warranties, express or implied, including
19 *  warranties of performance, merchantability or fitness for any particular
20 *  purpose.
21 *
22 *  Please cite the author in any work or product based on this material.
23 *
24 * ===========================================================================
25 *
26 * Author: Azat Badretdin
27 *
28 * File Description:
29 *
30 * ===========================================================================
31 */
32 #include <ncbi_pch.hpp>
33 #include "read_blast_result.hpp"
34 #include <objects/seqloc/Seq_point.hpp>
35 
GetGenomeLen()36 void CReadBlastApp::GetGenomeLen()
37 {
38     for (CTypeIterator<CBioseq> seq = Begin();  seq;  ++seq)
39       {
40 // check if na
41       if(seq->GetInst().GetMol()!=CSeq_inst::eMol_dna) continue;
42       if(PrintDetails()) NcbiCerr << "GetGenomeLen: found DNA" << NcbiEndl;
43       m_length = seq->GetInst().GetLength(); // let the toolkit take care of exception
44       } // end iteration over all genomic sequences
45 }
46 
CheckUniqLocusTag()47 void CReadBlastApp::CheckUniqLocusTag()
48 {
49   typedef map<string, int> THaveIt;
50   THaveIt locuses;
51   for(CTypeIterator< CSeq_feat > f = Begin(); f; ++f)
52     {
53     // const CSeq_loc&  loc = f->GetLocation();
54     if(f->GetData().IsGene())
55       {
56       if (f->GetData().GetGene().CanGetLocus_tag())
57         {
58         string locus_tag =  f->GetData().GetGene().GetLocus_tag();
59         locuses[locus_tag]++;
60         }
61       }
62     }
63 
64   bool bad=false;
65   ITERATE(THaveIt, locus, locuses)
66     {
67     if(locus->second<2) continue;
68     bad=true;
69     NcbiCerr << "ERROR: CReadBlastApp::CheckUniqLocusTag : more than one gene with the same locus_tag: "
70              << locus->first
71              << " = "
72              << locus->second
73              << NcbiEndl;
74     }
75   if(bad)
76     {
77     NcbiCerr << "FATAL: sequences or genes with the same locus_tag are not allowed" << NcbiEndl;
78     throw;
79     }
80 }
81 
GetProtName(const CBioseq & seq)82 string CReadBlastApp::GetProtName(const CBioseq& seq)
83 {
84   ITERATE(CBioseq::TAnnot, annot, seq.GetAnnot())
85     {
86     if( ! (*annot)->GetData().IsFtable() ) continue;
87     ITERATE(CSeq_annot::TData::TFtable, feat, (*annot)->GetData().GetFtable())
88       {
89       if ( ! (*feat)->GetData().IsProt() ) continue;
90       if ( ! (*feat)->GetData().GetProt().CanGetName() ) continue;
91       return *( (*feat)->GetData().GetProt().GetName().begin() );
92       }
93     }
94   return "default name";
95 }
getQueryLen(const CBioseq & seq)96 int CReadBlastApp::getQueryLen(const CBioseq& seq)
97 {
98   return seq.GetInst().GetLength();
99 }
100 
GetRRNAtype(const CRNA_ref & rna)101 string GetRRNAtype(const CRNA_ref& rna)
102 {
103    string type =  "unknown rRNA";
104    if(!rna.CanGetExt()) return type;
105    if(!rna.GetExt().IsName()) return type;
106    string name = rna.GetExt().GetName();
107    if(name.find("5S") != string::npos) return "5S";
108    if(name.find("16S") != string::npos) return "16S";
109    if(name.find("23S") != string::npos) return "23S";
110    if(name.find("SSU") != string::npos) return "16S";
111    if(name.find("LSU") != string::npos) return "23S";
112    return type;
113 }
114 
Get3type(const CRNA_ref & rna)115 string Get3type(const CRNA_ref& rna)
116 {
117   string type;
118 
119   if(!rna.CanGetExt()) {
120       ERR_POST(Error<<MSerial_AsnText << rna);
121       NCBI_USER_THROW("!rna.CanGetExt()");
122   }
123   if(!rna.GetExt().IsTRNA()) {
124       ERR_POST(Error<<MSerial_AsnText << rna);
125       NCBI_USER_THROW("!rna.GetExt().IsTRNA()");
126   }
127   if(!rna.GetExt().GetTRNA().CanGetAa()) {
128       ERR_POST(Error<<MSerial_AsnText << rna);
129       NCBI_USER_THROW("!rna.GetExt().GetTRNA().CanGetAa()");
130   }
131   CTrna_ext::C_Aa::E_Choice choice = rna.GetExt().GetTRNA().GetAa().Which();
132 
133   char let1;
134   switch(choice)
135     {
136     case CTrna_ext::C_Aa::e_Ncbieaa:
137       let1 = rna.GetExt().GetTRNA().GetAa().GetNcbieaa();
138       type = let1_2_let3(let1);
139       break;
140     case CTrna_ext::C_Aa::e_Iupacaa:
141       let1 = rna.GetExt().GetTRNA().GetAa().GetIupacaa();
142       type = let1_2_let3(let1);
143       break;
144     case CTrna_ext::C_Aa::e_Ncbi8aa:
145       NcbiCerr << "Get3type(FATAL)::type e_Ncbi8aa(" << rna.GetExt().GetTRNA().GetAa().GetNcbi8aa() << ") is not handled now" << NcbiEndl;
146       throw;
147       break;
148     case CTrna_ext::C_Aa::e_Ncbistdaa:
149       NcbiCerr << "Get3type(FATAL)::type e_Ncbistdaa (" << rna.GetExt().GetTRNA().GetAa().GetNcbistdaa() << ") is not handled now" << NcbiEndl;
150       throw;
151       break;
152     default:
153       NcbiCerr << "Get3type: INTERNAL FATAL:: you should never be here" << NcbiEndl;
154       break;
155     }
156   return type;
157 }
158 
get_title(const CBioseq & seq)159 string get_title(const CBioseq& seq)
160 {
161   string descr="";
162   if(seq.CanGetDescr())
163     {
164     ITERATE(CSeq_descr::Tdata, desc, seq.GetDescr().Get())
165       {
166       if( (*desc)->IsTitle() )
167         {
168         descr=(*desc)->GetTitle(); break;
169         }
170       }
171     }
172 
173  return descr;
174 }
175 
get_my_seq_type(const CBioseq & seq)176 EMyFeatureType get_my_seq_type(const CBioseq& seq)
177 {
178   string name = GetStringDescr(seq);
179   EMyFeatureType type = eMyFeatureType_unknown;
180 
181   string descr="";
182   descr=get_title(seq);
183   if( descr.find("hypothetical") != string::npos)
184     {
185     type = eMyFeatureType_hypo_CDS;
186     }
187   else
188     {
189     type = eMyFeatureType_normal_CDS;
190     }
191   if(CReadBlastApp::PrintDetails()) NcbiCerr << "get_my_seq_type: " << name << ", "
192                               << "descr " << descr << ", "
193                               << "type= " << type
194                               << NcbiEndl;
195   return type;
196 }
get_trna_string(const CSeq_feat & feat)197 string get_trna_string(const CSeq_feat& feat)
198 {
199   string trna_string="";
200   bool isPseudo=false;
201   char aa=0;
202 
203   if(!feat.GetData().IsRna()) return trna_string;
204   if(!feat.GetData().GetRna().CanGetType()) return trna_string;
205   CRNA_ref::EType rna_type = feat.GetData().GetRna().GetType();
206   if(rna_type != CRNA_ref::eType_tRNA) return trna_string;
207   if(feat.GetData().GetRna().CanGetPseudo())
208      {
209      isPseudo=feat.GetData().GetRna().GetPseudo();
210      }
211   if(feat.GetData().GetRna().CanGetExt() &&
212      feat.GetData().GetRna().GetExt().IsTRNA() &&
213      feat.GetData().GetRna().GetExt().GetTRNA().CanGetAa() &&
214      feat.GetData().GetRna().GetExt().GetTRNA().GetAa().IsNcbieaa())
215      {
216      aa  = feat.GetData().GetRna().GetExt().GetTRNA().GetAa().GetNcbieaa();
217      }
218   trna_string = isPseudo ? "pseudo-tRNA-":"tRNA-";
219   switch(aa)
220     {
221     case 0: trna_string+="unknown"; break;
222     case 'X': trna_string+="other"; break;
223     default: trna_string+=aa; break;
224     }
225   return trna_string;
226 
227 }
228 
GetRNAname(const CSeq_feat & feat)229 string GetRNAname(const CSeq_feat& feat)
230 {
231   string trna_string="";
232 
233   if(!feat.GetData().IsRna()) return trna_string;
234   if(!feat.GetData().GetRna().CanGetType()) return trna_string;
235   // CRNA_ref::EType rna_type = feat.GetData().GetRna().GetType();
236 //  if(rna_type != CRNA_ref::eType_rRNA) return trna_string;
237   if(feat.GetData().GetRna().CanGetExt() &&
238      feat.GetData().GetRna().GetExt().IsName())
239      {
240      trna_string = feat.GetData().GetRna().GetExt().GetName();
241      }
242   return trna_string;
243 
244 }
245 
get_my_feat_type(const CSeq_feat & feat,const LocMap & loc_map)246 EMyFeatureType get_my_feat_type(const CSeq_feat& feat, const LocMap& loc_map)
247 {
248   EMyFeatureType feat_type = eMyFeatureType_unknown;
249   string name = GetLocusTag(feat, loc_map);
250 
251 //  need to get product name from sequence. Bummer.
252 
253   if(feat.GetData().IsRna())
254     {
255     if(feat.GetData().GetRna().CanGetType())
256       {
257       CRNA_ref::EType rna_type = feat.GetData().GetRna().GetType();
258       if(CReadBlastApp::PrintDetails()) NcbiCerr << "get_my_feat_type: " << name
259                               << ", rna_type= " << rna_type
260                               << NcbiEndl;
261       if(rna_type == CRNA_ref::eType_tRNA)
262         {
263         if(feat.GetData().GetRna().CanGetPseudo() && feat.GetData().GetRna().GetPseudo() == true)
264           {
265           feat_type = eMyFeatureType_pseudo_tRNA;
266           }
267         else if(feat.GetData().GetRna().CanGetExt() &&
268                 feat.GetData().GetRna().GetExt().IsTRNA() &&
269                 feat.GetData().GetRna().GetExt().GetTRNA().CanGetAa() &&
270                 feat.GetData().GetRna().GetExt().GetTRNA().GetAa().IsNcbieaa())
271           {
272           int aa = feat.GetData().GetRna().GetExt().GetTRNA().GetAa().GetNcbieaa();
273           if( aa=='A' ||
274               (aa>='C' && aa<='I') ||
275               (aa>='K' && aa<='N') ||
276               (aa>='P' && aa<='T') ||
277               aa=='V' ||
278               aa=='W' ||
279               aa=='Y'
280             )
281             {
282             feat_type = eMyFeatureType_normal_tRNA;
283             }
284           else
285             {
286             feat_type = eMyFeatureType_atypical_tRNA;
287             }
288           if(CReadBlastApp::PrintDetails()) NcbiCerr << "get_my_feat_type: " << name
289                               << ", aa= " << aa
290                               << ", type= " << feat_type
291                               << NcbiEndl;
292           }
293         } // if(rna_type == CRNA_ref::eType_tRNA)
294       else if (rna_type == CRNA_ref::eType_rRNA)
295         {
296         feat_type = eMyFeatureType_rRNA;
297         }
298       else if (rna_type == CRNA_ref::eType_miscRNA)
299         {
300         feat_type = eMyFeatureType_miscRNA;
301         }
302       }
303     }
304   if(CReadBlastApp::PrintDetails()) NcbiCerr << "get_my_feat_type: " << name
305                               << ", type= " << feat_type
306                               << NcbiEndl;
307   return feat_type;
308 }
GetStringDescr(const CBioseq & bioseq)309 string GetStringDescr(const CBioseq& bioseq)
310 {
311 
312   string result = CSeq_id::GetStringDescr (bioseq, CSeq_id::eFormat_FastA);
313   string locus_tag = CReadBlastApp::getLocusTag(bioseq);
314 // make sure locus_tag does not match result
315 
316   if(locus_tag != "" && result.find(locus_tag) == string::npos) result += "|" + locus_tag;
317   return result;
318 }
319 
printed_range(const TSeqPos from2,const TSeqPos to2)320 string printed_range(const TSeqPos from2, const TSeqPos to2) // Mother of All Printed_Ranges
321 {
322    CNcbiStrstream rstr;
323    rstr << from2 << "..." << to2 << '\0';
324    string r  = rstr.str();
325    return r;
326 }
327 
printed_range(const CSeq_feat & feat)328 string printed_range(const CSeq_feat& feat)
329 {
330    return printed_range(feat.GetLocation());
331 }
332 
printed_range(const CSeq_loc & seq_interval)333 string printed_range(const CSeq_loc& seq_interval)
334 {
335    TSeqPos from2, to2;
336    from2 = seq_interval.GetStart(eExtreme_Positional);
337    to2   = seq_interval.GetStop (eExtreme_Positional);
338    return printed_range(from2, to2);
339 }
340 
printed_ranges(const CSeq_loc & seq_interval)341 string printed_ranges(const CSeq_loc& seq_interval)
342 {
343   typedef list<pair<int,int> > Tlist;
344   Tlist ints;
345   for(CTypeConstIterator<CSeq_interval> inter=::ConstBegin(seq_interval); inter; ++inter)
346     {
347     pair<int,int> apair(inter->GetFrom(), inter->GetTo());
348     ints.push_back(apair);
349     }
350   for(CTypeConstIterator<CSeq_point> pnt=::ConstBegin(seq_interval); pnt; ++pnt)
351     {
352     pair<int,int> apair(pnt->GetPoint(), pnt->GetPoint());
353     ints.push_back(apair);
354     }
355   ints.sort(CReadBlastApp::less_pair);
356 
357   string ranges="";
358   NON_CONST_ITERATE(Tlist, interval, ints)
359     {
360     if(!ranges.empty()) ranges+=",";
361     ranges+=printed_range(interval->first, interval->second);
362     }
363 
364   return ranges;
365 }
366 
printed_range(const CBioseq & seq)367 string printed_range(const CBioseq& seq)
368 {
369    return printed_range(CReadBlastApp::getGenomicLocation(seq));
370 }
371 
printed_range(const TSimpleSeqs::iterator & ext_rna)372 string printed_range(const TSimpleSeqs::iterator& ext_rna)
373 {
374   return printed_range(*ext_rna);
375 /*
376    int from = ext_rna->exons[0].from;
377    int to   = ext_rna->exons[ext_rna->exons.size()-1].to;
378    CNcbiStrstream ext_rna_range_stream; ext_rna_range_stream << from << "..." << to << '\0';
379    string ext_rna_range = ext_rna_range_stream.str();
380    return ext_rna_range;
381 */
382 }
383 
printed_range(const TSimplePair & apair)384 string printed_range(const TSimplePair& apair)
385 {
386    return printed_range(apair.from, apair.to);
387 
388 }
389 
390 
printed_range(const TSimpleSeq & ext_rna)391 string printed_range(const TSimpleSeq& ext_rna)
392 {
393    int from,to;
394    if(ext_rna.exons[0].strand == eNa_strand_plus)
395      {
396      from = ext_rna.exons[0].from;
397      to   = ext_rna.exons[ext_rna.exons.size()-1].to;
398      }
399    else
400     {
401      to   = ext_rna.exons[0].to;
402      from = ext_rna.exons[ext_rna.exons.size()-1].from;
403      }
404 
405    return printed_range(from, to);
406 }
407 
printed_range(const TSimpleSeqs::iterator & ext_rna,const TSimpleSeqs::iterator & end)408 string printed_range(const TSimpleSeqs::iterator& ext_rna, const TSimpleSeqs::iterator& end)
409 {
410    if(ext_rna==end) return "beyond end...beyond end";
411    return printed_range(ext_rna);
412 }
413 
printed_range(const TSimpleSeqs::iterator & ext_rna,TSimpleSeqs & seqs)414 string printed_range(const TSimpleSeqs::iterator& ext_rna, TSimpleSeqs& seqs)
415 {
416    return printed_range(ext_rna, seqs.end());
417 }
418 
419 
getLenScore(CBioseq::TAnnot::const_iterator & annot)420 int CReadBlastApp::getLenScore( CBioseq::TAnnot::const_iterator& annot)
421 {
422   int len=-1;
423   if(!(*annot)->GetData().IsAlign()) return len;
424 
425   ITERATE(CSeq_align::TScore, score, (*(*annot)->GetData().GetAlign().begin())->GetScore())
426     {
427     string name = (*score)->GetId().GetStr();
428     if(name!="sbjLen") continue;
429     len = (*score)->GetValue().GetInt();
430     return len;
431     }
432   return len;
433 }
getBounds(CBioseq::TAnnot::const_iterator & annot,int * qFrom,int * qTo,int * sFrom,int * sTo)434 void CReadBlastApp::getBounds
435   (
436   CBioseq::TAnnot::const_iterator& annot,
437   int* qFrom, int* qTo, int* sFrom, int* sTo
438   )
439 {
440   int i=0;
441   ITERATE(CSeq_align::TBounds, bounds, (*(*annot)->GetData().GetAlign().begin())->GetBounds() )
442     {
443     if(!i)
444       {
445       *qFrom = (*bounds)->GetInt().GetFrom();
446       *qTo   = (*bounds)->GetInt().GetTo();
447       }
448     else if (i==1)
449       {
450       *sFrom = (*bounds)->GetInt().GetFrom();
451       *sTo   = (*bounds)->GetInt().GetTo();
452       break;
453       }
454     i++;
455     }
456 
457 }
458 
getAnnotName(CBioseq::TAnnot::const_iterator & annot)459 string CReadBlastApp::getAnnotName(CBioseq::TAnnot::const_iterator& annot)
460 {
461  ITERATE(CAnnot_descr::Tdata, desc, (*annot)->GetDesc().Get())
462     {
463     if( (*desc)->IsName() )
464       {
465       return  (*desc)->GetName();
466       }
467     }
468  return "cannot get name";
469 }
470 
getAnnotComment(CBioseq::TAnnot::const_iterator & annot)471 string CReadBlastApp::getAnnotComment(CBioseq::TAnnot::const_iterator& annot)
472 {
473  ITERATE(CAnnot_descr::Tdata, desc, (*annot)->GetDesc().Get())
474     {
475     if( (*desc)->IsComment() )
476       {
477       return  (*desc)->GetComment();
478       }
479     }
480  return "cannot get comment";
481 }
482 
483