1 /*
2 * ===========================================================================
3 *
4 * PUBLIC DOMAIN NOTICE
5 * National Center for Biotechnology Information
6 *
7 * This software/database is a "United States Government Work" under the
8 * terms of the United States Copyright Act. It was written as part of
9 * the author's official duties as a United States Government employee and
10 * thus cannot be copyrighted. This software/database is freely available
11 * to the public for use. The National Library of Medicine and the U.S.
12 * Government have not placed any restriction on its use or reproduction.
13 *
14 * Although all reasonable efforts have been taken to ensure the accuracy
15 * and reliability of the software and data, the NLM and the U.S.
16 * Government do not and cannot warrant the performance or results that
17 * may be obtained by using this software or data. The NLM and the U.S.
18 * Government disclaim all warranties, express or implied, including
19 * warranties of performance, merchantability or fitness for any particular
20 * purpose.
21 *
22 * Please cite the author in any work or product based on this material.
23 *
24 * ===========================================================================
25 *
26 * Author: Azat Badretdin
27 *
28 * File Description:
29 *
30 * ===========================================================================
31 */
32 #include <ncbi_pch.hpp>
33 #include "read_blast_result.hpp"
34 #include <objects/seqloc/Seq_point.hpp>
35
GetGenomeLen()36 void CReadBlastApp::GetGenomeLen()
37 {
38 for (CTypeIterator<CBioseq> seq = Begin(); seq; ++seq)
39 {
40 // check if na
41 if(seq->GetInst().GetMol()!=CSeq_inst::eMol_dna) continue;
42 if(PrintDetails()) NcbiCerr << "GetGenomeLen: found DNA" << NcbiEndl;
43 m_length = seq->GetInst().GetLength(); // let the toolkit take care of exception
44 } // end iteration over all genomic sequences
45 }
46
CheckUniqLocusTag()47 void CReadBlastApp::CheckUniqLocusTag()
48 {
49 typedef map<string, int> THaveIt;
50 THaveIt locuses;
51 for(CTypeIterator< CSeq_feat > f = Begin(); f; ++f)
52 {
53 // const CSeq_loc& loc = f->GetLocation();
54 if(f->GetData().IsGene())
55 {
56 if (f->GetData().GetGene().CanGetLocus_tag())
57 {
58 string locus_tag = f->GetData().GetGene().GetLocus_tag();
59 locuses[locus_tag]++;
60 }
61 }
62 }
63
64 bool bad=false;
65 ITERATE(THaveIt, locus, locuses)
66 {
67 if(locus->second<2) continue;
68 bad=true;
69 NcbiCerr << "ERROR: CReadBlastApp::CheckUniqLocusTag : more than one gene with the same locus_tag: "
70 << locus->first
71 << " = "
72 << locus->second
73 << NcbiEndl;
74 }
75 if(bad)
76 {
77 NcbiCerr << "FATAL: sequences or genes with the same locus_tag are not allowed" << NcbiEndl;
78 throw;
79 }
80 }
81
GetProtName(const CBioseq & seq)82 string CReadBlastApp::GetProtName(const CBioseq& seq)
83 {
84 ITERATE(CBioseq::TAnnot, annot, seq.GetAnnot())
85 {
86 if( ! (*annot)->GetData().IsFtable() ) continue;
87 ITERATE(CSeq_annot::TData::TFtable, feat, (*annot)->GetData().GetFtable())
88 {
89 if ( ! (*feat)->GetData().IsProt() ) continue;
90 if ( ! (*feat)->GetData().GetProt().CanGetName() ) continue;
91 return *( (*feat)->GetData().GetProt().GetName().begin() );
92 }
93 }
94 return "default name";
95 }
getQueryLen(const CBioseq & seq)96 int CReadBlastApp::getQueryLen(const CBioseq& seq)
97 {
98 return seq.GetInst().GetLength();
99 }
100
GetRRNAtype(const CRNA_ref & rna)101 string GetRRNAtype(const CRNA_ref& rna)
102 {
103 string type = "unknown rRNA";
104 if(!rna.CanGetExt()) return type;
105 if(!rna.GetExt().IsName()) return type;
106 string name = rna.GetExt().GetName();
107 if(name.find("5S") != string::npos) return "5S";
108 if(name.find("16S") != string::npos) return "16S";
109 if(name.find("23S") != string::npos) return "23S";
110 if(name.find("SSU") != string::npos) return "16S";
111 if(name.find("LSU") != string::npos) return "23S";
112 return type;
113 }
114
Get3type(const CRNA_ref & rna)115 string Get3type(const CRNA_ref& rna)
116 {
117 string type;
118
119 if(!rna.CanGetExt()) {
120 ERR_POST(Error<<MSerial_AsnText << rna);
121 NCBI_USER_THROW("!rna.CanGetExt()");
122 }
123 if(!rna.GetExt().IsTRNA()) {
124 ERR_POST(Error<<MSerial_AsnText << rna);
125 NCBI_USER_THROW("!rna.GetExt().IsTRNA()");
126 }
127 if(!rna.GetExt().GetTRNA().CanGetAa()) {
128 ERR_POST(Error<<MSerial_AsnText << rna);
129 NCBI_USER_THROW("!rna.GetExt().GetTRNA().CanGetAa()");
130 }
131 CTrna_ext::C_Aa::E_Choice choice = rna.GetExt().GetTRNA().GetAa().Which();
132
133 char let1;
134 switch(choice)
135 {
136 case CTrna_ext::C_Aa::e_Ncbieaa:
137 let1 = rna.GetExt().GetTRNA().GetAa().GetNcbieaa();
138 type = let1_2_let3(let1);
139 break;
140 case CTrna_ext::C_Aa::e_Iupacaa:
141 let1 = rna.GetExt().GetTRNA().GetAa().GetIupacaa();
142 type = let1_2_let3(let1);
143 break;
144 case CTrna_ext::C_Aa::e_Ncbi8aa:
145 NcbiCerr << "Get3type(FATAL)::type e_Ncbi8aa(" << rna.GetExt().GetTRNA().GetAa().GetNcbi8aa() << ") is not handled now" << NcbiEndl;
146 throw;
147 break;
148 case CTrna_ext::C_Aa::e_Ncbistdaa:
149 NcbiCerr << "Get3type(FATAL)::type e_Ncbistdaa (" << rna.GetExt().GetTRNA().GetAa().GetNcbistdaa() << ") is not handled now" << NcbiEndl;
150 throw;
151 break;
152 default:
153 NcbiCerr << "Get3type: INTERNAL FATAL:: you should never be here" << NcbiEndl;
154 break;
155 }
156 return type;
157 }
158
get_title(const CBioseq & seq)159 string get_title(const CBioseq& seq)
160 {
161 string descr="";
162 if(seq.CanGetDescr())
163 {
164 ITERATE(CSeq_descr::Tdata, desc, seq.GetDescr().Get())
165 {
166 if( (*desc)->IsTitle() )
167 {
168 descr=(*desc)->GetTitle(); break;
169 }
170 }
171 }
172
173 return descr;
174 }
175
get_my_seq_type(const CBioseq & seq)176 EMyFeatureType get_my_seq_type(const CBioseq& seq)
177 {
178 string name = GetStringDescr(seq);
179 EMyFeatureType type = eMyFeatureType_unknown;
180
181 string descr="";
182 descr=get_title(seq);
183 if( descr.find("hypothetical") != string::npos)
184 {
185 type = eMyFeatureType_hypo_CDS;
186 }
187 else
188 {
189 type = eMyFeatureType_normal_CDS;
190 }
191 if(CReadBlastApp::PrintDetails()) NcbiCerr << "get_my_seq_type: " << name << ", "
192 << "descr " << descr << ", "
193 << "type= " << type
194 << NcbiEndl;
195 return type;
196 }
get_trna_string(const CSeq_feat & feat)197 string get_trna_string(const CSeq_feat& feat)
198 {
199 string trna_string="";
200 bool isPseudo=false;
201 char aa=0;
202
203 if(!feat.GetData().IsRna()) return trna_string;
204 if(!feat.GetData().GetRna().CanGetType()) return trna_string;
205 CRNA_ref::EType rna_type = feat.GetData().GetRna().GetType();
206 if(rna_type != CRNA_ref::eType_tRNA) return trna_string;
207 if(feat.GetData().GetRna().CanGetPseudo())
208 {
209 isPseudo=feat.GetData().GetRna().GetPseudo();
210 }
211 if(feat.GetData().GetRna().CanGetExt() &&
212 feat.GetData().GetRna().GetExt().IsTRNA() &&
213 feat.GetData().GetRna().GetExt().GetTRNA().CanGetAa() &&
214 feat.GetData().GetRna().GetExt().GetTRNA().GetAa().IsNcbieaa())
215 {
216 aa = feat.GetData().GetRna().GetExt().GetTRNA().GetAa().GetNcbieaa();
217 }
218 trna_string = isPseudo ? "pseudo-tRNA-":"tRNA-";
219 switch(aa)
220 {
221 case 0: trna_string+="unknown"; break;
222 case 'X': trna_string+="other"; break;
223 default: trna_string+=aa; break;
224 }
225 return trna_string;
226
227 }
228
GetRNAname(const CSeq_feat & feat)229 string GetRNAname(const CSeq_feat& feat)
230 {
231 string trna_string="";
232
233 if(!feat.GetData().IsRna()) return trna_string;
234 if(!feat.GetData().GetRna().CanGetType()) return trna_string;
235 // CRNA_ref::EType rna_type = feat.GetData().GetRna().GetType();
236 // if(rna_type != CRNA_ref::eType_rRNA) return trna_string;
237 if(feat.GetData().GetRna().CanGetExt() &&
238 feat.GetData().GetRna().GetExt().IsName())
239 {
240 trna_string = feat.GetData().GetRna().GetExt().GetName();
241 }
242 return trna_string;
243
244 }
245
get_my_feat_type(const CSeq_feat & feat,const LocMap & loc_map)246 EMyFeatureType get_my_feat_type(const CSeq_feat& feat, const LocMap& loc_map)
247 {
248 EMyFeatureType feat_type = eMyFeatureType_unknown;
249 string name = GetLocusTag(feat, loc_map);
250
251 // need to get product name from sequence. Bummer.
252
253 if(feat.GetData().IsRna())
254 {
255 if(feat.GetData().GetRna().CanGetType())
256 {
257 CRNA_ref::EType rna_type = feat.GetData().GetRna().GetType();
258 if(CReadBlastApp::PrintDetails()) NcbiCerr << "get_my_feat_type: " << name
259 << ", rna_type= " << rna_type
260 << NcbiEndl;
261 if(rna_type == CRNA_ref::eType_tRNA)
262 {
263 if(feat.GetData().GetRna().CanGetPseudo() && feat.GetData().GetRna().GetPseudo() == true)
264 {
265 feat_type = eMyFeatureType_pseudo_tRNA;
266 }
267 else if(feat.GetData().GetRna().CanGetExt() &&
268 feat.GetData().GetRna().GetExt().IsTRNA() &&
269 feat.GetData().GetRna().GetExt().GetTRNA().CanGetAa() &&
270 feat.GetData().GetRna().GetExt().GetTRNA().GetAa().IsNcbieaa())
271 {
272 int aa = feat.GetData().GetRna().GetExt().GetTRNA().GetAa().GetNcbieaa();
273 if( aa=='A' ||
274 (aa>='C' && aa<='I') ||
275 (aa>='K' && aa<='N') ||
276 (aa>='P' && aa<='T') ||
277 aa=='V' ||
278 aa=='W' ||
279 aa=='Y'
280 )
281 {
282 feat_type = eMyFeatureType_normal_tRNA;
283 }
284 else
285 {
286 feat_type = eMyFeatureType_atypical_tRNA;
287 }
288 if(CReadBlastApp::PrintDetails()) NcbiCerr << "get_my_feat_type: " << name
289 << ", aa= " << aa
290 << ", type= " << feat_type
291 << NcbiEndl;
292 }
293 } // if(rna_type == CRNA_ref::eType_tRNA)
294 else if (rna_type == CRNA_ref::eType_rRNA)
295 {
296 feat_type = eMyFeatureType_rRNA;
297 }
298 else if (rna_type == CRNA_ref::eType_miscRNA)
299 {
300 feat_type = eMyFeatureType_miscRNA;
301 }
302 }
303 }
304 if(CReadBlastApp::PrintDetails()) NcbiCerr << "get_my_feat_type: " << name
305 << ", type= " << feat_type
306 << NcbiEndl;
307 return feat_type;
308 }
GetStringDescr(const CBioseq & bioseq)309 string GetStringDescr(const CBioseq& bioseq)
310 {
311
312 string result = CSeq_id::GetStringDescr (bioseq, CSeq_id::eFormat_FastA);
313 string locus_tag = CReadBlastApp::getLocusTag(bioseq);
314 // make sure locus_tag does not match result
315
316 if(locus_tag != "" && result.find(locus_tag) == string::npos) result += "|" + locus_tag;
317 return result;
318 }
319
printed_range(const TSeqPos from2,const TSeqPos to2)320 string printed_range(const TSeqPos from2, const TSeqPos to2) // Mother of All Printed_Ranges
321 {
322 CNcbiStrstream rstr;
323 rstr << from2 << "..." << to2 << '\0';
324 string r = rstr.str();
325 return r;
326 }
327
printed_range(const CSeq_feat & feat)328 string printed_range(const CSeq_feat& feat)
329 {
330 return printed_range(feat.GetLocation());
331 }
332
printed_range(const CSeq_loc & seq_interval)333 string printed_range(const CSeq_loc& seq_interval)
334 {
335 TSeqPos from2, to2;
336 from2 = seq_interval.GetStart(eExtreme_Positional);
337 to2 = seq_interval.GetStop (eExtreme_Positional);
338 return printed_range(from2, to2);
339 }
340
printed_ranges(const CSeq_loc & seq_interval)341 string printed_ranges(const CSeq_loc& seq_interval)
342 {
343 typedef list<pair<int,int> > Tlist;
344 Tlist ints;
345 for(CTypeConstIterator<CSeq_interval> inter=::ConstBegin(seq_interval); inter; ++inter)
346 {
347 pair<int,int> apair(inter->GetFrom(), inter->GetTo());
348 ints.push_back(apair);
349 }
350 for(CTypeConstIterator<CSeq_point> pnt=::ConstBegin(seq_interval); pnt; ++pnt)
351 {
352 pair<int,int> apair(pnt->GetPoint(), pnt->GetPoint());
353 ints.push_back(apair);
354 }
355 ints.sort(CReadBlastApp::less_pair);
356
357 string ranges="";
358 NON_CONST_ITERATE(Tlist, interval, ints)
359 {
360 if(!ranges.empty()) ranges+=",";
361 ranges+=printed_range(interval->first, interval->second);
362 }
363
364 return ranges;
365 }
366
printed_range(const CBioseq & seq)367 string printed_range(const CBioseq& seq)
368 {
369 return printed_range(CReadBlastApp::getGenomicLocation(seq));
370 }
371
printed_range(const TSimpleSeqs::iterator & ext_rna)372 string printed_range(const TSimpleSeqs::iterator& ext_rna)
373 {
374 return printed_range(*ext_rna);
375 /*
376 int from = ext_rna->exons[0].from;
377 int to = ext_rna->exons[ext_rna->exons.size()-1].to;
378 CNcbiStrstream ext_rna_range_stream; ext_rna_range_stream << from << "..." << to << '\0';
379 string ext_rna_range = ext_rna_range_stream.str();
380 return ext_rna_range;
381 */
382 }
383
printed_range(const TSimplePair & apair)384 string printed_range(const TSimplePair& apair)
385 {
386 return printed_range(apair.from, apair.to);
387
388 }
389
390
printed_range(const TSimpleSeq & ext_rna)391 string printed_range(const TSimpleSeq& ext_rna)
392 {
393 int from,to;
394 if(ext_rna.exons[0].strand == eNa_strand_plus)
395 {
396 from = ext_rna.exons[0].from;
397 to = ext_rna.exons[ext_rna.exons.size()-1].to;
398 }
399 else
400 {
401 to = ext_rna.exons[0].to;
402 from = ext_rna.exons[ext_rna.exons.size()-1].from;
403 }
404
405 return printed_range(from, to);
406 }
407
printed_range(const TSimpleSeqs::iterator & ext_rna,const TSimpleSeqs::iterator & end)408 string printed_range(const TSimpleSeqs::iterator& ext_rna, const TSimpleSeqs::iterator& end)
409 {
410 if(ext_rna==end) return "beyond end...beyond end";
411 return printed_range(ext_rna);
412 }
413
printed_range(const TSimpleSeqs::iterator & ext_rna,TSimpleSeqs & seqs)414 string printed_range(const TSimpleSeqs::iterator& ext_rna, TSimpleSeqs& seqs)
415 {
416 return printed_range(ext_rna, seqs.end());
417 }
418
419
getLenScore(CBioseq::TAnnot::const_iterator & annot)420 int CReadBlastApp::getLenScore( CBioseq::TAnnot::const_iterator& annot)
421 {
422 int len=-1;
423 if(!(*annot)->GetData().IsAlign()) return len;
424
425 ITERATE(CSeq_align::TScore, score, (*(*annot)->GetData().GetAlign().begin())->GetScore())
426 {
427 string name = (*score)->GetId().GetStr();
428 if(name!="sbjLen") continue;
429 len = (*score)->GetValue().GetInt();
430 return len;
431 }
432 return len;
433 }
getBounds(CBioseq::TAnnot::const_iterator & annot,int * qFrom,int * qTo,int * sFrom,int * sTo)434 void CReadBlastApp::getBounds
435 (
436 CBioseq::TAnnot::const_iterator& annot,
437 int* qFrom, int* qTo, int* sFrom, int* sTo
438 )
439 {
440 int i=0;
441 ITERATE(CSeq_align::TBounds, bounds, (*(*annot)->GetData().GetAlign().begin())->GetBounds() )
442 {
443 if(!i)
444 {
445 *qFrom = (*bounds)->GetInt().GetFrom();
446 *qTo = (*bounds)->GetInt().GetTo();
447 }
448 else if (i==1)
449 {
450 *sFrom = (*bounds)->GetInt().GetFrom();
451 *sTo = (*bounds)->GetInt().GetTo();
452 break;
453 }
454 i++;
455 }
456
457 }
458
getAnnotName(CBioseq::TAnnot::const_iterator & annot)459 string CReadBlastApp::getAnnotName(CBioseq::TAnnot::const_iterator& annot)
460 {
461 ITERATE(CAnnot_descr::Tdata, desc, (*annot)->GetDesc().Get())
462 {
463 if( (*desc)->IsName() )
464 {
465 return (*desc)->GetName();
466 }
467 }
468 return "cannot get name";
469 }
470
getAnnotComment(CBioseq::TAnnot::const_iterator & annot)471 string CReadBlastApp::getAnnotComment(CBioseq::TAnnot::const_iterator& annot)
472 {
473 ITERATE(CAnnot_descr::Tdata, desc, (*annot)->GetDesc().Get())
474 {
475 if( (*desc)->IsComment() )
476 {
477 return (*desc)->GetComment();
478 }
479 }
480 return "cannot get comment";
481 }
482
483