1 /* $Id: gnomon_objmgr.cpp 477875 2015-09-02 14:23:40Z souvorov $
2 * ===========================================================================
3 *
4 * PUBLIC DOMAIN NOTICE
5 * National Center for Biotechnology Information
6 *
7 * This software/database is a "United States Government Work" under the
8 * terms of the United States Copyright Act. It was written as part of
9 * the author's official duties as a United States Government employee and
10 * thus cannot be copyrighted. This software/database is freely available
11 * to the public for use. The National Library of Medicine and the U.S.
12 * Government have not placed any restriction on its use or reproduction.
13 *
14 * Although all reasonable efforts have been taken to ensure the accuracy
15 * and reliability of the software and data, the NLM and the U.S.
16 * Government do not and cannot warrant the performance or results that
17 * may be obtained by using this software or data. The NLM and the U.S.
18 * Government disclaim all warranties, express or implied, including
19 * warranties of performance, merchantability or fitness for any particular
20 * purpose.
21 *
22 * Please cite the author in any work or product based on this material.
23 *
24 * ===========================================================================
25 *
26 * Authors: Mike DiCuccio
27 *
28 * File Description: gnomon library parts requiring object manager support
29 * to allow apps not using these to be linked w/o object manager libs
30 *
31 */
32
33 #include <ncbi_pch.hpp>
34 #include <algo/gnomon/gnomon.hpp>
35 #include <algo/gnomon/chainer.hpp>
36 #include <algo/gnomon/gnomon_exception.hpp>
37 #include <algo/gnomon/id_handler.hpp>
38 #include "hmm.hpp"
39 #include "hmm_inlines.hpp"
40
41 #include <objects/seqloc/Seq_loc.hpp>
42 #include <objects/seqloc/Seq_id.hpp>
43 #include <objects/seqloc/Packed_seqint.hpp>
44 #include <objects/seqloc/Seq_interval.hpp>
45 #include <objects/seqfeat/Seq_feat.hpp>
46 #include <objects/seqfeat/RNA_ref.hpp>
47 #include <objects/seqfeat/Cdregion.hpp>
48 #include <objmgr/util/sequence.hpp>
49 #include <objects/seqalign/seqalign__.hpp>
50 #include <objects/general/general__.hpp>
51 #include <objmgr/object_manager.hpp>
52
53 #include <stdio.h>
54
55 #include <functional>
56
57 BEGIN_NCBI_SCOPE
58 BEGIN_SCOPE(gnomon)
59 USING_SCOPE(ncbi::objects);
60
debug()61 void debug()
62 {
63 }
64
GetModelId(const CSeq_align & seq_align)65 Int8 GetModelId(const CSeq_align& seq_align)
66 {
67 Int8 id = 0;
68 if(seq_align.CanGetId())
69 seq_align.GetId().back()->GetIdType(id); // 0 if no integer id
70
71 return id;
72 // return CIdHandler::GetId(*seq_align.GetId().back());
73 }
74
CAlignModel(const CSeq_align & seq_align)75 CAlignModel::CAlignModel(const CSeq_align& seq_align) :
76 CGeneModel(seq_align.GetSegs().GetSpliced().GetGenomic_strand()==eNa_strand_minus?eMinus:ePlus, GetModelId(seq_align), emRNA)
77 {
78 #ifdef _DEBUG
79 debug();
80 #endif
81
82 const CSpliced_seg& sps = seq_align.GetSegs().GetSpliced();
83
84 bool is_protein = false;
85 if(sps.CanGetProduct_type() && sps.GetProduct_type()==CSpliced_seg::eProduct_type_protein) {
86 SetType(eProt);
87 is_protein = true;
88 }
89
90 bool is_product_reversed = false;
91 if(sps.CanGetProduct_strand() && sps.GetProduct_strand()==eNa_strand_minus) {
92 Status() |= CGeneModel::eReversed;
93 is_product_reversed = true;
94 }
95
96 SetTargetId(sps.GetProduct_id());
97
98 int product_len = sps.CanGetProduct_length()?sps.GetProduct_length():0;
99 if (is_protein)
100 product_len *=3;
101 int prod_prev = -1;
102 bool prev_3_prime_splice = false;
103
104 int target_len = product_len;
105
106 vector<TSignedSeqRange> transcript_exons;
107 TInDels indels;
108
109 if(!sps.CanGetGenomic_id())
110 NCBI_THROW(CGnomonException, eGenericError, "CSpliced_seg must have genomic id");
111
112 const CSeq_id& genomic_id = sps.GetGenomic_id();
113
114 string mismatches;
115 string mismstatus;
116
117 if(seq_align.CanGetExt()) {
118 int count = 0;
119 ITERATE(CSeq_align::TExt, i, seq_align.GetExt()) {
120 if((*i)->IsSetType() && (*i)->GetType().IsStr()) {
121 string type = (*i)->GetType().GetStr();
122 if(type == "RNASeq-Counts") {
123 ITERATE(CUser_object::TData, j, (*i)->GetData()) {
124 if(*j && (*j)->CanGetLabel() && (*j)->GetLabel().IsStr()) {
125 string label = (*j)->GetLabel().GetStr();
126 if(NStr::EndsWith(label, "alignments") || label.find(' ') == string::npos)
127 count += (*j)->GetData().GetInt();
128 }
129 }
130 } else if(type == "MismatchedBases") {
131 mismatches = (*i)->GetData().front()->GetData().GetStr();
132 } else if(type == "MismatchedBasesStatus") {
133 mismstatus = (*i)->GetData().front()->GetData().GetStr();
134 }
135 }
136 }
137 if(count > 0)
138 SetWeight(count);
139 if(Strand() == eMinus) {
140 reverse(mismatches.begin(),mismatches.end());
141 reverse(mismstatus.begin(),mismstatus.end());
142 }
143 }
144
145 #ifdef _DEBUG
146 bool ggap_model= false;
147 #endif
148
149 ITERATE(CSpliced_seg::TExons, e_it, sps.GetExons()) {
150 const CSpliced_exon& exon = **e_it;
151 int prod_cur_start = exon.GetProduct_start().AsSeqPos();
152 int prod_cur_end = exon.GetProduct_end().AsSeqPos();
153 if (is_product_reversed) {
154 int tmp = prod_cur_start;
155 prod_cur_start = product_len - prod_cur_end -1;
156 prod_cur_end = product_len - tmp -1;
157 }
158 int nuc_cur_start = exon.GetGenomic_start();
159 int nuc_cur_end = exon.GetGenomic_end();
160
161 // bool cur_5_prime_splice = exon.CanGetAcceptor_before_exon() && exon.GetAcceptor_before_exon().CanGetBases() && exon.GetAcceptor_before_exon().GetBases().size()==2;
162 bool cur_5_prime_splice = exon.CanGetAcceptor_before_exon() && exon.GetAcceptor_before_exon().CanGetBases();
163
164 if (prod_prev+1 != prod_cur_start || !prev_3_prime_splice || !cur_5_prime_splice) {
165 AddHole();
166 if(!mismatches.empty()) // also will take care about first exon
167 mismatches = mismatches.substr(prod_cur_start - prod_prev -1);
168 }
169
170 // prev_3_prime_splice = exon.CanGetDonor_after_exon() && exon.GetDonor_after_exon().CanGetBases() && exon.GetDonor_after_exon().GetBases().size()==2;
171 prev_3_prime_splice = exon.CanGetDonor_after_exon() && exon.GetDonor_after_exon().CanGetBases();
172
173 string fs, ss;
174 if(exon.CanGetAcceptor_before_exon() && exon.GetAcceptor_before_exon().CanGetBases()) {
175 fs = exon.GetAcceptor_before_exon().GetBases();
176 }
177 if(exon.CanGetDonor_after_exon() && exon.GetDonor_after_exon().CanGetBases()) {
178 ss = exon.GetDonor_after_exon().GetBases();
179 }
180 if(Strand() == eMinus) {
181 swap(fs,ss);
182 }
183
184 double eident = 0;
185 if(exon.CanGetScores() && exon.GetScores().CanGet()) {
186 CScore_set::Tdata scores = exon.GetScores().Get();
187 ITERATE(CScore_set::Tdata, it, scores) {
188 if((*it)->CanGetId() && (*it)->GetId().IsStr()) {
189 if((*it)->GetId().GetStr() == "idty") {
190 eident = (*it)->GetValue().GetReal();
191 break;
192 }
193 }
194 }
195 }
196
197 if(!exon.CanGetGenomic_id() || !(exon.GetGenomic_id() < genomic_id || genomic_id < exon.GetGenomic_id())) { // normal exon
198 AddNormalExon(TSignedSeqRange(nuc_cur_start,nuc_cur_end), fs, ss, eident, Strand() == eMinus);
199 } else { // genomic gap
200 if(!exon.CanGetGenomic_strand())
201 NCBI_THROW(CGnomonException, eGenericError, "CSpliced_exon for gap filling must have genomic strand");
202 CConstRef<objects::CSeq_id> fill_id(&exon.GetGenomic_id());
203 CScope scope(*CObjectManager::GetInstance());
204 scope.AddDefaults();
205
206 string transcript = GetDNASequence(fill_id,scope);
207 string fill_seq = transcript.substr(nuc_cur_start, nuc_cur_end-nuc_cur_start+1);
208
209 CInDelInfo::SSource fill_src;
210 fill_src.m_acc = CIdHandler::ToString(*fill_id);
211 fill_src.m_strand = ePlus;
212 fill_src.m_range = TSignedSeqRange(nuc_cur_start, nuc_cur_end);
213
214 if(exon.GetGenomic_strand() == eNa_strand_minus) {
215 fill_src.m_strand = eMinus;
216 ReverseComplement(fill_seq.begin(),fill_seq.end());
217 }
218
219 AddGgapExon(eident, fill_seq, fill_src, Strand() == eMinus);
220 #ifdef _DEBUG
221 ggap_model = true;
222 #endif
223 }
224 transcript_exons.push_back(TSignedSeqRange(exon.GetProduct_start().AsSeqPos(), exon.GetProduct_end().AsSeqPos()));
225
226 _ASSERT(transcript_exons.back().NotEmpty());
227
228 int pos = 0;
229 int prod_pos = prod_cur_start;
230
231 ITERATE(CSpliced_exon::TParts, p_it, exon.GetParts()) {
232 const CSpliced_exon_chunk& chunk = **p_it;
233 CInDelInfo::EStatus indelstatus = CInDelInfo::eUnknown;
234 if(!mismstatus.empty() && (chunk.IsProduct_ins() || chunk.IsGenomic_ins() || chunk.IsMismatch())) {
235 if(mismstatus[0] == 'n')
236 indelstatus = CInDelInfo::eGenomeNotCorrect;
237 else if(mismstatus[0] == 'c')
238 indelstatus = CInDelInfo::eGenomeCorrect;
239 mismstatus = mismstatus.substr(1);
240 }
241 if (chunk.IsProduct_ins()) {
242 string v = kEmptyStr;
243 int product_ins = chunk.GetProduct_ins();
244 if(!mismatches.empty()) {
245 v = mismatches.substr(0,product_ins);
246 mismatches = mismatches.substr(product_ins);
247 }
248 if(Strand() == eMinus)
249 reverse(v.begin(),v.end());
250 CInDelInfo fs(Strand()==ePlus?nuc_cur_start+pos:nuc_cur_end-pos+1, product_ins, CInDelInfo::eDel, v);
251 if (Strand() == ePlus)
252 indels.push_back(fs);
253 else
254 indels.insert(indels.begin(), fs);
255 prod_pos += chunk.GetProduct_ins();
256 } else if (chunk.IsGenomic_ins()) {
257 const int genomic_ins = chunk.GetGenomic_ins();
258 if (Strand()==ePlus)
259 indels.push_back(CInDelInfo(nuc_cur_start+pos, genomic_ins, CInDelInfo::eIns));
260 else
261 indels.insert(indels.begin(), CInDelInfo(nuc_cur_end-pos-genomic_ins+1, genomic_ins, CInDelInfo::eIns));
262 pos += genomic_ins;
263 } else if (chunk.IsMatch()) {
264 pos += chunk.GetMatch();
265 prod_pos += chunk.GetMatch();
266 } else if (chunk.IsMismatch()) {
267 int mismatch_len = chunk.GetMismatch();
268 string v(mismatch_len,'N');
269 if(!mismatches.empty()) {
270 _ASSERT(mismatch_len <= (int)mismatches.length());
271 v = mismatches.substr(0,mismatch_len);
272 mismatches = mismatches.substr(mismatch_len);
273 }
274 if(Strand() == ePlus) {
275 indels.push_back(CInDelInfo(nuc_cur_start+pos, mismatch_len, CInDelInfo::eMism, v));
276 } else {
277 reverse(v.begin(),v.end());
278 indels.insert(indels.begin(), CInDelInfo(nuc_cur_end-pos-mismatch_len+1, mismatch_len, CInDelInfo::eMism, v));
279 }
280 pos += mismatch_len;
281 prod_pos += mismatch_len;
282 } else { // if (chunk.IsDiag())
283 pos += chunk.GetDiag();
284 prod_pos += chunk.GetDiag();
285 }
286 if(indelstatus != CInDelInfo::eUnknown) {
287 if(Strand() == ePlus)
288 indels.back().SetStatus(indelstatus);
289 else
290 indels.front().SetStatus(indelstatus);
291 }
292 }
293
294 prod_prev = prod_cur_end;
295 }
296
297 _ASSERT(mismatches.empty() || (product_len - prod_prev - 1 == (int)mismatches.length()));
298
299 sort(transcript_exons.begin(),transcript_exons.end());
300 bool minusstrand = Strand() == eMinus;
301 EStrand orientation = (is_product_reversed == minusstrand) ? ePlus : eMinus;
302 if(orientation == eMinus)
303 reverse(transcript_exons.begin(),transcript_exons.end());
304
305 if (is_protein) {
306 _ASSERT(orientation == Strand());
307 if (sps.CanGetModifiers()) {
308 ITERATE(CSpliced_seg::TModifiers, m, sps.GetModifiers()) {
309 if ((*m)->IsStop_codon_found()) {
310 target_len += 3;
311 if (Strand() == ePlus) {
312 ExtendRight( 3 );
313 _ASSERT((transcript_exons.back().GetTo()+1)%3 == 0);
314 transcript_exons.back().SetTo(transcript_exons.back().GetTo()+3);
315 } else {
316 ExtendLeft( 3 );
317 _ASSERT((transcript_exons.front().GetTo()+1)%3 == 0);
318 transcript_exons.front().SetTo(transcript_exons.front().GetTo()+3);
319 }
320 }
321 }
322 }
323 }
324
325 //convert tandem indels into indel+mism
326 bool keepdoing = true;
327 while(keepdoing) {
328 keepdoing = false;
329 NON_CONST_ITERATE(TInDels, indl, indels) {
330 TInDels::iterator indl_next = indl;
331 if(++indl_next == indels.end())
332 break;
333
334 if(indl->InDelEnd() == indl_next->Loc()) { // tandem
335 string new_seq = indl->GetInDelV()+indl_next->GetInDelV();
336 int new_seq_len = new_seq.size();
337 if(indl->GetType() == indl_next->GetType()) { // combine same
338 *indl = CInDelInfo(indl->Loc(), indl->Len()+indl_next->Len(), indl->GetType(), new_seq);
339 indels.erase(indl_next);
340 keepdoing = true;
341 } else if(!indl->IsMismatch() && !indl_next->IsMismatch()) { // tandem indels
342 if(indl->Len() == indl_next->Len()) {
343 *indl = CInDelInfo(indl->Loc(), indl->Len(), CInDelInfo::eMism, new_seq);
344 indels.erase(indl_next);
345 } else if(indl->Len() < indl_next->Len()) {
346 *indl = CInDelInfo(indl->Loc(), indl->Len(), CInDelInfo::eMism, new_seq.substr(0,indl->Len()));
347 *indl_next = CInDelInfo(indl->InDelEnd(), indl_next->Len()-indl->Len(), indl_next->GetType(), new_seq.substr(indl->Len()));
348 } else { // indl_next->Len() < indl->Len()
349 *indl = CInDelInfo(indl->Loc(), indl->Len()-indl_next->Len(), indl->GetType(), new_seq.substr(0,new_seq_len-indl_next->Len()));
350 *indl_next = CInDelInfo(indl->InDelEnd(), indl_next->Len(), CInDelInfo::eMism, new_seq.substr(new_seq_len-indl_next->Len()));
351 }
352 keepdoing = true;
353 }
354 }
355 }
356 }
357
358 m_alignmap = CAlignMap(Exons(), transcript_exons, indels, orientation, target_len );
359 FrameShifts() = indels;
360
361 TSignedSeqRange newlimits = m_alignmap.ShrinkToRealPoints(Limits(),is_protein);
362 if(newlimits != Limits()) {
363 Clip(newlimits,CAlignModel::eRemoveExons);
364 }
365
366 for (CGeneModel::TExons::const_iterator piece_begin = Exons().begin(); piece_begin != Exons().end(); ++piece_begin) {
367 _ASSERT( !piece_begin->m_fsplice );
368
369 if(piece_begin->Limits().Empty()) { // ggap
370 _ASSERT(piece_begin->m_ssplice);
371 ++piece_begin;
372 _ASSERT(piece_begin->Limits().NotEmpty());
373 }
374
375 CGeneModel::TExons::const_iterator piece_end;
376 for (piece_end = piece_begin; piece_end != Exons().end() && piece_end->m_ssplice; ++piece_end) ;
377 _ASSERT( piece_end != Exons().end() );
378
379 CGeneModel::TExons::const_iterator piece_end_g = piece_end;
380 if(piece_end_g->Limits().Empty()) { // ggap
381 _ASSERT(piece_end_g->m_fsplice);
382 --piece_end_g;
383 _ASSERT(piece_end_g->Limits().NotEmpty());
384 }
385
386 TSignedSeqRange piece_range(piece_begin->GetFrom(),piece_end_g->GetTo());
387
388 piece_range = m_alignmap.ShrinkToRealPoints(piece_range, is_protein); // finds first projectable interval (on codon boundaries for proteins)
389
390 /*
391 TSignedSeqRange pr;
392 while(pr != piece_range) {
393 pr = piece_range;
394 ITERATE(TInDels, i, indels) { // here we check that no indels touch our interval from outside
395 if((i->IsDeletion() && i->Loc() == pr.GetFrom()) || ((i->IsInsertion() || i->IsMismatch()) && i->Loc()+i->Len() == pr.GetFrom()))
396 pr.SetFrom(pr.GetFrom()+1);
397 else if(i->Loc() == pr.GetTo()+1)
398 pr.SetTo(pr.GetTo()-1);
399 }
400 if(pr != piece_range)
401 piece_range = m_alignmap.ShrinkToRealPoints(pr, is_protein);
402 }
403 */
404
405 _ASSERT(piece_range.NotEmpty());
406 _ASSERT(piece_range.IntersectingWith(piece_begin->Limits()) && piece_range.IntersectingWith(piece_end_g->Limits()));
407
408 if(piece_range.GetFrom() != piece_begin->GetFrom() || piece_range.GetTo() != piece_end_g->GetTo()) {
409 _ASSERT(!ggap_model);
410 Clip(piece_range, CGeneModel::eDontRemoveExons);
411 }
412
413 piece_begin = piece_end;
414 }
415
416 if (is_protein) {
417 TSignedSeqRange reading_frame = m_alignmap.MapRangeOrigToEdited(Limits(), true);
418 TSignedSeqRange start, stop;
419 if (sps.CanGetModifiers()) {
420 ITERATE(CSpliced_seg::TModifiers, m, sps.GetModifiers()) {
421 if ((*m)->IsStart_codon_found()) {
422 start = TSignedSeqRange(reading_frame.GetFrom(),reading_frame.GetFrom()+2);
423 reading_frame.SetFrom(start.GetTo()+1);
424 } else if ((*m)->IsStop_codon_found()) {
425 stop = TSignedSeqRange(reading_frame.GetTo()-2,reading_frame.GetTo());
426 reading_frame.SetTo(stop.GetFrom()-1);
427 }
428 }
429 }
430
431 CCDSInfo cds_info_t;
432 cds_info_t.SetReadingFrame(reading_frame, true);
433 if (start.NotEmpty()) {
434 cds_info_t.SetStart(start, false);
435 }
436 if (stop.NotEmpty()) {
437 cds_info_t.SetStop(stop, false);
438 }
439 CCDSInfo cds_info_g = cds_info_t.MapFromEditedToOrig(m_alignmap);
440 if(cds_info_g.ReadingFrame().NotEmpty()) // successful projection
441 SetCdsInfo(cds_info_g);
442 else
443 SetCdsInfo(cds_info_t);
444 }
445
446 if (sps.IsSetPoly_a()) {
447 Status() |= CGeneModel::ePolyA;
448 }
449
450 if(seq_align.CanGetScore()) {
451 const CSeq_align::TScore& score = seq_align.GetScore();
452 ITERATE(CSeq_align::TScore, it, score) {
453 if((*it)->CanGetId() && (*it)->GetId().IsStr()) {
454 string scr = (*it)->GetId().GetStr();
455 if((scr == "N of matches") || (scr == "num_ident") || (scr == "matches")) {
456 double ident = (*it)->GetValue().GetInt();
457 ident /= seq_align.GetAlignLength();
458 SetIdent(ident);
459 } else if(scr == "rank" && (*it)->GetValue().GetInt() == 1) {
460 Status() |= CGeneModel::eBestPlacement;
461 } else if(scr == "ambiguous_orientation") {
462 Status() |= CGeneModel::eUnknownOrientation;
463 } else if(scr == "count") {
464 _ASSERT(Weight() == 1 || Weight() == (*it)->GetValue().GetInt());
465 SetWeight((*it)->GetValue().GetInt());
466 }
467 }
468 }
469 }
470 }
471
472
473
GetCdsDnaSequence(const CResidueVec & contig_sequence) const474 string CGeneModel::GetCdsDnaSequence (const CResidueVec& contig_sequence) const
475 {
476 if(ReadingFrame().Empty())
477 return kEmptyStr;
478
479 CAlignMap amap = CGeneModel::GetAlignMap();
480 CCDSInfo cds_info = GetCdsInfo();
481 if(cds_info.IsMappedToGenome())
482 cds_info = cds_info.MapFromOrigToEdited(amap);
483 int cds_len = cds_info.Cds().GetLength();
484 int cds_start = cds_info.Cds().GetFrom()-TranscriptLimits().GetFrom();
485
486 CResidueVec mrna;
487 amap.EditedSequence(contig_sequence,mrna);
488
489 string cds_seq(cds_len,'A');
490 copy(mrna.begin()+cds_start, mrna.begin()+cds_start+cds_len, cds_seq.begin());
491
492 if(Status() & eReversed)
493 ReverseComplement(cds_seq.begin(),cds_seq.end());
494
495 return cds_seq;
496 }
497
GetProtein(const CResidueVec & contig_sequence) const498 string CGeneModel::GetProtein (const CResidueVec& contig_sequence) const
499 {
500 string cds_seq = GetCdsDnaSequence(contig_sequence);
501 string prot_seq;
502
503 objects::CSeqTranslator::Translate(cds_seq, prot_seq, objects::CSeqTranslator::fIs5PrimePartial);
504
505 if(PStop()) {
506 CCDSInfo cds_info = GetCdsInfo();
507 if(cds_info.IsMappedToGenome()) {
508 CAlignMap amap = GetAlignMap();
509 cds_info = cds_info.MapFromOrigToEdited(amap);
510 }
511 ITERATE(CCDSInfo::TPStops, stp, cds_info.PStops()) {
512 if(stp->m_status == CCDSInfo::eSelenocysteine)
513 prot_seq[(stp->GetFrom()- cds_info.Cds().GetFrom())/3] = 'U';
514 }
515 }
516
517 return prot_seq;
518 }
519
GetProtein(const CResidueVec & contig_sequence,const CGenetic_code * gencode) const520 string CGeneModel::GetProtein (const CResidueVec& contig_sequence, const CGenetic_code* gencode) const
521 {
522 string cds_seq = GetCdsDnaSequence(contig_sequence);
523 string prot_seq;
524
525 objects::CSeqTranslator::Translate(cds_seq, prot_seq, objects::CSeqTranslator::fDefault, gencode );
526 if (prot_seq[0] == '-') {
527 string first_triplet = cds_seq.substr(0, 3);
528 string first_aa;
529 objects::CSeqTranslator::Translate(first_triplet, first_aa, objects::CSeqTranslator::fIs5PrimePartial, gencode );
530 prot_seq = first_aa+prot_seq.substr(1);
531 }
532
533 if(PStop()) {
534 CCDSInfo cds_info = GetCdsInfo();
535 if(cds_info.IsMappedToGenome()) {
536 CAlignMap amap = GetAlignMap();
537 cds_info = cds_info.MapFromOrigToEdited(amap);
538 }
539 ITERATE(CCDSInfo::TPStops, stp, cds_info.PStops()) {
540 if(stp->m_status == CCDSInfo::eSelenocysteine)
541 prot_seq[(stp->GetFrom()- cds_info.Cds().GetFrom())/3] = 'U';
542 }
543 }
544
545 return prot_seq;
546 }
547
548 //
549 // helper function - convert a vector<TSignedSeqRange> into a compact CSeq_loc
550 //
551 namespace {
552
s_ExonDataToLoc(const vector<TSignedSeqRange> & vec,ENa_strand strand,const CSeq_id & id)553 CRef<CSeq_loc> s_ExonDataToLoc(const vector<TSignedSeqRange>& vec,
554 ENa_strand strand, const CSeq_id& id)
555 {
556 CRef<CSeq_loc> loc(new CSeq_loc());
557
558 CPacked_seqint::Tdata data;
559 ITERATE (vector<TSignedSeqRange>, iter, vec) {
560 CRef<CSeq_interval> ival(new CSeq_interval);
561 ival->SetFrom(iter->GetFrom());
562 ival->SetTo(iter->GetTo());
563 ival->SetStrand(strand);
564 ival->SetId().Assign(id);
565
566 data.push_back(ival);
567 }
568
569 if (data.size() == 1) {
570 loc->SetInt(*data.front());
571 } else {
572 loc->SetPacked_int().Set().swap(data);
573 }
574
575 return loc;
576 }
577
578 } //end unnamed namespace
579
GetAnnot(const CSeq_id & id)580 CRef<CSeq_annot> CGnomonEngine::GetAnnot(const CSeq_id& id)
581 {
582 TGeneModelList genes = GetGenes();
583
584 CRef<objects::CSeq_annot> annot(new CSeq_annot());
585
586 annot->SetNameDesc("Gnomon gene scan output");
587
588 CSeq_annot::C_Data::TFtable& ftable = annot->SetData().SetFtable();
589
590 unsigned int counter = 0;
591 string locus_tag_base("GNOMON_");
592 ITERATE (TGeneModelList, it, genes) {
593 const CGeneModel& igene = *it;
594 int strand = igene.Strand();
595 TSignedSeqRange cds_limits = igene.RealCdsLimits();
596
597 vector<TSignedSeqRange> mrna_vec;
598 copy(igene.Exons().begin(), igene.Exons().end(), back_inserter(mrna_vec));
599 vector<TSignedSeqRange> cds_vec;
600
601 for (size_t j = 0; j < mrna_vec.size(); ++j) {
602 TSignedSeqRange intersect(mrna_vec[j] & cds_limits);
603 if (!intersect.Empty()) {
604 cds_vec.push_back(intersect);
605 }
606 }
607
608 // stop-codon removal from cds
609 if (igene.HasStop()) {
610 if (strand == ePlus) {
611 _ASSERT(cds_vec.back().GetLength()>=3);
612 cds_vec.back().SetTo(cds_vec.back().GetTo() - 3);
613 } else {
614 _ASSERT(cds_vec.front().GetLength()>=3);
615 cds_vec.front().SetFrom(cds_vec.front().GetFrom() + 3);
616 }
617 }
618
619 //
620 // create our mRNA
621 CRef<CSeq_feat> feat_mrna;
622 if (mrna_vec.size()) {
623 feat_mrna = new CSeq_feat();
624 feat_mrna->SetData().SetRna().SetType(CRNA_ref::eType_mRNA);
625 feat_mrna->SetLocation
626 (*s_ExonDataToLoc(mrna_vec,
627 (strand == ePlus ? eNa_strand_plus : eNa_strand_minus), id));
628 } else {
629 continue;
630 }
631
632 //
633 // create the accompanying CDS
634 CRef<CSeq_feat> feat_cds;
635 if (!cds_vec.empty()) {
636 feat_cds = new CSeq_feat();
637 feat_cds->SetData().SetCdregion().SetFrame(CCdregion::TFrame(1+(strand == ePlus?(igene.ReadingFrame().GetFrom()-cds_limits.GetFrom())%3:(cds_limits.GetTo()-igene.ReadingFrame().GetTo())%3)));
638
639 feat_cds->SetLocation
640 (*s_ExonDataToLoc(cds_vec,
641 (strand == ePlus ? eNa_strand_plus : eNa_strand_minus), id));
642 }
643
644 //
645 // create a dummy gene feature as well
646 CRef<CSeq_feat> feat_gene(new CSeq_feat());
647
648 char buf[32];
649 sprintf(buf, "%04u", ++counter);
650 string name(locus_tag_base);
651 name += buf;
652 feat_gene->SetData().SetGene().SetLocus_tag(name);
653 feat_gene->SetLocation().SetInt()
654 .SetFrom(feat_mrna->GetLocation().GetTotalRange().GetFrom());
655 feat_gene->SetLocation().SetInt()
656 .SetTo(feat_mrna->GetLocation().GetTotalRange().GetTo());
657 feat_gene->SetLocation().SetInt().SetStrand
658 (strand == ePlus ? eNa_strand_plus : eNa_strand_minus);
659
660 feat_gene->SetLocation().SetId
661 (sequence::GetId(feat_mrna->GetLocation(), 0));
662
663 ftable.push_back(feat_gene);
664 ftable.push_back(feat_mrna);
665 if (feat_cds) {
666 ftable.push_back(feat_cds);
667 }
668 }
669 return annot;
670 }
671
672
673 //
674 //
675 // This function deduces the frame from 5p coordinate of the CDS which should be on the 5p codon boundary
676 //
677 //
GetScore(CConstRef<CHMMParameters> hmm_params,const CSeq_loc & cds,CScope & scope,int * const gccontent,double * const startscore)678 double CCodingPropensity::GetScore(CConstRef<CHMMParameters> hmm_params, const CSeq_loc& cds, CScope& scope, int *const gccontent, double *const startscore)
679 {
680 *gccontent = 0;
681 const CSeq_id* seq_id = cds.GetId();
682 if (seq_id == NULL)
683 NCBI_THROW(CGnomonException, eGenericError, "CCodingPropensity: CDS has multiple ids or no id at all");
684
685 // Need to know GC content in order to load correct models.
686 // Compute on the whole transcript, not just CDS.
687 CBioseq_Handle xcript_hand = scope.GetBioseqHandle(*seq_id);
688 CSeqVector xcript_vec = xcript_hand.GetSeqVector();
689 xcript_vec.SetIupacCoding();
690 unsigned int gc_count = 0;
691 CSeqVector_CI xcript_iter(xcript_vec);
692 for( ; xcript_iter; ++xcript_iter) {
693 if (*xcript_iter == 'G' || *xcript_iter == 'C') {
694 ++gc_count;
695 }
696 }
697 *gccontent = static_cast<unsigned int>(100.0 * gc_count / xcript_vec.size() + 0.5);
698
699 const CMC3_CodingRegion<5>& cdr = dynamic_cast<const CMC3_CodingRegion<5>&>(hmm_params->GetParameter(CMC3_CodingRegion<5>::class_id(), *gccontent));
700 const CMC_NonCodingRegion<5>& ncdr = dynamic_cast<const CMC_NonCodingRegion<5>&>(hmm_params->GetParameter(CMC_NonCodingRegion<5>::class_id(), *gccontent));
701
702 // Represent coding sequence as enum Nucleotides
703 CSeqVector vec(cds, scope);
704 vec.SetIupacCoding();
705 CEResidueVec seq;
706 seq.reserve(vec.size());
707 CSeqVector_CI iter(vec);
708 for( ; iter; ++iter) {
709 seq.push_back(fromACGT(*iter));
710 }
711
712 // Sum coding and non-coding scores across coding sequence.
713 // Don't include stop codon!
714 double score = 0;
715 for (unsigned int i = 5; i < seq.size() - 3; ++i)
716 score += cdr.Score(seq, i, i % 3) - ncdr.Score(seq, i);
717
718 if(startscore) {
719 //
720 // start evalustion needs 17 bases total (11 before ATG and 3 after)
721 // if there is not enough sequence it will be substituted by Ns which will degrade the score
722 //
723
724 const CWMM_Start& start = dynamic_cast<const CWMM_Start&>(hmm_params->GetParameter(CWMM_Start::class_id(), *gccontent));
725
726 int totallen = xcript_vec.size();
727 int left, right;
728 int extraNs5p = 0;
729 int extrabases = start.Left()+2; // (11) extrabases left of ATG needed for start (6) and noncoding (5) evaluation
730 if(cds.GetStrand() == eNa_strand_plus) {
731 int startposition = cds.GetTotalRange().GetFrom();
732 if(startposition < extrabases) {
733 left = 0;
734 extraNs5p = extrabases-startposition;
735 } else {
736 left = startposition-extrabases;
737 }
738 right = min(startposition+2+start.Right(),totallen-1);
739 } else {
740 int startposition = cds.GetTotalRange().GetTo();
741 if(startposition+extrabases >= totallen) {
742 right = totallen-1;
743 extraNs5p = startposition+extrabases-totallen+1;
744 } else {
745 right = startposition+extrabases;
746 }
747 left = max(0,startposition-2-start.Right());
748 }
749
750
751 CRef<CSeq_loc> startarea(new CSeq_loc());
752 CSeq_interval& d = startarea->SetInt();
753 d.SetStrand(cds.GetStrand());
754 CRef<CSeq_id> id(new CSeq_id());
755 id->Assign(*seq_id);
756 d.SetId(*id);
757 d.SetFrom(left);
758 d.SetTo(right);
759
760 CSeqVector sttvec(*startarea, scope);
761 sttvec.SetIupacCoding();
762 CEResidueVec sttseq;
763 sttseq.resize(extraNs5p, enN); // fill with Ns if necessery
764 for(unsigned int i = 0; i < sttvec.size(); ++i) {
765 sttseq.push_back(fromACGT(sttvec[i]));
766 }
767 sttseq.resize(5+start.Left()+start.Right(), enN); // add Ns if necessery
768
769 *startscore = start.Score(sttseq, extrabases+2);
770 if(*startscore != BadScore()) {
771 for(unsigned int i = 5; i < sttseq.size(); ++i) {
772 *startscore -= ncdr.Score(sttseq, i);
773 }
774 }
775 }
776
777 return score;
778 }
779
780 END_SCOPE(gnomon)
781 END_NCBI_SCOPE
782