1 /*  $Id: select_alignments_alt.cpp 475782 2015-08-11 20:18:41Z souvorov $
2  * ===========================================================================
3  *
4  *                            PUBLIC DOMAIN NOTICE
5  *               National Center for Biotechnology Information
6  *
7  *  This software/database is a "United States Government Work" under the
8  *  terms of the United States Copyright Act.  It was written as part of
9  *  the author's official duties as a United States Government employee and
10  *  thus cannot be copyrighted.  This software/database is freely available
11  *  to the public for use. The National Library of Medicine and the U.S.
12  *  Government have not placed any restriction on its use or reproduction.
13  *
14  *  Although all reasonable efforts have been taken to ensure the accuracy
15  *  and reliability of the software and data, the NLM and the U.S.
16  *  Government do not and cannot warrant the performance or results that
17  *  may be obtained by using this software or data. The NLM and the U.S.
18  *  Government disclaim all warranties, express or implied, including
19  *  warranties of performance, merchantability or fitness for any particular
20  *  purpose.
21  *
22  *  Please cite the author in any work or product based on this material.
23  *
24  * ===========================================================================
25  *
26  * Authors:  Alexandre Souvorov
27  *
28  * File Description:
29  *
30  */
31 
32 #include <ncbi_pch.hpp>
33 
34 #include <algo/gnomon/gnomon_model.hpp>
35 #include <algo/gnomon/annot.hpp>
36 
37 #include <map>
38 #include <list>
39 #include <sstream>
40 
41 
42 BEGIN_NCBI_SCOPE
BEGIN_SCOPE(gnomon)43 BEGIN_SCOPE(gnomon)
44 
45 bool CModelCompare::CanBeConnectedIntoOne(const CGeneModel& a, const CGeneModel& b)
46 {
47     if (a.Strand() != b.Strand())
48         return false;
49     if (Precede(a.Limits(),b.Limits())) {
50         if(a.OpenRightEnd() && b.OpenLeftEnd())
51             return true;
52     } else if (Precede(b.Limits(),a.Limits())) {
53         if(b.OpenRightEnd() && a.OpenLeftEnd())
54             return true;
55     }
56     return false;
57 }
58 
59 
CountCommonSplices(const CGeneModel & a,const CGeneModel & b)60 size_t CModelCompare::CountCommonSplices(const CGeneModel& a, const CGeneModel& b) {
61     size_t commonspl = 0;
62     if(a.Strand() != b.Strand() || !a.IntersectingWith(b)) return commonspl;
63     for(size_t i = 1; i < a.Exons().size(); ++i) {
64         for(size_t j = 1; j < b.Exons().size(); ++j) {
65             if(a.Exons()[i-1].GetTo() == b.Exons()[j-1].GetTo())
66                 ++commonspl;
67             if(a.Exons()[i].GetFrom() == b.Exons()[j].GetFrom() )
68                 ++commonspl;
69         }
70     }
71 
72     return commonspl;
73 }
74 
75 
AreSimilar(const CGeneModel & a,const CGeneModel & b,int tolerance)76 bool CModelCompare::AreSimilar(const CGeneModel& a, const CGeneModel& b, int tolerance)
77 {
78     if(a.Strand() != b.Strand())
79         return false;
80 
81     if(a.ReadingFrame().NotEmpty() && b.ReadingFrame().NotEmpty()) {
82         if(!a.ReadingFrame().IntersectingWith(b.ReadingFrame()) || a.GetCdsInfo().PStops() != b.GetCdsInfo().PStops())
83            return false;
84 
85         if(a.Exons().size() == 1 && b.Exons().size()==1) {         // both coding; reading frames intersecting
86             TSignedSeqRange acds = a.GetCdsInfo().Cds();
87             TSignedSeqRange bcds = b.GetCdsInfo().Cds();
88             int common_point = (acds & bcds).GetFrom();
89             if(a.FShiftedLen(acds.GetFrom(),common_point,false)%3 != b.FShiftedLen(bcds.GetFrom(),common_point,false)%3)  // different frames
90                 return false;
91         }
92     }
93 
94     TSignedSeqRange intersection = a.Limits() & b.Limits();
95     TSignedSeqPos mutual_min = intersection.GetFrom();
96     TSignedSeqPos mutual_max = intersection.GetTo();
97 
98     int amin = 0;
99     while(amin < (int)a.Exons().size() && a.Exons()[amin].GetTo() < mutual_min) ++amin;
100     if(amin == (int)a.Exons().size()) return false;
101 
102     int amax = (int)a.Exons().size()-1;
103     while(amax >=0 && a.Exons()[amax].GetFrom() > mutual_max) --amax;
104     if(amax < 0) return false;
105 
106     int bmin = 0;
107     while(bmin < (int)b.Exons().size() && b.Exons()[bmin].GetTo() < mutual_min) ++bmin;
108     if(bmin == (int)b.Exons().size()) return false;
109 
110     int bmax = (int)b.Exons().size()-1;
111     while(bmax >=0 && b.Exons()[bmax].GetFrom() > mutual_max) --bmax;
112     if(bmax < 0) return false;
113 
114     if(amax-amin != bmax-bmin) return false;
115 
116 //  head-to-tail overlap
117     if (amin != 0 || size_t(amax) != a.Exons().size()-1 || bmin != 0 || size_t(bmax) != b.Exons().size()-1)
118         return false;
119 
120     for( ; amin <= amax; ++amin, ++bmin) {
121         if(abs(max(mutual_min,a.Exons()[amin].GetFrom())-max(mutual_min,b.Exons()[bmin].GetFrom())) >= tolerance)
122             return false;
123         if(abs(min(mutual_max,a.Exons()[amin].GetTo())-min(mutual_max,b.Exons()[bmin].GetTo())) >= tolerance)
124             return false;
125     }
126 
127     return true;
128 }
129 
130 
BadOverlapTest(const CGeneModel & a,const CGeneModel & b)131 bool CModelCompare::BadOverlapTest(const CGeneModel& a, const CGeneModel& b) {     // returns true for bad overlap
132     if((!a.TrustedmRNA().empty() || !a.TrustedProt().empty()) && (!b.TrustedmRNA().empty() || !b.TrustedProt().empty()))
133         return false;
134     //    else if(a.Limits().IntersectingWith(b.Limits()) && (a.Exons().size() == 1 || b.Exons().size() == 1))
135     //        return true;
136     else
137         return CountCommonSplices(a,b) > 0;
138 }
139 
140 
RangeNestedInIntron(TSignedSeqRange r,const CGeneModel & algn,bool check_in_holes)141 bool CModelCompare::RangeNestedInIntron(TSignedSeqRange r, const CGeneModel& algn, bool check_in_holes) {
142     for(int i = 1; i < (int)algn.Exons().size(); ++i) {
143         if(check_in_holes || (algn.Exons()[i-1].m_ssplice && algn.Exons()[i].m_fsplice)) {
144             TSignedSeqRange intron(algn.Exons()[i-1].GetTo()+1,algn.Exons()[i].GetFrom()-1);
145             if(Include(intron, r))
146                 return true;
147         }
148     }
149     return false;
150 }
151 
152 
HaveCommonExonOrIntron(const CGeneModel & a,const CGeneModel & b)153 bool CModelCompare::HaveCommonExonOrIntron(const CGeneModel& a, const CGeneModel& b) {
154     if(a.Strand() != b.Strand() || !a.IntersectingWith(b)) return false;
155 
156     for(unsigned int i = 0; i < a.Exons().size(); ++i) {
157         for(unsigned int j = 0; j < b.Exons().size(); ++j) {
158             if(a.Exons()[i] == b.Exons()[j])
159                 return true;
160         }
161     }
162 
163     for(unsigned int i = 1; i < a.Exons().size(); ++i) {
164         TSignedSeqRange introna(a.Exons()[i-1].GetTo()+1,a.Exons()[i].GetFrom()-1);
165         for(unsigned int j = 1; j < b.Exons().size(); ++j) {
166             TSignedSeqRange intronb(b.Exons()[j-1].GetTo()+1,b.Exons()[j].GetFrom()-1);
167             if(introna == intronb)
168                 return true;
169         }
170     }
171 
172     return false;
173 }
174 
175 
FilterGenes(TGeneModelList & chains,TGeneModelList & bad_aligns,TGeneModelList & dest)176 void CGeneSelector::FilterGenes(TGeneModelList& chains, TGeneModelList& bad_aligns,
177                                 TGeneModelList& dest)
178 {
179     NON_CONST_ITERATE(TGeneModelList, im, chains) {
180         CGeneModel& model = *im;
181         CCDSInfo cds_info = model.GetCdsInfo();
182         if(cds_info.ReadingFrame().Empty())
183             continue;
184 
185         if(cds_info.IsMappedToGenome())
186             cds_info = cds_info.MapFromOrigToEdited(model.GetAlignMap());
187         TSignedSeqRange cds = cds_info.Cds();
188 
189         bool gapfilled = false;
190         int genome_cds = 0;
191         for(int ie = 0; ie < (int)model.Exons().size(); ++ie) {
192             if(model.Exons()[ie].Limits().Empty())
193                 gapfilled = true;
194             else
195                 genome_cds += (cds&model.TranscriptExon(ie)).GetLength();
196         }
197 
198         if(gapfilled && genome_cds < 45) {
199             model.Status() |= CGeneModel::eSkipped;
200             model.AddComment("Most CDS in genomic gap");
201         }
202     }
203 
204     ITERATE(TGeneModelList, it, chains) {
205         if(it->Status()&CGeneModel::eSkipped) {
206             bad_aligns.push_back(*it);
207         } else {
208             dest.push_back(*it);
209         }
210     }
211 }
212 
213 
FilterGenes(TGeneModelList & chains,TGeneModelList & bad_aligns)214 TGeneModelList CGeneSelector::FilterGenes(TGeneModelList& chains, TGeneModelList& bad_aligns)
215 {
216     TGeneModelList models;
217     FilterGenes(chains, bad_aligns, models);
218     return models;
219 }
220 
221 END_SCOPE(gnomon)
222 END_NCBI_SCOPE
223