1 /* $Id: select_alignments_alt.cpp 475782 2015-08-11 20:18:41Z souvorov $
2 * ===========================================================================
3 *
4 * PUBLIC DOMAIN NOTICE
5 * National Center for Biotechnology Information
6 *
7 * This software/database is a "United States Government Work" under the
8 * terms of the United States Copyright Act. It was written as part of
9 * the author's official duties as a United States Government employee and
10 * thus cannot be copyrighted. This software/database is freely available
11 * to the public for use. The National Library of Medicine and the U.S.
12 * Government have not placed any restriction on its use or reproduction.
13 *
14 * Although all reasonable efforts have been taken to ensure the accuracy
15 * and reliability of the software and data, the NLM and the U.S.
16 * Government do not and cannot warrant the performance or results that
17 * may be obtained by using this software or data. The NLM and the U.S.
18 * Government disclaim all warranties, express or implied, including
19 * warranties of performance, merchantability or fitness for any particular
20 * purpose.
21 *
22 * Please cite the author in any work or product based on this material.
23 *
24 * ===========================================================================
25 *
26 * Authors: Alexandre Souvorov
27 *
28 * File Description:
29 *
30 */
31
32 #include <ncbi_pch.hpp>
33
34 #include <algo/gnomon/gnomon_model.hpp>
35 #include <algo/gnomon/annot.hpp>
36
37 #include <map>
38 #include <list>
39 #include <sstream>
40
41
42 BEGIN_NCBI_SCOPE
BEGIN_SCOPE(gnomon)43 BEGIN_SCOPE(gnomon)
44
45 bool CModelCompare::CanBeConnectedIntoOne(const CGeneModel& a, const CGeneModel& b)
46 {
47 if (a.Strand() != b.Strand())
48 return false;
49 if (Precede(a.Limits(),b.Limits())) {
50 if(a.OpenRightEnd() && b.OpenLeftEnd())
51 return true;
52 } else if (Precede(b.Limits(),a.Limits())) {
53 if(b.OpenRightEnd() && a.OpenLeftEnd())
54 return true;
55 }
56 return false;
57 }
58
59
CountCommonSplices(const CGeneModel & a,const CGeneModel & b)60 size_t CModelCompare::CountCommonSplices(const CGeneModel& a, const CGeneModel& b) {
61 size_t commonspl = 0;
62 if(a.Strand() != b.Strand() || !a.IntersectingWith(b)) return commonspl;
63 for(size_t i = 1; i < a.Exons().size(); ++i) {
64 for(size_t j = 1; j < b.Exons().size(); ++j) {
65 if(a.Exons()[i-1].GetTo() == b.Exons()[j-1].GetTo())
66 ++commonspl;
67 if(a.Exons()[i].GetFrom() == b.Exons()[j].GetFrom() )
68 ++commonspl;
69 }
70 }
71
72 return commonspl;
73 }
74
75
AreSimilar(const CGeneModel & a,const CGeneModel & b,int tolerance)76 bool CModelCompare::AreSimilar(const CGeneModel& a, const CGeneModel& b, int tolerance)
77 {
78 if(a.Strand() != b.Strand())
79 return false;
80
81 if(a.ReadingFrame().NotEmpty() && b.ReadingFrame().NotEmpty()) {
82 if(!a.ReadingFrame().IntersectingWith(b.ReadingFrame()) || a.GetCdsInfo().PStops() != b.GetCdsInfo().PStops())
83 return false;
84
85 if(a.Exons().size() == 1 && b.Exons().size()==1) { // both coding; reading frames intersecting
86 TSignedSeqRange acds = a.GetCdsInfo().Cds();
87 TSignedSeqRange bcds = b.GetCdsInfo().Cds();
88 int common_point = (acds & bcds).GetFrom();
89 if(a.FShiftedLen(acds.GetFrom(),common_point,false)%3 != b.FShiftedLen(bcds.GetFrom(),common_point,false)%3) // different frames
90 return false;
91 }
92 }
93
94 TSignedSeqRange intersection = a.Limits() & b.Limits();
95 TSignedSeqPos mutual_min = intersection.GetFrom();
96 TSignedSeqPos mutual_max = intersection.GetTo();
97
98 int amin = 0;
99 while(amin < (int)a.Exons().size() && a.Exons()[amin].GetTo() < mutual_min) ++amin;
100 if(amin == (int)a.Exons().size()) return false;
101
102 int amax = (int)a.Exons().size()-1;
103 while(amax >=0 && a.Exons()[amax].GetFrom() > mutual_max) --amax;
104 if(amax < 0) return false;
105
106 int bmin = 0;
107 while(bmin < (int)b.Exons().size() && b.Exons()[bmin].GetTo() < mutual_min) ++bmin;
108 if(bmin == (int)b.Exons().size()) return false;
109
110 int bmax = (int)b.Exons().size()-1;
111 while(bmax >=0 && b.Exons()[bmax].GetFrom() > mutual_max) --bmax;
112 if(bmax < 0) return false;
113
114 if(amax-amin != bmax-bmin) return false;
115
116 // head-to-tail overlap
117 if (amin != 0 || size_t(amax) != a.Exons().size()-1 || bmin != 0 || size_t(bmax) != b.Exons().size()-1)
118 return false;
119
120 for( ; amin <= amax; ++amin, ++bmin) {
121 if(abs(max(mutual_min,a.Exons()[amin].GetFrom())-max(mutual_min,b.Exons()[bmin].GetFrom())) >= tolerance)
122 return false;
123 if(abs(min(mutual_max,a.Exons()[amin].GetTo())-min(mutual_max,b.Exons()[bmin].GetTo())) >= tolerance)
124 return false;
125 }
126
127 return true;
128 }
129
130
BadOverlapTest(const CGeneModel & a,const CGeneModel & b)131 bool CModelCompare::BadOverlapTest(const CGeneModel& a, const CGeneModel& b) { // returns true for bad overlap
132 if((!a.TrustedmRNA().empty() || !a.TrustedProt().empty()) && (!b.TrustedmRNA().empty() || !b.TrustedProt().empty()))
133 return false;
134 // else if(a.Limits().IntersectingWith(b.Limits()) && (a.Exons().size() == 1 || b.Exons().size() == 1))
135 // return true;
136 else
137 return CountCommonSplices(a,b) > 0;
138 }
139
140
RangeNestedInIntron(TSignedSeqRange r,const CGeneModel & algn,bool check_in_holes)141 bool CModelCompare::RangeNestedInIntron(TSignedSeqRange r, const CGeneModel& algn, bool check_in_holes) {
142 for(int i = 1; i < (int)algn.Exons().size(); ++i) {
143 if(check_in_holes || (algn.Exons()[i-1].m_ssplice && algn.Exons()[i].m_fsplice)) {
144 TSignedSeqRange intron(algn.Exons()[i-1].GetTo()+1,algn.Exons()[i].GetFrom()-1);
145 if(Include(intron, r))
146 return true;
147 }
148 }
149 return false;
150 }
151
152
HaveCommonExonOrIntron(const CGeneModel & a,const CGeneModel & b)153 bool CModelCompare::HaveCommonExonOrIntron(const CGeneModel& a, const CGeneModel& b) {
154 if(a.Strand() != b.Strand() || !a.IntersectingWith(b)) return false;
155
156 for(unsigned int i = 0; i < a.Exons().size(); ++i) {
157 for(unsigned int j = 0; j < b.Exons().size(); ++j) {
158 if(a.Exons()[i] == b.Exons()[j])
159 return true;
160 }
161 }
162
163 for(unsigned int i = 1; i < a.Exons().size(); ++i) {
164 TSignedSeqRange introna(a.Exons()[i-1].GetTo()+1,a.Exons()[i].GetFrom()-1);
165 for(unsigned int j = 1; j < b.Exons().size(); ++j) {
166 TSignedSeqRange intronb(b.Exons()[j-1].GetTo()+1,b.Exons()[j].GetFrom()-1);
167 if(introna == intronb)
168 return true;
169 }
170 }
171
172 return false;
173 }
174
175
FilterGenes(TGeneModelList & chains,TGeneModelList & bad_aligns,TGeneModelList & dest)176 void CGeneSelector::FilterGenes(TGeneModelList& chains, TGeneModelList& bad_aligns,
177 TGeneModelList& dest)
178 {
179 NON_CONST_ITERATE(TGeneModelList, im, chains) {
180 CGeneModel& model = *im;
181 CCDSInfo cds_info = model.GetCdsInfo();
182 if(cds_info.ReadingFrame().Empty())
183 continue;
184
185 if(cds_info.IsMappedToGenome())
186 cds_info = cds_info.MapFromOrigToEdited(model.GetAlignMap());
187 TSignedSeqRange cds = cds_info.Cds();
188
189 bool gapfilled = false;
190 int genome_cds = 0;
191 for(int ie = 0; ie < (int)model.Exons().size(); ++ie) {
192 if(model.Exons()[ie].Limits().Empty())
193 gapfilled = true;
194 else
195 genome_cds += (cds&model.TranscriptExon(ie)).GetLength();
196 }
197
198 if(gapfilled && genome_cds < 45) {
199 model.Status() |= CGeneModel::eSkipped;
200 model.AddComment("Most CDS in genomic gap");
201 }
202 }
203
204 ITERATE(TGeneModelList, it, chains) {
205 if(it->Status()&CGeneModel::eSkipped) {
206 bad_aligns.push_back(*it);
207 } else {
208 dest.push_back(*it);
209 }
210 }
211 }
212
213
FilterGenes(TGeneModelList & chains,TGeneModelList & bad_aligns)214 TGeneModelList CGeneSelector::FilterGenes(TGeneModelList& chains, TGeneModelList& bad_aligns)
215 {
216 TGeneModelList models;
217 FilterGenes(chains, bad_aligns, models);
218 return models;
219 }
220
221 END_SCOPE(gnomon)
222 END_NCBI_SCOPE
223