1 #include <ncbi_pch.hpp>
2
3 #include <corelib/ncbistd.hpp>
4 #include <corelib/ncbiobj.hpp>
5
6 #include <objtools/validator/feature_match.hpp>
7 #include <objmgr/mapped_feat.hpp>
8
9 #include <objects/seq/Bioseq.hpp>
10
11 //#include <objmgr/bioseq_handle.hpp>
12
13 #include <objects/seqfeat/seqfeat_macros.hpp>
14 #include <objmgr/util/sequence.hpp>
15 #include <objmgr/util/feature.hpp>
16
17 #include <objtools/validator/utilities.hpp>
18
19
20 BEGIN_NCBI_SCOPE
21 BEGIN_SCOPE(objects)
22
23 //using namespace sequence;
24 //LCOV_EXCL_START
25 //not currently using
26 bool CMatchFeat::operator < (const CMatchFeat& o) const
27 {
28 const CMatchFeat& l = *this;
29 const CMatchFeat& r = o;
30
31 const CSeq_feat& lf = l.GetFeat();
32 const CSeq_feat& rf = r.GetFeat();
33
34 if (l.m_pos_start != r.m_pos_start)
35 return l.m_pos_start < r.m_pos_start;
36
37 if (l.m_pos_stop != r.m_pos_stop)
38 return l.m_pos_stop < r.m_pos_stop;
39
40 if (lf.Compare(rf) < 0)
41 return true;
42
43 return false;
44 }
45
46 struct feat_loc_compare
47 {
48 template<class _T>
operator ()feat_loc_compare49 bool operator()(const CRef<_T>& l, const CRef<_T>& r) const
50 {
51 return *l < *r;
52 }
53 };
54
55 template<typename TFeatList>
s_SetUpXrefPairs(vector<CRef<CMatchCDS>> & cds_list,vector<CRef<CMatchmRNA>> & mrna_list,const TFeatList & feat_list,CScope * scope,ENoteCDS eNoteCDS)56 void s_SetUpXrefPairs(
57 vector < CRef<CMatchCDS> > & cds_list,
58 vector < CRef<CMatchmRNA> > & mrna_list,
59 const TFeatList & feat_list, CScope* scope, ENoteCDS eNoteCDS)
60 {
61 // predict the size of the final arrays
62 cds_list.reserve(feat_list.size() / 2);
63 mrna_list.reserve(feat_list.size() / 2);
64
65 // fill in cds_list, mrna_list
66 for (auto& feat_it : feat_list) {
67 const CSeq_feat& feature = feat_it.GetOriginalFeature();
68 if (feature.IsSetData()) {
69 if (feature.GetData().IsCdregion()) {
70 cds_list.push_back(Ref(new CMatchCDS(feat_it)));
71 }
72 else
73 if (feature.GetData().IsRna() &&
74 feature.GetData().GetSubtype() == CSeqFeatData::eSubtype_mRNA) {
75 mrna_list.push_back(Ref(new CMatchmRNA(feat_it)));
76 }
77 }
78 }
79
80 // sort them
81 sort(cds_list.begin(), cds_list.end(), feat_loc_compare());
82 sort(mrna_list.begin(), mrna_list.end(), feat_loc_compare());
83
84 if (cds_list.empty() || mrna_list.empty())
85 return;
86
87 auto mrna_begin = mrna_list.begin();
88
89 size_t compare_attempts = 0;
90 for (auto& cds_it : cds_list)
91 {
92 CMatchCDS& cds_match = *cds_it;
93 const CSeq_feat& cds = cds_match.GetFeat();
94
95 sequence::EOverlapType overlap_type = sequence::eOverlap_CheckIntRev;
96 if (cds.IsSetExcept_text()
97 && (NStr::FindNoCase(cds.GetExcept_text(), "ribosomal slippage") != string::npos
98 || NStr::FindNoCase(cds.GetExcept_text(), "trans-splicing") != string::npos)) {
99 overlap_type = sequence::eOverlap_SubsetRev;
100 }
101
102
103 while (mrna_begin != mrna_list.end() &&
104 (**mrna_begin).maxpos() < cds_match.minpos())
105 mrna_begin++;
106
107 for (auto mrna_it = mrna_begin;
108 mrna_it != mrna_list.end() &&
109 (**mrna_it).minpos() <= cds_match.maxpos() &&
110 cds_match.minpos() <= (**mrna_it).maxpos();
111 ++mrna_it)
112 {
113 CMatchmRNA& mrna_match = **mrna_it;
114 const CSeq_feat& mrna = mrna_match.GetFeat();
115 if (!mrna_match.IsAccountedFor()) {
116 compare_attempts++;
117 if (TestForOverlapEx(cds.GetLocation(), mrna.GetLocation(), overlap_type, scope) >= 0) {
118 cds_match.AddmRNA(mrna_match);
119 if (eNoteCDS == eNoteCDS_No) {
120 mrna_match.AddCDS(cds_match);
121 }
122 if (validator::s_IdXrefsAreReciprocal(cds, mrna)) {
123 cds_match.SetXrefMatch(mrna_match);
124 if (eNoteCDS == eNoteCDS_No) {
125 mrna_match.SetCDS(cds);
126 }
127 else {
128 mrna_match.SetAccountedFor(true);
129 }
130 mrna_match.SetCDS(cds);
131 }
132 }
133 else
134 {
135 //cout << "problem" << endl;
136 }
137 }
138 }
139
140 if (!cds_match.HasmRNA()) {
141 cds_match.AssignSinglemRNA();
142 }
143 }
144 // cout << "Totals CDS, mRNAs, compare attempts:" << cds_list.size() << " " << mrna_list.size() << " " << compare_attempts << endl;
145 }
146
CMatchFeat(const CMappedFeat & feat)147 CMatchFeat::CMatchFeat(const CMappedFeat &feat) : m_feat(feat.GetSeq_feat())
148 {
149 m_pos_start = m_feat->GetLocation().GetStart(eExtreme_Positional);
150 m_pos_stop = m_feat->GetLocation().GetStop(eExtreme_Positional);
151 }
152
153
HasCDSMatch(void)154 bool CMatchmRNA::HasCDSMatch(void)
155 {
156 bool rval = false;
157
158 if (m_Cds) {
159 return true;
160 }
161 else {
162 // iterate through underlying cdss that aren't accounted for
163 vector < CRef<CMatchCDS> >::iterator cds_it =
164 m_UnderlyingCDSs.begin();
165 for (; cds_it != m_UnderlyingCDSs.end() && !rval; ++cds_it) {
166 if (!(*cds_it)->HasmRNA()) {
167 return true;
168 }
169 }
170 }
171 return false;
172 }
173
174
MatchesUnderlyingCDS(unsigned int partial_type) const175 bool CMatchmRNA::MatchesUnderlyingCDS(unsigned int partial_type) const
176 {
177 bool rval = false;
178
179 TSeqPos mrna_start = m_feat->GetLocation().GetStart(eExtreme_Biological);
180 TSeqPos mrna_stop = m_feat->GetLocation().GetStop(eExtreme_Biological);
181
182 if (m_Cds) {
183 if (partial_type == sequence::eSeqlocPartial_Nostart) {
184 if (m_Cds->GetLocation().GetStart(eExtreme_Biological) == mrna_start) {
185 rval = true;
186 }
187 else {
188 rval = false;
189 }
190 }
191 else if (partial_type == sequence::eSeqlocPartial_Nostop) {
192 if (m_Cds->GetLocation().GetStop(eExtreme_Biological) == mrna_stop) {
193 rval = true;
194 }
195 else {
196 rval = false;
197 }
198 }
199 #if 0
200 }
201 else {
202 // iterate through underlying cdss that aren't accounted for
203 vector < CRef<CMatchCDS> >::iterator cds_it =
204 m_UnderlyingCDSs.begin();
205 while (cds_it != m_UnderlyingCDSs.end() && !rval) {
206 if (!(*cds_it)->HasmRNA()) {
207 if (partial_type == eSeqlocPartial_Nostart) {
208 if ((*cds_it)->m_Cds->GetLocation().GetStart(eExtreme_Biological) == mrna_start) {
209 rval = true;
210 }
211 else {
212 rval = false;
213 }
214 }
215 else if (partial_type == eSeqlocPartial_Nostop) {
216 if ((*cds_it)->m_Cds->GetLocation().GetStop(eExtreme_Biological) == mrna_stop) {
217 rval = true;
218 }
219 else {
220 rval = false;
221 }
222 }
223 }
224 ++cds_it;
225 }
226 #endif
227 }
228 return rval;
229 }
230
231
MatchAnyUnderlyingCDS(unsigned int partial_type) const232 bool CMatchmRNA::MatchAnyUnderlyingCDS(unsigned int partial_type) const
233 {
234 bool rval = false;
235
236 TSeqPos mrna_start = m_feat->GetLocation().GetStart(eExtreme_Biological);
237 TSeqPos mrna_stop = m_feat->GetLocation().GetStop(eExtreme_Biological);
238
239 // iterate through underlying cdss that aren't accounted for
240 vector < CRef<CMatchCDS> >::const_iterator cds_it = m_UnderlyingCDSs.begin();
241 while (cds_it != m_UnderlyingCDSs.end() && !rval) {
242 if (partial_type == sequence::eSeqlocPartial_Nostart) {
243 if ((*cds_it)->GetFeat().GetLocation().GetStart(eExtreme_Biological) == mrna_start) {
244 rval = true;
245 }
246 else {
247 rval = false;
248 }
249 }
250 else if (partial_type == sequence::eSeqlocPartial_Nostop) {
251 if ((*cds_it)->GetFeat().GetLocation().GetStop(eExtreme_Biological) == mrna_stop) {
252 rval = true;
253 }
254 else {
255 rval = false;
256 }
257 }
258 ++cds_it;
259 }
260 return rval;
261 }
262
263
264 // only assign if the coding region has only one overlapping unaccounted for mRNA
AssignSinglemRNA(void)265 void CMatchCDS::AssignSinglemRNA(void)
266 {
267 CRef<CMatchmRNA> match;
268
269 vector < CRef<CMatchmRNA> >::iterator mrna_it =
270 m_OverlappingmRNAs.begin();
271 for (; mrna_it != m_OverlappingmRNAs.end(); ++mrna_it) {
272 if (!(*mrna_it)->IsAccountedFor()) {
273 if (match == NULL) {
274 match.Reset(*mrna_it);
275 }
276 else {
277 // found more than one, can't assign either
278 match.Reset();
279 break;
280 }
281 }
282
283 }
284 if (match != NULL) {
285 m_AssignedMrna = match;
286 match->SetCDS(*m_feat);
287 }
288 }
289
290
GetNummRNA(bool & loc_unique)291 int CMatchCDS::GetNummRNA(bool &loc_unique)
292 {
293 int num = 0;
294 loc_unique = true;
295
296 if (HasmRNA()) {
297 // count the one assigned
298 num++;
299 }
300 vector < CRef<CMatchmRNA> >::iterator mrna_it =
301 m_OverlappingmRNAs.begin();
302 vector < string > product_list;
303 while (mrna_it != m_OverlappingmRNAs.end()) {
304 if (!(*mrna_it)->IsAccountedFor()) {
305 // count overlapping unassigned mRNAS
306 if ((*mrna_it)->GetFeat().IsSetProduct()) {
307 string label;
308 (*mrna_it)->GetFeat().GetProduct().GetLabel(&label);
309 product_list.push_back(label);
310 }
311 num++;
312 }
313 ++mrna_it;
314 }
315 if (product_list.size() > 1) {
316 stable_sort(product_list.begin(), product_list.end());
317 vector < string >::iterator s1 = product_list.begin();
318 vector < string >::iterator s2 = s1;
319 s2++;
320 while (s2 != product_list.end()) {
321 if (NStr::Equal(*s1, *s2)) {
322 loc_unique = false;
323 break;
324 }
325 ++s1;
326 ++s2;
327 }
328 }
329 return num;
330 }
331
CmRNAAndCDSIndex()332 CmRNAAndCDSIndex::CmRNAAndCDSIndex()
333 {
334 // nothing needed yet
335 }
336
337
~CmRNAAndCDSIndex(void)338 CmRNAAndCDSIndex::~CmRNAAndCDSIndex(void)
339 {
340 // nothing needed yet
341 }
342
343
SetBioseq(const std::vector<CMappedFeat> * feat_list,CScope * scope)344 void CmRNAAndCDSIndex::SetBioseq(const std::vector<CMappedFeat> *feat_list, CScope* scope)
345 {
346 m_CdsList.clear();
347 m_mRNAList.clear();
348
349 if (!feat_list) {
350 return;
351 }
352
353 s_SetUpXrefPairs(
354 m_CdsList, m_mRNAList, *feat_list, scope, eNoteCDS_Yes);
355 }
356
357
FindMatchmRNA(const CMappedFeat & mrna)358 CRef<CMatchmRNA> CmRNAAndCDSIndex::FindMatchmRNA(const CMappedFeat& mrna)
359 {
360 vector < CRef<CMatchmRNA> >::iterator mrna_it = m_mRNAList.begin();
361 for (; mrna_it != m_mRNAList.end(); ++mrna_it) {
362 if (mrna.GetOriginalFeature().Equals((*mrna_it)->GetFeat())) {
363 return (*mrna_it);
364 }
365 }
366 return CRef<CMatchmRNA>();
367 }
368
369
MatchmRNAToCDSEnd(const CMappedFeat & mrna,unsigned int partial_type)370 bool CmRNAAndCDSIndex::MatchmRNAToCDSEnd(const CMappedFeat& mrna, unsigned int partial_type)
371 {
372 bool rval = false;
373
374 CRef<CMatchmRNA> match_mrna = FindMatchmRNA(mrna);
375 if (match_mrna && match_mrna->MatchesUnderlyingCDS(partial_type)) {
376 rval = true;
377 }
378 return rval;
379 }
380
381 //LCOV_EXCL_STOP
382
383
384
385
386 END_SCOPE(objects)
387 END_NCBI_SCOPE
388