1 #include <ncbi_pch.hpp>
2 
3 #include <corelib/ncbistd.hpp>
4 #include <corelib/ncbiobj.hpp>
5 
6 #include <objtools/validator/feature_match.hpp>
7 #include <objmgr/mapped_feat.hpp>
8 
9 #include <objects/seq/Bioseq.hpp>
10 
11 //#include <objmgr/bioseq_handle.hpp>
12 
13 #include <objects/seqfeat/seqfeat_macros.hpp>
14 #include <objmgr/util/sequence.hpp>
15 #include <objmgr/util/feature.hpp>
16 
17 #include <objtools/validator/utilities.hpp>
18 
19 
20 BEGIN_NCBI_SCOPE
21 BEGIN_SCOPE(objects)
22 
23 //using namespace sequence;
24 //LCOV_EXCL_START
25 //not currently using
26 bool CMatchFeat::operator < (const CMatchFeat& o) const
27 {
28     const CMatchFeat& l = *this;
29     const CMatchFeat& r = o;
30 
31     const CSeq_feat& lf = l.GetFeat();
32     const CSeq_feat& rf = r.GetFeat();
33 
34     if (l.m_pos_start != r.m_pos_start)
35         return l.m_pos_start < r.m_pos_start;
36 
37     if (l.m_pos_stop != r.m_pos_stop)
38         return l.m_pos_stop < r.m_pos_stop;
39 
40     if (lf.Compare(rf) < 0)
41         return true;
42 
43     return false;
44 }
45 
46 struct feat_loc_compare
47 {
48     template<class _T>
operator ()feat_loc_compare49     bool operator()(const CRef<_T>& l, const CRef<_T>& r) const
50     {
51         return *l < *r;
52     }
53 };
54 
55 template<typename TFeatList>
s_SetUpXrefPairs(vector<CRef<CMatchCDS>> & cds_list,vector<CRef<CMatchmRNA>> & mrna_list,const TFeatList & feat_list,CScope * scope,ENoteCDS eNoteCDS)56 void s_SetUpXrefPairs(
57     vector < CRef<CMatchCDS> > & cds_list,
58     vector < CRef<CMatchmRNA> > & mrna_list,
59     const TFeatList & feat_list, CScope* scope, ENoteCDS eNoteCDS)
60 {
61     // predict the size of the final arrays
62     cds_list.reserve(feat_list.size() / 2);
63     mrna_list.reserve(feat_list.size() / 2);
64 
65     // fill in cds_list, mrna_list
66     for (auto& feat_it : feat_list) {
67         const CSeq_feat& feature = feat_it.GetOriginalFeature();
68         if (feature.IsSetData()) {
69             if (feature.GetData().IsCdregion()) {
70                 cds_list.push_back(Ref(new CMatchCDS(feat_it)));
71             }
72             else
73                 if (feature.GetData().IsRna() &&
74                     feature.GetData().GetSubtype() == CSeqFeatData::eSubtype_mRNA) {
75                     mrna_list.push_back(Ref(new CMatchmRNA(feat_it)));
76                 }
77         }
78     }
79 
80     // sort them
81     sort(cds_list.begin(), cds_list.end(), feat_loc_compare());
82     sort(mrna_list.begin(), mrna_list.end(), feat_loc_compare());
83 
84     if (cds_list.empty() || mrna_list.empty())
85         return;
86 
87     auto mrna_begin = mrna_list.begin();
88 
89     size_t compare_attempts = 0;
90     for (auto& cds_it : cds_list)
91     {
92         CMatchCDS& cds_match = *cds_it;
93         const CSeq_feat& cds = cds_match.GetFeat();
94 
95         sequence::EOverlapType overlap_type = sequence::eOverlap_CheckIntRev;
96         if (cds.IsSetExcept_text()
97             && (NStr::FindNoCase(cds.GetExcept_text(), "ribosomal slippage") != string::npos
98             || NStr::FindNoCase(cds.GetExcept_text(), "trans-splicing") != string::npos)) {
99             overlap_type = sequence::eOverlap_SubsetRev;
100         }
101 
102 
103         while (mrna_begin != mrna_list.end() &&
104             (**mrna_begin).maxpos() < cds_match.minpos())
105             mrna_begin++;
106 
107         for (auto mrna_it = mrna_begin;
108             mrna_it != mrna_list.end() &&
109             (**mrna_it).minpos() <= cds_match.maxpos() &&
110             cds_match.minpos() <= (**mrna_it).maxpos();
111         ++mrna_it)
112         {
113             CMatchmRNA& mrna_match = **mrna_it;
114             const CSeq_feat& mrna = mrna_match.GetFeat();
115             if (!mrna_match.IsAccountedFor()) {
116                 compare_attempts++;
117                 if (TestForOverlapEx(cds.GetLocation(), mrna.GetLocation(), overlap_type, scope) >= 0) {
118                     cds_match.AddmRNA(mrna_match);
119                     if (eNoteCDS == eNoteCDS_No) {
120                         mrna_match.AddCDS(cds_match);
121                     }
122                     if (validator::s_IdXrefsAreReciprocal(cds, mrna)) {
123                         cds_match.SetXrefMatch(mrna_match);
124                         if (eNoteCDS == eNoteCDS_No) {
125                             mrna_match.SetCDS(cds);
126                         }
127                         else {
128                             mrna_match.SetAccountedFor(true);
129                         }
130                         mrna_match.SetCDS(cds);
131                     }
132                 }
133                 else
134                 {
135                     //cout << "problem" << endl;
136                 }
137             }
138         }
139 
140         if (!cds_match.HasmRNA()) {
141             cds_match.AssignSinglemRNA();
142         }
143     }
144     // cout << "Totals CDS, mRNAs, compare attempts:" << cds_list.size() << " " << mrna_list.size() << " " << compare_attempts << endl;
145 }
146 
CMatchFeat(const CMappedFeat & feat)147 CMatchFeat::CMatchFeat(const CMappedFeat &feat) : m_feat(feat.GetSeq_feat())
148 {
149     m_pos_start = m_feat->GetLocation().GetStart(eExtreme_Positional);
150     m_pos_stop = m_feat->GetLocation().GetStop(eExtreme_Positional);
151 }
152 
153 
HasCDSMatch(void)154 bool CMatchmRNA::HasCDSMatch(void)
155 {
156     bool rval = false;
157 
158     if (m_Cds) {
159         return true;
160     }
161     else {
162         // iterate through underlying cdss that aren't accounted for
163         vector < CRef<CMatchCDS> >::iterator cds_it =
164             m_UnderlyingCDSs.begin();
165         for (; cds_it != m_UnderlyingCDSs.end() && !rval; ++cds_it) {
166             if (!(*cds_it)->HasmRNA()) {
167                 return true;
168             }
169         }
170     }
171     return false;
172 }
173 
174 
MatchesUnderlyingCDS(unsigned int partial_type) const175 bool CMatchmRNA::MatchesUnderlyingCDS(unsigned int partial_type) const
176 {
177     bool rval = false;
178 
179     TSeqPos mrna_start = m_feat->GetLocation().GetStart(eExtreme_Biological);
180     TSeqPos mrna_stop = m_feat->GetLocation().GetStop(eExtreme_Biological);
181 
182     if (m_Cds) {
183         if (partial_type == sequence::eSeqlocPartial_Nostart) {
184             if (m_Cds->GetLocation().GetStart(eExtreme_Biological) == mrna_start) {
185                 rval = true;
186             }
187             else {
188                 rval = false;
189             }
190         }
191         else if (partial_type == sequence::eSeqlocPartial_Nostop) {
192             if (m_Cds->GetLocation().GetStop(eExtreme_Biological) == mrna_stop) {
193                 rval = true;
194             }
195             else {
196                 rval = false;
197             }
198         }
199 #if 0
200     }
201     else {
202         // iterate through underlying cdss that aren't accounted for
203         vector < CRef<CMatchCDS> >::iterator cds_it =
204             m_UnderlyingCDSs.begin();
205         while (cds_it != m_UnderlyingCDSs.end() && !rval) {
206             if (!(*cds_it)->HasmRNA()) {
207                 if (partial_type == eSeqlocPartial_Nostart) {
208                     if ((*cds_it)->m_Cds->GetLocation().GetStart(eExtreme_Biological) == mrna_start) {
209                         rval = true;
210                     }
211                     else {
212                         rval = false;
213                     }
214                 }
215                 else if (partial_type == eSeqlocPartial_Nostop) {
216                     if ((*cds_it)->m_Cds->GetLocation().GetStop(eExtreme_Biological) == mrna_stop) {
217                         rval = true;
218                     }
219                     else {
220                         rval = false;
221                     }
222                 }
223             }
224             ++cds_it;
225         }
226 #endif
227     }
228     return rval;
229 }
230 
231 
MatchAnyUnderlyingCDS(unsigned int partial_type) const232 bool CMatchmRNA::MatchAnyUnderlyingCDS(unsigned int partial_type) const
233 {
234     bool rval = false;
235 
236     TSeqPos mrna_start = m_feat->GetLocation().GetStart(eExtreme_Biological);
237     TSeqPos mrna_stop = m_feat->GetLocation().GetStop(eExtreme_Biological);
238 
239     // iterate through underlying cdss that aren't accounted for
240     vector < CRef<CMatchCDS> >::const_iterator cds_it = m_UnderlyingCDSs.begin();
241     while (cds_it != m_UnderlyingCDSs.end() && !rval) {
242         if (partial_type == sequence::eSeqlocPartial_Nostart) {
243             if ((*cds_it)->GetFeat().GetLocation().GetStart(eExtreme_Biological) == mrna_start) {
244                 rval = true;
245             }
246             else {
247                 rval = false;
248             }
249         }
250         else if (partial_type == sequence::eSeqlocPartial_Nostop) {
251             if ((*cds_it)->GetFeat().GetLocation().GetStop(eExtreme_Biological) == mrna_stop) {
252                 rval = true;
253             }
254             else {
255                 rval = false;
256             }
257         }
258         ++cds_it;
259     }
260     return rval;
261 }
262 
263 
264 // only assign if the coding region has only one overlapping unaccounted for mRNA
AssignSinglemRNA(void)265 void CMatchCDS::AssignSinglemRNA(void)
266 {
267     CRef<CMatchmRNA> match;
268 
269     vector < CRef<CMatchmRNA> >::iterator mrna_it =
270         m_OverlappingmRNAs.begin();
271     for (; mrna_it != m_OverlappingmRNAs.end(); ++mrna_it) {
272         if (!(*mrna_it)->IsAccountedFor()) {
273             if (match == NULL) {
274                 match.Reset(*mrna_it);
275             }
276             else {
277                 // found more than one, can't assign either
278                 match.Reset();
279                 break;
280             }
281         }
282 
283     }
284     if (match != NULL) {
285         m_AssignedMrna = match;
286         match->SetCDS(*m_feat);
287     }
288 }
289 
290 
GetNummRNA(bool & loc_unique)291 int CMatchCDS::GetNummRNA(bool &loc_unique)
292 {
293     int num = 0;
294     loc_unique = true;
295 
296     if (HasmRNA()) {
297         // count the one assigned
298         num++;
299     }
300     vector < CRef<CMatchmRNA> >::iterator mrna_it =
301         m_OverlappingmRNAs.begin();
302     vector < string > product_list;
303     while (mrna_it != m_OverlappingmRNAs.end()) {
304         if (!(*mrna_it)->IsAccountedFor()) {
305             // count overlapping unassigned mRNAS
306             if ((*mrna_it)->GetFeat().IsSetProduct()) {
307                 string label;
308                 (*mrna_it)->GetFeat().GetProduct().GetLabel(&label);
309                 product_list.push_back(label);
310             }
311             num++;
312         }
313         ++mrna_it;
314     }
315     if (product_list.size() > 1) {
316         stable_sort(product_list.begin(), product_list.end());
317         vector < string >::iterator s1 = product_list.begin();
318         vector < string >::iterator s2 = s1;
319         s2++;
320         while (s2 != product_list.end()) {
321             if (NStr::Equal(*s1, *s2)) {
322                 loc_unique = false;
323                 break;
324             }
325             ++s1;
326             ++s2;
327         }
328     }
329     return num;
330 }
331 
CmRNAAndCDSIndex()332 CmRNAAndCDSIndex::CmRNAAndCDSIndex()
333 {
334     // nothing needed yet
335 }
336 
337 
~CmRNAAndCDSIndex(void)338 CmRNAAndCDSIndex::~CmRNAAndCDSIndex(void)
339 {
340     // nothing needed yet
341 }
342 
343 
SetBioseq(const std::vector<CMappedFeat> * feat_list,CScope * scope)344 void CmRNAAndCDSIndex::SetBioseq(const std::vector<CMappedFeat> *feat_list, CScope* scope)
345 {
346     m_CdsList.clear();
347     m_mRNAList.clear();
348 
349     if (!feat_list) {
350         return;
351     }
352 
353     s_SetUpXrefPairs(
354         m_CdsList, m_mRNAList, *feat_list, scope, eNoteCDS_Yes);
355 }
356 
357 
FindMatchmRNA(const CMappedFeat & mrna)358 CRef<CMatchmRNA> CmRNAAndCDSIndex::FindMatchmRNA(const CMappedFeat& mrna)
359 {
360     vector < CRef<CMatchmRNA> >::iterator mrna_it = m_mRNAList.begin();
361     for (; mrna_it != m_mRNAList.end(); ++mrna_it) {
362         if (mrna.GetOriginalFeature().Equals((*mrna_it)->GetFeat())) {
363             return (*mrna_it);
364         }
365     }
366     return CRef<CMatchmRNA>();
367 }
368 
369 
MatchmRNAToCDSEnd(const CMappedFeat & mrna,unsigned int partial_type)370 bool CmRNAAndCDSIndex::MatchmRNAToCDSEnd(const CMappedFeat& mrna, unsigned int partial_type)
371 {
372     bool rval = false;
373 
374     CRef<CMatchmRNA> match_mrna = FindMatchmRNA(mrna);
375     if (match_mrna && match_mrna->MatchesUnderlyingCDS(partial_type)) {
376         rval = true;
377     }
378     return rval;
379 }
380 
381 //LCOV_EXCL_STOP
382 
383 
384 
385 
386 END_SCOPE(objects)
387 END_NCBI_SCOPE
388