1 /*  $Id: internal_stops.cpp 627020 2021-03-08 17:25:43Z ivanov $
2  * ===========================================================================
3  *
4  *                            PUBLIC DOMAIN NOTICE
5  *               National Center for Biotechnology Information
6  *
7  *  This software/database is a "United States Government Work" under the
8  *  terms of the United States Copyright Act.  It was written as part of
9  *  the author's official duties as a United States Government employee and
10  *  thus cannot be copyrighted.  This software/database is freely available
11  *  to the public for use. The National Library of Medicine and the U.S.
12  *  Government have not placed any restriction on its use or reproduction.
13  *
14  *  Although all reasonable efforts have been taken to ensure the accuracy
15  *  and reliability of the software and data, the NLM and the U.S.
16  *  Government do not and cannot warrant the performance or results that
17  *  may be obtained by using this software or data. The NLM and the U.S.
18  *  Government disclaim all warranties, express or implied, including
19  *  warranties of performance, merchantability or fitness for any particular
20  *  purpose.
21  *
22  *  Please cite the author in any work or product based on this material.
23  *
24  * ===========================================================================
25  *
26  * Authors:  Vyacheslav Chetvernin
27  *
28  * File Description:
29  *
30  */
31 #include <ncbi_pch.hpp>
32 
33 #include <algo/sequence/internal_stops.hpp>
34 #include "feature_generator.hpp"
35 
36 #include <objects/seqfeat/Genetic_code.hpp>
37 #include <objects/seqalign/Spliced_seg.hpp>
38 #include <objects/seqalign/Product_pos.hpp>
39 #include <objects/seqalign/Spliced_exon_chunk.hpp>
40 #include <objects/seqfeat/Org_ref.hpp>
41 #include <objects/seqfeat/Genetic_code_table.hpp>
42 #include <objmgr/scope.hpp>
43 #include <objmgr/seq_vector.hpp>
44 #include <objmgr/util/sequence.hpp>
45 
46 BEGIN_NCBI_SCOPE
47 
CInternalStopFinder(CScope & a_scope)48 CInternalStopFinder::CInternalStopFinder(CScope& a_scope)
49     : scope(a_scope), generator(a_scope)
50 {
51     generator.SetFlags(CFeatureGenerator::fTrimEnds);
52     generator.SetAllowedUnaligned(10);
53 }
54 
55 typedef map<TSeqRange, string> TStarts;
56 
FindStartsStops(const CSeq_align & align,int padding)57 pair<set<TSeqPos>, set<TSeqPos> > CInternalStopFinder::FindStartsStops(const CSeq_align& align, int padding)
58 {
59     pair<TStarts, set<TSeqRange> > start_stop_ranges = FindStartStopRanges(align, padding);
60     set<TSeqPos> starts, stops;
61     ITERATE (TStarts, r, start_stop_ranges.first) {
62         starts.insert(r->first.GetFrom());
63     }
64     ITERATE (set<TSeqRange>, r, start_stop_ranges.second) {
65         stops.insert(r->GetFrom());
66     }
67     return make_pair(starts, stops);
68 }
FindStartStopRanges(const CSeq_align & align,int padding,set<TSignedSeqRange> * gaps)69 pair<TStarts, set<TSeqRange> > CInternalStopFinder::FindStartStopRanges(const CSeq_align& align, int padding,
70                                                                         set<TSignedSeqRange>* gaps)
71 {
72     CBioseq_Handle bsh = scope.GetBioseqHandle(align.GetSeq_id(1));
73     int genomic_length = bsh.GetBioseqLength();
74 
75     CConstRef<CSeq_align> clean_align;
76     pair<bool, bool> trim_by_contig(false, false);
77     {{
78         CConstRef<CSeq_align> padded_align(&align);
79         if (padding > 0) {
80             CRef<CSeq_loc> loc = align.CreateRowSeq_loc(1);
81             int start = loc->GetStart(eExtreme_Positional);
82             int stop = loc->GetStop(eExtreme_Positional);
83 
84             bool is_circular = (bsh.GetInst_Topology() == CSeq_inst::eTopology_circular);
85 
86             if (is_circular) {
87                 //prevent self overlap
88                 padding = min(padding, ((stop > start ? genomic_length : 0) - (stop - start +1))/2);
89             }
90 
91             start -= padding;
92             stop += padding;
93 
94             if (start <= 2 && !is_circular) {
95                 trim_by_contig.first = true;
96             }
97             if (stop >= genomic_length-3 && !is_circular) {
98                 trim_by_contig.second = true;
99             }
100 
101             if (start < 0) {
102                 start = is_circular ? start + genomic_length : 0;
103             }
104             if (stop >= genomic_length) {
105                 stop = is_circular ? stop - genomic_length : genomic_length-1;
106             }
107             padded_align = generator.AdjustAlignment(align, TSeqRange(start, stop));
108             //cerr << MSerial_AsnText << *padded_align;
109         }
110 
111         clean_align = generator.CleanAlignment(*padded_align);
112         //cerr << MSerial_AsnText << *clean_align;
113     }}
114 
115     CSeq_loc_Mapper mapper(*clean_align, 1);
116 
117     CRef<CSeq_loc> query_loc(new CSeq_loc);
118     const CSpliced_seg& spl = clean_align->GetSegs().GetSpliced();
119     query_loc->SetInt(*spl.GetExons().front()->CreateRowSeq_interval(0, spl));
120 
121     const bool is_protein = (spl.GetProduct_type() == CSpliced_seg::eProduct_type_protein);
122 
123     string seq = GetCDSNucleotideSequence(*clean_align);
124     if (seq.size()%3 != 0) {
125         cerr << MSerial_AsnText << align << endl;
126         _ASSERT(seq.size()%3 == 0);
127         NCBI_USER_THROW("CDSNucleotideSequence not divisible by 3");
128     }
129 
130     int gcode = fg::GetGeneticCode(bsh);
131 
132     CRef<CGenetic_code::C_E> c_e(new CGenetic_code::C_E); c_e->SetId(gcode);
133     CRef<CGenetic_code> code(new CGenetic_code);
134     code->Set().push_back(c_e);
135 
136     const CTrans_table & tbl = CGen_code_table::GetTransTable(*code);
137     const size_t kUnknownState = tbl.SetCodonState('N', 'N', 'N');
138 
139     TStarts starts;
140     set<TSeqRange> stops;
141 
142     size_t state = 0;
143     int k = 0;
144     string codon = "NNN";
145 
146     ITERATE(string, s, seq) {
147         state = tbl.NextCodonState(state, *s);
148         codon[k%3] = *s;
149 
150         if (++k%3)
151             continue;
152 
153         if (state == kUnknownState)
154             continue;
155 
156         if (tbl.IsOrfStart(state) || tbl.IsOrfStop(state)) {
157             if (is_protein) {
158                 query_loc->SetInt().SetFrom((k-3)/3);
159                 query_loc->SetInt().SetTo((k-3)/3);
160             } else {
161                 query_loc->SetInt().SetFrom(k-3);
162                 query_loc->SetInt().SetTo(k-1);
163             }
164             TSeqPos mapped_pos = mapper.Map(*query_loc)->GetStart(eExtreme_Biological);
165             if (mapped_pos == kInvalidSeqPos)
166                 continue;
167             TSeqPos mapped_pos2 = mapper.Map(*query_loc)->GetStop(eExtreme_Biological);
168 
169             if (tbl.IsOrfStart(state)) {
170                 starts[TSeqRange(mapped_pos, mapped_pos2)] = codon;
171             }
172             if (tbl.IsOrfStop(state)) {
173                 stops.insert(TSeqRange(mapped_pos, mapped_pos2));
174             }
175         }
176     }
177 
178     if (gaps != nullptr) {
179 
180         CRef<CSeq_loc> region_loc = clean_align->CreateRowSeq_loc(1);
181         if (trim_by_contig.first && region_loc->GetStart(eExtreme_Positional) < 3) {
182             region_loc = region_loc->Merge(CSeq_loc::fMerge_SingleRange, nullptr);
183             region_loc->SetInt().SetFrom(0);
184         }
185         if (trim_by_contig.second && int(region_loc->GetStop(eExtreme_Positional)) > genomic_length -3) {
186             region_loc = region_loc->Merge(CSeq_loc::fMerge_SingleRange, nullptr);
187             region_loc->SetInt().SetTo(genomic_length-1);
188         }
189 //         cerr << MSerial_AsnText << *region_loc;
190 
191         CSeqVector region_vec(*region_loc, scope,
192                                CBioseq_Handle::eCoding_Iupac);
193         string region_seq;
194         region_vec.GetSeqData(region_vec.begin(), region_vec.end(), region_seq);
195 
196         region_seq += 'X'; // to finish last run of Ns
197 
198         int gap_begin = -1;
199         int gap_end = -1;
200         int k = 0;
201 
202         CRef<CSeq_id> id(new CSeq_id);
203         id->Assign(*region_loc->GetId());
204         CRef<CSeq_loc> query_loc(new CSeq_loc(*id, 0, region_vec.size()-1));
205         CSeq_loc_Mapper mapper(*query_loc, *region_loc);
206 
207         for (auto s: region_seq) {
208             if (s == 'N') {
209                 if (gap_end != k) {
210                     gap_begin = k;
211                 }
212                 gap_end = k+1;
213             } else if (gap_end == k) {
214                 query_loc->SetInt().SetFrom(gap_begin);
215                 query_loc->SetInt().SetTo(gap_end-1);
216 
217                 auto mapped_loc = mapper.Map(*query_loc);
218                 TSeqPos mapped_pos = mapped_loc->GetStart(eExtreme_Biological);
219                 TSeqPos mapped_pos2 = mapped_loc->GetStop(eExtreme_Biological);
220                 if (mapped_pos == kInvalidSeqPos || mapped_pos2 == kInvalidSeqPos) {
221                     NCBI_USER_THROW("Cannot map Ns run");
222                 }
223                 gaps->insert(TSignedSeqRange(mapped_pos, mapped_pos2));
224             }
225             ++k;
226         }
227 
228         auto strand = region_loc->GetStrand();
229 
230         if (trim_by_contig.first) {
231             int gap_stop = -1;
232             if (strand != eNa_strand_minus) {
233                 if (!gaps->empty() && gaps->begin()->GetFrom()==0) {
234                     gap_stop = gaps->begin()->GetTo();
235                     gaps->erase(gaps->begin());
236                 }
237                 gaps->insert(TSignedSeqRange(gap_stop -9, gap_stop));
238             } else {
239                 if (!gaps->empty() && gaps->begin()->GetTo()==0) {
240                     gap_stop = gaps->begin()->GetFrom();
241                     gaps->erase(gaps->begin());
242                 }
243                 gaps->insert(TSignedSeqRange(gap_stop, gap_stop -9));
244             }
245         }
246         if (trim_by_contig.second) {
247             int gap_start = genomic_length;
248             if (strand != eNa_strand_minus) {
249                 if (!gaps->empty() && gaps->rbegin()->GetTo()==genomic_length-1) {
250                     gap_start = gaps->rbegin()->GetFrom();
251                     gaps->erase(prev(gaps->end()));
252                 }
253                 gaps->insert(TSignedSeqRange(gap_start, gap_start +9));
254             } else {
255                 if (!gaps->empty() && gaps->rbegin()->GetFrom()==genomic_length-1) {
256                     gap_start = gaps->rbegin()->GetTo();
257                     gaps->erase(prev(gaps->end()));
258                 }
259                 gaps->insert(TSignedSeqRange(gap_start +9, gap_start));
260             }
261         }
262     }
263 
264     return make_pair(starts, stops);
265 }
266 
FindStops(const CSeq_align & align)267 set<TSeqPos> CInternalStopFinder::FindStops(const CSeq_align& align)
268 {
269     return FindStartsStops(align).second;
270 }
271 
HasInternalStops(const CSeq_align & align)272 bool CInternalStopFinder::HasInternalStops(const CSeq_align& align)
273 {
274     return !FindStops(align).empty();
275 }
276 
GetCDSNucleotideSequence(const CSeq_align & align)277 string CInternalStopFinder::GetCDSNucleotideSequence(const CSeq_align& align)
278 {
279     if (!align.GetSegs().IsSpliced()) {
280         NCBI_THROW(CException, eUnknown, "CInternalStopFinder supports Spliced-seg alignments only");
281     }
282 
283     string mRNA;
284 
285     const CSpliced_seg& spliced_seg = align.GetSegs().GetSpliced();
286 
287     int next_prod_start = 0;
288     int cds_len = spliced_seg.GetProduct_length();
289 
290     if (spliced_seg.GetProduct_type() == CSpliced_seg::eProduct_type_transcript) {
291         const CSeq_id& product_id = spliced_seg.GetProduct_id();
292         CMappedFeat cds_on_rna = GetCdsOnMrna(product_id, scope);
293         if (!cds_on_rna) {
294             /// No CDS
295             return kEmptyStr;
296         }
297         if (cds_on_rna.GetLocation().GetStrand() == eNa_strand_minus) {
298             NCBI_THROW(CException, eUnknown, "minus strand cdregion on mrna is not supported");
299         }
300         next_prod_start = cds_on_rna.GetLocation().GetStart(eExtreme_Biological);
301         cds_len = cds_on_rna.GetLocation().GetTotalRange().GetLength();
302 
303         if (!cds_on_rna.GetLocation().IsPartialStop(eExtreme_Biological)) {
304             cds_len -= 3;
305         }
306     } else {
307         cds_len *=3;
308     }
309 
310     ITERATE( CSpliced_seg::TExons, exon, spliced_seg.GetExons() ) {
311 
312         int prod_pos_start = (*exon)->GetProduct_start().AsSeqPos();
313 
314         CRef<CSeq_loc> subject_loc(new CSeq_loc);
315         subject_loc->SetInt(*(*exon)->CreateRowSeq_interval(1, spliced_seg));
316         CSeqVector subject_vec(*subject_loc, scope,
317                                CBioseq_Handle::eCoding_Iupac);
318         string subject_seq;
319         subject_vec.GetSeqData(subject_vec.begin(), subject_vec.end(), subject_seq);
320         int subj_pos = 0;
321 
322         if (next_prod_start < prod_pos_start) {
323             mRNA.append(prod_pos_start - next_prod_start, 'N');
324             next_prod_start = prod_pos_start;
325         }
326 
327         if ((*exon)->IsSetParts()) {
328             ITERATE (CSpliced_exon::TParts, part_it, (*exon)->GetParts()) {
329                 pair<int, int> chunk = ChunkSize(**part_it);
330                 prod_pos_start += chunk.second;
331                 if (chunk.first == 0) {
332                     if (next_prod_start < prod_pos_start) {
333                         mRNA.append(prod_pos_start - next_prod_start, 'N');
334                         next_prod_start = prod_pos_start;
335                     }
336                 } else if (chunk.second > 0) {
337                     if (next_prod_start < prod_pos_start) {
338                         mRNA.append(subject_seq, subj_pos+chunk.second-(prod_pos_start - next_prod_start), prod_pos_start - next_prod_start);
339                         next_prod_start = prod_pos_start;
340                     }
341                 }
342                 subj_pos += chunk.first;
343             }
344         } else {
345             mRNA.append(subject_seq);
346             next_prod_start += subject_seq.size();
347         }
348     }
349 
350     return mRNA.substr(0, cds_len);
351 }
352 
353 
354 // pair(genomic, product)
ChunkSize(const CSpliced_exon_chunk & chunk)355 pair<int, int> ChunkSize(const CSpliced_exon_chunk& chunk)
356 {
357     int len = 0;
358     switch (chunk.Which()) {
359     case CSpliced_exon_chunk::e_Genomic_ins:
360         len = chunk.GetGenomic_ins();
361         return make_pair(len, 0);
362     case CSpliced_exon_chunk::e_Product_ins:
363         len = chunk.GetProduct_ins();
364         return make_pair(0, len);
365     case CSpliced_exon_chunk::e_Match:
366         len = chunk.GetMatch();
367         break;
368     case CSpliced_exon_chunk::e_Mismatch:
369         len = chunk.GetMismatch();
370         break;
371     case CSpliced_exon_chunk::e_Diag:
372         len = chunk.GetDiag();
373         break;
374     default:
375         NCBI_THROW(CException, eUnknown, "Spliced_exon_chunk type not set");
376     }
377     return make_pair(len, len);
378 }
379 
380 END_NCBI_SCOPE
381