1 /* $Id: internal_stops.cpp 627020 2021-03-08 17:25:43Z ivanov $
2 * ===========================================================================
3 *
4 * PUBLIC DOMAIN NOTICE
5 * National Center for Biotechnology Information
6 *
7 * This software/database is a "United States Government Work" under the
8 * terms of the United States Copyright Act. It was written as part of
9 * the author's official duties as a United States Government employee and
10 * thus cannot be copyrighted. This software/database is freely available
11 * to the public for use. The National Library of Medicine and the U.S.
12 * Government have not placed any restriction on its use or reproduction.
13 *
14 * Although all reasonable efforts have been taken to ensure the accuracy
15 * and reliability of the software and data, the NLM and the U.S.
16 * Government do not and cannot warrant the performance or results that
17 * may be obtained by using this software or data. The NLM and the U.S.
18 * Government disclaim all warranties, express or implied, including
19 * warranties of performance, merchantability or fitness for any particular
20 * purpose.
21 *
22 * Please cite the author in any work or product based on this material.
23 *
24 * ===========================================================================
25 *
26 * Authors: Vyacheslav Chetvernin
27 *
28 * File Description:
29 *
30 */
31 #include <ncbi_pch.hpp>
32
33 #include <algo/sequence/internal_stops.hpp>
34 #include "feature_generator.hpp"
35
36 #include <objects/seqfeat/Genetic_code.hpp>
37 #include <objects/seqalign/Spliced_seg.hpp>
38 #include <objects/seqalign/Product_pos.hpp>
39 #include <objects/seqalign/Spliced_exon_chunk.hpp>
40 #include <objects/seqfeat/Org_ref.hpp>
41 #include <objects/seqfeat/Genetic_code_table.hpp>
42 #include <objmgr/scope.hpp>
43 #include <objmgr/seq_vector.hpp>
44 #include <objmgr/util/sequence.hpp>
45
46 BEGIN_NCBI_SCOPE
47
CInternalStopFinder(CScope & a_scope)48 CInternalStopFinder::CInternalStopFinder(CScope& a_scope)
49 : scope(a_scope), generator(a_scope)
50 {
51 generator.SetFlags(CFeatureGenerator::fTrimEnds);
52 generator.SetAllowedUnaligned(10);
53 }
54
55 typedef map<TSeqRange, string> TStarts;
56
FindStartsStops(const CSeq_align & align,int padding)57 pair<set<TSeqPos>, set<TSeqPos> > CInternalStopFinder::FindStartsStops(const CSeq_align& align, int padding)
58 {
59 pair<TStarts, set<TSeqRange> > start_stop_ranges = FindStartStopRanges(align, padding);
60 set<TSeqPos> starts, stops;
61 ITERATE (TStarts, r, start_stop_ranges.first) {
62 starts.insert(r->first.GetFrom());
63 }
64 ITERATE (set<TSeqRange>, r, start_stop_ranges.second) {
65 stops.insert(r->GetFrom());
66 }
67 return make_pair(starts, stops);
68 }
FindStartStopRanges(const CSeq_align & align,int padding,set<TSignedSeqRange> * gaps)69 pair<TStarts, set<TSeqRange> > CInternalStopFinder::FindStartStopRanges(const CSeq_align& align, int padding,
70 set<TSignedSeqRange>* gaps)
71 {
72 CBioseq_Handle bsh = scope.GetBioseqHandle(align.GetSeq_id(1));
73 int genomic_length = bsh.GetBioseqLength();
74
75 CConstRef<CSeq_align> clean_align;
76 pair<bool, bool> trim_by_contig(false, false);
77 {{
78 CConstRef<CSeq_align> padded_align(&align);
79 if (padding > 0) {
80 CRef<CSeq_loc> loc = align.CreateRowSeq_loc(1);
81 int start = loc->GetStart(eExtreme_Positional);
82 int stop = loc->GetStop(eExtreme_Positional);
83
84 bool is_circular = (bsh.GetInst_Topology() == CSeq_inst::eTopology_circular);
85
86 if (is_circular) {
87 //prevent self overlap
88 padding = min(padding, ((stop > start ? genomic_length : 0) - (stop - start +1))/2);
89 }
90
91 start -= padding;
92 stop += padding;
93
94 if (start <= 2 && !is_circular) {
95 trim_by_contig.first = true;
96 }
97 if (stop >= genomic_length-3 && !is_circular) {
98 trim_by_contig.second = true;
99 }
100
101 if (start < 0) {
102 start = is_circular ? start + genomic_length : 0;
103 }
104 if (stop >= genomic_length) {
105 stop = is_circular ? stop - genomic_length : genomic_length-1;
106 }
107 padded_align = generator.AdjustAlignment(align, TSeqRange(start, stop));
108 //cerr << MSerial_AsnText << *padded_align;
109 }
110
111 clean_align = generator.CleanAlignment(*padded_align);
112 //cerr << MSerial_AsnText << *clean_align;
113 }}
114
115 CSeq_loc_Mapper mapper(*clean_align, 1);
116
117 CRef<CSeq_loc> query_loc(new CSeq_loc);
118 const CSpliced_seg& spl = clean_align->GetSegs().GetSpliced();
119 query_loc->SetInt(*spl.GetExons().front()->CreateRowSeq_interval(0, spl));
120
121 const bool is_protein = (spl.GetProduct_type() == CSpliced_seg::eProduct_type_protein);
122
123 string seq = GetCDSNucleotideSequence(*clean_align);
124 if (seq.size()%3 != 0) {
125 cerr << MSerial_AsnText << align << endl;
126 _ASSERT(seq.size()%3 == 0);
127 NCBI_USER_THROW("CDSNucleotideSequence not divisible by 3");
128 }
129
130 int gcode = fg::GetGeneticCode(bsh);
131
132 CRef<CGenetic_code::C_E> c_e(new CGenetic_code::C_E); c_e->SetId(gcode);
133 CRef<CGenetic_code> code(new CGenetic_code);
134 code->Set().push_back(c_e);
135
136 const CTrans_table & tbl = CGen_code_table::GetTransTable(*code);
137 const size_t kUnknownState = tbl.SetCodonState('N', 'N', 'N');
138
139 TStarts starts;
140 set<TSeqRange> stops;
141
142 size_t state = 0;
143 int k = 0;
144 string codon = "NNN";
145
146 ITERATE(string, s, seq) {
147 state = tbl.NextCodonState(state, *s);
148 codon[k%3] = *s;
149
150 if (++k%3)
151 continue;
152
153 if (state == kUnknownState)
154 continue;
155
156 if (tbl.IsOrfStart(state) || tbl.IsOrfStop(state)) {
157 if (is_protein) {
158 query_loc->SetInt().SetFrom((k-3)/3);
159 query_loc->SetInt().SetTo((k-3)/3);
160 } else {
161 query_loc->SetInt().SetFrom(k-3);
162 query_loc->SetInt().SetTo(k-1);
163 }
164 TSeqPos mapped_pos = mapper.Map(*query_loc)->GetStart(eExtreme_Biological);
165 if (mapped_pos == kInvalidSeqPos)
166 continue;
167 TSeqPos mapped_pos2 = mapper.Map(*query_loc)->GetStop(eExtreme_Biological);
168
169 if (tbl.IsOrfStart(state)) {
170 starts[TSeqRange(mapped_pos, mapped_pos2)] = codon;
171 }
172 if (tbl.IsOrfStop(state)) {
173 stops.insert(TSeqRange(mapped_pos, mapped_pos2));
174 }
175 }
176 }
177
178 if (gaps != nullptr) {
179
180 CRef<CSeq_loc> region_loc = clean_align->CreateRowSeq_loc(1);
181 if (trim_by_contig.first && region_loc->GetStart(eExtreme_Positional) < 3) {
182 region_loc = region_loc->Merge(CSeq_loc::fMerge_SingleRange, nullptr);
183 region_loc->SetInt().SetFrom(0);
184 }
185 if (trim_by_contig.second && int(region_loc->GetStop(eExtreme_Positional)) > genomic_length -3) {
186 region_loc = region_loc->Merge(CSeq_loc::fMerge_SingleRange, nullptr);
187 region_loc->SetInt().SetTo(genomic_length-1);
188 }
189 // cerr << MSerial_AsnText << *region_loc;
190
191 CSeqVector region_vec(*region_loc, scope,
192 CBioseq_Handle::eCoding_Iupac);
193 string region_seq;
194 region_vec.GetSeqData(region_vec.begin(), region_vec.end(), region_seq);
195
196 region_seq += 'X'; // to finish last run of Ns
197
198 int gap_begin = -1;
199 int gap_end = -1;
200 int k = 0;
201
202 CRef<CSeq_id> id(new CSeq_id);
203 id->Assign(*region_loc->GetId());
204 CRef<CSeq_loc> query_loc(new CSeq_loc(*id, 0, region_vec.size()-1));
205 CSeq_loc_Mapper mapper(*query_loc, *region_loc);
206
207 for (auto s: region_seq) {
208 if (s == 'N') {
209 if (gap_end != k) {
210 gap_begin = k;
211 }
212 gap_end = k+1;
213 } else if (gap_end == k) {
214 query_loc->SetInt().SetFrom(gap_begin);
215 query_loc->SetInt().SetTo(gap_end-1);
216
217 auto mapped_loc = mapper.Map(*query_loc);
218 TSeqPos mapped_pos = mapped_loc->GetStart(eExtreme_Biological);
219 TSeqPos mapped_pos2 = mapped_loc->GetStop(eExtreme_Biological);
220 if (mapped_pos == kInvalidSeqPos || mapped_pos2 == kInvalidSeqPos) {
221 NCBI_USER_THROW("Cannot map Ns run");
222 }
223 gaps->insert(TSignedSeqRange(mapped_pos, mapped_pos2));
224 }
225 ++k;
226 }
227
228 auto strand = region_loc->GetStrand();
229
230 if (trim_by_contig.first) {
231 int gap_stop = -1;
232 if (strand != eNa_strand_minus) {
233 if (!gaps->empty() && gaps->begin()->GetFrom()==0) {
234 gap_stop = gaps->begin()->GetTo();
235 gaps->erase(gaps->begin());
236 }
237 gaps->insert(TSignedSeqRange(gap_stop -9, gap_stop));
238 } else {
239 if (!gaps->empty() && gaps->begin()->GetTo()==0) {
240 gap_stop = gaps->begin()->GetFrom();
241 gaps->erase(gaps->begin());
242 }
243 gaps->insert(TSignedSeqRange(gap_stop, gap_stop -9));
244 }
245 }
246 if (trim_by_contig.second) {
247 int gap_start = genomic_length;
248 if (strand != eNa_strand_minus) {
249 if (!gaps->empty() && gaps->rbegin()->GetTo()==genomic_length-1) {
250 gap_start = gaps->rbegin()->GetFrom();
251 gaps->erase(prev(gaps->end()));
252 }
253 gaps->insert(TSignedSeqRange(gap_start, gap_start +9));
254 } else {
255 if (!gaps->empty() && gaps->rbegin()->GetFrom()==genomic_length-1) {
256 gap_start = gaps->rbegin()->GetTo();
257 gaps->erase(prev(gaps->end()));
258 }
259 gaps->insert(TSignedSeqRange(gap_start +9, gap_start));
260 }
261 }
262 }
263
264 return make_pair(starts, stops);
265 }
266
FindStops(const CSeq_align & align)267 set<TSeqPos> CInternalStopFinder::FindStops(const CSeq_align& align)
268 {
269 return FindStartsStops(align).second;
270 }
271
HasInternalStops(const CSeq_align & align)272 bool CInternalStopFinder::HasInternalStops(const CSeq_align& align)
273 {
274 return !FindStops(align).empty();
275 }
276
GetCDSNucleotideSequence(const CSeq_align & align)277 string CInternalStopFinder::GetCDSNucleotideSequence(const CSeq_align& align)
278 {
279 if (!align.GetSegs().IsSpliced()) {
280 NCBI_THROW(CException, eUnknown, "CInternalStopFinder supports Spliced-seg alignments only");
281 }
282
283 string mRNA;
284
285 const CSpliced_seg& spliced_seg = align.GetSegs().GetSpliced();
286
287 int next_prod_start = 0;
288 int cds_len = spliced_seg.GetProduct_length();
289
290 if (spliced_seg.GetProduct_type() == CSpliced_seg::eProduct_type_transcript) {
291 const CSeq_id& product_id = spliced_seg.GetProduct_id();
292 CMappedFeat cds_on_rna = GetCdsOnMrna(product_id, scope);
293 if (!cds_on_rna) {
294 /// No CDS
295 return kEmptyStr;
296 }
297 if (cds_on_rna.GetLocation().GetStrand() == eNa_strand_minus) {
298 NCBI_THROW(CException, eUnknown, "minus strand cdregion on mrna is not supported");
299 }
300 next_prod_start = cds_on_rna.GetLocation().GetStart(eExtreme_Biological);
301 cds_len = cds_on_rna.GetLocation().GetTotalRange().GetLength();
302
303 if (!cds_on_rna.GetLocation().IsPartialStop(eExtreme_Biological)) {
304 cds_len -= 3;
305 }
306 } else {
307 cds_len *=3;
308 }
309
310 ITERATE( CSpliced_seg::TExons, exon, spliced_seg.GetExons() ) {
311
312 int prod_pos_start = (*exon)->GetProduct_start().AsSeqPos();
313
314 CRef<CSeq_loc> subject_loc(new CSeq_loc);
315 subject_loc->SetInt(*(*exon)->CreateRowSeq_interval(1, spliced_seg));
316 CSeqVector subject_vec(*subject_loc, scope,
317 CBioseq_Handle::eCoding_Iupac);
318 string subject_seq;
319 subject_vec.GetSeqData(subject_vec.begin(), subject_vec.end(), subject_seq);
320 int subj_pos = 0;
321
322 if (next_prod_start < prod_pos_start) {
323 mRNA.append(prod_pos_start - next_prod_start, 'N');
324 next_prod_start = prod_pos_start;
325 }
326
327 if ((*exon)->IsSetParts()) {
328 ITERATE (CSpliced_exon::TParts, part_it, (*exon)->GetParts()) {
329 pair<int, int> chunk = ChunkSize(**part_it);
330 prod_pos_start += chunk.second;
331 if (chunk.first == 0) {
332 if (next_prod_start < prod_pos_start) {
333 mRNA.append(prod_pos_start - next_prod_start, 'N');
334 next_prod_start = prod_pos_start;
335 }
336 } else if (chunk.second > 0) {
337 if (next_prod_start < prod_pos_start) {
338 mRNA.append(subject_seq, subj_pos+chunk.second-(prod_pos_start - next_prod_start), prod_pos_start - next_prod_start);
339 next_prod_start = prod_pos_start;
340 }
341 }
342 subj_pos += chunk.first;
343 }
344 } else {
345 mRNA.append(subject_seq);
346 next_prod_start += subject_seq.size();
347 }
348 }
349
350 return mRNA.substr(0, cds_len);
351 }
352
353
354 // pair(genomic, product)
ChunkSize(const CSpliced_exon_chunk & chunk)355 pair<int, int> ChunkSize(const CSpliced_exon_chunk& chunk)
356 {
357 int len = 0;
358 switch (chunk.Which()) {
359 case CSpliced_exon_chunk::e_Genomic_ins:
360 len = chunk.GetGenomic_ins();
361 return make_pair(len, 0);
362 case CSpliced_exon_chunk::e_Product_ins:
363 len = chunk.GetProduct_ins();
364 return make_pair(0, len);
365 case CSpliced_exon_chunk::e_Match:
366 len = chunk.GetMatch();
367 break;
368 case CSpliced_exon_chunk::e_Mismatch:
369 len = chunk.GetMismatch();
370 break;
371 case CSpliced_exon_chunk::e_Diag:
372 len = chunk.GetDiag();
373 break;
374 default:
375 NCBI_THROW(CException, eUnknown, "Spliced_exon_chunk type not set");
376 }
377 return make_pair(len, len);
378 }
379
380 END_NCBI_SCOPE
381