1 /* $Id: translation_problems.cpp 632625 2021-06-03 17:38:33Z ivanov $
2 * ===========================================================================
3 *
4 * PUBLIC DOMAIN NOTICE
5 * National Center for Biotechnology Information
6 *
7 * This software/database is a "United States Government Work" under the
8 * terms of the United States Copyright Act. It was written as part of
9 * the author's official duties as a United States Government employee and
10 * thus cannot be copyrighted. This software/database is freely available
11 * to the public for use. The National Library of Medicine and the U.S.
12 * Government have not placed any restriction on its use or reproduction.
13 *
14 * Although all reasonable efforts have been taken to ensure the accuracy
15 * and reliability of the software and data, the NLM and the U.S.
16 * Government do not and cannot warrant the performance or results that
17 * may be obtained by using this software or data. The NLM and the U.S.
18 * Government disclaim all warranties, express or implied, including
19 * warranties of performance, merchantability or fitness for any particular
20 * purpose.
21 *
22 * Please cite the author in any work or product based on this material.
23 *
24 * ===========================================================================
25 *
26 * Author: Colleen Bollin
27 *
28 * File Description:
29 * detecting translation problems
30 * .......
31 *
32 */
33 #include <ncbi_pch.hpp>
34 #include <corelib/ncbistd.hpp>
35 #include <corelib/ncbistr.hpp>
36 #include <objtools/validator/translation_problems.hpp>
37 #include <objtools/validator/utilities.hpp>
38
39 #include <objmgr/seq_vector.hpp>
40 #include <objmgr/util/sequence.hpp>
41 #include <objects/seqfeat/Cdregion.hpp>
42 #include <objects/seqfeat/Code_break.hpp>
43 #include <objects/seqfeat/Gb_qual.hpp>
44 #include <objects/seqfeat/Genetic_code.hpp>
45 #include <util/sequtil/sequtil_convert.hpp>
46
47 BEGIN_NCBI_SCOPE
48 BEGIN_SCOPE(objects)
49 BEGIN_SCOPE(validator)
50 using namespace sequence;
51
52
CCDSTranslationProblems()53 CCDSTranslationProblems::CCDSTranslationProblems()
54 {
55 x_Reset();
56 }
57
58
x_Reset()59 void CCDSTranslationProblems::x_Reset()
60 {
61 m_ProblemFlags = 0;
62 m_RaggedLength = 0;
63 m_TransLen = 0;
64 m_ProtLen = 0;
65 m_HasDashXStart = false;
66 m_TranslStart = 0;
67 m_InternalStopCodons = 0;
68 m_TranslTerminalX = 0;
69 m_ProdTerminalX = 0;
70 m_UnableToTranslate = false;
71 m_AltStart = false;
72 m_UnparsedTranslExcept = false;
73 m_NumNonsenseIntrons = 0;
74 m_HasException = false;
75 }
76
77
CalculateTranslationProblems(const CSeq_feat & feat,CBioseq_Handle loc_handle,CBioseq_Handle prot_handle,bool ignore_exceptions,bool far_fetch_cds,bool standalone_annot,bool single_seq,bool is_gpipe,bool is_genomic,bool is_refseq,bool is_nt_or_ng_or_nw,bool is_nc,bool has_accession,CScope * scope)78 void CCDSTranslationProblems::CalculateTranslationProblems
79 (const CSeq_feat& feat,
80 CBioseq_Handle loc_handle,
81 CBioseq_Handle prot_handle,
82 bool ignore_exceptions,
83 bool far_fetch_cds,
84 bool standalone_annot,
85 bool single_seq,
86 bool is_gpipe,
87 bool is_genomic,
88 bool is_refseq,
89 bool is_nt_or_ng_or_nw,
90 bool is_nc,
91 bool has_accession,
92 CScope* scope)
93 {
94 x_Reset();
95 // bail if not CDS
96 if (!feat.GetData().IsCdregion()) {
97 return;
98 }
99
100 // do not validate for pseudo gene
101 if (feat.IsSetQual()) {
102 for (auto it = feat.GetQual().begin(); it != feat.GetQual().end(); it++) {
103 if ((*it)->IsSetQual() && NStr::EqualNocase((*it)->GetQual(), "pseudo")) {
104 return;
105 }
106 }
107 }
108
109 bool has_errors = false, unclassified_except = false,
110 mismatch_except = false, frameshift_except = false,
111 rearrange_except = false, product_replaced = false,
112 mixed_population = false, low_quality = false,
113 report_errors = true, other_than_mismatch = false,
114 rna_editing = false, transcript_or_proteomic = false;
115 string farstr;
116
117 if (!ignore_exceptions &&
118 feat.IsSetExcept() && feat.GetExcept() &&
119 feat.IsSetExcept_text()) {
120 const string& except_text = feat.GetExcept_text();
121 report_errors = ReportTranslationErrors(except_text);
122 x_GetExceptionFlags(except_text,
123 unclassified_except,
124 mismatch_except,
125 frameshift_except,
126 rearrange_except,
127 product_replaced,
128 mixed_population,
129 low_quality,
130 rna_editing,
131 transcript_or_proteomic);
132 }
133
134 m_HasException = !report_errors;
135
136 // check frame
137 m_ProblemFlags |= x_CheckCDSFrame(feat, scope);
138
139 // check for unparsed transl_except
140 if (feat.IsSetQual()) {
141 for (auto it = feat.GetQual().begin(); it != feat.GetQual().end(); it++) {
142 if ((*it)->IsSetQual() && NStr::Equal((*it)->GetQual(), "transl_except")) {
143 m_UnparsedTranslExcept = true;
144 }
145 }
146 }
147
148 string transl_prot; // translated protein
149 bool got_stop = false;
150
151 try {
152 transl_prot = TranslateCodingRegionForValidation(feat, *scope, m_AltStart);
153 } catch (CException&) {
154 m_UnableToTranslate = true;
155 }
156
157 if (NStr::EndsWith(transl_prot, "*")) {
158 got_stop = true;
159 }
160
161 if (HasBadStartCodon(feat.GetLocation(), transl_prot)) {
162 m_TranslStart = transl_prot.c_str()[0];
163 m_ProblemFlags |= eCDSTranslationProblem_BadStart;
164 }
165
166 bool no_product = true;
167
168 if (!m_UnableToTranslate) {
169
170 // check for code break not on a codon
171 m_TranslExceptProblems = x_GetTranslExceptProblems(feat, loc_handle, scope, is_refseq);
172
173 m_NumNonsenseIntrons = x_CountNonsenseIntrons(feat, scope);
174 if (x_ProteinHasTooManyXs(transl_prot)) {
175 m_ProblemFlags |= eCDSTranslationProblem_TooManyX;
176 }
177
178 m_InternalStopCodons = CountInternalStopCodons(transl_prot);
179 if (m_InternalStopCodons > 5) {
180 // stop checking if too many stop codons
181 return;
182 }
183
184 // get protein sequence
185
186 if (!prot_handle) {
187 const CSeq_id* protid = 0;
188 try {
189 protid = &GetId(feat.GetProduct(), scope);
190 } catch (CException&) {}
191 if (protid && far_fetch_cds && feat.IsSetProduct()) {
192 m_ProblemFlags |= eCDSTranslationProblem_UnableToFetch;
193 } else if (protid && (!far_fetch_cds || feat.IsSetProduct())) {
194 // don't report missing protein sequence
195 } else if (!standalone_annot && transl_prot.length() > 6) {
196 if (!is_nt_or_ng_or_nw && (!is_nc || !single_seq)) {
197 m_ProblemFlags |= eCDSTranslationProblem_NoProtein;
198 }
199 }
200 }
201
202 bool show_stop = true;
203
204 if (prot_handle && prot_handle.IsAa()) {
205 no_product = false;
206
207 CSeqVector prot_vec = prot_handle.GetSeqVector();
208 prot_vec.SetCoding(CSeq_data::e_Ncbieaa);
209
210
211 CalculateEffectiveTranslationLengths(transl_prot, prot_vec, m_TransLen, m_ProtLen);
212
213 if (m_TransLen == m_ProtLen || has_accession) { // could be identical
214 if (prot_vec.size() > 0 && transl_prot.length() > 0 &&
215 prot_vec[0] != transl_prot[0]) {
216 bool no_beg, no_end;
217 FeatureHasEnds(feat, scope, no_beg, no_end);
218 if (feat.IsSetPartial() && feat.GetPartial() && (!no_beg) && (!no_end)) {
219 m_ProblemFlags |= eCDSTranslationProblem_ShouldStartPartial;
220 } else if (transl_prot[0] == '-' || transl_prot[0] == 'X') {
221 m_HasDashXStart = true;
222 }
223 }
224 m_Mismatches = x_GetTranslationMismatches(feat, prot_vec, transl_prot, has_accession);
225 }
226
227 if (feat.CanGetPartial() && feat.GetPartial() &&
228 m_Mismatches.size() == 0) {
229 bool no_beg, no_end;
230 FeatureHasEnds(feat, scope, no_beg, no_end);
231 if (!no_beg && !no_end) {
232 if (report_errors) {
233 if (is_gpipe && is_genomic) {
234 // suppress in gpipe genomic
235 } else {
236 if (!got_stop) {
237 m_ProblemFlags |= eCDSTranslationProblem_ShouldBePartialButIsnt;
238 } else {
239 m_ProblemFlags |= eCDSTranslationProblem_ShouldNotBePartialButIs;
240 }
241 }
242 }
243 show_stop = false;
244 }
245 }
246
247 if (!transl_prot.empty()) {
248 // look for discrepancy in number of terminal Xs between product and translation
249 m_TranslTerminalX = x_CountTerminalXs(transl_prot, (got_stop && (transl_prot.length() == prot_vec.size() + 1)));
250 m_ProdTerminalX = x_CountTerminalXs(prot_vec);
251
252 }
253
254 }
255
256 x_GetCdTransErrors(feat, prot_handle, show_stop, got_stop, scope);
257
258 }
259
260 if (x_JustifiesException()) {
261 has_errors = true;
262 other_than_mismatch = true;
263 } else if (m_Mismatches.size() > 0) {
264 has_errors = true;
265 }
266
267 if (!report_errors && !no_product) {
268 if (!has_errors) {
269 if (!frameshift_except && !rearrange_except && !mixed_population && !low_quality) {
270 m_ProblemFlags |= eCDSTranslationProblem_UnnecessaryException;
271 }
272 } else if (unclassified_except && !other_than_mismatch) {
273 if (m_Mismatches.size() * 50 <= m_ProtLen) {
274 m_ProblemFlags |= eCDSTranslationProblem_ErroneousException;
275 }
276 } else if (product_replaced) {
277 m_ProblemFlags |= eCDSTranslationProblem_UnqualifiedException;
278 }
279 }
280
281 }
282
283
x_GetCdTransErrors(const CSeq_feat & feat,CBioseq_Handle product,bool show_stop,bool got_stop,CScope * scope)284 void CCDSTranslationProblems::x_GetCdTransErrors(const CSeq_feat& feat, CBioseq_Handle product, bool show_stop, bool got_stop, CScope* scope)
285 {
286 bool no_beg, no_end;
287 FeatureHasEnds(feat, product ? &(product.GetScope()) : scope, no_beg, no_end);
288 if (show_stop) {
289 if (!got_stop && !no_end) {
290 m_ProblemFlags |= eCDSTranslationProblem_NoStop;
291 } else if (got_stop && no_end) {
292 m_ProblemFlags |= eCDSTranslationProblem_StopPartial;
293 } else if (got_stop && !no_end) {
294 m_RaggedLength = x_CheckForRaggedEnd(feat, scope);
295 }
296 }
297 }
298
x_IsThreeBaseNonsense(const CSeq_feat & feat,const CSeq_id & id,const CCdregion & cdr,TSeqPos start,TSeqPos stop,ENa_strand strand,CScope * scope)299 bool CCDSTranslationProblems::x_IsThreeBaseNonsense(
300 const CSeq_feat& feat,
301 const CSeq_id& id,
302 const CCdregion& cdr,
303 TSeqPos start,
304 TSeqPos stop,
305 ENa_strand strand,
306 CScope *scope
307 )
308
309 {
310 bool nonsense_intron = false;
311 CRef<CSeq_feat> tmp_cds (new CSeq_feat());
312
313 tmp_cds->SetLocation().SetInt().SetFrom(start);
314 tmp_cds->SetLocation().SetInt().SetTo(stop);
315 tmp_cds->SetLocation().SetInt().SetStrand(strand);
316 tmp_cds->SetLocation().SetInt().SetId().Assign(id);
317
318 tmp_cds->SetLocation().SetPartialStart(true, eExtreme_Biological);
319 tmp_cds->SetLocation().SetPartialStop(true, eExtreme_Biological);
320 tmp_cds->SetData().SetCdregion();
321 if ( cdr.IsSetCode()) {
322 tmp_cds->SetData().SetCdregion().SetCode().Assign(cdr.GetCode());
323 }
324
325 bool alt_start = false;
326 try {
327 string transl_prot = TranslateCodingRegionForValidation(*tmp_cds, *scope, alt_start);
328
329 if (NStr::Equal(transl_prot, "*")) {
330 nonsense_intron = true;
331 }
332 } catch (CException&) {
333 }
334 return nonsense_intron;
335 }
336
337
x_CountNonsenseIntrons(const CSeq_feat & feat,CScope * scope)338 size_t CCDSTranslationProblems::x_CountNonsenseIntrons(const CSeq_feat& feat, CScope* scope)
339 {
340 TSeqPos last_start = 0, last_stop = 0, start, stop;
341
342 if (!feat.IsSetData() && !feat.GetData().IsCdregion()) {
343 return 0;
344 }
345 if (feat.IsSetExcept() || feat.IsSetExcept_text()) return 0;
346 const CCdregion& cdr = feat.GetData().GetCdregion();
347 if (cdr.IsSetCode_break()) return 0;
348
349 size_t count = 0;
350 const CSeq_loc& loc = feat.GetLocation();
351
352 CSeq_loc_CI prev;
353 for (CSeq_loc_CI curr(loc); curr; ++curr) {
354 start = curr.GetRange().GetFrom();
355 stop = curr.GetRange().GetTo();
356 if (prev && curr && IsSameBioseq(curr.GetSeq_id(), prev.GetSeq_id(), scope)) {
357 ENa_strand strand = curr.GetStrand();
358 if (strand == eNa_strand_minus) {
359 if (last_start - stop == 4) {
360 if (x_IsThreeBaseNonsense(feat, curr.GetSeq_id(), cdr, stop + 1, last_start - 1, strand, scope)) {
361 count++;
362 }
363 }
364 } else {
365 if (start - last_stop == 4) {
366 if (x_IsThreeBaseNonsense(feat, curr.GetSeq_id(), cdr, last_stop + 1, start - 1, strand, scope)) {
367 count++;
368 }
369 }
370 }
371 }
372 last_start = start;
373 last_stop = stop;
374 prev = curr;
375 }
376
377 if (count > 0 && sequence::IsPseudo(feat, *scope)) return 0;
378 else return count;
379 }
380
381
GetNonsenseIntrons(const CSeq_feat & feat,CScope & scope)382 vector<CRef<CSeq_loc> > CCDSTranslationProblems::GetNonsenseIntrons(const CSeq_feat& feat, CScope& scope)
383 {
384 vector<CRef<CSeq_loc> > intron_locs;
385
386 TSeqPos last_start = 0, last_stop = 0, start, stop;
387
388 if (!feat.IsSetData() && !feat.GetData().IsCdregion()) {
389 return intron_locs;
390 }
391 if (feat.IsSetExcept() || feat.IsSetExcept_text()) return intron_locs;
392 const CCdregion& cdr = feat.GetData().GetCdregion();
393 if (cdr.IsSetCode_break()) return intron_locs;
394
395 const CSeq_loc& loc = feat.GetLocation();
396
397 CSeq_loc_CI prev;
398 for (CSeq_loc_CI curr(loc); curr; ++curr) {
399 start = curr.GetRange().GetFrom();
400 stop = curr.GetRange().GetTo();
401 if (prev && curr && IsSameBioseq(curr.GetSeq_id(), prev.GetSeq_id(), &scope)) {
402 ENa_strand strand = curr.GetStrand();
403 if (strand == eNa_strand_minus) {
404 if (last_start - stop == 4) {
405 if (x_IsThreeBaseNonsense(feat, curr.GetSeq_id(), cdr, stop + 1, last_start - 1, strand, &scope)) {
406 CRef<CSeq_id> id(new CSeq_id());
407 id->Assign(curr.GetSeq_id());
408 CRef<CSeq_loc> intron_loc(new CSeq_loc(*id, stop + 1, last_start - 1, strand));
409 intron_locs.push_back(intron_loc);
410 }
411 }
412 } else {
413 if (start - last_stop == 4) {
414 if (x_IsThreeBaseNonsense(feat, curr.GetSeq_id(), cdr, last_stop + 1, start - 1, strand, &scope)) {
415 CRef<CSeq_id> id(new CSeq_id());
416 id->Assign(curr.GetSeq_id());
417 CRef<CSeq_loc> intron_loc(new CSeq_loc(*id, last_stop + 1, start - 1, strand));
418 intron_locs.push_back(intron_loc);
419 }
420 }
421 }
422 }
423 last_start = start;
424 last_stop = stop;
425 prev = curr;
426 }
427
428 if (intron_locs.size() > 0 && sequence::IsPseudo(feat, scope)) {
429 intron_locs.clear();
430 }
431 return intron_locs;
432
433 }
434
435
x_ProteinHasTooManyXs(const string & transl_prot)436 bool CCDSTranslationProblems::x_ProteinHasTooManyXs(const string& transl_prot)
437 {
438 size_t num_x = 0, num_nonx = 0;
439
440 ITERATE(string, it, transl_prot) {
441 if (*it == 'X') {
442 num_x++;
443 } else {
444 num_nonx++;
445 }
446 }
447
448 // report too many Xs
449 if (num_x > num_nonx) {
450 return true;
451 } else {
452 return false;
453 }
454 }
455
456
x_GetExceptionFlags(const string & except_text,bool & unclassified_except,bool & mismatch_except,bool & frameshift_except,bool & rearrange_except,bool & product_replaced,bool & mixed_population,bool & low_quality,bool & rna_editing,bool & transcript_or_proteomic)457 void CCDSTranslationProblems::x_GetExceptionFlags
458 (const string& except_text,
459 bool& unclassified_except,
460 bool& mismatch_except,
461 bool& frameshift_except,
462 bool& rearrange_except,
463 bool& product_replaced,
464 bool& mixed_population,
465 bool& low_quality,
466 bool& rna_editing,
467 bool& transcript_or_proteomic)
468 {
469 if (NStr::FindNoCase(except_text, "unclassified translation discrepancy") != NPOS) {
470 unclassified_except = true;
471 }
472 if (NStr::FindNoCase(except_text, "mismatches in translation") != NPOS) {
473 mismatch_except = true;
474 }
475 if (NStr::FindNoCase(except_text, "artificial frameshift") != NPOS) {
476 frameshift_except = true;
477 }
478 if (NStr::FindNoCase(except_text, "rearrangement required for product") != NPOS) {
479 rearrange_except = true;
480 }
481 if (NStr::FindNoCase(except_text, "translated product replaced") != NPOS) {
482 product_replaced = true;
483 }
484 if (NStr::FindNoCase(except_text, "heterogeneous population sequenced") != NPOS) {
485 mixed_population = true;
486 }
487 if (NStr::FindNoCase(except_text, "low-quality sequence region") != NPOS) {
488 low_quality = true;
489 }
490 if (NStr::FindNoCase (except_text, "RNA editing") != NPOS) {
491 rna_editing = true;
492 }
493 }
494
495
x_CheckCDSFrame(const CSeq_feat & feat,CScope * scope)496 size_t CCDSTranslationProblems::x_CheckCDSFrame(const CSeq_feat& feat, CScope* scope)
497 {
498 size_t rval = 0;
499
500 const CCdregion& cdregion = feat.GetData().GetCdregion();
501 const CSeq_loc& location = feat.GetLocation();
502 unsigned int part_loc = SeqLocPartialCheck(location, scope);
503 string comment_text;
504 if (feat.IsSetComment()) {
505 comment_text = feat.GetComment();
506 }
507
508 // check frame
509 if (cdregion.IsSetFrame()
510 && (cdregion.GetFrame() == CCdregion::eFrame_two || cdregion.GetFrame() == CCdregion::eFrame_three)) {
511 // coding region should be 5' partial
512 if (!(part_loc & eSeqlocPartial_Start)) {
513 rval |= eCDSTranslationProblem_FrameNotPartial;
514 } else if ((part_loc & eSeqlocPartial_Start) && !x_Is5AtEndSpliceSiteOrGap(location, *scope)) {
515 if (s_PartialAtGapOrNs(scope, location, eSeqlocPartial_Nostart, true)
516 || NStr::Find(comment_text, "coding region disrupted by sequencing gap") != string::npos) {
517 // suppress
518 } else {
519 rval |= eCDSTranslationProblem_FrameNotConsensus;
520 }
521 }
522 }
523 return rval;
524 }
525
526
x_JustifiesException() const527 bool CCDSTranslationProblems::x_JustifiesException() const
528 {
529 if (m_ProblemFlags & eCDSTranslationProblem_FrameNotPartial ||
530 m_ProblemFlags & eCDSTranslationProblem_FrameNotConsensus ||
531 m_ProblemFlags & eCDSTranslationProblem_NoStop ||
532 m_ProblemFlags & eCDSTranslationProblem_StopPartial ||
533 m_ProblemFlags & eCDSTranslationProblem_PastStop ||
534 m_ProblemFlags & eCDSTranslationProblem_ShouldStartPartial ||
535 m_ProblemFlags & eCDSTranslationProblem_BadStart ||
536 m_ProblemFlags & eCDSTranslationProblem_NoProtein ||
537 m_ProtLen != m_TransLen ||
538 m_InternalStopCodons > 0 ||
539 m_RaggedLength > 0 || m_HasDashXStart ||
540 m_UnableToTranslate) {
541 return true;
542 } else if (x_JustifiesException(m_TranslExceptProblems)) {
543 return true;
544 } else {
545 return false;
546 }
547 }
548
549
x_CountTerminalXs(const string & transl_prot,bool skip_stop)550 size_t CCDSTranslationProblems::x_CountTerminalXs(const string& transl_prot, bool skip_stop)
551 {
552 // look for discrepancy in number of terminal Xs between product and translation
553 size_t transl_terminal_x = 0;
554 size_t i = transl_prot.length() - 1;
555 if (i > 0 && transl_prot[i] == '*' && skip_stop) {
556 i--;
557 }
558 while (i > 0) {
559 if (transl_prot[i] == 'X') {
560 transl_terminal_x++;
561 } else {
562 break;
563 }
564 i--;
565 }
566 if (i == 0 && transl_prot[0] == 'X') {
567 transl_terminal_x++;
568 }
569 return transl_terminal_x;
570 }
571
x_CountTerminalXs(const CSeqVector & prot_vec)572 size_t CCDSTranslationProblems::x_CountTerminalXs(const CSeqVector& prot_vec)
573 {
574 size_t prod_terminal_x = 0;
575 TSeqPos prod_len = prot_vec.size() - 1;
576 while (prod_len > 0 && prot_vec[prod_len] == 'X') {
577 prod_terminal_x++;
578 prod_len--;
579 }
580 if (prod_len == 0 && prot_vec[prod_len] == 'X') {
581 prod_terminal_x++;
582 }
583 return prod_terminal_x;
584 }
585
586
587 CCDSTranslationProblems::TTranslationMismatches
x_GetTranslationMismatches(const CSeq_feat & feat,const CSeqVector & prot_vec,const string & transl_prot,bool has_accession)588 CCDSTranslationProblems::x_GetTranslationMismatches(const CSeq_feat& feat, const CSeqVector& prot_vec, const string& transl_prot, bool has_accession)
589 {
590 TTranslationMismatches mismatches;
591 size_t prot_len;
592 size_t len;
593
594 CalculateEffectiveTranslationLengths(transl_prot, prot_vec, len, prot_len);
595
596 if (len == prot_len || has_accession) { // could be identical
597 if (len > prot_len) {
598 len = prot_len;
599 }
600 for (TSeqPos i = 0; i < len; ++i) {
601 CSeqVectorTypes::TResidue p_res = prot_vec[i];
602 CSeqVectorTypes::TResidue t_res = transl_prot[i];
603
604 if (t_res != p_res) {
605 if (i == 0) {
606 bool no_beg, no_end;
607 FeatureHasEnds(feat, &(prot_vec.GetScope()), no_beg, no_end);
608 if (feat.IsSetPartial() && feat.GetPartial() && (!no_beg) && (!no_end)) {
609 } else if (t_res == '-') {
610 } else {
611 mismatches.push_back({ p_res, t_res, i });
612 }
613 } else {
614 mismatches.push_back({ p_res, t_res, i });
615 }
616 }
617 }
618 }
619 return mismatches;
620 }
621
622
623
624
625
x_LeuCUGstart(const CSeq_feat & feat)626 static bool x_LeuCUGstart
627 (
628 const CSeq_feat& feat
629 )
630
631 {
632 if ( ! feat.IsSetExcept()) return false;
633 if ( ! feat.IsSetExcept_text()) return false;
634 const string& except_text = feat.GetExcept_text();
635 if (NStr::FindNoCase(except_text, "translation initiation by tRNA-Leu at CUG codon") == NPOS) return false;
636 if (feat.IsSetQual()) {
637 for (auto it = feat.GetQual().begin(); it != feat.GetQual().end(); it++) {
638 const CGb_qual& qual = **it;
639 if (qual.IsSetQual() && NStr::Compare(qual.GetQual(), "experiment") == 0) return true;
640 }
641 }
642 return false;
643 }
644
645
646 CCDSTranslationProblems::TTranslExceptProblems
x_GetTranslExceptProblems(const CSeq_feat & feat,CBioseq_Handle loc_handle,CScope * scope,bool is_refseq)647 CCDSTranslationProblems::x_GetTranslExceptProblems
648 (const CSeq_feat& feat, CBioseq_Handle loc_handle, CScope* scope, bool is_refseq)
649 {
650 TTranslExceptProblems problems;
651
652 if (!feat.IsSetData() || !feat.GetData().IsCdregion() || !feat.GetData().GetCdregion().IsSetCode_break()) {
653 return problems;
654 }
655
656 TSeqPos len = GetLength(feat.GetLocation(), scope);
657 bool alt_start = false;
658
659 // need to translate a version of the coding region without the code breaks,
660 // to see if the code breaks are necessary
661 const CCdregion& cdregion = feat.GetData().GetCdregion();
662 CRef<CSeq_feat> tmp_cds(new CSeq_feat());
663 tmp_cds->SetLocation().Assign(feat.GetLocation());
664 tmp_cds->SetData().SetCdregion();
665 if (cdregion.IsSetFrame()) {
666 tmp_cds->SetData().SetCdregion().SetFrame(cdregion.GetFrame());
667 }
668 if (cdregion.IsSetCode()) {
669 tmp_cds->SetData().SetCdregion().SetCode().Assign(cdregion.GetCode());
670 }
671
672 // now, will use tmp_cds to translate individual code breaks;
673 tmp_cds->SetData().SetCdregion().ResetFrame();
674
675 const CSeq_loc& loc = feat.GetLocation();
676
677 for (auto cbr = cdregion.GetCode_break().begin(); cbr != cdregion.GetCode_break().end(); cbr++) {
678 if (!(*cbr)->IsSetLoc()) {
679 continue;
680 }
681 // if the code break is outside the coding region, skip
682 ECompare comp = Compare((*cbr)->GetLoc(), loc,
683 scope, fCompareOverlapping);
684 if ((comp != eContained) && (comp != eSame)) {
685 continue;
686 }
687
688 TSeqPos codon_length = GetLength((*cbr)->GetLoc(), scope);
689 TSeqPos from = LocationOffset(loc, (*cbr)->GetLoc(),
690 eOffset_FromStart, scope);
691
692 TSeqPos from_end = LocationOffset(loc, (*cbr)->GetLoc(),
693 eOffset_FromEnd, scope);
694
695 TSeqPos to = from + codon_length - 1;
696
697 // check for code break not on a codon
698 if (codon_length == 3 ||
699 ((codon_length == 1 || codon_length == 2) && to == len - 1)) {
700 size_t start_pos;
701 switch (cdregion.GetFrame()) {
702 case CCdregion::eFrame_two:
703 start_pos = 1;
704 break;
705 case CCdregion::eFrame_three:
706 start_pos = 2;
707 break;
708 default:
709 start_pos = 0;
710 break;
711 }
712 if ((from % 3) != start_pos) {
713 problems.push_back({ eTranslExceptPhase, 0, 0 });
714 }
715 }
716 if ((*cbr)->IsSetAa() && (*cbr)->IsSetLoc()) {
717 tmp_cds->SetLocation().Assign((*cbr)->GetLoc());
718 tmp_cds->SetLocation().SetPartialStart(true, eExtreme_Biological);
719 tmp_cds->SetLocation().SetPartialStop(true, eExtreme_Biological);
720 string cb_trans;
721 try {
722 CSeqTranslator::Translate(*tmp_cds, *scope, cb_trans,
723 true, // include stop codons
724 false, // do not remove trailing X/B/Z
725 &alt_start);
726 } catch (CException&) {
727 }
728 size_t prot_pos = from / 3;
729
730 unsigned char ex = 0;
731 vector<char> seqData;
732 string str;
733 bool not_set = false;
734
735 switch ((*cbr)->GetAa().Which()) {
736 case CCode_break::C_Aa::e_Ncbi8aa:
737 str = (*cbr)->GetAa().GetNcbi8aa();
738 CSeqConvert::Convert(str, CSeqUtil::e_Ncbi8aa, (TSeqPos)0, (TSeqPos)(str.size()), seqData, CSeqUtil::e_Ncbieaa);
739 ex = seqData[0];
740 break;
741 case CCode_break::C_Aa::e_Ncbistdaa:
742 str = (*cbr)->GetAa().GetNcbi8aa();
743 CSeqConvert::Convert(str, CSeqUtil::e_Ncbistdaa, (TSeqPos)0, (TSeqPos)(str.size()), seqData, CSeqUtil::e_Ncbieaa);
744 ex = seqData[0];
745 break;
746 case CCode_break::C_Aa::e_Ncbieaa:
747 seqData.push_back((*cbr)->GetAa().GetNcbieaa());
748 ex = seqData[0];
749 break;
750 default:
751 // do nothing, code break wasn't actually set
752 not_set = true;
753 break;
754 }
755
756 if (!not_set) {
757 string except_char;
758 except_char += ex;
759
760 //At the beginning of the CDS
761 if (prot_pos == 0 && ex != 'M') {
762 if (prot_pos == 0 && ex == 'L' && x_LeuCUGstart(feat) && is_refseq) {
763 /* do not warn on explicitly documented unusual translation initiation at CUG without initiator tRNA-Met */
764 } else if ((!feat.IsSetPartial()) || (!feat.GetPartial())) {
765 problems.push_back({ eTranslExceptSuspicious, ex, prot_pos });
766 }
767 }
768
769 // Anywhere in CDS, where exception has no effect
770 if (from_end > 0) {
771 if (from_end < 2 && NStr::Equal(except_char, "*")) {
772 // this is a necessary terminal transl_except
773 } else if (NStr::EqualNocase(cb_trans, except_char)) {
774 if (prot_pos == 0 && ex == 'L' && x_LeuCUGstart(feat) && is_refseq) {
775 /* do not warn on explicitly documented unusual translation initiation at CUG without initiator tRNA-Met */
776 } else {
777 problems.push_back({ eTranslExceptUnnecessary, ex, prot_pos });
778 }
779 }
780 } else if (!NStr::Equal(except_char, "*"))
781 {
782 if (NStr::Equal(cb_trans, except_char) ||
783 !loc.IsPartialStop(eExtreme_Biological))
784 {
785 const CGenetic_code *gcode;
786 CBioseq_Handle bsh;
787 CSeqVector vec;
788 TSeqPos from1;
789 string p;
790 string q;
791 bool altst;
792
793 vec = loc_handle.GetSeqVector(CBioseq_Handle::eCoding_Iupac);
794 altst = false;
795 from1 = (*cbr)->GetLoc().GetStart(eExtreme_Biological);
796 vec.GetSeqData(from1, from1 + 3, q);
797
798 if (cdregion.CanGetCode())
799 gcode = &cdregion.GetCode();
800 else
801 gcode = NULL;
802
803 CSeqTranslator::Translate(q, p,
804 CSeqTranslator::fIs5PrimePartial,
805 gcode, &altst);
806
807 if (!NStr::Equal(except_char, "*")) {
808 problems.push_back({ eTranslExceptUnexpected, ex, prot_pos });
809 }
810 }
811 } else
812 {
813 /*bsv
814 Stop codon here. Needs a check for abutting tRNA feature.
815 bsv*/
816 }
817 }
818 }
819 }
820 return problems;
821 }
822
823
x_JustifiesException(const TTranslExceptProblems & problems)824 bool CCDSTranslationProblems::x_JustifiesException(const TTranslExceptProblems& problems)
825 {
826 for (auto it = problems.begin(); it != problems.end(); it++) {
827 if (it->problem == eTranslExceptPhase) {
828 return true;
829 }
830 }
831 return false;
832 }
833
834
x_Is5AtEndSpliceSiteOrGap(const CSeq_loc & loc,CScope & scope)835 bool CCDSTranslationProblems::x_Is5AtEndSpliceSiteOrGap(const CSeq_loc& loc, CScope& scope)
836 {
837 CSeq_loc_CI loc_it(loc);
838 if (!loc_it) {
839 return false;
840 }
841 CConstRef<CSeq_loc> rng = loc_it.GetRangeAsSeq_loc();
842 if (!rng) {
843 return false;
844 }
845
846 TSeqPos end = rng->GetStart(eExtreme_Biological);
847 const CBioseq_Handle & bsh = scope.GetBioseqHandle(*rng);
848 if (!bsh) {
849 return false;
850 }
851
852 ENa_strand strand = rng->GetStrand();
853 if (strand == eNa_strand_minus) {
854 TSeqPos seq_len = bsh.GetBioseqLength();
855 if (end < seq_len - 1) {
856 CSeqVector vec = bsh.GetSeqVector (CBioseq_Handle::eCoding_Iupac);
857 if (vec.IsInGap(end + 1)) {
858 if (vec.IsInGap (end)) {
859 // not ok - location overlaps gap
860 return false;
861 } else {
862 // ok, location abuts gap
863 }
864 } else if (end < seq_len - 2 && IsResidue (vec[end + 1]) && ConsistentWithC(vec[end + 1])
865 && IsResidue (vec[end + 2]) && ConsistentWithT(vec[end + 2])) {
866 // it's ok, it's abutting the reverse complement of AG
867 } else {
868 return false;
869 }
870 } else {
871 // it's ok, location endpoint is at the 3' end
872 }
873 } else {
874 if (end > 0 && end < bsh.GetBioseqLength()) {
875 CSeqVector vec = bsh.GetSeqVector (CBioseq_Handle::eCoding_Iupac);
876 if (vec.IsInGap(end - 1)) {
877 if (vec.IsInGap (end)) {
878 // not ok - location overlaps gap
879 return false;
880 } else {
881 // ok, location abuts gap
882 }
883 } else {
884 if (end > 1 && IsResidue (vec[end - 1]) && ConsistentWithG(vec[end - 1])
885 && IsResidue(vec[end - 2]) && ConsistentWithA(vec[end - 2])) {
886 //it's ok, it's abutting "AG"
887 } else {
888 return false;
889 }
890 }
891 } else {
892 // it's ok, location endpoint is at the 5' end
893 }
894 }
895 return true;
896 }
897
898
x_CheckForRaggedEnd(const CSeq_feat & feat,CScope * scope)899 int CCDSTranslationProblems::x_CheckForRaggedEnd(const CSeq_feat& feat, CScope* scope)
900 {
901 int ragged = 0;
902 if (!feat.IsSetData() || !feat.GetData().IsCdregion() || !feat.IsSetLocation()) {
903 return 0;
904 }
905 unsigned int part_loc = SeqLocPartialCheck(feat.GetLocation(), scope);
906 if (feat.IsSetProduct()) {
907 unsigned int part_prod = SeqLocPartialCheck(feat.GetProduct(), scope);
908 if ((part_loc & eSeqlocPartial_Stop) ||
909 (part_prod & eSeqlocPartial_Stop)) {
910 // not ragged
911 } else {
912 // complete stop, so check for ragged end
913 ragged = x_CheckForRaggedEnd(feat.GetLocation(), feat.GetData().GetCdregion(), scope);
914 }
915 }
916 return ragged;
917 }
918
919
x_CheckForRaggedEnd(const CSeq_loc & loc,const CCdregion & cdregion,CScope * scope)920 int CCDSTranslationProblems::x_CheckForRaggedEnd
921 (const CSeq_loc& loc,
922 const CCdregion& cdregion,
923 CScope* scope)
924 {
925 size_t len = GetLength(loc, scope);
926 if (cdregion.GetFrame() > CCdregion::eFrame_one) {
927 len -= cdregion.GetFrame() - 1;
928 }
929
930 int ragged = len % 3;
931 if (ragged > 0) {
932 len = GetLength(loc, scope);
933 size_t last_pos = 0;
934
935 CSeq_loc::TRange range = CSeq_loc::TRange::GetEmpty();
936 if (cdregion.IsSetCode_break()) {
937 for (auto cbr = cdregion.GetCode_break().begin(); cbr != cdregion.GetCode_break().end(); cbr++) {
938 SRelLoc rl(loc, (*cbr)->GetLoc(), scope);
939 ITERATE(SRelLoc::TRanges, rit, rl.m_Ranges) {
940 if ((*rit)->GetTo() > last_pos) {
941 last_pos = (*rit)->GetTo();
942 }
943 }
944 }
945 }
946
947 // allowing a partial codon at the end
948 TSeqPos codon_length = range.GetLength();
949 if ((codon_length == 0 || codon_length == 1) &&
950 last_pos == len - 1) {
951 ragged = 0;
952 }
953 }
954 return ragged;
955 }
956
957
958 typedef enum {
959 eMRNAExcept_Biological = 1,
960 eMRNAExcept_RNAEditing = 2,
961 eMRNAExcept_Unclassified = 4,
962 eMRNAExcept_Mismatch = 8,
963 eMRNAExcept_ProductReplaced = 16
964 } EMRNAException;
965 static const char* const sc_BypassMrnaTransCheckText[] = {
966 "RNA editing",
967 "adjusted for low-quality genome",
968 "annotated by transcript or proteomic data",
969 "artificial frameshift",
970 "reasons given in citation",
971 "transcribed product replaced",
972 "unclassified transcription discrepancy",
973 };
974 typedef CStaticArraySet<const char*, PCase_CStr> TBypassMrnaTransCheckSet;
975 DEFINE_STATIC_ARRAY_MAP(TBypassMrnaTransCheckSet, sc_BypassMrnaTransCheck, sc_BypassMrnaTransCheckText);
976
977
InterpretMrnaException(const string & except_text)978 size_t InterpretMrnaException(const string& except_text)
979 {
980 size_t rval = 0;
981
982 ITERATE(TBypassMrnaTransCheckSet, it, sc_BypassMrnaTransCheck) {
983 if (NStr::FindNoCase(except_text, *it) != NPOS) {
984 rval |= eMRNAExcept_Biological;
985 }
986 }
987 if (NStr::FindNoCase(except_text, "RNA editing") != NPOS) {
988 rval |= eMRNAExcept_RNAEditing;
989 }
990 if (NStr::FindNoCase(except_text, "unclassified transcription discrepancy") != NPOS) {
991 rval |= eMRNAExcept_Unclassified;
992 }
993 if (NStr::FindNoCase(except_text, "mismatches in transcription") != NPOS) {
994 rval |= eMRNAExcept_Mismatch;
995 }
996 if (NStr::FindNoCase(except_text, "transcribed product replaced") != NPOS) {
997 rval |= eMRNAExcept_ProductReplaced;
998 }
999 return rval;
1000 }
1001
GetMRNATranslationProblems(const CSeq_feat & feat,size_t & mismatches,bool ignore_exceptions,CBioseq_Handle nuc,CBioseq_Handle rna,bool far_fetch,bool is_gpipe,bool is_genomic,CScope * scope)1002 size_t GetMRNATranslationProblems
1003 (const CSeq_feat& feat,
1004 size_t& mismatches,
1005 bool ignore_exceptions,
1006 CBioseq_Handle nuc,
1007 CBioseq_Handle rna,
1008 bool far_fetch,
1009 bool is_gpipe,
1010 bool is_genomic,
1011 CScope* scope)
1012 {
1013 size_t rval = 0;
1014 mismatches = 0;
1015 if (feat.CanGetPseudo() && feat.GetPseudo()) {
1016 return rval;
1017 }
1018 if (!feat.CanGetProduct()) {
1019 return rval;
1020 }
1021
1022 size_t exception_flags = 0;
1023
1024 if (!ignore_exceptions &&
1025 feat.CanGetExcept() && feat.GetExcept() &&
1026 feat.CanGetExcept_text()) {
1027 exception_flags = InterpretMrnaException(feat.GetExcept_text());
1028 }
1029 bool has_errors = false, other_than_mismatch = false;
1030 bool report_errors = !(exception_flags & eMRNAExcept_Biological) || (exception_flags & eMRNAExcept_Mismatch);
1031
1032 CConstRef<CSeq_id> product_id;
1033 try {
1034 product_id.Reset(&GetId(feat.GetProduct(), scope));
1035 } catch (CException&) {
1036 }
1037 if (!product_id) {
1038 return rval;
1039 }
1040
1041 if (!nuc) {
1042 if (exception_flags & eMRNAExcept_Unclassified) {
1043 rval |= eMRNAProblem_TransFail;
1044 }
1045 return rval;
1046 }
1047
1048 size_t total = 0;
1049
1050 // note - exception will be thrown when creating CSeqVector if wrong type set for bioseq seq-data
1051 try {
1052 if (nuc) {
1053
1054 if (!rna) {
1055 if (far_fetch) {
1056 rval |= eMRNAProblem_UnableToFetch;
1057 }
1058 return rval;
1059 }
1060
1061 _ASSERT(nuc && rna);
1062
1063 CSeqVector nuc_vec(feat.GetLocation(), *scope,
1064 CBioseq_Handle::eCoding_Iupac);
1065 CSeqVector rna_vec(rna, CBioseq_Handle::eCoding_Iupac);
1066
1067 TSeqPos nuc_len = nuc_vec.size();
1068 TSeqPos rna_len = rna_vec.size();
1069
1070 if (nuc_len != rna_len) {
1071 has_errors = true;
1072 other_than_mismatch = true;
1073 if (nuc_len < rna_len) {
1074 size_t count_a = 0, count_no_a = 0;
1075 // count 'A's in the tail
1076 for (CSeqVector_CI iter(rna_vec, nuc_len); iter; ++iter) {
1077 if ((*iter == 'A') || (*iter == 'a')) {
1078 ++count_a;
1079 } else {
1080 ++count_no_a;
1081 }
1082 }
1083 if (count_a < (19 * count_no_a)) { // less then 5%
1084 if (report_errors || (exception_flags & eMRNAExcept_RNAEditing)) {
1085 rval |= eMRNAProblem_TranscriptLenLess;
1086 }
1087 } else if (count_a > 0 && count_no_a == 0) {
1088 has_errors = true;
1089 other_than_mismatch = true;
1090 if (report_errors || (exception_flags & eMRNAExcept_RNAEditing)) {
1091 if (is_gpipe && is_genomic) {
1092 // suppress
1093 } else {
1094 rval |= eMRNAProblem_PolyATail100;
1095 }
1096 }
1097 } else {
1098 if (report_errors) {
1099 rval |= eMRNAProblem_PolyATail95;
1100 }
1101 }
1102 // allow base-by-base comparison on common length
1103 rna_len = nuc_len = min(nuc_len, rna_len);
1104
1105 } else {
1106 if (report_errors) {
1107 rval |= eMRNAProblem_TranscriptLenMore;
1108 }
1109 }
1110 }
1111
1112 if (rna_len == nuc_len && nuc_len > 0) {
1113 CSeqVector_CI nuc_ci(nuc_vec);
1114 CSeqVector_CI rna_ci(rna_vec);
1115
1116 // compare content of common length
1117 while ((nuc_ci && rna_ci) && (nuc_ci.GetPos() < nuc_len)) {
1118 if (*nuc_ci != *rna_ci) {
1119 ++mismatches;
1120 }
1121 ++nuc_ci;
1122 ++rna_ci;
1123 ++total;
1124 }
1125 if (mismatches > 0) {
1126 has_errors = true;
1127 if (report_errors && !(exception_flags & eMRNAExcept_Mismatch)) {
1128 rval |= eMRNAProblem_Mismatch;
1129 }
1130 }
1131 }
1132 }
1133 } catch (CException&) {
1134 rval |= eMRNAProblem_TransFail;
1135 } catch (std::exception) {
1136 }
1137
1138 if (!report_errors) {
1139 if (!has_errors) {
1140 rval |= eMRNAProblem_UnnecessaryException;
1141 } else if ((exception_flags & eMRNAExcept_Unclassified) && !other_than_mismatch) {
1142 if (mismatches * 50 <= total) {
1143 rval |= eMRNAProblem_ErroneousException;
1144 }
1145 } else if (exception_flags & eMRNAExcept_ProductReplaced) {
1146 rval |= eMRNAProblem_ProductReplaced;
1147 }
1148 }
1149 return rval;
1150 }
1151
1152
1153
1154 END_SCOPE(validator)
1155 END_SCOPE(objects)
1156 END_NCBI_SCOPE
1157