1 /*  $Id: translation_problems.cpp 632625 2021-06-03 17:38:33Z ivanov $
2  * ===========================================================================
3  *
4  *                            PUBLIC DOMAIN NOTICE
5  *               National Center for Biotechnology Information
6  *
7  *  This software/database is a "United States Government Work" under the
8  *  terms of the United States Copyright Act.  It was written as part of
9  *  the author's official duties as a United States Government employee and
10  *  thus cannot be copyrighted.  This software/database is freely available
11  *  to the public for use. The National Library of Medicine and the U.S.
12  *  Government have not placed any restriction on its use or reproduction.
13  *
14  *  Although all reasonable efforts have been taken to ensure the accuracy
15  *  and reliability of the software and data, the NLM and the U.S.
16  *  Government do not and cannot warrant the performance or results that
17  *  may be obtained by using this software or data. The NLM and the U.S.
18  *  Government disclaim all warranties, express or implied, including
19  *  warranties of performance, merchantability or fitness for any particular
20  *  purpose.
21  *
22  *  Please cite the author in any work or product based on this material.
23  *
24  * ===========================================================================
25  *
26  * Author:  Colleen Bollin
27  *
28  * File Description:
29  *   detecting translation problems
30  *   .......
31  *
32  */
33 #include <ncbi_pch.hpp>
34 #include <corelib/ncbistd.hpp>
35 #include <corelib/ncbistr.hpp>
36 #include <objtools/validator/translation_problems.hpp>
37 #include <objtools/validator/utilities.hpp>
38 
39 #include <objmgr/seq_vector.hpp>
40 #include <objmgr/util/sequence.hpp>
41 #include <objects/seqfeat/Cdregion.hpp>
42 #include <objects/seqfeat/Code_break.hpp>
43 #include <objects/seqfeat/Gb_qual.hpp>
44 #include <objects/seqfeat/Genetic_code.hpp>
45 #include <util/sequtil/sequtil_convert.hpp>
46 
47 BEGIN_NCBI_SCOPE
48 BEGIN_SCOPE(objects)
49 BEGIN_SCOPE(validator)
50 using namespace sequence;
51 
52 
CCDSTranslationProblems()53 CCDSTranslationProblems::CCDSTranslationProblems()
54 {
55     x_Reset();
56 }
57 
58 
x_Reset()59 void CCDSTranslationProblems::x_Reset()
60 {
61     m_ProblemFlags = 0;
62     m_RaggedLength = 0;
63     m_TransLen = 0;
64     m_ProtLen = 0;
65     m_HasDashXStart = false;
66     m_TranslStart = 0;
67     m_InternalStopCodons = 0;
68     m_TranslTerminalX = 0;
69     m_ProdTerminalX = 0;
70     m_UnableToTranslate = false;
71     m_AltStart = false;
72     m_UnparsedTranslExcept = false;
73     m_NumNonsenseIntrons = 0;
74     m_HasException = false;
75 }
76 
77 
CalculateTranslationProblems(const CSeq_feat & feat,CBioseq_Handle loc_handle,CBioseq_Handle prot_handle,bool ignore_exceptions,bool far_fetch_cds,bool standalone_annot,bool single_seq,bool is_gpipe,bool is_genomic,bool is_refseq,bool is_nt_or_ng_or_nw,bool is_nc,bool has_accession,CScope * scope)78 void CCDSTranslationProblems::CalculateTranslationProblems
79 (const CSeq_feat& feat,
80 CBioseq_Handle loc_handle,
81 CBioseq_Handle prot_handle,
82 bool ignore_exceptions,
83 bool far_fetch_cds,
84 bool standalone_annot,
85 bool single_seq,
86 bool is_gpipe,
87 bool is_genomic,
88 bool is_refseq,
89 bool is_nt_or_ng_or_nw,
90 bool is_nc,
91 bool has_accession,
92 CScope* scope)
93 {
94     x_Reset();
95     // bail if not CDS
96     if (!feat.GetData().IsCdregion()) {
97         return;
98     }
99 
100     // do not validate for pseudo gene
101     if (feat.IsSetQual()) {
102         for (auto it = feat.GetQual().begin(); it != feat.GetQual().end(); it++) {
103              if ((*it)->IsSetQual() && NStr::EqualNocase((*it)->GetQual(), "pseudo")) {
104                 return;
105             }
106         }
107     }
108 
109     bool has_errors = false, unclassified_except = false,
110         mismatch_except = false, frameshift_except = false,
111         rearrange_except = false, product_replaced = false,
112         mixed_population = false, low_quality = false,
113         report_errors = true, other_than_mismatch = false,
114         rna_editing = false, transcript_or_proteomic = false;
115     string farstr;
116 
117     if (!ignore_exceptions &&
118         feat.IsSetExcept() && feat.GetExcept() &&
119         feat.IsSetExcept_text()) {
120         const string& except_text = feat.GetExcept_text();
121         report_errors = ReportTranslationErrors(except_text);
122         x_GetExceptionFlags(except_text,
123             unclassified_except,
124             mismatch_except,
125             frameshift_except,
126             rearrange_except,
127             product_replaced,
128             mixed_population,
129             low_quality,
130             rna_editing,
131             transcript_or_proteomic);
132     }
133 
134     m_HasException = !report_errors;
135 
136     // check frame
137     m_ProblemFlags |= x_CheckCDSFrame(feat, scope);
138 
139     // check for unparsed transl_except
140     if (feat.IsSetQual()) {
141         for (auto it = feat.GetQual().begin(); it != feat.GetQual().end(); it++) {
142             if ((*it)->IsSetQual() && NStr::Equal((*it)->GetQual(), "transl_except")) {
143                 m_UnparsedTranslExcept = true;
144             }
145         }
146     }
147 
148     string transl_prot;   // translated protein
149     bool got_stop = false;
150 
151     try {
152         transl_prot = TranslateCodingRegionForValidation(feat, *scope, m_AltStart);
153     } catch (CException&) {
154         m_UnableToTranslate = true;
155     }
156 
157     if (NStr::EndsWith(transl_prot, "*")) {
158         got_stop = true;
159     }
160 
161     if (HasBadStartCodon(feat.GetLocation(), transl_prot)) {
162         m_TranslStart = transl_prot.c_str()[0];
163         m_ProblemFlags |= eCDSTranslationProblem_BadStart;
164     }
165 
166     bool no_product = true;
167 
168     if (!m_UnableToTranslate) {
169 
170         // check for code break not on a codon
171         m_TranslExceptProblems = x_GetTranslExceptProblems(feat, loc_handle, scope, is_refseq);
172 
173         m_NumNonsenseIntrons = x_CountNonsenseIntrons(feat, scope);
174         if (x_ProteinHasTooManyXs(transl_prot)) {
175             m_ProblemFlags |= eCDSTranslationProblem_TooManyX;
176         }
177 
178         m_InternalStopCodons = CountInternalStopCodons(transl_prot);
179         if (m_InternalStopCodons >  5) {
180             // stop checking if too many stop codons
181             return;
182         }
183 
184         // get protein sequence
185 
186         if (!prot_handle) {
187             const CSeq_id* protid = 0;
188             try {
189                 protid = &GetId(feat.GetProduct(), scope);
190             } catch (CException&) {}
191             if (protid && far_fetch_cds && feat.IsSetProduct()) {
192                 m_ProblemFlags |= eCDSTranslationProblem_UnableToFetch;
193             } else if (protid && (!far_fetch_cds || feat.IsSetProduct())) {
194                 // don't report missing protein sequence
195             } else if (!standalone_annot && transl_prot.length() > 6) {
196                 if (!is_nt_or_ng_or_nw && (!is_nc || !single_seq)) {
197                     m_ProblemFlags |= eCDSTranslationProblem_NoProtein;
198                 }
199             }
200         }
201 
202         bool show_stop = true;
203 
204         if (prot_handle && prot_handle.IsAa()) {
205             no_product = false;
206 
207             CSeqVector prot_vec = prot_handle.GetSeqVector();
208             prot_vec.SetCoding(CSeq_data::e_Ncbieaa);
209 
210 
211             CalculateEffectiveTranslationLengths(transl_prot, prot_vec, m_TransLen, m_ProtLen);
212 
213             if (m_TransLen == m_ProtLen || has_accession)  {                // could be identical
214                 if (prot_vec.size() > 0 && transl_prot.length() > 0 &&
215                     prot_vec[0] != transl_prot[0]) {
216                     bool no_beg, no_end;
217                     FeatureHasEnds(feat, scope, no_beg, no_end);
218                     if (feat.IsSetPartial() && feat.GetPartial() && (!no_beg) && (!no_end)) {
219                         m_ProblemFlags |= eCDSTranslationProblem_ShouldStartPartial;
220                     } else if (transl_prot[0] == '-' || transl_prot[0] == 'X') {
221                         m_HasDashXStart = true;
222                     }
223                 }
224                 m_Mismatches = x_GetTranslationMismatches(feat, prot_vec, transl_prot, has_accession);
225             }
226 
227             if (feat.CanGetPartial() && feat.GetPartial() &&
228                 m_Mismatches.size() == 0) {
229                 bool no_beg, no_end;
230                 FeatureHasEnds(feat, scope, no_beg, no_end);
231                 if (!no_beg  && !no_end) {
232                     if (report_errors) {
233                         if (is_gpipe && is_genomic) {
234                             // suppress in gpipe genomic
235                         } else {
236                             if (!got_stop) {
237                                 m_ProblemFlags |= eCDSTranslationProblem_ShouldBePartialButIsnt;
238                             } else {
239                                 m_ProblemFlags |= eCDSTranslationProblem_ShouldNotBePartialButIs;
240                             }
241                         }
242                     }
243                     show_stop = false;
244                 }
245             }
246 
247             if (!transl_prot.empty()) {
248                 // look for discrepancy in number of terminal Xs between product and translation
249                 m_TranslTerminalX = x_CountTerminalXs(transl_prot, (got_stop && (transl_prot.length() == prot_vec.size() + 1)));
250                 m_ProdTerminalX = x_CountTerminalXs(prot_vec);
251 
252             }
253 
254         }
255 
256         x_GetCdTransErrors(feat, prot_handle, show_stop, got_stop, scope);
257 
258     }
259 
260     if (x_JustifiesException()) {
261         has_errors = true;
262         other_than_mismatch = true;
263     } else if (m_Mismatches.size() > 0) {
264         has_errors = true;
265     }
266 
267     if (!report_errors && !no_product) {
268         if (!has_errors) {
269             if (!frameshift_except && !rearrange_except && !mixed_population && !low_quality) {
270                 m_ProblemFlags |= eCDSTranslationProblem_UnnecessaryException;
271             }
272         } else if (unclassified_except && !other_than_mismatch) {
273             if (m_Mismatches.size() * 50 <= m_ProtLen) {
274                 m_ProblemFlags |= eCDSTranslationProblem_ErroneousException;
275             }
276         } else if (product_replaced) {
277             m_ProblemFlags |= eCDSTranslationProblem_UnqualifiedException;
278         }
279     }
280 
281 }
282 
283 
x_GetCdTransErrors(const CSeq_feat & feat,CBioseq_Handle product,bool show_stop,bool got_stop,CScope * scope)284 void CCDSTranslationProblems::x_GetCdTransErrors(const CSeq_feat& feat, CBioseq_Handle product, bool show_stop, bool got_stop, CScope* scope)
285 {
286     bool no_beg, no_end;
287     FeatureHasEnds(feat, product ? &(product.GetScope()) : scope, no_beg, no_end);
288     if (show_stop) {
289         if (!got_stop  && !no_end) {
290             m_ProblemFlags |= eCDSTranslationProblem_NoStop;
291         } else if (got_stop  &&  no_end) {
292             m_ProblemFlags |= eCDSTranslationProblem_StopPartial;
293         } else if (got_stop  &&  !no_end) {
294             m_RaggedLength = x_CheckForRaggedEnd(feat, scope);
295         }
296     }
297 }
298 
x_IsThreeBaseNonsense(const CSeq_feat & feat,const CSeq_id & id,const CCdregion & cdr,TSeqPos start,TSeqPos stop,ENa_strand strand,CScope * scope)299 bool CCDSTranslationProblems::x_IsThreeBaseNonsense(
300     const CSeq_feat& feat,
301     const CSeq_id& id,
302     const CCdregion& cdr,
303     TSeqPos start,
304     TSeqPos stop,
305     ENa_strand strand,
306     CScope *scope
307 )
308 
309 {
310     bool nonsense_intron = false;
311     CRef<CSeq_feat> tmp_cds (new CSeq_feat());
312 
313     tmp_cds->SetLocation().SetInt().SetFrom(start);
314     tmp_cds->SetLocation().SetInt().SetTo(stop);
315     tmp_cds->SetLocation().SetInt().SetStrand(strand);
316     tmp_cds->SetLocation().SetInt().SetId().Assign(id);
317 
318     tmp_cds->SetLocation().SetPartialStart(true, eExtreme_Biological);
319     tmp_cds->SetLocation().SetPartialStop(true, eExtreme_Biological);
320     tmp_cds->SetData().SetCdregion();
321     if ( cdr.IsSetCode()) {
322         tmp_cds->SetData().SetCdregion().SetCode().Assign(cdr.GetCode());
323     }
324 
325     bool alt_start = false;
326     try {
327         string transl_prot = TranslateCodingRegionForValidation(*tmp_cds, *scope, alt_start);
328 
329         if (NStr::Equal(transl_prot, "*")) {
330             nonsense_intron = true;
331         }
332     } catch (CException&) {
333     }
334     return nonsense_intron;
335 }
336 
337 
x_CountNonsenseIntrons(const CSeq_feat & feat,CScope * scope)338 size_t CCDSTranslationProblems::x_CountNonsenseIntrons(const CSeq_feat& feat, CScope* scope)
339 {
340     TSeqPos last_start = 0, last_stop = 0, start, stop;
341 
342     if (!feat.IsSetData() && !feat.GetData().IsCdregion()) {
343         return 0;
344     }
345     if (feat.IsSetExcept() || feat.IsSetExcept_text()) return 0;
346     const CCdregion& cdr = feat.GetData().GetCdregion();
347     if (cdr.IsSetCode_break()) return 0;
348 
349     size_t count = 0;
350     const CSeq_loc& loc = feat.GetLocation();
351 
352     CSeq_loc_CI prev;
353     for (CSeq_loc_CI curr(loc); curr; ++curr) {
354         start = curr.GetRange().GetFrom();
355         stop = curr.GetRange().GetTo();
356         if (prev  &&  curr  && IsSameBioseq(curr.GetSeq_id(), prev.GetSeq_id(), scope)) {
357             ENa_strand strand = curr.GetStrand();
358             if (strand == eNa_strand_minus) {
359                 if (last_start - stop == 4) {
360                     if (x_IsThreeBaseNonsense(feat, curr.GetSeq_id(), cdr, stop + 1, last_start - 1, strand, scope)) {
361                         count++;
362                     }
363                 }
364             } else {
365                 if (start - last_stop == 4) {
366                     if (x_IsThreeBaseNonsense(feat, curr.GetSeq_id(), cdr, last_stop + 1, start - 1, strand, scope)) {
367                         count++;
368                     }
369                 }
370             }
371         }
372         last_start = start;
373         last_stop = stop;
374         prev = curr;
375     }
376 
377     if (count > 0 && sequence::IsPseudo(feat, *scope)) return 0;
378     else return count;
379 }
380 
381 
GetNonsenseIntrons(const CSeq_feat & feat,CScope & scope)382 vector<CRef<CSeq_loc> > CCDSTranslationProblems::GetNonsenseIntrons(const CSeq_feat& feat, CScope& scope)
383 {
384     vector<CRef<CSeq_loc> > intron_locs;
385 
386     TSeqPos last_start = 0, last_stop = 0, start, stop;
387 
388     if (!feat.IsSetData() && !feat.GetData().IsCdregion()) {
389         return intron_locs;
390     }
391     if (feat.IsSetExcept() || feat.IsSetExcept_text()) return intron_locs;
392     const CCdregion& cdr = feat.GetData().GetCdregion();
393     if (cdr.IsSetCode_break()) return intron_locs;
394 
395     const CSeq_loc& loc = feat.GetLocation();
396 
397     CSeq_loc_CI prev;
398     for (CSeq_loc_CI curr(loc); curr; ++curr) {
399         start = curr.GetRange().GetFrom();
400         stop = curr.GetRange().GetTo();
401         if (prev  &&  curr  && IsSameBioseq(curr.GetSeq_id(), prev.GetSeq_id(), &scope)) {
402             ENa_strand strand = curr.GetStrand();
403             if (strand == eNa_strand_minus) {
404                 if (last_start - stop == 4) {
405                     if (x_IsThreeBaseNonsense(feat, curr.GetSeq_id(), cdr, stop + 1, last_start - 1, strand, &scope)) {
406                         CRef<CSeq_id> id(new CSeq_id());
407                         id->Assign(curr.GetSeq_id());
408                         CRef<CSeq_loc> intron_loc(new CSeq_loc(*id, stop + 1, last_start - 1, strand));
409                         intron_locs.push_back(intron_loc);
410                     }
411                 }
412             } else {
413                 if (start - last_stop == 4) {
414                     if (x_IsThreeBaseNonsense(feat, curr.GetSeq_id(), cdr, last_stop + 1, start - 1, strand, &scope)) {
415                         CRef<CSeq_id> id(new CSeq_id());
416                         id->Assign(curr.GetSeq_id());
417                         CRef<CSeq_loc> intron_loc(new CSeq_loc(*id, last_stop + 1, start - 1, strand));
418                         intron_locs.push_back(intron_loc);
419                     }
420                 }
421             }
422         }
423         last_start = start;
424         last_stop = stop;
425         prev = curr;
426     }
427 
428     if (intron_locs.size() > 0 && sequence::IsPseudo(feat, scope)) {
429         intron_locs.clear();
430     }
431     return intron_locs;
432 
433 }
434 
435 
x_ProteinHasTooManyXs(const string & transl_prot)436 bool CCDSTranslationProblems::x_ProteinHasTooManyXs(const string& transl_prot)
437 {
438     size_t num_x = 0, num_nonx = 0;
439 
440     ITERATE(string, it, transl_prot) {
441         if (*it == 'X') {
442             num_x++;
443         } else {
444             num_nonx++;
445         }
446     }
447 
448     // report too many Xs
449     if (num_x > num_nonx) {
450         return true;
451     } else {
452         return false;
453     }
454 }
455 
456 
x_GetExceptionFlags(const string & except_text,bool & unclassified_except,bool & mismatch_except,bool & frameshift_except,bool & rearrange_except,bool & product_replaced,bool & mixed_population,bool & low_quality,bool & rna_editing,bool & transcript_or_proteomic)457 void CCDSTranslationProblems::x_GetExceptionFlags
458 (const string& except_text,
459  bool& unclassified_except,
460  bool& mismatch_except,
461  bool& frameshift_except,
462  bool& rearrange_except,
463  bool& product_replaced,
464  bool& mixed_population,
465  bool& low_quality,
466  bool& rna_editing,
467  bool& transcript_or_proteomic)
468 {
469     if (NStr::FindNoCase(except_text, "unclassified translation discrepancy") != NPOS) {
470         unclassified_except = true;
471     }
472     if (NStr::FindNoCase(except_text, "mismatches in translation") != NPOS) {
473         mismatch_except = true;
474     }
475     if (NStr::FindNoCase(except_text, "artificial frameshift") != NPOS) {
476         frameshift_except = true;
477     }
478     if (NStr::FindNoCase(except_text, "rearrangement required for product") != NPOS) {
479         rearrange_except = true;
480     }
481     if (NStr::FindNoCase(except_text, "translated product replaced") != NPOS) {
482         product_replaced = true;
483     }
484     if (NStr::FindNoCase(except_text, "heterogeneous population sequenced") != NPOS) {
485         mixed_population = true;
486     }
487     if (NStr::FindNoCase(except_text, "low-quality sequence region") != NPOS) {
488         low_quality = true;
489     }
490     if (NStr::FindNoCase (except_text, "RNA editing") != NPOS) {
491         rna_editing = true;
492     }
493 }
494 
495 
x_CheckCDSFrame(const CSeq_feat & feat,CScope * scope)496 size_t CCDSTranslationProblems::x_CheckCDSFrame(const CSeq_feat& feat, CScope* scope)
497 {
498     size_t rval = 0;
499 
500     const CCdregion& cdregion = feat.GetData().GetCdregion();
501     const CSeq_loc& location = feat.GetLocation();
502     unsigned int part_loc = SeqLocPartialCheck(location, scope);
503     string comment_text;
504     if (feat.IsSetComment()) {
505         comment_text = feat.GetComment();
506     }
507 
508     // check frame
509     if (cdregion.IsSetFrame()
510         && (cdregion.GetFrame() == CCdregion::eFrame_two || cdregion.GetFrame() == CCdregion::eFrame_three)) {
511         // coding region should be 5' partial
512         if (!(part_loc & eSeqlocPartial_Start)) {
513             rval |= eCDSTranslationProblem_FrameNotPartial;
514         } else if ((part_loc & eSeqlocPartial_Start) && !x_Is5AtEndSpliceSiteOrGap(location, *scope)) {
515             if (s_PartialAtGapOrNs(scope, location, eSeqlocPartial_Nostart, true)
516                 || NStr::Find(comment_text, "coding region disrupted by sequencing gap") != string::npos) {
517                 // suppress
518             } else {
519                 rval |= eCDSTranslationProblem_FrameNotConsensus;
520             }
521         }
522     }
523     return rval;
524 }
525 
526 
x_JustifiesException() const527 bool CCDSTranslationProblems::x_JustifiesException() const
528 {
529     if (m_ProblemFlags & eCDSTranslationProblem_FrameNotPartial ||
530         m_ProblemFlags & eCDSTranslationProblem_FrameNotConsensus ||
531         m_ProblemFlags & eCDSTranslationProblem_NoStop ||
532         m_ProblemFlags & eCDSTranslationProblem_StopPartial ||
533         m_ProblemFlags & eCDSTranslationProblem_PastStop ||
534         m_ProblemFlags & eCDSTranslationProblem_ShouldStartPartial ||
535         m_ProblemFlags & eCDSTranslationProblem_BadStart ||
536         m_ProblemFlags & eCDSTranslationProblem_NoProtein ||
537         m_ProtLen != m_TransLen ||
538         m_InternalStopCodons > 0 ||
539         m_RaggedLength > 0 || m_HasDashXStart ||
540         m_UnableToTranslate) {
541         return true;
542     } else if (x_JustifiesException(m_TranslExceptProblems)) {
543         return true;
544     } else {
545         return false;
546     }
547 }
548 
549 
x_CountTerminalXs(const string & transl_prot,bool skip_stop)550 size_t CCDSTranslationProblems::x_CountTerminalXs(const string& transl_prot, bool skip_stop)
551 {
552     // look for discrepancy in number of terminal Xs between product and translation
553     size_t transl_terminal_x = 0;
554     size_t i = transl_prot.length() - 1;
555     if (i > 0 && transl_prot[i] == '*' && skip_stop) {
556         i--;
557     }
558     while (i > 0) {
559         if (transl_prot[i] == 'X') {
560             transl_terminal_x++;
561         } else {
562             break;
563         }
564         i--;
565     }
566     if (i == 0 && transl_prot[0] == 'X') {
567         transl_terminal_x++;
568     }
569     return transl_terminal_x;
570 }
571 
x_CountTerminalXs(const CSeqVector & prot_vec)572 size_t CCDSTranslationProblems::x_CountTerminalXs(const CSeqVector& prot_vec)
573 {
574     size_t prod_terminal_x = 0;
575     TSeqPos prod_len = prot_vec.size() - 1;
576     while (prod_len > 0 && prot_vec[prod_len] == 'X') {
577         prod_terminal_x++;
578         prod_len--;
579     }
580     if (prod_len == 0 && prot_vec[prod_len] == 'X') {
581         prod_terminal_x++;
582     }
583     return prod_terminal_x;
584 }
585 
586 
587 CCDSTranslationProblems::TTranslationMismatches
x_GetTranslationMismatches(const CSeq_feat & feat,const CSeqVector & prot_vec,const string & transl_prot,bool has_accession)588 CCDSTranslationProblems::x_GetTranslationMismatches(const CSeq_feat& feat, const CSeqVector& prot_vec, const string& transl_prot, bool has_accession)
589 {
590     TTranslationMismatches mismatches;
591     size_t prot_len;
592     size_t len;
593 
594     CalculateEffectiveTranslationLengths(transl_prot, prot_vec, len, prot_len);
595 
596     if (len == prot_len || has_accession)  {                // could be identical
597         if (len > prot_len) {
598             len = prot_len;
599         }
600         for (TSeqPos i = 0; i < len; ++i) {
601             CSeqVectorTypes::TResidue p_res = prot_vec[i];
602             CSeqVectorTypes::TResidue t_res = transl_prot[i];
603 
604             if (t_res != p_res) {
605                 if (i == 0) {
606                     bool no_beg, no_end;
607                     FeatureHasEnds(feat, &(prot_vec.GetScope()), no_beg, no_end);
608                     if (feat.IsSetPartial() && feat.GetPartial() && (!no_beg) && (!no_end)) {
609                     } else if (t_res == '-') {
610                     } else {
611                         mismatches.push_back({ p_res, t_res, i });
612                     }
613                 } else {
614                     mismatches.push_back({ p_res, t_res, i });
615                 }
616             }
617         }
618     }
619     return mismatches;
620 }
621 
622 
623 
624 
625 
x_LeuCUGstart(const CSeq_feat & feat)626 static bool x_LeuCUGstart
627 (
628     const CSeq_feat& feat
629 )
630 
631 {
632     if ( ! feat.IsSetExcept()) return false;
633     if ( ! feat.IsSetExcept_text()) return false;
634     const string& except_text = feat.GetExcept_text();
635     if (NStr::FindNoCase(except_text, "translation initiation by tRNA-Leu at CUG codon") == NPOS) return false;
636     if (feat.IsSetQual()) {
637         for (auto it = feat.GetQual().begin(); it != feat.GetQual().end(); it++) {
638             const CGb_qual& qual = **it;
639             if (qual.IsSetQual() && NStr::Compare(qual.GetQual(), "experiment") == 0) return true;
640         }
641     }
642     return false;
643 }
644 
645 
646 CCDSTranslationProblems::TTranslExceptProblems
x_GetTranslExceptProblems(const CSeq_feat & feat,CBioseq_Handle loc_handle,CScope * scope,bool is_refseq)647 CCDSTranslationProblems::x_GetTranslExceptProblems
648 (const CSeq_feat& feat, CBioseq_Handle loc_handle, CScope* scope, bool is_refseq)
649 {
650     TTranslExceptProblems problems;
651 
652     if (!feat.IsSetData() || !feat.GetData().IsCdregion() || !feat.GetData().GetCdregion().IsSetCode_break()) {
653         return problems;
654     }
655 
656     TSeqPos len = GetLength(feat.GetLocation(), scope);
657     bool alt_start = false;
658 
659     // need to translate a version of the coding region without the code breaks,
660     // to see if the code breaks are necessary
661     const CCdregion& cdregion = feat.GetData().GetCdregion();
662     CRef<CSeq_feat> tmp_cds(new CSeq_feat());
663     tmp_cds->SetLocation().Assign(feat.GetLocation());
664     tmp_cds->SetData().SetCdregion();
665     if (cdregion.IsSetFrame()) {
666         tmp_cds->SetData().SetCdregion().SetFrame(cdregion.GetFrame());
667     }
668     if (cdregion.IsSetCode()) {
669         tmp_cds->SetData().SetCdregion().SetCode().Assign(cdregion.GetCode());
670     }
671 
672     // now, will use tmp_cds to translate individual code breaks;
673     tmp_cds->SetData().SetCdregion().ResetFrame();
674 
675     const CSeq_loc& loc = feat.GetLocation();
676 
677     for (auto cbr = cdregion.GetCode_break().begin(); cbr != cdregion.GetCode_break().end(); cbr++) {
678         if (!(*cbr)->IsSetLoc()) {
679             continue;
680         }
681         // if the code break is outside the coding region, skip
682         ECompare comp = Compare((*cbr)->GetLoc(), loc,
683             scope, fCompareOverlapping);
684         if ((comp != eContained) && (comp != eSame)) {
685             continue;
686         }
687 
688         TSeqPos codon_length = GetLength((*cbr)->GetLoc(), scope);
689         TSeqPos from = LocationOffset(loc, (*cbr)->GetLoc(),
690             eOffset_FromStart, scope);
691 
692         TSeqPos from_end = LocationOffset(loc, (*cbr)->GetLoc(),
693             eOffset_FromEnd, scope);
694 
695         TSeqPos to = from + codon_length - 1;
696 
697         // check for code break not on a codon
698         if (codon_length == 3 ||
699             ((codon_length == 1 || codon_length == 2) && to == len - 1)) {
700             size_t start_pos;
701             switch (cdregion.GetFrame()) {
702             case CCdregion::eFrame_two:
703                 start_pos = 1;
704                 break;
705             case CCdregion::eFrame_three:
706                 start_pos = 2;
707                 break;
708             default:
709                 start_pos = 0;
710                 break;
711             }
712             if ((from % 3) != start_pos) {
713                 problems.push_back({ eTranslExceptPhase, 0, 0 });
714             }
715         }
716         if ((*cbr)->IsSetAa() && (*cbr)->IsSetLoc()) {
717             tmp_cds->SetLocation().Assign((*cbr)->GetLoc());
718             tmp_cds->SetLocation().SetPartialStart(true, eExtreme_Biological);
719             tmp_cds->SetLocation().SetPartialStop(true, eExtreme_Biological);
720             string cb_trans;
721             try {
722                 CSeqTranslator::Translate(*tmp_cds, *scope, cb_trans,
723                     true,   // include stop codons
724                     false,  // do not remove trailing X/B/Z
725                     &alt_start);
726             } catch (CException&) {
727             }
728             size_t prot_pos = from / 3;
729 
730             unsigned char ex = 0;
731             vector<char> seqData;
732             string str;
733             bool not_set = false;
734 
735             switch ((*cbr)->GetAa().Which()) {
736             case CCode_break::C_Aa::e_Ncbi8aa:
737                 str = (*cbr)->GetAa().GetNcbi8aa();
738                 CSeqConvert::Convert(str, CSeqUtil::e_Ncbi8aa, (TSeqPos)0, (TSeqPos)(str.size()), seqData, CSeqUtil::e_Ncbieaa);
739                 ex = seqData[0];
740                 break;
741             case CCode_break::C_Aa::e_Ncbistdaa:
742                 str = (*cbr)->GetAa().GetNcbi8aa();
743                 CSeqConvert::Convert(str, CSeqUtil::e_Ncbistdaa, (TSeqPos)0, (TSeqPos)(str.size()), seqData, CSeqUtil::e_Ncbieaa);
744                 ex = seqData[0];
745                 break;
746             case CCode_break::C_Aa::e_Ncbieaa:
747                 seqData.push_back((*cbr)->GetAa().GetNcbieaa());
748                 ex = seqData[0];
749                 break;
750             default:
751                 // do nothing, code break wasn't actually set
752                 not_set = true;
753                 break;
754             }
755 
756             if (!not_set) {
757                 string except_char;
758                 except_char += ex;
759 
760                 //At the beginning of the CDS
761                 if (prot_pos == 0 && ex != 'M') {
762                     if (prot_pos == 0 && ex == 'L' && x_LeuCUGstart(feat) && is_refseq) {
763                         /* do not warn on explicitly documented unusual translation initiation at CUG without initiator tRNA-Met */
764                     } else if ((!feat.IsSetPartial()) || (!feat.GetPartial())) {
765                         problems.push_back({ eTranslExceptSuspicious, ex, prot_pos });
766                     }
767                 }
768 
769                 // Anywhere in CDS, where exception has no effect
770                 if (from_end > 0) {
771                     if (from_end < 2 && NStr::Equal(except_char, "*")) {
772                         // this is a necessary terminal transl_except
773                     } else if (NStr::EqualNocase(cb_trans, except_char)) {
774                         if (prot_pos == 0 && ex == 'L' && x_LeuCUGstart(feat) && is_refseq) {
775                             /* do not warn on explicitly documented unusual translation initiation at CUG without initiator tRNA-Met */
776                         } else {
777                             problems.push_back({ eTranslExceptUnnecessary, ex, prot_pos });
778                         }
779                     }
780                 } else if (!NStr::Equal(except_char, "*"))
781                 {
782                     if (NStr::Equal(cb_trans, except_char) ||
783                         !loc.IsPartialStop(eExtreme_Biological))
784                     {
785                         const CGenetic_code  *gcode;
786                         CBioseq_Handle       bsh;
787                         CSeqVector           vec;
788                         TSeqPos              from1;
789                         string               p;
790                         string               q;
791                         bool                 altst;
792 
793                         vec = loc_handle.GetSeqVector(CBioseq_Handle::eCoding_Iupac);
794                         altst = false;
795                         from1 = (*cbr)->GetLoc().GetStart(eExtreme_Biological);
796                         vec.GetSeqData(from1, from1 + 3, q);
797 
798                         if (cdregion.CanGetCode())
799                             gcode = &cdregion.GetCode();
800                         else
801                             gcode = NULL;
802 
803                         CSeqTranslator::Translate(q, p,
804                             CSeqTranslator::fIs5PrimePartial,
805                             gcode, &altst);
806 
807                         if (!NStr::Equal(except_char, "*")) {
808                             problems.push_back({ eTranslExceptUnexpected, ex, prot_pos });
809                         }
810                     }
811                 } else
812                 {
813                     /*bsv
814                     Stop codon here. Needs a check for abutting tRNA feature.
815                     bsv*/
816                 }
817             }
818         }
819     }
820     return problems;
821 }
822 
823 
x_JustifiesException(const TTranslExceptProblems & problems)824 bool CCDSTranslationProblems::x_JustifiesException(const TTranslExceptProblems& problems)
825 {
826     for (auto it = problems.begin(); it != problems.end(); it++) {
827         if (it->problem == eTranslExceptPhase) {
828             return true;
829         }
830     }
831     return false;
832 }
833 
834 
x_Is5AtEndSpliceSiteOrGap(const CSeq_loc & loc,CScope & scope)835 bool CCDSTranslationProblems::x_Is5AtEndSpliceSiteOrGap(const CSeq_loc& loc, CScope& scope)
836 {
837     CSeq_loc_CI loc_it(loc);
838     if (!loc_it) {
839         return false;
840     }
841     CConstRef<CSeq_loc> rng = loc_it.GetRangeAsSeq_loc();
842     if (!rng) {
843         return false;
844     }
845 
846     TSeqPos end = rng->GetStart(eExtreme_Biological);
847     const CBioseq_Handle & bsh = scope.GetBioseqHandle(*rng);
848     if (!bsh) {
849         return false;
850     }
851 
852     ENa_strand strand = rng->GetStrand();
853     if (strand == eNa_strand_minus) {
854           TSeqPos seq_len = bsh.GetBioseqLength();
855           if (end < seq_len - 1) {
856               CSeqVector vec = bsh.GetSeqVector (CBioseq_Handle::eCoding_Iupac);
857               if (vec.IsInGap(end + 1)) {
858                   if (vec.IsInGap (end)) {
859                       // not ok - location overlaps gap
860                       return false;
861                   } else {
862                       // ok, location abuts gap
863                   }
864               } else if (end < seq_len - 2 && IsResidue (vec[end + 1]) && ConsistentWithC(vec[end + 1])
865                          && IsResidue (vec[end + 2]) && ConsistentWithT(vec[end + 2])) {
866                   // it's ok, it's abutting the reverse complement of AG
867               } else {
868                   return false;
869               }
870           } else {
871               // it's ok, location endpoint is at the 3' end
872           }
873     } else {
874           if (end > 0 && end < bsh.GetBioseqLength()) {
875               CSeqVector vec = bsh.GetSeqVector (CBioseq_Handle::eCoding_Iupac);
876               if (vec.IsInGap(end - 1)) {
877                   if (vec.IsInGap (end)) {
878                       // not ok - location overlaps gap
879                       return false;
880                   } else {
881                       // ok, location abuts gap
882                   }
883               } else {
884                   if (end > 1 && IsResidue (vec[end - 1]) && ConsistentWithG(vec[end - 1])
885                       && IsResidue(vec[end - 2]) && ConsistentWithA(vec[end - 2])) {
886                       //it's ok, it's abutting "AG"
887                   } else {
888                       return false;
889                   }
890               }
891           } else {
892               // it's ok, location endpoint is at the 5' end
893           }
894     }
895     return true;
896 }
897 
898 
x_CheckForRaggedEnd(const CSeq_feat & feat,CScope * scope)899 int CCDSTranslationProblems::x_CheckForRaggedEnd(const CSeq_feat& feat, CScope* scope)
900 {
901     int ragged = 0;
902     if (!feat.IsSetData() || !feat.GetData().IsCdregion() || !feat.IsSetLocation()) {
903         return 0;
904     }
905     unsigned int part_loc = SeqLocPartialCheck(feat.GetLocation(), scope);
906     if (feat.IsSetProduct()) {
907         unsigned int part_prod = SeqLocPartialCheck(feat.GetProduct(), scope);
908         if ((part_loc & eSeqlocPartial_Stop) ||
909             (part_prod & eSeqlocPartial_Stop)) {
910             // not ragged
911         } else {
912             // complete stop, so check for ragged end
913             ragged = x_CheckForRaggedEnd(feat.GetLocation(), feat.GetData().GetCdregion(), scope);
914         }
915     }
916     return ragged;
917 }
918 
919 
x_CheckForRaggedEnd(const CSeq_loc & loc,const CCdregion & cdregion,CScope * scope)920 int CCDSTranslationProblems::x_CheckForRaggedEnd
921 (const CSeq_loc& loc,
922 const CCdregion& cdregion,
923 CScope* scope)
924 {
925     size_t len = GetLength(loc, scope);
926     if (cdregion.GetFrame() > CCdregion::eFrame_one) {
927         len -= cdregion.GetFrame() - 1;
928     }
929 
930     int ragged = len % 3;
931     if (ragged > 0) {
932         len = GetLength(loc, scope);
933         size_t last_pos = 0;
934 
935         CSeq_loc::TRange range = CSeq_loc::TRange::GetEmpty();
936         if (cdregion.IsSetCode_break()) {
937             for (auto cbr = cdregion.GetCode_break().begin(); cbr != cdregion.GetCode_break().end(); cbr++) {
938                 SRelLoc rl(loc, (*cbr)->GetLoc(), scope);
939                 ITERATE(SRelLoc::TRanges, rit, rl.m_Ranges) {
940                     if ((*rit)->GetTo() > last_pos) {
941                         last_pos = (*rit)->GetTo();
942                     }
943                 }
944             }
945         }
946 
947         // allowing a partial codon at the end
948         TSeqPos codon_length = range.GetLength();
949         if ((codon_length == 0 || codon_length == 1) &&
950             last_pos == len - 1) {
951             ragged = 0;
952         }
953     }
954     return ragged;
955 }
956 
957 
958 typedef enum {
959     eMRNAExcept_Biological = 1,
960     eMRNAExcept_RNAEditing = 2,
961     eMRNAExcept_Unclassified = 4,
962     eMRNAExcept_Mismatch = 8,
963     eMRNAExcept_ProductReplaced = 16
964 } EMRNAException;
965 static const char* const sc_BypassMrnaTransCheckText[] = {
966     "RNA editing",
967     "adjusted for low-quality genome",
968     "annotated by transcript or proteomic data",
969     "artificial frameshift",
970     "reasons given in citation",
971     "transcribed product replaced",
972     "unclassified transcription discrepancy",
973 };
974 typedef CStaticArraySet<const char*, PCase_CStr> TBypassMrnaTransCheckSet;
975 DEFINE_STATIC_ARRAY_MAP(TBypassMrnaTransCheckSet, sc_BypassMrnaTransCheck, sc_BypassMrnaTransCheckText);
976 
977 
InterpretMrnaException(const string & except_text)978 size_t InterpretMrnaException(const string& except_text)
979 {
980     size_t rval = 0;
981 
982     ITERATE(TBypassMrnaTransCheckSet, it, sc_BypassMrnaTransCheck) {
983         if (NStr::FindNoCase(except_text, *it) != NPOS) {
984             rval |= eMRNAExcept_Biological;
985         }
986     }
987     if (NStr::FindNoCase(except_text, "RNA editing") != NPOS) {
988         rval |= eMRNAExcept_RNAEditing;
989     }
990     if (NStr::FindNoCase(except_text, "unclassified transcription discrepancy") != NPOS) {
991         rval |= eMRNAExcept_Unclassified;
992     }
993     if (NStr::FindNoCase(except_text, "mismatches in transcription") != NPOS) {
994         rval |= eMRNAExcept_Mismatch;
995     }
996     if (NStr::FindNoCase(except_text, "transcribed product replaced") != NPOS) {
997         rval |= eMRNAExcept_ProductReplaced;
998     }
999     return rval;
1000 }
1001 
GetMRNATranslationProblems(const CSeq_feat & feat,size_t & mismatches,bool ignore_exceptions,CBioseq_Handle nuc,CBioseq_Handle rna,bool far_fetch,bool is_gpipe,bool is_genomic,CScope * scope)1002 size_t GetMRNATranslationProblems
1003 (const CSeq_feat& feat,
1004  size_t& mismatches,
1005  bool ignore_exceptions,
1006  CBioseq_Handle nuc,
1007  CBioseq_Handle rna,
1008  bool far_fetch,
1009  bool is_gpipe,
1010  bool is_genomic,
1011  CScope* scope)
1012 {
1013     size_t rval = 0;
1014     mismatches = 0;
1015     if (feat.CanGetPseudo() && feat.GetPseudo()) {
1016         return rval;
1017     }
1018     if (!feat.CanGetProduct()) {
1019         return rval;
1020     }
1021 
1022     size_t exception_flags = 0;
1023 
1024     if (!ignore_exceptions &&
1025         feat.CanGetExcept() && feat.GetExcept() &&
1026         feat.CanGetExcept_text()) {
1027         exception_flags = InterpretMrnaException(feat.GetExcept_text());
1028     }
1029     bool has_errors = false, other_than_mismatch = false;
1030     bool report_errors = !(exception_flags & eMRNAExcept_Biological) || (exception_flags & eMRNAExcept_Mismatch);
1031 
1032     CConstRef<CSeq_id> product_id;
1033     try {
1034         product_id.Reset(&GetId(feat.GetProduct(), scope));
1035     } catch (CException&) {
1036     }
1037     if (!product_id) {
1038         return rval;
1039     }
1040 
1041     if (!nuc) {
1042         if (exception_flags & eMRNAExcept_Unclassified) {
1043             rval |= eMRNAProblem_TransFail;
1044         }
1045         return rval;
1046     }
1047 
1048     size_t total = 0;
1049 
1050     // note - exception will be thrown when creating CSeqVector if wrong type set for bioseq seq-data
1051     try {
1052         if (nuc) {
1053 
1054             if (!rna) {
1055                 if (far_fetch) {
1056                     rval |= eMRNAProblem_UnableToFetch;
1057                 }
1058                 return rval;
1059             }
1060 
1061             _ASSERT(nuc  &&  rna);
1062 
1063             CSeqVector nuc_vec(feat.GetLocation(), *scope,
1064                 CBioseq_Handle::eCoding_Iupac);
1065             CSeqVector rna_vec(rna, CBioseq_Handle::eCoding_Iupac);
1066 
1067             TSeqPos nuc_len = nuc_vec.size();
1068             TSeqPos rna_len = rna_vec.size();
1069 
1070             if (nuc_len != rna_len) {
1071                 has_errors = true;
1072                 other_than_mismatch = true;
1073                 if (nuc_len < rna_len) {
1074                     size_t count_a = 0, count_no_a = 0;
1075                     // count 'A's in the tail
1076                     for (CSeqVector_CI iter(rna_vec, nuc_len); iter; ++iter) {
1077                         if ((*iter == 'A') || (*iter == 'a')) {
1078                             ++count_a;
1079                         } else {
1080                             ++count_no_a;
1081                         }
1082                     }
1083                     if (count_a < (19 * count_no_a)) { // less then 5%
1084                         if (report_errors || (exception_flags & eMRNAExcept_RNAEditing)) {
1085                             rval |= eMRNAProblem_TranscriptLenLess;
1086                         }
1087                     } else if (count_a > 0 && count_no_a == 0) {
1088                         has_errors = true;
1089                         other_than_mismatch = true;
1090                         if (report_errors || (exception_flags & eMRNAExcept_RNAEditing)) {
1091                             if (is_gpipe && is_genomic) {
1092                                 // suppress
1093                             } else {
1094                                 rval |= eMRNAProblem_PolyATail100;
1095                             }
1096                         }
1097                     } else {
1098                         if (report_errors) {
1099                             rval |= eMRNAProblem_PolyATail95;
1100                         }
1101                     }
1102                     // allow base-by-base comparison on common length
1103                     rna_len = nuc_len = min(nuc_len, rna_len);
1104 
1105                 } else {
1106                     if (report_errors) {
1107                         rval |= eMRNAProblem_TranscriptLenMore;
1108                     }
1109                 }
1110             }
1111 
1112             if (rna_len == nuc_len && nuc_len > 0) {
1113                 CSeqVector_CI nuc_ci(nuc_vec);
1114                 CSeqVector_CI rna_ci(rna_vec);
1115 
1116                 // compare content of common length
1117                 while ((nuc_ci  &&  rna_ci) && (nuc_ci.GetPos() < nuc_len)) {
1118                     if (*nuc_ci != *rna_ci) {
1119                         ++mismatches;
1120                     }
1121                     ++nuc_ci;
1122                     ++rna_ci;
1123                     ++total;
1124                 }
1125                 if (mismatches > 0) {
1126                     has_errors = true;
1127                     if (report_errors  &&  !(exception_flags & eMRNAExcept_Mismatch)) {
1128                         rval |= eMRNAProblem_Mismatch;
1129                     }
1130                 }
1131             }
1132         }
1133     } catch (CException&) {
1134         rval |= eMRNAProblem_TransFail;
1135     } catch (std::exception) {
1136     }
1137 
1138     if (!report_errors) {
1139         if (!has_errors) {
1140             rval |= eMRNAProblem_UnnecessaryException;
1141         } else if ((exception_flags & eMRNAExcept_Unclassified) && !other_than_mismatch) {
1142             if (mismatches * 50 <= total) {
1143                 rval |= eMRNAProblem_ErroneousException;
1144             }
1145         } else if (exception_flags & eMRNAExcept_ProductReplaced) {
1146             rval |= eMRNAProblem_ProductReplaced;
1147         }
1148     }
1149     return rval;
1150 }
1151 
1152 
1153 
1154 END_SCOPE(validator)
1155 END_SCOPE(objects)
1156 END_NCBI_SCOPE
1157