1 /*  $Id: cdregion_validator.cpp 636457 2021-08-24 17:53:45Z fukanchi $
2  * ===========================================================================
3  *
4  *                            PUBLIC DOMAIN NOTICE
5  *               National Center for Biotechnology Information
6  *
7  *  This software/database is a "United States Government Work" under the
8  *  terms of the United States Copyright Act.  It was written as part of
9  *  the author's official duties as a United States Government employee and
10  *  thus cannot be copyrighted.  This software/database is freely available
11  *  to the public for use. The National Library of Medicine and the U.S.
12  *  Government have not placed any restriction on its use or reproduction.
13  *
14  *  Although all reasonable efforts have been taken to ensure the accuracy
15  *  and reliability of the software and data, the NLM and the U.S.
16  *  Government do not and cannot warrant the performance or results that
17  *  may be obtained by using this software or data. The NLM and the U.S.
18  *  Government disclaim all warranties, express or implied, including
19  *  warranties of performance, merchantability or fitness for any particular
20  *  purpose.
21  *
22  *  Please cite the author in any work or product based on this material.
23  *
24  * ===========================================================================
25  *
26  * Author:  Jonathan Kans, Clifford Clausen, Aaron Ucko......
27  *
28  * File Description:
29  *   validation of Seq_feat
30  *   .......
31  *
32  */
33 #include <ncbi_pch.hpp>
34 #include <corelib/ncbistd.hpp>
35 #include <corelib/ncbistr.hpp>
36 #include <objtools/validator/validatorp.hpp>
37 #include <objtools/validator/single_feat_validator.hpp>
38 #include <objtools/validator/utilities.hpp>
39 #include <objtools/format/items/gene_finder.hpp>
40 
41 #include <serial/serialbase.hpp>
42 
43 #include <objmgr/bioseq_handle.hpp>
44 #include <objmgr/seq_entry_handle.hpp>
45 #include <objmgr/seqdesc_ci.hpp>
46 #include <objmgr/seq_vector.hpp>
47 #include <objmgr/scope.hpp>
48 #include <objmgr/util/sequence.hpp>
49 #include <objmgr/util/feature.hpp>
50 
51 #include <objects/seqfeat/Seq_feat.hpp>
52 #include <objects/seqfeat/BioSource.hpp>
53 #include <objects/seqfeat/Cdregion.hpp>
54 #include <objects/seqfeat/Code_break.hpp>
55 #include <objects/seqfeat/Gb_qual.hpp>
56 #include <objects/seqfeat/Genetic_code.hpp>
57 #include <objects/seqfeat/Genetic_code_table.hpp>
58 #include <objects/seqfeat/Prot_ref.hpp>
59 
60 #include <objects/seq/MolInfo.hpp>
61 #include <objects/seq/Bioseq.hpp>
62 #include <objects/seq/seqport_util.hpp>
63 
64 #include <objtools/edit/cds_fix.hpp>
65 
66 #include <string>
67 
68 
69 BEGIN_NCBI_SCOPE
70 BEGIN_SCOPE(objects)
71 BEGIN_SCOPE(validator)
72 using namespace sequence;
73 
74 
CCdregionValidator(const CSeq_feat & feat,CScope & scope,CValidError_imp & imp)75 CCdregionValidator::CCdregionValidator(const CSeq_feat& feat, CScope& scope, CValidError_imp& imp) :
76 CSingleFeatValidator(feat, scope, imp) {
77     m_Gene = m_Imp.GetGeneCache().GetGeneFromCache(&feat, m_Scope);
78     if (m_Gene) {
79         m_GeneIsPseudo = s_IsPseudo(*m_Gene);
80     } else {
81         m_GeneIsPseudo = false;
82     }
83 }
84 
85 
x_ValidateFeatComment()86 void CCdregionValidator::x_ValidateFeatComment()
87 {
88     if (!m_Feat.IsSetComment()) {
89         return;
90     }
91     CSingleFeatValidator::x_ValidateFeatComment();
92     const string& comment = m_Feat.GetComment();
93     if (NStr::Find(comment, "ambiguity in stop codon") != NPOS
94         && !edit::DoesCodingRegionHaveTerminalCodeBreak(m_Feat.GetData().GetCdregion())) {
95         CRef<CSeq_loc> stop_codon_loc = edit::GetLastCodonLoc(m_Feat, m_Scope);
96         if (stop_codon_loc) {
97             TSeqPos len = sequence::GetLength(*stop_codon_loc, &m_Scope);
98             CSeqVector vec(*stop_codon_loc, m_Scope, CBioseq_Handle::eCoding_Iupac);
99             string seq_string;
100             vec.GetSeqData(0, len - 1, seq_string);
101             bool found_ambig = false;
102             string::iterator it = seq_string.begin();
103             while (it != seq_string.end() && !found_ambig) {
104                 if (*it != 'A' && *it != 'T' && *it != 'C' && *it != 'G' && *it != 'U') {
105                     found_ambig = true;
106                 }
107                 ++it;
108             }
109             if (!found_ambig) {
110                 m_Imp.PostErr(eDiag_Error, eErr_SEQ_FEAT_BadCDScomment,
111                     "Feature comment indicates ambiguity in stop codon "
112                     "but no ambiguities are present in stop codon.", m_Feat);
113             }
114         }
115     }
116 
117     // look for EC number in comment
118     if (HasECnumberPattern(m_Feat.GetComment())) {
119         // suppress if protein has EC numbers
120         bool suppress = false;
121         if (m_ProductBioseq) {
122             CFeat_CI prot_feat(m_ProductBioseq, SAnnotSelector(CSeqFeatData::eSubtype_prot));
123             if (prot_feat && prot_feat->GetData().GetProt().IsSetEc()) {
124                 suppress = true;
125             }
126         }
127         if (!suppress) {
128             PostErr(eDiag_Info, eErr_SEQ_FEAT_EcNumberInCDSComment,
129                 "Apparent EC number in CDS comment");
130         }
131     }
132 
133 }
134 
135 
x_ValidateExceptText(const string & text)136 void CCdregionValidator::x_ValidateExceptText(const string& text)
137 {
138     CSingleFeatValidator::x_ValidateExceptText(text);
139     if (m_Feat.GetData().GetCdregion().IsSetCode_break() &&
140         NStr::FindNoCase(text, "RNA editing") != NPOS) {
141         PostErr(eDiag_Warning, eErr_SEQ_FEAT_TranslExceptAndRnaEditing,
142             "CDS has both RNA editing /exception and /transl_except qualifiers");
143     }
144 }
145 
146 
147 #define FOR_EACH_SEQID_ON_BIOSEQ_HANDLE(Itr, Var) \
148 ITERATE (CBioseq_Handle::TId, Itr, Var.GetId())
149 
s_LocIdType(CBioseq_Handle bsh,bool & is_nt,bool & is_ng,bool & is_nw,bool & is_nc)150 void s_LocIdType(CBioseq_Handle bsh,
151     bool& is_nt, bool& is_ng, bool& is_nw, bool& is_nc)
152 {
153     is_nt = is_ng = is_nw = is_nc = false;
154     if (bsh) {
155         FOR_EACH_SEQID_ON_BIOSEQ_HANDLE(it, bsh) {
156             CSeq_id_Handle sid = *it;
157             switch (sid.Which()) {
158             case NCBI_SEQID(Embl):
159             case NCBI_SEQID(Ddbj):
160             case NCBI_SEQID(Other):
161             case NCBI_SEQID(Genbank):
162             {
163                 CSeq_id::EAccessionInfo info = sid.IdentifyAccession();
164                 is_nt |= (info == CSeq_id::eAcc_refseq_contig);
165                 is_ng |= (info == CSeq_id::eAcc_refseq_genomic);
166                 is_nw |= (info == CSeq_id::eAcc_refseq_wgs_intermed);
167                 is_nc |= (info == CSeq_id::eAcc_refseq_chromosome);
168                 break;
169             }
170             default:
171                 break;
172             }
173         }
174     }
175 }
176 
177 
s_LocIdType(const CSeq_loc & loc,CScope & scope,const CSeq_entry & tse,bool & is_nt,bool & is_ng,bool & is_nw,bool & is_nc)178 void s_LocIdType(const CSeq_loc& loc, CScope& scope, const CSeq_entry& tse,
179     bool& is_nt, bool& is_ng, bool& is_nw, bool& is_nc)
180 {
181     is_nt = is_ng = is_nw = is_nc = false;
182     if (!IsOneBioseq(loc, &scope)) {
183         return;
184     }
185     const CSeq_id& id = GetId(loc, &scope);
186     try {
187         CBioseq_Handle bsh = scope.GetBioseqHandleFromTSE(id, tse);
188         if (bsh) {
189             s_LocIdType(bsh, is_nt, is_ng, is_nw, is_nc);
190         }
191     } catch (CException&) {
192     }
193 }
194 
195 
x_ValidateTrans()196 void CCdregionValidator::x_ValidateTrans()
197 {
198     CCDSTranslationProblems problems;
199     bool is_nt, is_ng, is_nw, is_nc;
200     s_LocIdType(m_LocationBioseq, is_nt, is_ng, is_nw, is_nc);
201 
202     problems.CalculateTranslationProblems(m_Feat,
203                                     m_LocationBioseq,
204                                     m_ProductBioseq,
205                                     m_Imp.IgnoreExceptions(),
206                                     m_Imp.IsFarFetchCDSproducts(),
207                                     m_Imp.IsStandaloneAnnot(),
208                                     m_Imp.IsStandaloneAnnot() ? false : m_Imp.GetTSE().IsSeq(),
209                                     m_Imp.IsGpipe(),
210                                     m_Imp.IsGenomic(),
211                                     m_Imp.IsRefSeq(),
212                                     (is_nt||is_ng||is_nw),
213                                     is_nc,
214                                     (m_Imp.IsRefSeq() || m_Imp.IsGED() || m_Imp.IsTPE()),
215                                     &m_Scope);
216     if (!problems.UnableToTranslate() && !problems.HasException()) {
217         x_ValidateCodebreak();
218     }
219 
220     if (problems.GetTranslationProblemFlags() & CCDSTranslationProblems::eCDSTranslationProblem_UnableToFetch) {
221         if (m_Imp.x_IsFarFetchFailure(m_Feat.GetProduct())) {
222             m_Imp.SetFarFetchFailure();
223         }
224     }
225 
226     x_ReportTranslationProblems(problems);
227 }
228 
229 
GetGcodeForName(const string & code_name)230 int GetGcodeForName(const string& code_name)
231 {
232     const CGenetic_code_table& tbl = CGen_code_table::GetCodeTable();
233     ITERATE(CGenetic_code_table::Tdata, it, tbl.Get()) {
234         if (NStr::EqualNocase((*it)->GetName(), code_name)) {
235             return (*it)->GetId();
236         }
237     }
238     return 255;
239 }
240 
241 
GetGcodeForInternalStopErrors(const CCdregion & cdr)242 int GetGcodeForInternalStopErrors(const CCdregion& cdr)
243 {
244     int gc = 0;
245     if (cdr.IsSetCode()) {
246         ITERATE(CCdregion::TCode::Tdata, it, cdr.GetCode().Get()) {
247             if ((*it)->IsId()) {
248                 gc = (*it)->GetId();
249             } else if ((*it)->IsName()) {
250                 gc = GetGcodeForName((*it)->GetName());
251             }
252             if (gc != 0) break;
253         }
254     }
255     return gc;
256 }
257 
258 
GetInternalStopErrorMessage(const CSeq_feat & feat,size_t internal_stop_count,bool bad_start,char transl_start)259 string GetInternalStopErrorMessage(const CSeq_feat& feat, size_t internal_stop_count, bool bad_start, char transl_start)
260 {
261     int gc = GetGcodeForInternalStopErrors(feat.GetData().GetCdregion());
262     string gccode = NStr::IntToString(gc);
263 
264     string error_message;
265     if (bad_start) {
266         bool got_dash = transl_start == '-';
267         string codon_desc = got_dash ? "illegal" : "ambiguous";
268         error_message = NStr::SizetToString(internal_stop_count) +
269             " internal stops (and " + codon_desc + " start codon). Genetic code [" + gccode + "]";
270     } else {
271         error_message = NStr::SizetToString(internal_stop_count) +
272             " internal stops. Genetic code [" + gccode + "]";
273     }
274     return error_message;
275 }
276 
277 
GetInternalStopErrorMessage(const CSeq_feat & feat,const string & transl_prot)278 string GetInternalStopErrorMessage(const CSeq_feat& feat, const string& transl_prot)
279 {
280     size_t internal_stop_count = CountInternalStopCodons(transl_prot);
281 
282     int gc = GetGcodeForInternalStopErrors(feat.GetData().GetCdregion());
283     string gccode = NStr::IntToString(gc);
284 
285     string error_message;
286     if (HasBadStartCodon(feat.GetLocation(), transl_prot)) {
287         bool got_dash = transl_prot[0] == '-';
288         string codon_desc = got_dash ? "illegal" : "ambiguous";
289         error_message = NStr::SizetToString(internal_stop_count) +
290             " internal stops (and " + codon_desc + " start codon). Genetic code [" + gccode + "]";
291     } else {
292         error_message = NStr::SizetToString(internal_stop_count) +
293             " internal stops. Genetic code [" + gccode + "]";
294     }
295     return error_message;
296 }
297 
298 
GetStartCodonErrorMessage(const CSeq_feat & feat,const char first_char,size_t internal_stop_count)299 string GetStartCodonErrorMessage(const CSeq_feat& feat, const char first_char, size_t internal_stop_count)
300 {
301     bool got_dash = first_char == '-';
302     string codon_desc = got_dash ? "Illegal" : "Ambiguous";
303     string p_word = got_dash ? "Probably" : "Possibly";
304 
305     int gc = GetGcodeForInternalStopErrors(feat.GetData().GetCdregion());
306     string gccode = NStr::IntToString(gc);
307 
308     string error_message;
309 
310     if (internal_stop_count > 0) {
311         error_message = codon_desc + " start codon (and " +
312             NStr::SizetToString(internal_stop_count) +
313             " internal stops). " + p_word + " wrong genetic code [" +
314             gccode + "]";
315     } else {
316         error_message = codon_desc + " start codon used. Wrong genetic code [" +
317             gccode + "] or protein should be partial";
318     }
319     return error_message;
320 }
321 
322 
GetStartCodonErrorMessage(const CSeq_feat & feat,const string & transl_prot)323 string GetStartCodonErrorMessage(const CSeq_feat& feat, const string& transl_prot)
324 {
325     size_t internal_stop_count = CountInternalStopCodons(transl_prot);
326 
327     return GetStartCodonErrorMessage(feat, transl_prot[0], internal_stop_count);
328 }
329 
330 
x_ReportTranslationProblems(const CCDSTranslationProblems & problems)331 void CCdregionValidator::x_ReportTranslationProblems(const CCDSTranslationProblems& problems)
332 {
333     size_t problem_flags = problems.GetTranslationProblemFlags();
334     if (problem_flags & CCDSTranslationProblems::eCDSTranslationProblem_UnableToFetch) {
335         string label;
336         const CSeq_id* protid = &GetId(m_Feat.GetProduct(), &m_Scope);
337         protid->GetLabel(&label);
338         EDiagSev sev = eDiag_Error;
339         if (protid->IsGeneral() && protid->GetGeneral().IsSetDb() &&
340             (NStr::EqualNocase(protid->GetGeneral().GetDb(), "ti") ||
341             NStr::EqualNocase(protid->GetGeneral().GetDb(), "SRA"))) {
342             sev = eDiag_Warning;
343         }
344         PostErr(sev, eErr_SEQ_FEAT_ProductFetchFailure,
345             "Unable to fetch CDS product '" + label + "'");
346     }
347 
348     if (!problems.HasException() && (problem_flags & CCDSTranslationProblems::eCDSTranslationProblem_NoProtein)) {
349         bool is_nt, is_ng, is_nw, is_nc;
350         s_LocIdType(m_Feat.GetLocation(), m_Scope, m_Imp.GetTSE(), is_nt, is_ng, is_nw, is_nc);
351         EDiagSev sev = eDiag_Error;
352         if (IsDeltaOrFarSeg(m_Feat.GetLocation(), &m_Scope)) {
353             sev = eDiag_Warning;
354         }
355         if (is_nc) {
356             sev = eDiag_Warning;
357         }
358         PostErr(sev, eErr_SEQ_FEAT_NoProtein,
359             "No protein Bioseq given");
360     }
361 
362     bool unclassified_except = false;
363     if (m_Feat.IsSetExcept_text() && NStr::FindNoCase(m_Feat.GetExcept_text(), "unclassified translation discrepancy") != NPOS) {
364         unclassified_except = true;
365     }
366 
367     x_ReportTranslExceptProblems(problems.GetTranslExceptProblems(), problems.HasException());
368 
369     if (!problems.HasException() && problems.HasUnparsedTranslExcept()) {
370         if (problems.GetInternalStopCodons() == 0 && problems.GetTranslationMismatches().size() == 0) {
371             PostErr(eDiag_Warning, eErr_SEQ_FEAT_TranslExcept,
372                 "Unparsed transl_except qual (but protein is okay). Skipped");
373         } else {
374             PostErr(eDiag_Warning, eErr_SEQ_FEAT_TranslExcept,
375                 "Unparsed transl_except qual. Skipped");
376         }
377     }
378 
379 
380     for (size_t i = 0; i < problems.GetNumNonsenseIntrons(); i++) {
381         EDiagSev sev = eDiag_Critical;
382         if (m_Imp.IsEmbl() || m_Imp.IsDdbj()) {
383             sev = eDiag_Error;
384         }
385         PostErr(sev, eErr_SEQ_FEAT_IntronIsStopCodon, "Triplet intron encodes stop codon");
386     }
387 
388     if (problem_flags & CCDSTranslationProblems::eCDSTranslationProblem_TooManyX) {
389         PostErr(eDiag_Info, eErr_SEQ_FEAT_CDShasTooManyXs, "CDS translation consists of more than 50% X residues");
390     }
391 
392     if (problems.UnableToTranslate()) {
393         if (!problems.HasException()) {
394             PostErr(eDiag_Error, eErr_SEQ_FEAT_CdTransFail,
395                 "Unable to translate");
396         }
397     }
398 
399     if (!problems.UnableToTranslate() && !problems.AltStart() &&
400         m_Feat.IsSetExcept() && m_Feat.IsSetExcept_text() &&
401         NStr::Find(m_Feat.GetExcept_text(), "alternative start codon") != string::npos &&
402         x_BioseqHasNmAccession(m_LocationBioseq)) {
403 
404         PostErr(eDiag_Warning, eErr_SEQ_FEAT_AltStartCodonException,
405                 "Unnecessary alternative start codon exception");
406     }
407 
408     if ((!problems.HasException() || unclassified_except) && problems.GetInternalStopCodons() > 0) {
409         if (unclassified_except && m_Imp.IsGpipe()) {
410             // suppress if gpipe genomic
411         } else {
412             EDiagSev stop_sev = unclassified_except ? eDiag_Warning : eDiag_Error;
413             if (!m_Imp.IsRefSeq() && m_Imp.IsGI() && m_Imp.IsGED()) {
414                 stop_sev = eDiag_Critical;
415             }
416 
417             PostErr(stop_sev, eErr_SEQ_FEAT_InternalStop,
418                 GetInternalStopErrorMessage(m_Feat, problems.GetInternalStopCodons(),
419                                             (problem_flags & CCDSTranslationProblems::eCDSTranslationProblem_BadStart),
420                                             problems.GetTranslStartCharacter()));
421         }
422     }
423 
424     if (!problems.HasException()) {
425 
426         if (!unclassified_except && (problem_flags & CCDSTranslationProblems::eCDSTranslationProblem_BadStart)) {
427             string start_err_msg = GetStartCodonErrorMessage(m_Feat, problems.GetTranslStartCharacter(), problems.GetInternalStopCodons());
428             PostErr(eDiag_Error, eErr_SEQ_FEAT_StartCodon,
429                 start_err_msg);
430         }
431 
432         if (problem_flags & CCDSTranslationProblems::eCDSTranslationProblem_FrameNotPartial) {
433             PostErr(eDiag_Error, eErr_SEQ_FEAT_SuspiciousFrame,
434                 "Suspicious CDS location - reading frame > 1 but not 5' partial");
435         }
436 
437         if (problem_flags & CCDSTranslationProblems::eCDSTranslationProblem_FrameNotConsensus) {
438             EDiagSev sev = eDiag_Warning;
439             if (x_BioseqHasNmAccession(m_LocationBioseq))
440             {
441                 sev = eDiag_Error;
442             }
443             PostErr(sev, eErr_SEQ_FEAT_SuspiciousFrame,
444                 "Suspicious CDS location - reading frame > 1 and not at consensus splice site");
445         }
446 
447         if (problem_flags & CCDSTranslationProblems::eCDSTranslationProblem_NoStop) {
448             PostErr(eDiag_Error, eErr_SEQ_FEAT_NoStop,
449                 "Missing stop codon");
450         }
451         if (problem_flags & CCDSTranslationProblems::eCDSTranslationProblem_StopPartial) {
452             PostErr(eDiag_Error, eErr_SEQ_FEAT_PartialProblemHasStop,
453                 "Got stop codon, but 3'end is labeled partial");
454         }
455         if (problem_flags & CCDSTranslationProblems::eCDSTranslationProblem_ShouldStartPartial) {
456             PostErr(eDiag_Error, eErr_SEQ_FEAT_PartialProblem,
457                 "Start of location should probably be partial");
458         }
459         if (problems.GetRaggedLength() > 0)  {
460             PostErr(eDiag_Error, eErr_SEQ_FEAT_TransLen,
461                 "Coding region extends " + NStr::IntToString(problems.GetRaggedLength()) +
462                 " base(s) past stop codon");
463         }
464     }
465 
466     if (!problems.UnableToTranslate() && problems.GetProtLen() > 1.2 * problems.GetTransLen()) {
467         if ((!m_Feat.IsSetExcept_text()) || NStr::Find(m_Feat.GetExcept_text(), "annotated by transcript or proteomic data") == string::npos) {
468             string msg = "Protein product length [" + NStr::SizetToString(problems.GetProtLen()) +
469                 "] is more than 120% of the ";
470             if (m_ProductIsFar) {
471                 msg += "(far) ";
472             }
473             msg += "translation length [" + NStr::SizetToString(problems.GetTransLen()) + "]";
474             PostErr(eDiag_Warning, eErr_SEQ_FEAT_ProductLength, msg);
475         }
476     }
477 
478 
479     bool rna_editing = false;
480     if (m_Feat.IsSetExcept_text() && NStr::FindNoCase(m_Feat.GetExcept_text(), "RNA editing") != NPOS) {
481         rna_editing = true;
482     }
483     if (problems.GetProtLen() != problems.GetTransLen() &&
484         (!problems.HasException() ||
485          (rna_editing &&
486          (problems.GetProtLen() < problems.GetTransLen() - 1 || problems.GetProtLen() > problems.GetTransLen())))) {
487         string msg = "Given protein length [" + NStr::SizetToString(problems.GetProtLen()) +
488             "] does not match ";
489         if (m_ProductIsFar) {
490             msg += "(far) ";
491         }
492         msg += "translation length [" +
493             NStr::SizetToString(problems.GetTransLen()) + "]";
494 
495         if (rna_editing) {
496             msg += " (RNA editing present)";
497         }
498         PostErr(rna_editing ? eDiag_Warning : eDiag_Error,
499             eErr_SEQ_FEAT_TransLen, msg);
500     }
501 
502     bool mismatch_except = false;
503     if (m_Feat.IsSetExcept_text() && NStr::FindNoCase(m_Feat.GetExcept_text(), "mismatches in translation") != NPOS) {
504         mismatch_except = true;
505     }
506 
507     if (!problems.HasException() && !mismatch_except) {
508         x_ReportTranslationMismatches(problems.GetTranslationMismatches());
509     }
510 
511     if (problems.GetTranslTerminalX() != problems.GetProdTerminalX()) {
512         PostErr(eDiag_Warning, eErr_SEQ_FEAT_TerminalXDiscrepancy,
513             "Terminal X count for CDS translation (" + NStr::SizetToString(problems.GetTranslTerminalX())
514             + ") and protein product sequence (" + NStr::SizetToString(problems.GetProdTerminalX())
515             + ") are not equal");
516     }
517 
518     if (problem_flags & CCDSTranslationProblems::eCDSTranslationProblem_ShouldBePartialButIsnt) {
519         PostErr(eDiag_Error, eErr_SEQ_FEAT_PartialProblem,
520             "End of location should probably be partial");
521     }
522     if (problem_flags & CCDSTranslationProblems::eCDSTranslationProblem_ShouldNotBePartialButIs) {
523         PostErr(eDiag_Error, eErr_SEQ_FEAT_PartialProblem,
524             "This SeqFeat should not be partial");
525     }
526 
527     if (problem_flags & CCDSTranslationProblems::eCDSTranslationProblem_UnnecessaryException) {
528         PostErr(eDiag_Warning, eErr_SEQ_FEAT_UnnecessaryException,
529             "CDS has exception but passes translation test");
530     }
531 
532     if (problem_flags & CCDSTranslationProblems::eCDSTranslationProblem_ErroneousException) {
533         PostErr(eDiag_Warning, eErr_SEQ_FEAT_ErroneousException,
534             "CDS has unclassified exception but only difference is "
535             + NStr::SizetToString(problems.GetTranslationMismatches().size()) + " mismatches out of "
536             + NStr::SizetToString(problems.GetProtLen()) + " residues");
537     }
538 
539     if (problem_flags & CCDSTranslationProblems::eCDSTranslationProblem_UnqualifiedException) {
540         PostErr(eDiag_Warning, eErr_SEQ_FEAT_UnnecessaryException,
541             "CDS has unnecessary translated product replaced exception");
542     }
543 
544 }
545 
546 
MapToNTCoords(TSeqPos pos)547 string CCdregionValidator::MapToNTCoords(TSeqPos pos)
548 {
549     string result;
550 
551     CSeq_point pnt;
552     pnt.SetPoint(pos);
553     pnt.SetStrand( GetStrand(m_Feat.GetProduct(), &m_Scope) );
554 
555     try {
556         pnt.SetId().Assign(GetId(m_Feat.GetProduct(), &m_Scope));
557     } catch (CObjmgrUtilException) {}
558 
559     CSeq_loc tmp;
560     tmp.SetPnt(pnt);
561     CRef<CSeq_loc> loc = ProductToSource(m_Feat, tmp, 0, &m_Scope);
562 
563     result = GetValidatorLocationLabel(*loc, m_Scope);
564 
565     return result;
566 }
567 
568 
x_ReportTranslationMismatches(const CCDSTranslationProblems::TTranslationMismatches & mismatches)569 void CCdregionValidator::x_ReportTranslationMismatches(const CCDSTranslationProblems::TTranslationMismatches& mismatches)
570 {
571     string nuclocstr;
572 
573     size_t num_mismatches = mismatches.size();
574 
575     if (num_mismatches > 10) {
576         // report total number of mismatches and the details of the
577         // first and last.
578         nuclocstr = MapToNTCoords(mismatches.front().pos);
579         string msg =
580             NStr::SizetToString(mismatches.size()) + " mismatches found.  " +
581             "First mismatch at " + NStr::IntToString(mismatches.front().pos + 1) +
582             ", residue in protein [";
583         msg += mismatches.front().prot_res;
584         msg += "] != translation [";
585         msg += mismatches.front().transl_res;
586         msg += "]";
587         if (!nuclocstr.empty()) {
588             msg += " at " + nuclocstr;
589         }
590         nuclocstr = MapToNTCoords(mismatches.back().pos);
591         msg +=
592             ".  Last mismatch at " + NStr::IntToString(mismatches.back().pos + 1) +
593             ", residue in protein [";
594         msg += mismatches.back().prot_res;
595         msg += "] != translation [";
596         msg += mismatches.back().transl_res;
597         msg += "]";
598         if (!nuclocstr.empty()) {
599             msg += " at " + nuclocstr;
600         }
601         int gc = 0;
602         if (m_Feat.GetData().IsCdregion() && m_Feat.GetData().GetCdregion().IsSetCode()) {
603             // We assume that the id is set for all Genetic_code
604             gc = m_Feat.GetData().GetCdregion().GetCode().GetId();
605         }
606         string gccode = NStr::IntToString(gc);
607 
608         msg += ".  Genetic code [" + gccode + "]";
609         PostErr(eDiag_Error, eErr_SEQ_FEAT_MisMatchAA, msg);
610     } else {
611         // report individual mismatches
612         for (size_t i = 0; i < mismatches.size(); ++i) {
613             nuclocstr = MapToNTCoords(mismatches[i].pos);
614             if (mismatches[i].pos == 0 && mismatches[i].transl_res == '-') {
615                 // skip - dash is expected to differ
616                 num_mismatches--;
617             } else {
618                 EDiagSev sev = eDiag_Error;
619                 if (mismatches[i].prot_res == 'X' &&
620                     (mismatches[i].transl_res == 'B' || mismatches[i].transl_res == 'Z' || mismatches[i].transl_res == 'J')) {
621                     sev = eDiag_Warning;
622                 }
623                 string msg;
624                 if (m_ProductIsFar) {
625                     msg += "(far) ";
626                 }
627                 msg += "Residue " + NStr::IntToString(mismatches[i].pos + 1) +
628                     " in protein [";
629                 msg += mismatches[i].prot_res;
630                 msg += "] != translation [";
631                 msg += mismatches[i].transl_res;
632                 msg += "]";
633                 if (!nuclocstr.empty()) {
634                     msg += " at " + nuclocstr;
635                 }
636                 PostErr(sev, eErr_SEQ_FEAT_MisMatchAA, msg);
637             }
638         }
639     }
640 }
641 
642 
x_ReportTranslExceptProblems(const CCDSTranslationProblems::TTranslExceptProblems & problems,bool has_exception)643 void CCdregionValidator::x_ReportTranslExceptProblems(const CCDSTranslationProblems::TTranslExceptProblems& problems, bool has_exception)
644 {
645     for (auto it = problems.begin(); it != problems.end(); it++) {
646         string msg;
647         switch (it->problem) {
648         case CCDSTranslationProblems::eTranslExceptPhase:
649             if (!has_exception) {
650                 PostErr(eDiag_Warning, eErr_SEQ_FEAT_TranslExceptPhase,
651                     "transl_except qual out of frame.");
652             }
653             break;
654         case CCDSTranslationProblems::eTranslExceptSuspicious:
655             msg = "Suspicious transl_except ";
656             msg += it->ex;
657             msg += " at first codon of complete CDS";
658             PostErr(eDiag_Warning, eErr_SEQ_FEAT_TranslExcept, msg);
659             break;
660         case CCDSTranslationProblems::eTranslExceptUnnecessary:
661             msg = "Unnecessary transl_except ";
662             msg += it->ex;
663             msg += " at position ";
664             msg += NStr::SizetToString(it->prot_pos + 1);
665             PostErr(eDiag_Warning, eErr_SEQ_FEAT_UnnecessaryTranslExcept,
666                     msg);
667             break;
668         case CCDSTranslationProblems::eTranslExceptUnexpected:
669             msg = "Unexpected transl_except ";
670             msg += it->ex;
671             msg += +" at position " + NStr::SizetToString(it->prot_pos + 1)
672                 + " just past end of protein";
673 
674             PostErr(eDiag_Warning, eErr_SEQ_FEAT_UnnecessaryTranslExcept,
675                     msg);
676             break;
677         }
678     }
679 }
680 
681 
x_ValidateCodebreak()682 void CCdregionValidator::x_ValidateCodebreak()
683 {
684     const CCdregion& cds = m_Feat.GetData().GetCdregion();
685     const CSeq_loc& feat_loc = m_Feat.GetLocation();
686     const CCode_break* prev_cbr = 0;
687 
688     FOR_EACH_CODEBREAK_ON_CDREGION (it, cds) {
689         const CCode_break& cbr = **it;
690         const CSeq_loc& cbr_loc = cbr.GetLoc();
691         ECompare comp = Compare(cbr_loc, feat_loc, &m_Scope, fCompareOverlapping);
692         if ( ((comp != eContained) && (comp != eSame)) || cbr_loc.IsNull() || cbr_loc.IsEmpty()) {
693             PostErr (eDiag_Critical, eErr_SEQ_FEAT_CDSrange,
694                 "Code-break location not in coding region");
695         } else if (m_Feat.IsSetProduct()) {
696             if (cbr_loc.GetStop(eExtreme_Biological) == feat_loc.GetStop(eExtreme_Biological)) {
697                 // terminal exception - don't bother checking, can't be mapped
698             } else {
699                 if (SeqLocCheck(cbr_loc, &m_Scope) == eSeqLocCheck_error) {
700                     string lbl = GetValidatorLocationLabel(cbr_loc, m_Scope);
701                     PostErr(eDiag_Critical, eErr_SEQ_FEAT_CDSrange,
702                         "Code-break: SeqLoc [" + lbl + "] out of range");
703                 } else {
704                     int frame = 0;
705                     CRef<CSeq_loc> p_loc = SourceToProduct(m_Feat, cbr_loc, fS2P_AllowTer, &m_Scope, &frame);
706                     if (!p_loc || p_loc->IsNull() || frame != 1) {
707                         PostErr(eDiag_Critical, eErr_SEQ_FEAT_CDSrange,
708                             "Code-break location not in coding region - may be frame problem");
709                     }
710                 }
711             }
712         }
713         if (cbr_loc.IsPartialStart(eExtreme_Biological) ||
714             cbr_loc.IsPartialStop(eExtreme_Biological)) {
715             PostErr(eDiag_Error, eErr_SEQ_FEAT_TranslExceptIsPartial,
716                    "Translation exception locations should not be partial");
717         }
718         if ( prev_cbr != 0 ) {
719             if ( Compare(cbr_loc, prev_cbr->GetLoc(), &m_Scope, fCompareOverlapping) == eSame ) {
720                 string msg = "Multiple code-breaks at same location ";
721                 string str = GetValidatorLocationLabel (cbr_loc, m_Scope);
722                 if ( !str.empty() ) {
723                     msg += "[" + str + "]";
724                 }
725                 PostErr(eDiag_Error, eErr_SEQ_FEAT_DuplicateTranslExcept,
726                    msg);
727             }
728         }
729         prev_cbr = &cbr;
730     }
731 }
732 
733 
Validate()734 void CCdregionValidator::Validate()
735 {
736     CSingleFeatValidator::Validate();
737 
738     bool feat_is_pseudo = s_IsPseudo(m_Feat);
739     bool pseudo = feat_is_pseudo || m_GeneIsPseudo;
740 
741     x_ValidateQuals();
742     x_ValidateGeneticCode();
743 
744     const CCdregion& cdregion = m_Feat.GetData().GetCdregion();
745     if (cdregion.IsSetOrf() && cdregion.GetOrf() &&
746         m_Feat.IsSetProduct()) {
747         PostErr(eDiag_Warning, eErr_SEQ_FEAT_OrfCdsHasProduct,
748             "An ORF coding region should not have a product");
749     }
750 
751     if (pseudo) {
752         if (m_Feat.IsSetProduct()) {
753             if (feat_is_pseudo) {
754                 PostErr(eDiag_Error, eErr_SEQ_FEAT_PseudoCdsHasProduct,
755                     "A pseudo coding region should not have a product");
756             } else if (m_GeneIsPseudo) {
757                 PostErr(eDiag_Error, eErr_SEQ_FEAT_PseudoCdsViaGeneHasProduct,
758                     "A coding region overlapped by a pseudogene should not have a product");
759             } else {
760                 PostErr(eDiag_Error, eErr_SEQ_FEAT_PseudoCdsHasProduct,
761                     "A pseudo coding region should not have a product");
762             }
763         }
764     } else {
765         ReportShortIntrons();
766         x_ValidateProductId();
767         x_ValidateCommonProduct();
768     }
769 
770     x_ValidateBadMRNAOverlap();
771     x_ValidateFarProducts();
772     x_ValidateCDSPeptides();
773     x_ValidateCDSPartial();
774 
775     if (x_IsProductMisplaced()) {
776         if (m_Imp.IsSmallGenomeSet()) {
777             PostErr(eDiag_Warning, eErr_SEQ_FEAT_CDSproductPackagingProblem,
778                 "Protein product not packaged in nuc-prot set with nucleotide in small genome set");
779         } else {
780             PostErr(eDiag_Error, eErr_SEQ_FEAT_CDSproductPackagingProblem,
781                 "Protein product not packaged in nuc-prot set with nucleotide");
782         }
783     }
784 
785     bool conflict = cdregion.IsSetConflict()  &&  cdregion.GetConflict();
786     if ( !pseudo  &&  !conflict ) {
787         x_ValidateTrans();
788         ValidateSplice(false, false);
789     }
790 
791     if (conflict) {
792         x_ValidateConflict();
793     }
794 
795     x_ReportPseudogeneConflict(m_Gene);
796     x_ValidateLocusTagGeneralMatch(m_Gene);
797 
798     x_ValidateProductPartials();
799     x_ValidateParentPartialness();
800 }
801 
802 
x_ValidateQuals()803 void CCdregionValidator::x_ValidateQuals()
804 {
805 
806     FOR_EACH_GBQUAL_ON_FEATURE(it, m_Feat) {
807         const CGb_qual& qual = **it;
808         if (qual.CanGetQual()) {
809             const string& key = qual.GetQual();
810             if (NStr::EqualNocase(key, "exception")) {
811                 if (!m_Feat.IsSetExcept()) {
812                     PostErr(eDiag_Warning, eErr_SEQ_FEAT_MissingExceptionFlag,
813                         "Exception flag should be set in coding region");
814                 }
815             } else if (NStr::EqualNocase(key, "codon")) {
816                 PostErr(eDiag_Error, eErr_SEQ_FEAT_CodonQualifierUsed,
817                     "Use the proper genetic code, if available, "
818                     "or set transl_excepts on specific codons");
819             } else if (NStr::EqualNocase(key, "protein_id")) {
820                 PostErr(eDiag_Warning, eErr_SEQ_FEAT_WrongQualOnFeature,
821                     "protein_id should not be a gbqual on a CDS feature");
822             } else if (NStr::EqualNocase(key, "gene_synonym")) {
823                 PostErr(eDiag_Warning, eErr_SEQ_FEAT_WrongQualOnCDS,
824                     "gene_synonym should not be a gbqual on a CDS feature");
825             } else if (NStr::EqualNocase(key, "transcript_id")) {
826                 PostErr(eDiag_Warning, eErr_SEQ_FEAT_WrongQualOnFeature,
827                     "transcript_id should not be a gbqual on a CDS feature");
828             } else if (NStr::EqualNocase(key, "codon_start")) {
829                 const CCdregion& cdregion = m_Feat.GetData().GetCdregion();
830                 if (cdregion.IsSetFrame() && cdregion.GetFrame() != CCdregion::eFrame_not_set) {
831                     PostErr(eDiag_Warning, eErr_SEQ_FEAT_WrongQualOnFeature,
832                         "conflicting codon_start values");
833                 } else {
834                     PostErr(eDiag_Warning, eErr_SEQ_FEAT_InvalidCodonStart,
835                         "codon_start value should be 1, 2, or 3");
836                 }
837             }
838         }
839     }
840 }
841 
842 
x_ReportOrigProteinId()843 bool CCdregionValidator::x_ReportOrigProteinId()
844 {
845     if (!m_GeneIsPseudo && !s_IsPseudo(m_Feat)) {
846         return true;
847     } else {
848         return false;
849     }
850 }
851 
852 
853 const string s_PlastidTxt[] = {
854   "",
855   "",
856   "chloroplast",
857   "chromoplast",
858   "",
859   "",
860   "plastid",
861   "",
862   "",
863   "",
864   "",
865   "",
866   "cyanelle",
867   "",
868   "",
869   "",
870   "apicoplast",
871   "leucoplast",
872   "proplastid",
873   ""
874 };
875 
876 
IsPlastid(int genome)877 bool CCdregionValidator::IsPlastid(int genome)
878 {
879     if ( genome == CBioSource::eGenome_chloroplast  ||
880          genome == CBioSource::eGenome_chromoplast  ||
881          genome == CBioSource::eGenome_plastid      ||
882          genome == CBioSource::eGenome_cyanelle     ||
883          genome == CBioSource::eGenome_apicoplast   ||
884          genome == CBioSource::eGenome_leucoplast   ||
885          genome == CBioSource::eGenome_proplastid   ||
886          genome == CBioSource::eGenome_chromatophore ) {
887         return true;
888     }
889 
890     return false;
891 }
892 
893 
IsGeneticCodeValid(int gcode)894 static bool IsGeneticCodeValid(int gcode)
895 {
896     bool ret = false;
897     if (gcode > 0) {
898 
899         try {
900             const CTrans_table& tbl = CGen_code_table::GetTransTable(gcode);
901             (void)tbl;  // suppress unused-variable warning
902             ret = true;
903         }
904         catch (CException&) {
905         }
906     }
907 
908     return ret;
909 }
910 
911 
s_GetStrictGenCode(const CBioSource & src)912 static int s_GetStrictGenCode(const CBioSource& src)
913 {
914     int gencode = 0;
915 
916     try {
917       CBioSource::TGenome genome = src.IsSetGenome() ? src.GetGenome() : CBioSource::eGenome_unknown;
918 
919         if ( src.IsSetOrg()  &&  src.GetOrg().IsSetOrgname() ) {
920             const COrgName& orn = src.GetOrg().GetOrgname();
921 
922             switch ( genome ) {
923                 case CBioSource::eGenome_kinetoplast:
924                 case CBioSource::eGenome_mitochondrion:
925                 case CBioSource::eGenome_hydrogenosome:
926                     // bacteria and plant organelle code
927                     if (orn.IsSetMgcode()) {
928                         gencode = orn.GetMgcode();
929                     }
930                     break;
931                 case CBioSource::eGenome_chloroplast:
932                 case CBioSource::eGenome_chromoplast:
933                 case CBioSource::eGenome_plastid:
934                 case CBioSource::eGenome_cyanelle:
935                 case CBioSource::eGenome_apicoplast:
936                 case CBioSource::eGenome_leucoplast:
937                 case CBioSource::eGenome_proplastid:
938                     if (orn.IsSetPgcode() && orn.GetPgcode() != 0) {
939                         gencode = orn.GetPgcode();
940                     } else {
941                         // bacteria and plant plastids are code 11.
942                         gencode = 11;
943                     }
944                     break;
945                 default:
946                     if (orn.IsSetGcode()) {
947                         gencode = orn.GetGcode();
948                     }
949                     break;
950             }
951         }
952     } catch (CException ) {
953     } catch (std::exception ) {
954     }
955     return gencode;
956 }
957 
958 
x_ValidateGeneticCode()959 void CCdregionValidator::x_ValidateGeneticCode()
960 {
961     if (!m_LocationBioseq) {
962         return;
963     }
964     int cdsgencode = 0;
965 
966     const CCdregion& cdregion = m_Feat.GetData().GetCdregion();
967 
968     if (cdregion.CanGetCode()) {
969         cdsgencode = cdregion.GetCode().GetId();
970 
971         if (!IsGeneticCodeValid(cdsgencode)) {
972             PostErr(eDiag_Error, eErr_SEQ_FEAT_GenCodeInvalid,
973                 "A coding region contains invalid genetic code [" + NStr::IntToString(cdsgencode) + "]");
974         }
975     }
976 
977     CSeqdesc_CI diter(m_LocationBioseq, CSeqdesc::e_Source);
978     if (diter) {
979         const CBioSource& src = diter->GetSource();
980         int biopgencode = s_GetStrictGenCode(src);
981 
982         if (biopgencode != cdsgencode
983             && (!m_Feat.IsSetExcept()
984             || !m_Feat.IsSetExcept_text()
985             || NStr::Find(m_Feat.GetExcept_text(), "genetic code exception") == string::npos)) {
986             int genome = 0;
987 
988             if (src.CanGetGenome()) {
989                 genome = src.GetGenome();
990             }
991 
992             if (IsPlastid(genome)) {
993                 PostErr(eDiag_Warning, eErr_SEQ_FEAT_GenCodeMismatch,
994                     "Genetic code conflict between CDS (code " +
995                     NStr::IntToString(cdsgencode) +
996                     ") and BioSource.genome biological context (" +
997                     s_PlastidTxt[genome] + ") (uses code 11)");
998             } else {
999                 PostErr(eDiag_Warning, eErr_SEQ_FEAT_GenCodeMismatch,
1000                     "Genetic code conflict between CDS (code " +
1001                     NStr::IntToString(cdsgencode) +
1002                     ") and BioSource (code " +
1003                     NStr::IntToString(biopgencode) + ")");
1004             }
1005         }
1006     }
1007 }
1008 
1009 
x_ValidateSeqFeatLoc()1010 void CCdregionValidator::x_ValidateSeqFeatLoc()
1011 {
1012     CSingleFeatValidator::x_ValidateSeqFeatLoc();
1013 
1014     // for coding regions, internal exons should not be 15 or less bp long
1015     int num_short_exons = 0;
1016     string message;
1017     CSeq_loc_CI it(m_Feat.GetLocation());
1018     if (it) {
1019         // note - do not want to warn for first or last exon
1020         ++it;
1021         size_t prev_len = 16;
1022         size_t prev_start = 0;
1023         size_t prev_stop = 0;
1024         while (it) {
1025             if (prev_len <= 15) {
1026                 num_short_exons++;
1027                 if (!message.empty()) {
1028                     message += ", ";
1029                 }
1030                 message += NStr::NumericToString(prev_start + 1)
1031                     + "-" + NStr::NumericToString(prev_stop + 1);
1032             }
1033             prev_len = it.GetRange().GetLength();
1034             prev_start = it.GetRange().GetFrom();
1035             prev_stop = it.GetRange().GetTo();
1036             ++it;
1037         }
1038     }
1039     if (num_short_exons > 1) {
1040         PostErr(eDiag_Warning, eErr_SEQ_FEAT_ShortExon,
1041             "Coding region has multiple internal exons that are too short at positions " + message);
1042     } else if (num_short_exons == 1) {
1043         PostErr(eDiag_Warning, eErr_SEQ_FEAT_ShortExon,
1044             "Internal coding region exon is too short at position " + message);
1045     }
1046 }
1047 
1048 
x_ValidateBadMRNAOverlap()1049 void CCdregionValidator::x_ValidateBadMRNAOverlap()
1050 {
1051     if (x_HasGoodParent()) {
1052         return;
1053     }
1054 
1055     const CSeq_loc& loc = m_Feat.GetLocation();
1056 
1057     CConstRef<CSeq_feat> mrna = GetBestOverlappingFeat(
1058         loc,
1059         CSeqFeatData::eSubtype_mRNA,
1060         eOverlap_Simple,
1061         m_Scope);
1062     if (!mrna) {
1063         return;
1064     }
1065 
1066     mrna = GetBestOverlappingFeat(
1067         loc,
1068         CSeqFeatData::eSubtype_mRNA,
1069         eOverlap_CheckIntRev,
1070         m_Scope);
1071     if (mrna) {
1072         return;
1073     }
1074 
1075     mrna = GetBestOverlappingFeat(
1076         loc,
1077         CSeqFeatData::eSubtype_mRNA,
1078         eOverlap_Interval,
1079         m_Scope);
1080     if (!mrna) {
1081         return;
1082     }
1083 
1084     bool pseudo = s_IsPseudo(m_Feat) || m_GeneIsPseudo;
1085 
1086     EErrType err_type = eErr_SEQ_FEAT_CDSmRNArange;
1087     if (pseudo) {
1088         err_type = eErr_SEQ_FEAT_PseudoCDSmRNArange;
1089     }
1090 
1091     mrna = GetBestOverlappingFeat(
1092         loc,
1093         CSeqFeatData::eSubtype_mRNA,
1094         eOverlap_SubsetRev,
1095         m_Scope);
1096 
1097     EDiagSev sev = eDiag_Warning;
1098     if (pseudo) {
1099         sev = eDiag_Info;
1100     }
1101     if (mrna) {
1102         // ribosomal slippage exception suppresses CDSmRNArange warning
1103         bool supress = false;
1104 
1105         if (m_Feat.CanGetExcept_text()) {
1106             const CSeq_feat::TExcept_text& text = m_Feat.GetExcept_text();
1107             if (NStr::FindNoCase(text, "ribosomal slippage") != NPOS
1108                 || NStr::FindNoCase(text, "trans-splicing") != NPOS) {
1109                 supress = true;
1110             }
1111         }
1112         if (!supress) {
1113             PostErr(sev, err_type,
1114                 "mRNA contains CDS but internal intron-exon boundaries "
1115                 "do not match");
1116         }
1117     } else {
1118         PostErr(sev, err_type,
1119             "mRNA overlaps or contains CDS but does not completely "
1120             "contain intervals");
1121     }
1122 }
1123 
1124 
x_HasGoodParent()1125 bool CCdregionValidator::x_HasGoodParent()
1126 {
1127     static const CSeqFeatData::ESubtype parent_types[] = {
1128         CSeqFeatData::eSubtype_C_region,
1129         CSeqFeatData::eSubtype_D_segment,
1130         CSeqFeatData::eSubtype_J_segment,
1131         CSeqFeatData::eSubtype_V_segment
1132     };
1133     size_t num_parent_types = sizeof(parent_types) / sizeof(CSeqFeatData::ESubtype);
1134     CRef<feature::CFeatTree> feat_tree = m_Imp.GetGeneCache().GetFeatTreeFromCache(m_Feat, m_Scope);
1135     if (!feat_tree) {
1136         return false;
1137     }
1138     CSeq_feat_Handle fh;
1139     try {
1140         // will fail if location is bad
1141         fh = m_Scope.GetSeq_featHandle(m_Feat);
1142     } catch (CException&) {
1143         return false;
1144     }
1145 
1146     for (size_t i = 0; i < num_parent_types; i++) {
1147         CMappedFeat parent = feat_tree->GetParent(fh, parent_types[i]);
1148         if (parent) {
1149             sequence::ECompare cmp = sequence::Compare(m_Feat.GetLocation(),
1150                                                        parent.GetLocation(),
1151                                                        &m_Scope,
1152                                                        sequence::fCompareOverlapping);
1153             if (cmp == sequence::eContained || cmp == sequence::eSame) {
1154                 return true;
1155             }
1156         }
1157     }
1158     return false;
1159 }
1160 
1161 
1162 // VR-619
1163 // for an mRNA / CDS pair where both have far products
1164 // (which is only true for genomic RefSeqs with instantiated mRNA products),
1165 // please check that the pair found by CFeatTree corresponds to the nuc-prot pair in ID
1166 // (i.e.the CDS product is annotated on the mRNA product).
x_ValidateFarProducts()1167 void CCdregionValidator::x_ValidateFarProducts()
1168 {
1169     // if coding region doesn't have a far product, nothing to check
1170     if (!m_ProductIsFar) {
1171         return;
1172     }
1173     // no point if not far-fetching
1174     if (!m_Imp.IsRemoteFetch()) {
1175         return;
1176     }
1177     if (!m_Feat.GetData().IsCdregion() || !m_Feat.IsSetProduct()) {
1178         return;
1179     }
1180     if (!m_Imp.IsRefSeq()) {
1181         return;
1182     }
1183     const CSeq_id * cds_sid = m_Feat.GetProduct().GetId();
1184     if (!cds_sid) {
1185         return;
1186     }
1187     CRef<feature::CFeatTree> feat_tree = m_Imp.GetGeneCache().GetFeatTreeFromCache(m_Feat, m_Scope);
1188     if (!feat_tree) {
1189         return;
1190     }
1191     CSeq_feat_Handle fh = m_Scope.GetSeq_featHandle(m_Feat);
1192     if (!fh) {
1193         return;
1194     }
1195     CMappedFeat mrna = feat_tree->GetParent(fh, CSeqFeatData::eSubtype_mRNA);
1196     if (!mrna || !mrna.IsSetProduct()) {
1197         // no mRNA or no mRNA product
1198         return;
1199     }
1200     const CSeq_id * mrna_sid = mrna.GetProduct().GetId();
1201     if (!mrna_sid) {
1202         return;
1203     }
1204     CBioseq_Handle mrna_prod = m_Scope.GetBioseqHandleFromTSE(*mrna_sid, m_LocationBioseq.GetTSE_Handle());
1205     if (mrna_prod) {
1206         // mRNA product is not far
1207         return;
1208     }
1209     mrna_prod = m_Scope.GetBioseqHandle(*mrna_sid);
1210     if (!mrna_prod) {
1211         // can't be fetched, will be reported elsewhere
1212         return;
1213     }
1214     CSeq_entry_Handle far_mrna_nps =
1215         mrna_prod.GetExactComplexityLevel(CBioseq_set::eClass_nuc_prot);
1216     if (!far_mrna_nps) {
1217         PostErr(eDiag_Error, eErr_SEQ_FEAT_CDSmRNAmismatch, "no Far mRNA nuc-prot-set");
1218     } else {
1219         CBioseq_Handle cds_prod = m_Scope.GetBioseqHandleFromTSE(*cds_sid, far_mrna_nps);
1220         if (!cds_prod) {
1221             PostErr(eDiag_Error, eErr_SEQ_FEAT_CDSmRNAmismatch, "Far CDS product and far mRNA product are not packaged together");
1222             m_Imp.PostErr(eDiag_Error, eErr_SEQ_FEAT_CDSmRNAmismatch, "Far CDS product and far mRNA product are not packaged together", *(mrna.GetSeq_feat()));
1223         }
1224     }
1225 }
1226 
1227 
x_ValidateCDSPeptides()1228 void CCdregionValidator::x_ValidateCDSPeptides()
1229 {
1230     try {
1231     if (!m_Feat.GetData().IsCdregion()  ||  !m_Feat.CanGetProduct()) {
1232         return;
1233     }
1234 
1235     CBioseq_Handle prot = m_Scope.GetBioseqHandle(m_Feat.GetProduct());
1236     if (!prot) {
1237         return;
1238     }
1239     CBioseq_Handle nuc = m_Scope.GetBioseqHandle(m_Feat.GetLocation());
1240     if (!nuc) {
1241         return;
1242     }
1243     // check for self-referential CDS feature
1244     if (nuc == prot) {
1245         return;
1246     }
1247 
1248     const CGene_ref* cds_ref = 0;
1249 
1250     // map from cds product to nucleotide
1251     const string prev = GetDiagFilter(eDiagFilter_Post);
1252     SetDiagFilter(eDiagFilter_All, "!(1305.28,31)");
1253     CSeq_loc_Mapper prot_to_cds(m_Feat, CSeq_loc_Mapper::eProductToLocation, &m_Scope);
1254     SetDiagFilter(eDiagFilter_All, prev.c_str());
1255 
1256     for (CFeat_CI it(prot, CSeqFeatData::e_Prot); it; ++it) {
1257         CSeq_feat_Handle curr = it->GetSeq_feat_Handle();
1258         CSeqFeatData::ESubtype subtype = curr.GetFeatSubtype();
1259 
1260         if (subtype != CSeqFeatData::eSubtype_preprotein &&
1261             subtype != CSeqFeatData::eSubtype_mat_peptide_aa &&
1262             subtype != CSeqFeatData::eSubtype_sig_peptide_aa &&
1263             subtype != CSeqFeatData::eSubtype_transit_peptide_aa &&
1264             subtype != CSeqFeatData::eSubtype_propeptide_aa) {
1265             continue;
1266         }
1267 
1268         // see if already has gene xref
1269         if (curr.GetGeneXref()) {
1270             continue;
1271         }
1272 
1273         if (! cds_ref) {
1274             // wait until first mat_peptide found to avoid expensive computation on CDS /gene qualifier
1275             CConstRef<CSeq_feat> cgene = sequence::GetBestOverlappingFeat (m_Feat.GetLocation(), CSeqFeatData::eSubtype_gene, sequence::eOverlap_SubsetRev, m_Scope);
1276             if (cgene && cgene->CanGetData() && cgene->GetData().IsGene()) {
1277                 const CGene_ref& cgref = cgene->GetData().GetGene();
1278                 cds_ref = &cgref;
1279             } else {
1280                 // if CDS does not have overlapping gene, bail out of function
1281                 return;
1282             }
1283         }
1284 
1285         const CSeq_loc& loc = curr.GetLocation();
1286         // map prot location to nuc location
1287         CRef<CSeq_loc> nloc(prot_to_cds.Map(loc));
1288         if (! nloc) {
1289             continue;
1290         }
1291 
1292         const CGene_ref* pep_ref = 0;
1293         CConstRef<CSeq_feat> pgene = sequence::GetBestOverlappingFeat (*nloc, CSeqFeatData::eSubtype_gene, sequence::eOverlap_SubsetRev, m_Scope);
1294         if (pgene && pgene->CanGetData() && pgene->GetData().IsGene()) {
1295             const CGene_ref& pgref = pgene->GetData().GetGene();
1296             pep_ref = &pgref;
1297         }
1298 
1299         if (! cds_ref || ! pep_ref) {
1300             continue;
1301         }
1302         if (cds_ref->IsSetLocus_tag() && pep_ref->IsSetLocus_tag()) {
1303             if (cds_ref->GetLocus_tag() == pep_ref->GetLocus_tag()) {
1304                continue;
1305             }
1306         } else if (cds_ref->IsSetLocus() && pep_ref->IsSetLocus()) {
1307             if (cds_ref->GetLocus() == pep_ref->GetLocus()) {
1308                 continue;
1309             }
1310         }
1311 
1312         if (pgene) {
1313 
1314             const CSeq_loc& gloc = pgene->GetLocation();
1315 
1316             if (sequence::Compare(*nloc, gloc, nullptr /* scope */, sequence::fCompareOverlapping) == sequence::eSame) {
1317 
1318                 PostErr(eDiag_Warning, eErr_SEQ_FEAT_GeneOnNucPositionOfPeptide,
1319                         "Peptide under CDS matches small Gene");
1320             }
1321         }
1322     }
1323     } catch (CException) {
1324     }
1325 }
1326 
1327 
x_ValidateCDSPartial()1328 void CCdregionValidator::x_ValidateCDSPartial()
1329 {
1330     if (!m_ProductBioseq || x_BypassCDSPartialTest()) {
1331         return;
1332     }
1333 
1334     CSeqdesc_CI sd(m_ProductBioseq, CSeqdesc::e_Molinfo);
1335     if (!sd) {
1336         return;
1337     }
1338     const CMolInfo& molinfo = sd->GetMolinfo();
1339 
1340     const CSeq_loc& loc = m_Feat.GetLocation();
1341     bool partial5 = loc.IsPartialStart(eExtreme_Biological);
1342     bool partial3 = loc.IsPartialStop(eExtreme_Biological);
1343 
1344     if (molinfo.CanGetCompleteness()) {
1345         switch (molinfo.GetCompleteness()) {
1346         case CMolInfo::eCompleteness_unknown:
1347             break;
1348 
1349         case CMolInfo::eCompleteness_complete:
1350             if (partial5 || partial3) {
1351                 PostErr(eDiag_Error, eErr_SEQ_FEAT_PartialProblem,
1352                     "CDS is partial but protein is complete");
1353             }
1354             break;
1355 
1356         case CMolInfo::eCompleteness_partial:
1357             break;
1358 
1359         case CMolInfo::eCompleteness_no_left:
1360             if (!partial5) {
1361                 PostErr(eDiag_Error, eErr_SEQ_FEAT_PartialProblem,
1362                     "CDS is 5' complete but protein is NH2 partial");
1363             }
1364             if (partial3) {
1365                 EDiagSev sev = eDiag_Error;
1366                 if (x_CDS3primePartialTest())
1367                 {
1368                     sev = eDiag_Warning;
1369                 }
1370                 PostErr(sev, eErr_SEQ_FEAT_PartialProblem,
1371                     "CDS is 3' partial but protein is NH2 partial");
1372             }
1373             break;
1374 
1375         case CMolInfo::eCompleteness_no_right:
1376             if (!partial3) {
1377                 PostErr(eDiag_Error, eErr_SEQ_FEAT_PartialProblem,
1378                     "CDS is 3' complete but protein is CO2 partial");
1379             }
1380             if (partial5) {
1381                 EDiagSev sev = eDiag_Error;
1382                 if (x_CDS5primePartialTest())
1383                 {
1384                     sev = eDiag_Warning;
1385                 }
1386                 PostErr(sev, eErr_SEQ_FEAT_PartialProblem,
1387                     "CDS is 5' partial but protein is CO2 partial");
1388             }
1389             break;
1390 
1391         case CMolInfo::eCompleteness_no_ends:
1392             if (partial5 && partial3) {
1393             } else if (partial5) {
1394                 EDiagSev sev = eDiag_Error;
1395                 if (x_CDS5primePartialTest())
1396                 {
1397                     sev = eDiag_Warning;
1398                 }
1399                 PostErr(sev, eErr_SEQ_FEAT_PartialProblem,
1400                     "CDS is 5' partial but protein has neither end");
1401             } else if (partial3) {
1402                 EDiagSev sev = eDiag_Error;
1403                 if (x_CDS3primePartialTest()) {
1404                     sev = eDiag_Warning;
1405                 }
1406 
1407                 PostErr(sev, eErr_SEQ_FEAT_PartialProblem,
1408                     "CDS is 3' partial but protein has neither end");
1409             } else {
1410                 PostErr(eDiag_Error, eErr_SEQ_FEAT_PartialProblem,
1411                     "CDS is complete but protein has neither end");
1412             }
1413             break;
1414 
1415         case CMolInfo::eCompleteness_has_left:
1416             break;
1417 
1418         case CMolInfo::eCompleteness_has_right:
1419             break;
1420 
1421         case CMolInfo::eCompleteness_other:
1422             break;
1423 
1424         default:
1425             break;
1426         }
1427     }
1428 }
1429 
1430 
1431 static const char* const sc_BypassCdsPartialCheckText[] = {
1432     "RNA editing",
1433     "annotated by transcript or proteomic data",
1434     "artificial frameshift",
1435     "mismatches in translation",
1436     "rearrangement required for product",
1437     "reasons given in citation",
1438     "translated product replaced",
1439     "unclassified translation discrepancy"
1440 };
1441 typedef CStaticArraySet<const char*, PCase_CStr> TBypassCdsPartialCheckSet;
1442 DEFINE_STATIC_ARRAY_MAP(TBypassCdsPartialCheckSet, sc_BypassCdsPartialCheck, sc_BypassCdsPartialCheckText);
1443 
x_BypassCDSPartialTest() const1444 bool CCdregionValidator::x_BypassCDSPartialTest() const
1445 {
1446     if (m_Feat.CanGetExcept() && m_Feat.GetExcept() && m_Feat.CanGetExcept_text()) {
1447         const string& except_text = m_Feat.GetExcept_text();
1448         ITERATE(TBypassCdsPartialCheckSet, it, sc_BypassCdsPartialCheck) {
1449             if (NStr::FindNoCase(except_text, *it) != NPOS) {
1450                 return true;  // biological exception
1451             }
1452         }
1453     }
1454     return false;
1455 }
1456 
1457 
x_CDS3primePartialTest() const1458 bool CCdregionValidator::x_CDS3primePartialTest() const
1459 {
1460     CSeq_loc_CI last;
1461     for (CSeq_loc_CI sl_iter(m_Feat.GetLocation()); sl_iter; ++sl_iter) {
1462         last = sl_iter;
1463     }
1464 
1465     if (last) {
1466         if (last.GetStrand() == eNa_strand_minus) {
1467             if (last.GetRange().GetFrom() == 0) {
1468                 return true;
1469             }
1470         } else {
1471             if (!m_LocationBioseq) {
1472                 return false;
1473             }
1474             if (last.GetRange().GetTo() == m_LocationBioseq.GetInst_Length() - 1) {
1475                 return true;
1476             }
1477         }
1478     }
1479     return false;
1480 }
1481 
1482 
x_CDS5primePartialTest() const1483 bool CCdregionValidator::x_CDS5primePartialTest() const
1484 {
1485     CSeq_loc_CI first(m_Feat.GetLocation());
1486 
1487     if (first) {
1488         if (first.GetStrand() == eNa_strand_minus) {
1489             if (!m_LocationBioseq) {
1490                 return false;
1491             }
1492             if (first.GetRange().GetTo() == m_LocationBioseq.GetInst_Length() - 1) {
1493                 return true;
1494             }
1495         } else {
1496             if (first.GetRange().GetFrom() == 0) {
1497                 return true;
1498             }
1499         }
1500     }
1501     return false;
1502 }
1503 
1504 
x_IsProductMisplaced() const1505 bool CCdregionValidator::x_IsProductMisplaced() const
1506 {
1507     // don't calculate if no product or if ORF flag is set
1508     if (!m_Feat.IsSetProduct() ||
1509         m_Feat.GetData().GetCdregion().IsSetOrf()) {
1510         return false;
1511     }
1512     // don't calculate if feature is pseudo
1513     if (s_IsPseudo(m_Feat) || m_GeneIsPseudo) {
1514         return false;
1515     }
1516     if (!m_ProductBioseq) {
1517         return false;
1518     } else if (m_ProductIsFar) {
1519         if (m_Imp.RequireLocalProduct(m_Feat.GetProduct().GetId())) {
1520             return true;
1521         } else {
1522             return false;
1523         }
1524     }
1525 
1526     bool found_match = false;
1527 
1528     CSeq_entry_Handle prod_nps =
1529         m_ProductBioseq.GetExactComplexityLevel(CBioseq_set::eClass_nuc_prot);
1530     if (!prod_nps) {
1531         return true;
1532     }
1533 
1534     for (CSeq_loc_CI loc_i(m_Feat.GetLocation()); loc_i; ++loc_i) {
1535         const CSeq_id& sid = loc_i.GetSeq_id();
1536         if (sid.IsOther() && sid.GetOther().IsSetAccession() && NStr::StartsWith(sid.GetOther().GetAccession(), "NT_")) {
1537             return false;
1538         }
1539         CBioseq_Handle nuc = m_Scope.GetBioseqHandle(loc_i.GetSeq_id());
1540         if (nuc) {
1541             if (s_BioseqHasRefSeqThatStartsWithPrefix(nuc, "NT_")) {
1542                 // we don't report this for NT records
1543                 return false;
1544             }
1545             CSeq_entry_Handle wgs = nuc.GetExactComplexityLevel(CBioseq_set::eClass_gen_prod_set);
1546             if (wgs) {
1547                 // we don't report this for gen-prod-sets
1548                 return false;
1549             }
1550 
1551             CSeq_entry_Handle nuc_nps =
1552                 nuc.GetExactComplexityLevel(CBioseq_set::eClass_nuc_prot);
1553 
1554             if (prod_nps == nuc_nps) {
1555                 found_match = true;
1556                 break;
1557             }
1558         }
1559     }
1560     return !found_match;
1561 }
1562 
1563 
x_AddToIntronList(vector<CCdregionValidator::TShortIntron> & shortlist,TSeqPos last_start,TSeqPos last_stop,TSeqPos this_start,TSeqPos this_stop)1564 void CCdregionValidator::x_AddToIntronList(vector<CCdregionValidator::TShortIntron>& shortlist, TSeqPos last_start, TSeqPos last_stop, TSeqPos this_start, TSeqPos this_stop)
1565 {
1566     if (abs ((int)this_start - (int)last_stop) < 11) {
1567         shortlist.push_back(TShortIntron(last_stop, this_start));
1568     } else if (abs ((int)this_stop - (int)last_start) < 11) {
1569         shortlist.push_back(TShortIntron(last_start, this_stop));
1570     }
1571 }
1572 
1573 
x_GetShortIntrons(const CSeq_loc & loc,CScope * scope)1574 vector<CCdregionValidator::TShortIntron> CCdregionValidator::x_GetShortIntrons(const CSeq_loc& loc, CScope* scope)
1575 {
1576     vector<CCdregionValidator::TShortIntron> shortlist;
1577 
1578     CSeq_loc_CI li(loc);
1579 
1580     TSeqPos last_start = li.GetRange().GetFrom();
1581     TSeqPos last_stop = li.GetRange().GetTo();
1582     CRef<CSeq_id> last_id(new CSeq_id());
1583     last_id->Assign(li.GetSeq_id());
1584 
1585     ++li;
1586     while (li) {
1587         TSeqPos this_start = li.GetRange().GetFrom();
1588         TSeqPos this_stop = li.GetRange().GetTo();
1589         if (abs ((int)this_start - (int)last_stop) < 11 || abs ((int)this_stop - (int)last_start) < 11) {
1590             if (li.GetSeq_id().Equals(*last_id)) {
1591                 // definitely same bioseq, definitely report
1592                 x_AddToIntronList(shortlist, last_start, last_stop, this_start, this_stop);
1593             } else if (scope) {
1594                 // only report if definitely on same bioseq
1595                 CBioseq_Handle last_bsh = scope->GetBioseqHandle(*last_id);
1596                 if (last_bsh) {
1597                     for (auto id_it : last_bsh.GetId()) {
1598                         if (id_it.GetSeqId()->Equals(li.GetSeq_id())) {
1599                              x_AddToIntronList(shortlist, last_start, last_stop, this_start, this_stop);
1600                              break;
1601                         }
1602                     }
1603                 }
1604             }
1605         }
1606         last_start = this_start;
1607         last_stop = this_stop;
1608         last_id->Assign(li.GetSeq_id());
1609         ++li;
1610     }
1611     return shortlist;
1612 }
1613 
1614 
x_FormatIntronInterval(const TShortIntron & interval)1615 string CCdregionValidator::x_FormatIntronInterval(const TShortIntron& interval)
1616 {
1617     return NStr::NumericToString(interval.first + 1) + "-"
1618         + NStr::NumericToString(interval.second + 1);
1619 }
1620 
1621 
ReportShortIntrons()1622 void CCdregionValidator::ReportShortIntrons()
1623 {
1624     if (m_Feat.IsSetExcept()) {
1625         return;
1626     }
1627 
1628     string message;
1629 
1630     vector<TShortIntron> shortlist = x_GetShortIntrons(m_Feat.GetLocation(), &m_Scope);
1631     if (shortlist.size() == 0) {
1632         return;
1633     }
1634 
1635     // only report if no nonsense introns
1636     vector<CRef<CSeq_loc> > nonsense_introns = CCDSTranslationProblems::GetNonsenseIntrons(m_Feat, m_Scope);
1637     if (nonsense_introns.size() > 0) {
1638         return;
1639     }
1640 
1641     if (shortlist.size() == 1) {
1642         message = x_FormatIntronInterval(shortlist.front());
1643     } else if (shortlist.size() == 2) {
1644         message = x_FormatIntronInterval(shortlist.front())
1645             + " and " +
1646             x_FormatIntronInterval(shortlist.back());
1647     } else {
1648         for (size_t i = 0; i < shortlist.size() - 2; i++) {
1649             message += x_FormatIntronInterval(shortlist[i]) + ", ";
1650         }
1651         message += " and " + x_FormatIntronInterval(shortlist.back());
1652     }
1653     PostErr(eDiag_Warning, eErr_SEQ_FEAT_ShortIntron,
1654                 "Introns at positions " + message + " should be at least 10 nt long");
1655 }
1656 
1657 
1658 // non-pseudo CDS must have product
x_ValidateProductId()1659 void CCdregionValidator::x_ValidateProductId()
1660 {
1661     // bail if product exists
1662     if ( m_Feat.IsSetProduct() ) {
1663         return;
1664     }
1665     // bail if location has just stop
1666     if ( m_Feat.IsSetLocation() ) {
1667         const CSeq_loc& loc = m_Feat.GetLocation();
1668         if ( loc.IsPartialStart(eExtreme_Biological)  &&  !loc.IsPartialStop(eExtreme_Biological) ) {
1669             if ( GetLength(loc, &m_Scope) <= 5 ) {
1670                 return;
1671             }
1672         }
1673     }
1674     // supress in case of the appropriate exception
1675     if ( m_Feat.IsSetExcept()  &&  m_Feat.IsSetExcept_text()  &&
1676          !NStr::IsBlank(m_Feat.GetExcept_text()) ) {
1677         if ( NStr::Find(m_Feat.GetExcept_text(),
1678             "rearrangement required for product") != NPOS ) {
1679            return;
1680         }
1681     }
1682 
1683     // non-pseudo CDS must have /product
1684     PostErr(eDiag_Error, eErr_SEQ_FEAT_MissingCDSproduct,
1685         "Expected CDS product absent");
1686 }
1687 
1688 
x_ValidateConflict()1689 void CCdregionValidator::x_ValidateConflict()
1690 {
1691     if (!m_ProductBioseq) {
1692         return;
1693     }
1694     // translate the coding region
1695     string transl_prot;
1696     try {
1697         CSeqTranslator::Translate(m_Feat, m_Scope, transl_prot,
1698                                   false,   // do not include stop codons
1699                                   false);  // do not remove trailing X/B/Z
1700 
1701     } catch ( const runtime_error& ) {
1702     }
1703 
1704     CSeqVector prot_vec = m_ProductBioseq.GetSeqVector(CBioseq_Handle::eCoding_Iupac);
1705     prot_vec.SetCoding(CSeq_data::e_Ncbieaa);
1706 
1707     string prot_seq;
1708     prot_vec.GetSeqData(0, prot_vec.size(), prot_seq);
1709 
1710     if ( transl_prot.empty()  ||  prot_seq.empty()  ||  NStr::Equal(transl_prot, prot_seq) ) {
1711         PostErr(eDiag_Error, eErr_SEQ_FEAT_BadConflictFlag,
1712                 "Coding region conflict flag should not be set");
1713     } else {
1714         PostErr(eDiag_Warning, eErr_SEQ_FEAT_ConflictFlagSet,
1715                 "Coding region conflict flag is set");
1716     }
1717 }
1718 
1719 
x_ValidateCommonProduct()1720 void CCdregionValidator::x_ValidateCommonProduct()
1721 {
1722     if ( !m_Feat.IsSetProduct() ) {
1723         return;
1724     }
1725 
1726     const CCdregion& cdr = m_Feat.GetData().GetCdregion();
1727     if ( cdr.CanGetOrf() ) {
1728         return;
1729     }
1730 
1731     if ( !m_ProductBioseq  || m_ProductIsFar) {
1732         const CSeq_id* sid = 0;
1733         try {
1734             sid = &(GetId(m_Feat.GetProduct(), &m_Scope));
1735         } catch (const CObjmgrUtilException&) {}
1736         if (m_Imp.RequireLocalProduct(sid)) {
1737             PostErr(eDiag_Warning, eErr_SEQ_FEAT_MissingCDSproduct,
1738                 "Unable to find product Bioseq from CDS feature");
1739         }
1740         return;
1741     }
1742 
1743     const CSeq_feat* sfp = GetCDSForProduct(m_ProductBioseq);
1744     if ( sfp == 0 ) {
1745         return;
1746     }
1747 
1748     if ( &m_Feat != sfp ) {
1749         // if genomic product set, with one cds on contig and one on cdna,
1750         // do not report.
1751         if ( m_Imp.IsGPS() ) {
1752             // feature packaging test will do final contig vs. cdna check
1753             CBioseq_Handle sfh = m_Scope.GetBioseqHandle(sfp->GetLocation());
1754             if ( m_LocationBioseq != sfh ) {
1755                 return;
1756             }
1757         }
1758         PostErr(eDiag_Critical, eErr_SEQ_FEAT_MultipleCDSproducts,
1759             "Same product Bioseq from multiple CDS features");
1760     }
1761 }
1762 
1763 
x_ValidateProductPartials()1764 void CCdregionValidator::x_ValidateProductPartials()
1765 {
1766     if (!m_ProductBioseq || !m_LocationBioseq) {
1767         return;
1768     }
1769 
1770     if (m_LocationBioseq.GetTopLevelEntry() != m_ProductBioseq.GetTopLevelEntry()) {
1771         return;
1772     }
1773     CFeat_CI prot(m_ProductBioseq, CSeqFeatData::eSubtype_prot);
1774     if (!prot) {
1775         return;
1776     }
1777     if (!PartialsSame(m_Feat.GetLocation(), prot->GetLocation())) {
1778         PostErr(eDiag_Warning, eErr_SEQ_FEAT_PartialsInconsistentCDSProtein,
1779             "Coding region and protein feature partials conflict");
1780     }
1781 }
1782 
1783 
x_CheckPosNOrGap(TSeqPos pos,const CSeqVector & vec)1784 bool CCdregionValidator::x_CheckPosNOrGap(TSeqPos pos, const CSeqVector& vec)
1785 {
1786     if (vec.IsInGap(pos) || vec[pos] == 'N') {
1787         return true;
1788     } else {
1789         return false;
1790     }
1791 }
1792 
1793 
x_ValidateParentPartialness(const CSeq_loc & parent_loc,const string & parent_name)1794 void CCdregionValidator::x_ValidateParentPartialness(const CSeq_loc& parent_loc, const string& parent_name)
1795 {
1796     if (!m_LocationBioseq) {
1797         return;
1798     }
1799 
1800     bool check_gaps = false;
1801     if (m_LocationBioseq.IsSetInst() && m_LocationBioseq.GetInst().IsSetRepr() &&
1802         m_LocationBioseq.GetInst().GetRepr() == CSeq_inst::eRepr_delta) {
1803         check_gaps = true;
1804     }
1805 
1806     bool has_abutting_gap = false;
1807     bool is_minus_strand = m_Feat.GetLocation().IsSetStrand() && m_Feat.GetLocation().GetStrand() == eNa_strand_minus;
1808 
1809     if (m_Feat.GetLocation().IsPartialStart(eExtreme_Biological) && !parent_loc.IsPartialStart(eExtreme_Biological)) {
1810 
1811         if (check_gaps) {
1812             CSeqVector seq_vec(m_LocationBioseq, CBioseq_Handle::eCoding_Iupac);
1813             TSeqPos start = m_Feat.GetLocation().GetStart(eExtreme_Biological),
1814                     pos = is_minus_strand ? start + 1 : start - 1;
1815 
1816             if (pos < m_LocationBioseq.GetBioseqLength()) {
1817                 has_abutting_gap = x_CheckPosNOrGap(pos, seq_vec);
1818             }
1819         }
1820 
1821         if (!has_abutting_gap) {
1822             EDiagSev sev = eDiag_Warning;
1823             CConstRef <CSeq_feat> gene = m_Gene;
1824             if (gene && gene->GetData().GetGene().IsSetLocus()) {
1825                 string locus = gene->GetData().GetGene().GetLocus();
1826                 if ( NStr::EqualNocase (locus, "orf1ab") ) {
1827                     sev = eDiag_Info;
1828                 }
1829             }
1830             PostErr(sev, eErr_SEQ_FEAT_PartialProblemMismatch5Prime, parent_name + " should not be 5' complete if coding region is 5' partial");
1831         }
1832     }
1833     if (m_Feat.GetLocation().IsPartialStop(eExtreme_Biological) && !parent_loc.IsPartialStop(eExtreme_Biological)) {
1834 
1835         if (check_gaps) {
1836 
1837             CSeqVector seq_vec(m_LocationBioseq, CBioseq_Handle::eCoding_Iupac);
1838             TSeqPos stop = m_Feat.GetLocation().GetStop(eExtreme_Biological),
1839                     pos = is_minus_strand ? stop - 1 : stop + 1;
1840 
1841             if (pos < m_LocationBioseq.GetBioseqLength()) {
1842                 has_abutting_gap = x_CheckPosNOrGap(pos, seq_vec);
1843             }
1844         }
1845 
1846         if (!has_abutting_gap) {
1847             EDiagSev sev = eDiag_Warning;
1848             CConstRef <CSeq_feat> gene = m_Gene;
1849             if (gene && gene->GetData().GetGene().IsSetLocus()) {
1850                 string locus = gene->GetData().GetGene().GetLocus();
1851                 if ( NStr::EqualNocase (locus, "orf1ab") ) {
1852                     sev = eDiag_Info;
1853                 }
1854             }
1855             PostErr(sev, eErr_SEQ_FEAT_PartialProblemMismatch3Prime, parent_name + " should not be 3' complete if coding region is 3' partial");
1856         }
1857     }
1858 }
1859 
1860 
x_ValidateParentPartialness()1861 void CCdregionValidator::x_ValidateParentPartialness()
1862 {
1863     if (!m_Gene) {
1864         return;
1865     }
1866     x_ValidateParentPartialness(m_Gene->GetLocation(), "gene");
1867 
1868     CConstRef<CSeq_feat> mrna = GetmRNAforCDS(m_Feat, m_Scope);
1869     if (mrna) {
1870         TFeatScores contained_mrna;
1871         GetOverlappingFeatures(m_Gene->GetLocation(), CSeqFeatData::e_Rna,
1872             CSeqFeatData::eSubtype_mRNA, eOverlap_Contains, contained_mrna, m_Scope);
1873         if (contained_mrna.size() == 1) {
1874             // messy for alternate splicing, so only check if there is only one
1875             x_ValidateParentPartialness(mrna->GetLocation(), "mRNA");
1876         }
1877     }
1878 }
1879 
1880 
1881 END_SCOPE(validator)
1882 END_SCOPE(objects)
1883 END_NCBI_SCOPE
1884