1 /* $Id: cdregion_validator.cpp 636457 2021-08-24 17:53:45Z fukanchi $
2 * ===========================================================================
3 *
4 * PUBLIC DOMAIN NOTICE
5 * National Center for Biotechnology Information
6 *
7 * This software/database is a "United States Government Work" under the
8 * terms of the United States Copyright Act. It was written as part of
9 * the author's official duties as a United States Government employee and
10 * thus cannot be copyrighted. This software/database is freely available
11 * to the public for use. The National Library of Medicine and the U.S.
12 * Government have not placed any restriction on its use or reproduction.
13 *
14 * Although all reasonable efforts have been taken to ensure the accuracy
15 * and reliability of the software and data, the NLM and the U.S.
16 * Government do not and cannot warrant the performance or results that
17 * may be obtained by using this software or data. The NLM and the U.S.
18 * Government disclaim all warranties, express or implied, including
19 * warranties of performance, merchantability or fitness for any particular
20 * purpose.
21 *
22 * Please cite the author in any work or product based on this material.
23 *
24 * ===========================================================================
25 *
26 * Author: Jonathan Kans, Clifford Clausen, Aaron Ucko......
27 *
28 * File Description:
29 * validation of Seq_feat
30 * .......
31 *
32 */
33 #include <ncbi_pch.hpp>
34 #include <corelib/ncbistd.hpp>
35 #include <corelib/ncbistr.hpp>
36 #include <objtools/validator/validatorp.hpp>
37 #include <objtools/validator/single_feat_validator.hpp>
38 #include <objtools/validator/utilities.hpp>
39 #include <objtools/format/items/gene_finder.hpp>
40
41 #include <serial/serialbase.hpp>
42
43 #include <objmgr/bioseq_handle.hpp>
44 #include <objmgr/seq_entry_handle.hpp>
45 #include <objmgr/seqdesc_ci.hpp>
46 #include <objmgr/seq_vector.hpp>
47 #include <objmgr/scope.hpp>
48 #include <objmgr/util/sequence.hpp>
49 #include <objmgr/util/feature.hpp>
50
51 #include <objects/seqfeat/Seq_feat.hpp>
52 #include <objects/seqfeat/BioSource.hpp>
53 #include <objects/seqfeat/Cdregion.hpp>
54 #include <objects/seqfeat/Code_break.hpp>
55 #include <objects/seqfeat/Gb_qual.hpp>
56 #include <objects/seqfeat/Genetic_code.hpp>
57 #include <objects/seqfeat/Genetic_code_table.hpp>
58 #include <objects/seqfeat/Prot_ref.hpp>
59
60 #include <objects/seq/MolInfo.hpp>
61 #include <objects/seq/Bioseq.hpp>
62 #include <objects/seq/seqport_util.hpp>
63
64 #include <objtools/edit/cds_fix.hpp>
65
66 #include <string>
67
68
69 BEGIN_NCBI_SCOPE
70 BEGIN_SCOPE(objects)
71 BEGIN_SCOPE(validator)
72 using namespace sequence;
73
74
CCdregionValidator(const CSeq_feat & feat,CScope & scope,CValidError_imp & imp)75 CCdregionValidator::CCdregionValidator(const CSeq_feat& feat, CScope& scope, CValidError_imp& imp) :
76 CSingleFeatValidator(feat, scope, imp) {
77 m_Gene = m_Imp.GetGeneCache().GetGeneFromCache(&feat, m_Scope);
78 if (m_Gene) {
79 m_GeneIsPseudo = s_IsPseudo(*m_Gene);
80 } else {
81 m_GeneIsPseudo = false;
82 }
83 }
84
85
x_ValidateFeatComment()86 void CCdregionValidator::x_ValidateFeatComment()
87 {
88 if (!m_Feat.IsSetComment()) {
89 return;
90 }
91 CSingleFeatValidator::x_ValidateFeatComment();
92 const string& comment = m_Feat.GetComment();
93 if (NStr::Find(comment, "ambiguity in stop codon") != NPOS
94 && !edit::DoesCodingRegionHaveTerminalCodeBreak(m_Feat.GetData().GetCdregion())) {
95 CRef<CSeq_loc> stop_codon_loc = edit::GetLastCodonLoc(m_Feat, m_Scope);
96 if (stop_codon_loc) {
97 TSeqPos len = sequence::GetLength(*stop_codon_loc, &m_Scope);
98 CSeqVector vec(*stop_codon_loc, m_Scope, CBioseq_Handle::eCoding_Iupac);
99 string seq_string;
100 vec.GetSeqData(0, len - 1, seq_string);
101 bool found_ambig = false;
102 string::iterator it = seq_string.begin();
103 while (it != seq_string.end() && !found_ambig) {
104 if (*it != 'A' && *it != 'T' && *it != 'C' && *it != 'G' && *it != 'U') {
105 found_ambig = true;
106 }
107 ++it;
108 }
109 if (!found_ambig) {
110 m_Imp.PostErr(eDiag_Error, eErr_SEQ_FEAT_BadCDScomment,
111 "Feature comment indicates ambiguity in stop codon "
112 "but no ambiguities are present in stop codon.", m_Feat);
113 }
114 }
115 }
116
117 // look for EC number in comment
118 if (HasECnumberPattern(m_Feat.GetComment())) {
119 // suppress if protein has EC numbers
120 bool suppress = false;
121 if (m_ProductBioseq) {
122 CFeat_CI prot_feat(m_ProductBioseq, SAnnotSelector(CSeqFeatData::eSubtype_prot));
123 if (prot_feat && prot_feat->GetData().GetProt().IsSetEc()) {
124 suppress = true;
125 }
126 }
127 if (!suppress) {
128 PostErr(eDiag_Info, eErr_SEQ_FEAT_EcNumberInCDSComment,
129 "Apparent EC number in CDS comment");
130 }
131 }
132
133 }
134
135
x_ValidateExceptText(const string & text)136 void CCdregionValidator::x_ValidateExceptText(const string& text)
137 {
138 CSingleFeatValidator::x_ValidateExceptText(text);
139 if (m_Feat.GetData().GetCdregion().IsSetCode_break() &&
140 NStr::FindNoCase(text, "RNA editing") != NPOS) {
141 PostErr(eDiag_Warning, eErr_SEQ_FEAT_TranslExceptAndRnaEditing,
142 "CDS has both RNA editing /exception and /transl_except qualifiers");
143 }
144 }
145
146
147 #define FOR_EACH_SEQID_ON_BIOSEQ_HANDLE(Itr, Var) \
148 ITERATE (CBioseq_Handle::TId, Itr, Var.GetId())
149
s_LocIdType(CBioseq_Handle bsh,bool & is_nt,bool & is_ng,bool & is_nw,bool & is_nc)150 void s_LocIdType(CBioseq_Handle bsh,
151 bool& is_nt, bool& is_ng, bool& is_nw, bool& is_nc)
152 {
153 is_nt = is_ng = is_nw = is_nc = false;
154 if (bsh) {
155 FOR_EACH_SEQID_ON_BIOSEQ_HANDLE(it, bsh) {
156 CSeq_id_Handle sid = *it;
157 switch (sid.Which()) {
158 case NCBI_SEQID(Embl):
159 case NCBI_SEQID(Ddbj):
160 case NCBI_SEQID(Other):
161 case NCBI_SEQID(Genbank):
162 {
163 CSeq_id::EAccessionInfo info = sid.IdentifyAccession();
164 is_nt |= (info == CSeq_id::eAcc_refseq_contig);
165 is_ng |= (info == CSeq_id::eAcc_refseq_genomic);
166 is_nw |= (info == CSeq_id::eAcc_refseq_wgs_intermed);
167 is_nc |= (info == CSeq_id::eAcc_refseq_chromosome);
168 break;
169 }
170 default:
171 break;
172 }
173 }
174 }
175 }
176
177
s_LocIdType(const CSeq_loc & loc,CScope & scope,const CSeq_entry & tse,bool & is_nt,bool & is_ng,bool & is_nw,bool & is_nc)178 void s_LocIdType(const CSeq_loc& loc, CScope& scope, const CSeq_entry& tse,
179 bool& is_nt, bool& is_ng, bool& is_nw, bool& is_nc)
180 {
181 is_nt = is_ng = is_nw = is_nc = false;
182 if (!IsOneBioseq(loc, &scope)) {
183 return;
184 }
185 const CSeq_id& id = GetId(loc, &scope);
186 try {
187 CBioseq_Handle bsh = scope.GetBioseqHandleFromTSE(id, tse);
188 if (bsh) {
189 s_LocIdType(bsh, is_nt, is_ng, is_nw, is_nc);
190 }
191 } catch (CException&) {
192 }
193 }
194
195
x_ValidateTrans()196 void CCdregionValidator::x_ValidateTrans()
197 {
198 CCDSTranslationProblems problems;
199 bool is_nt, is_ng, is_nw, is_nc;
200 s_LocIdType(m_LocationBioseq, is_nt, is_ng, is_nw, is_nc);
201
202 problems.CalculateTranslationProblems(m_Feat,
203 m_LocationBioseq,
204 m_ProductBioseq,
205 m_Imp.IgnoreExceptions(),
206 m_Imp.IsFarFetchCDSproducts(),
207 m_Imp.IsStandaloneAnnot(),
208 m_Imp.IsStandaloneAnnot() ? false : m_Imp.GetTSE().IsSeq(),
209 m_Imp.IsGpipe(),
210 m_Imp.IsGenomic(),
211 m_Imp.IsRefSeq(),
212 (is_nt||is_ng||is_nw),
213 is_nc,
214 (m_Imp.IsRefSeq() || m_Imp.IsGED() || m_Imp.IsTPE()),
215 &m_Scope);
216 if (!problems.UnableToTranslate() && !problems.HasException()) {
217 x_ValidateCodebreak();
218 }
219
220 if (problems.GetTranslationProblemFlags() & CCDSTranslationProblems::eCDSTranslationProblem_UnableToFetch) {
221 if (m_Imp.x_IsFarFetchFailure(m_Feat.GetProduct())) {
222 m_Imp.SetFarFetchFailure();
223 }
224 }
225
226 x_ReportTranslationProblems(problems);
227 }
228
229
GetGcodeForName(const string & code_name)230 int GetGcodeForName(const string& code_name)
231 {
232 const CGenetic_code_table& tbl = CGen_code_table::GetCodeTable();
233 ITERATE(CGenetic_code_table::Tdata, it, tbl.Get()) {
234 if (NStr::EqualNocase((*it)->GetName(), code_name)) {
235 return (*it)->GetId();
236 }
237 }
238 return 255;
239 }
240
241
GetGcodeForInternalStopErrors(const CCdregion & cdr)242 int GetGcodeForInternalStopErrors(const CCdregion& cdr)
243 {
244 int gc = 0;
245 if (cdr.IsSetCode()) {
246 ITERATE(CCdregion::TCode::Tdata, it, cdr.GetCode().Get()) {
247 if ((*it)->IsId()) {
248 gc = (*it)->GetId();
249 } else if ((*it)->IsName()) {
250 gc = GetGcodeForName((*it)->GetName());
251 }
252 if (gc != 0) break;
253 }
254 }
255 return gc;
256 }
257
258
GetInternalStopErrorMessage(const CSeq_feat & feat,size_t internal_stop_count,bool bad_start,char transl_start)259 string GetInternalStopErrorMessage(const CSeq_feat& feat, size_t internal_stop_count, bool bad_start, char transl_start)
260 {
261 int gc = GetGcodeForInternalStopErrors(feat.GetData().GetCdregion());
262 string gccode = NStr::IntToString(gc);
263
264 string error_message;
265 if (bad_start) {
266 bool got_dash = transl_start == '-';
267 string codon_desc = got_dash ? "illegal" : "ambiguous";
268 error_message = NStr::SizetToString(internal_stop_count) +
269 " internal stops (and " + codon_desc + " start codon). Genetic code [" + gccode + "]";
270 } else {
271 error_message = NStr::SizetToString(internal_stop_count) +
272 " internal stops. Genetic code [" + gccode + "]";
273 }
274 return error_message;
275 }
276
277
GetInternalStopErrorMessage(const CSeq_feat & feat,const string & transl_prot)278 string GetInternalStopErrorMessage(const CSeq_feat& feat, const string& transl_prot)
279 {
280 size_t internal_stop_count = CountInternalStopCodons(transl_prot);
281
282 int gc = GetGcodeForInternalStopErrors(feat.GetData().GetCdregion());
283 string gccode = NStr::IntToString(gc);
284
285 string error_message;
286 if (HasBadStartCodon(feat.GetLocation(), transl_prot)) {
287 bool got_dash = transl_prot[0] == '-';
288 string codon_desc = got_dash ? "illegal" : "ambiguous";
289 error_message = NStr::SizetToString(internal_stop_count) +
290 " internal stops (and " + codon_desc + " start codon). Genetic code [" + gccode + "]";
291 } else {
292 error_message = NStr::SizetToString(internal_stop_count) +
293 " internal stops. Genetic code [" + gccode + "]";
294 }
295 return error_message;
296 }
297
298
GetStartCodonErrorMessage(const CSeq_feat & feat,const char first_char,size_t internal_stop_count)299 string GetStartCodonErrorMessage(const CSeq_feat& feat, const char first_char, size_t internal_stop_count)
300 {
301 bool got_dash = first_char == '-';
302 string codon_desc = got_dash ? "Illegal" : "Ambiguous";
303 string p_word = got_dash ? "Probably" : "Possibly";
304
305 int gc = GetGcodeForInternalStopErrors(feat.GetData().GetCdregion());
306 string gccode = NStr::IntToString(gc);
307
308 string error_message;
309
310 if (internal_stop_count > 0) {
311 error_message = codon_desc + " start codon (and " +
312 NStr::SizetToString(internal_stop_count) +
313 " internal stops). " + p_word + " wrong genetic code [" +
314 gccode + "]";
315 } else {
316 error_message = codon_desc + " start codon used. Wrong genetic code [" +
317 gccode + "] or protein should be partial";
318 }
319 return error_message;
320 }
321
322
GetStartCodonErrorMessage(const CSeq_feat & feat,const string & transl_prot)323 string GetStartCodonErrorMessage(const CSeq_feat& feat, const string& transl_prot)
324 {
325 size_t internal_stop_count = CountInternalStopCodons(transl_prot);
326
327 return GetStartCodonErrorMessage(feat, transl_prot[0], internal_stop_count);
328 }
329
330
x_ReportTranslationProblems(const CCDSTranslationProblems & problems)331 void CCdregionValidator::x_ReportTranslationProblems(const CCDSTranslationProblems& problems)
332 {
333 size_t problem_flags = problems.GetTranslationProblemFlags();
334 if (problem_flags & CCDSTranslationProblems::eCDSTranslationProblem_UnableToFetch) {
335 string label;
336 const CSeq_id* protid = &GetId(m_Feat.GetProduct(), &m_Scope);
337 protid->GetLabel(&label);
338 EDiagSev sev = eDiag_Error;
339 if (protid->IsGeneral() && protid->GetGeneral().IsSetDb() &&
340 (NStr::EqualNocase(protid->GetGeneral().GetDb(), "ti") ||
341 NStr::EqualNocase(protid->GetGeneral().GetDb(), "SRA"))) {
342 sev = eDiag_Warning;
343 }
344 PostErr(sev, eErr_SEQ_FEAT_ProductFetchFailure,
345 "Unable to fetch CDS product '" + label + "'");
346 }
347
348 if (!problems.HasException() && (problem_flags & CCDSTranslationProblems::eCDSTranslationProblem_NoProtein)) {
349 bool is_nt, is_ng, is_nw, is_nc;
350 s_LocIdType(m_Feat.GetLocation(), m_Scope, m_Imp.GetTSE(), is_nt, is_ng, is_nw, is_nc);
351 EDiagSev sev = eDiag_Error;
352 if (IsDeltaOrFarSeg(m_Feat.GetLocation(), &m_Scope)) {
353 sev = eDiag_Warning;
354 }
355 if (is_nc) {
356 sev = eDiag_Warning;
357 }
358 PostErr(sev, eErr_SEQ_FEAT_NoProtein,
359 "No protein Bioseq given");
360 }
361
362 bool unclassified_except = false;
363 if (m_Feat.IsSetExcept_text() && NStr::FindNoCase(m_Feat.GetExcept_text(), "unclassified translation discrepancy") != NPOS) {
364 unclassified_except = true;
365 }
366
367 x_ReportTranslExceptProblems(problems.GetTranslExceptProblems(), problems.HasException());
368
369 if (!problems.HasException() && problems.HasUnparsedTranslExcept()) {
370 if (problems.GetInternalStopCodons() == 0 && problems.GetTranslationMismatches().size() == 0) {
371 PostErr(eDiag_Warning, eErr_SEQ_FEAT_TranslExcept,
372 "Unparsed transl_except qual (but protein is okay). Skipped");
373 } else {
374 PostErr(eDiag_Warning, eErr_SEQ_FEAT_TranslExcept,
375 "Unparsed transl_except qual. Skipped");
376 }
377 }
378
379
380 for (size_t i = 0; i < problems.GetNumNonsenseIntrons(); i++) {
381 EDiagSev sev = eDiag_Critical;
382 if (m_Imp.IsEmbl() || m_Imp.IsDdbj()) {
383 sev = eDiag_Error;
384 }
385 PostErr(sev, eErr_SEQ_FEAT_IntronIsStopCodon, "Triplet intron encodes stop codon");
386 }
387
388 if (problem_flags & CCDSTranslationProblems::eCDSTranslationProblem_TooManyX) {
389 PostErr(eDiag_Info, eErr_SEQ_FEAT_CDShasTooManyXs, "CDS translation consists of more than 50% X residues");
390 }
391
392 if (problems.UnableToTranslate()) {
393 if (!problems.HasException()) {
394 PostErr(eDiag_Error, eErr_SEQ_FEAT_CdTransFail,
395 "Unable to translate");
396 }
397 }
398
399 if (!problems.UnableToTranslate() && !problems.AltStart() &&
400 m_Feat.IsSetExcept() && m_Feat.IsSetExcept_text() &&
401 NStr::Find(m_Feat.GetExcept_text(), "alternative start codon") != string::npos &&
402 x_BioseqHasNmAccession(m_LocationBioseq)) {
403
404 PostErr(eDiag_Warning, eErr_SEQ_FEAT_AltStartCodonException,
405 "Unnecessary alternative start codon exception");
406 }
407
408 if ((!problems.HasException() || unclassified_except) && problems.GetInternalStopCodons() > 0) {
409 if (unclassified_except && m_Imp.IsGpipe()) {
410 // suppress if gpipe genomic
411 } else {
412 EDiagSev stop_sev = unclassified_except ? eDiag_Warning : eDiag_Error;
413 if (!m_Imp.IsRefSeq() && m_Imp.IsGI() && m_Imp.IsGED()) {
414 stop_sev = eDiag_Critical;
415 }
416
417 PostErr(stop_sev, eErr_SEQ_FEAT_InternalStop,
418 GetInternalStopErrorMessage(m_Feat, problems.GetInternalStopCodons(),
419 (problem_flags & CCDSTranslationProblems::eCDSTranslationProblem_BadStart),
420 problems.GetTranslStartCharacter()));
421 }
422 }
423
424 if (!problems.HasException()) {
425
426 if (!unclassified_except && (problem_flags & CCDSTranslationProblems::eCDSTranslationProblem_BadStart)) {
427 string start_err_msg = GetStartCodonErrorMessage(m_Feat, problems.GetTranslStartCharacter(), problems.GetInternalStopCodons());
428 PostErr(eDiag_Error, eErr_SEQ_FEAT_StartCodon,
429 start_err_msg);
430 }
431
432 if (problem_flags & CCDSTranslationProblems::eCDSTranslationProblem_FrameNotPartial) {
433 PostErr(eDiag_Error, eErr_SEQ_FEAT_SuspiciousFrame,
434 "Suspicious CDS location - reading frame > 1 but not 5' partial");
435 }
436
437 if (problem_flags & CCDSTranslationProblems::eCDSTranslationProblem_FrameNotConsensus) {
438 EDiagSev sev = eDiag_Warning;
439 if (x_BioseqHasNmAccession(m_LocationBioseq))
440 {
441 sev = eDiag_Error;
442 }
443 PostErr(sev, eErr_SEQ_FEAT_SuspiciousFrame,
444 "Suspicious CDS location - reading frame > 1 and not at consensus splice site");
445 }
446
447 if (problem_flags & CCDSTranslationProblems::eCDSTranslationProblem_NoStop) {
448 PostErr(eDiag_Error, eErr_SEQ_FEAT_NoStop,
449 "Missing stop codon");
450 }
451 if (problem_flags & CCDSTranslationProblems::eCDSTranslationProblem_StopPartial) {
452 PostErr(eDiag_Error, eErr_SEQ_FEAT_PartialProblemHasStop,
453 "Got stop codon, but 3'end is labeled partial");
454 }
455 if (problem_flags & CCDSTranslationProblems::eCDSTranslationProblem_ShouldStartPartial) {
456 PostErr(eDiag_Error, eErr_SEQ_FEAT_PartialProblem,
457 "Start of location should probably be partial");
458 }
459 if (problems.GetRaggedLength() > 0) {
460 PostErr(eDiag_Error, eErr_SEQ_FEAT_TransLen,
461 "Coding region extends " + NStr::IntToString(problems.GetRaggedLength()) +
462 " base(s) past stop codon");
463 }
464 }
465
466 if (!problems.UnableToTranslate() && problems.GetProtLen() > 1.2 * problems.GetTransLen()) {
467 if ((!m_Feat.IsSetExcept_text()) || NStr::Find(m_Feat.GetExcept_text(), "annotated by transcript or proteomic data") == string::npos) {
468 string msg = "Protein product length [" + NStr::SizetToString(problems.GetProtLen()) +
469 "] is more than 120% of the ";
470 if (m_ProductIsFar) {
471 msg += "(far) ";
472 }
473 msg += "translation length [" + NStr::SizetToString(problems.GetTransLen()) + "]";
474 PostErr(eDiag_Warning, eErr_SEQ_FEAT_ProductLength, msg);
475 }
476 }
477
478
479 bool rna_editing = false;
480 if (m_Feat.IsSetExcept_text() && NStr::FindNoCase(m_Feat.GetExcept_text(), "RNA editing") != NPOS) {
481 rna_editing = true;
482 }
483 if (problems.GetProtLen() != problems.GetTransLen() &&
484 (!problems.HasException() ||
485 (rna_editing &&
486 (problems.GetProtLen() < problems.GetTransLen() - 1 || problems.GetProtLen() > problems.GetTransLen())))) {
487 string msg = "Given protein length [" + NStr::SizetToString(problems.GetProtLen()) +
488 "] does not match ";
489 if (m_ProductIsFar) {
490 msg += "(far) ";
491 }
492 msg += "translation length [" +
493 NStr::SizetToString(problems.GetTransLen()) + "]";
494
495 if (rna_editing) {
496 msg += " (RNA editing present)";
497 }
498 PostErr(rna_editing ? eDiag_Warning : eDiag_Error,
499 eErr_SEQ_FEAT_TransLen, msg);
500 }
501
502 bool mismatch_except = false;
503 if (m_Feat.IsSetExcept_text() && NStr::FindNoCase(m_Feat.GetExcept_text(), "mismatches in translation") != NPOS) {
504 mismatch_except = true;
505 }
506
507 if (!problems.HasException() && !mismatch_except) {
508 x_ReportTranslationMismatches(problems.GetTranslationMismatches());
509 }
510
511 if (problems.GetTranslTerminalX() != problems.GetProdTerminalX()) {
512 PostErr(eDiag_Warning, eErr_SEQ_FEAT_TerminalXDiscrepancy,
513 "Terminal X count for CDS translation (" + NStr::SizetToString(problems.GetTranslTerminalX())
514 + ") and protein product sequence (" + NStr::SizetToString(problems.GetProdTerminalX())
515 + ") are not equal");
516 }
517
518 if (problem_flags & CCDSTranslationProblems::eCDSTranslationProblem_ShouldBePartialButIsnt) {
519 PostErr(eDiag_Error, eErr_SEQ_FEAT_PartialProblem,
520 "End of location should probably be partial");
521 }
522 if (problem_flags & CCDSTranslationProblems::eCDSTranslationProblem_ShouldNotBePartialButIs) {
523 PostErr(eDiag_Error, eErr_SEQ_FEAT_PartialProblem,
524 "This SeqFeat should not be partial");
525 }
526
527 if (problem_flags & CCDSTranslationProblems::eCDSTranslationProblem_UnnecessaryException) {
528 PostErr(eDiag_Warning, eErr_SEQ_FEAT_UnnecessaryException,
529 "CDS has exception but passes translation test");
530 }
531
532 if (problem_flags & CCDSTranslationProblems::eCDSTranslationProblem_ErroneousException) {
533 PostErr(eDiag_Warning, eErr_SEQ_FEAT_ErroneousException,
534 "CDS has unclassified exception but only difference is "
535 + NStr::SizetToString(problems.GetTranslationMismatches().size()) + " mismatches out of "
536 + NStr::SizetToString(problems.GetProtLen()) + " residues");
537 }
538
539 if (problem_flags & CCDSTranslationProblems::eCDSTranslationProblem_UnqualifiedException) {
540 PostErr(eDiag_Warning, eErr_SEQ_FEAT_UnnecessaryException,
541 "CDS has unnecessary translated product replaced exception");
542 }
543
544 }
545
546
MapToNTCoords(TSeqPos pos)547 string CCdregionValidator::MapToNTCoords(TSeqPos pos)
548 {
549 string result;
550
551 CSeq_point pnt;
552 pnt.SetPoint(pos);
553 pnt.SetStrand( GetStrand(m_Feat.GetProduct(), &m_Scope) );
554
555 try {
556 pnt.SetId().Assign(GetId(m_Feat.GetProduct(), &m_Scope));
557 } catch (CObjmgrUtilException) {}
558
559 CSeq_loc tmp;
560 tmp.SetPnt(pnt);
561 CRef<CSeq_loc> loc = ProductToSource(m_Feat, tmp, 0, &m_Scope);
562
563 result = GetValidatorLocationLabel(*loc, m_Scope);
564
565 return result;
566 }
567
568
x_ReportTranslationMismatches(const CCDSTranslationProblems::TTranslationMismatches & mismatches)569 void CCdregionValidator::x_ReportTranslationMismatches(const CCDSTranslationProblems::TTranslationMismatches& mismatches)
570 {
571 string nuclocstr;
572
573 size_t num_mismatches = mismatches.size();
574
575 if (num_mismatches > 10) {
576 // report total number of mismatches and the details of the
577 // first and last.
578 nuclocstr = MapToNTCoords(mismatches.front().pos);
579 string msg =
580 NStr::SizetToString(mismatches.size()) + " mismatches found. " +
581 "First mismatch at " + NStr::IntToString(mismatches.front().pos + 1) +
582 ", residue in protein [";
583 msg += mismatches.front().prot_res;
584 msg += "] != translation [";
585 msg += mismatches.front().transl_res;
586 msg += "]";
587 if (!nuclocstr.empty()) {
588 msg += " at " + nuclocstr;
589 }
590 nuclocstr = MapToNTCoords(mismatches.back().pos);
591 msg +=
592 ". Last mismatch at " + NStr::IntToString(mismatches.back().pos + 1) +
593 ", residue in protein [";
594 msg += mismatches.back().prot_res;
595 msg += "] != translation [";
596 msg += mismatches.back().transl_res;
597 msg += "]";
598 if (!nuclocstr.empty()) {
599 msg += " at " + nuclocstr;
600 }
601 int gc = 0;
602 if (m_Feat.GetData().IsCdregion() && m_Feat.GetData().GetCdregion().IsSetCode()) {
603 // We assume that the id is set for all Genetic_code
604 gc = m_Feat.GetData().GetCdregion().GetCode().GetId();
605 }
606 string gccode = NStr::IntToString(gc);
607
608 msg += ". Genetic code [" + gccode + "]";
609 PostErr(eDiag_Error, eErr_SEQ_FEAT_MisMatchAA, msg);
610 } else {
611 // report individual mismatches
612 for (size_t i = 0; i < mismatches.size(); ++i) {
613 nuclocstr = MapToNTCoords(mismatches[i].pos);
614 if (mismatches[i].pos == 0 && mismatches[i].transl_res == '-') {
615 // skip - dash is expected to differ
616 num_mismatches--;
617 } else {
618 EDiagSev sev = eDiag_Error;
619 if (mismatches[i].prot_res == 'X' &&
620 (mismatches[i].transl_res == 'B' || mismatches[i].transl_res == 'Z' || mismatches[i].transl_res == 'J')) {
621 sev = eDiag_Warning;
622 }
623 string msg;
624 if (m_ProductIsFar) {
625 msg += "(far) ";
626 }
627 msg += "Residue " + NStr::IntToString(mismatches[i].pos + 1) +
628 " in protein [";
629 msg += mismatches[i].prot_res;
630 msg += "] != translation [";
631 msg += mismatches[i].transl_res;
632 msg += "]";
633 if (!nuclocstr.empty()) {
634 msg += " at " + nuclocstr;
635 }
636 PostErr(sev, eErr_SEQ_FEAT_MisMatchAA, msg);
637 }
638 }
639 }
640 }
641
642
x_ReportTranslExceptProblems(const CCDSTranslationProblems::TTranslExceptProblems & problems,bool has_exception)643 void CCdregionValidator::x_ReportTranslExceptProblems(const CCDSTranslationProblems::TTranslExceptProblems& problems, bool has_exception)
644 {
645 for (auto it = problems.begin(); it != problems.end(); it++) {
646 string msg;
647 switch (it->problem) {
648 case CCDSTranslationProblems::eTranslExceptPhase:
649 if (!has_exception) {
650 PostErr(eDiag_Warning, eErr_SEQ_FEAT_TranslExceptPhase,
651 "transl_except qual out of frame.");
652 }
653 break;
654 case CCDSTranslationProblems::eTranslExceptSuspicious:
655 msg = "Suspicious transl_except ";
656 msg += it->ex;
657 msg += " at first codon of complete CDS";
658 PostErr(eDiag_Warning, eErr_SEQ_FEAT_TranslExcept, msg);
659 break;
660 case CCDSTranslationProblems::eTranslExceptUnnecessary:
661 msg = "Unnecessary transl_except ";
662 msg += it->ex;
663 msg += " at position ";
664 msg += NStr::SizetToString(it->prot_pos + 1);
665 PostErr(eDiag_Warning, eErr_SEQ_FEAT_UnnecessaryTranslExcept,
666 msg);
667 break;
668 case CCDSTranslationProblems::eTranslExceptUnexpected:
669 msg = "Unexpected transl_except ";
670 msg += it->ex;
671 msg += +" at position " + NStr::SizetToString(it->prot_pos + 1)
672 + " just past end of protein";
673
674 PostErr(eDiag_Warning, eErr_SEQ_FEAT_UnnecessaryTranslExcept,
675 msg);
676 break;
677 }
678 }
679 }
680
681
x_ValidateCodebreak()682 void CCdregionValidator::x_ValidateCodebreak()
683 {
684 const CCdregion& cds = m_Feat.GetData().GetCdregion();
685 const CSeq_loc& feat_loc = m_Feat.GetLocation();
686 const CCode_break* prev_cbr = 0;
687
688 FOR_EACH_CODEBREAK_ON_CDREGION (it, cds) {
689 const CCode_break& cbr = **it;
690 const CSeq_loc& cbr_loc = cbr.GetLoc();
691 ECompare comp = Compare(cbr_loc, feat_loc, &m_Scope, fCompareOverlapping);
692 if ( ((comp != eContained) && (comp != eSame)) || cbr_loc.IsNull() || cbr_loc.IsEmpty()) {
693 PostErr (eDiag_Critical, eErr_SEQ_FEAT_CDSrange,
694 "Code-break location not in coding region");
695 } else if (m_Feat.IsSetProduct()) {
696 if (cbr_loc.GetStop(eExtreme_Biological) == feat_loc.GetStop(eExtreme_Biological)) {
697 // terminal exception - don't bother checking, can't be mapped
698 } else {
699 if (SeqLocCheck(cbr_loc, &m_Scope) == eSeqLocCheck_error) {
700 string lbl = GetValidatorLocationLabel(cbr_loc, m_Scope);
701 PostErr(eDiag_Critical, eErr_SEQ_FEAT_CDSrange,
702 "Code-break: SeqLoc [" + lbl + "] out of range");
703 } else {
704 int frame = 0;
705 CRef<CSeq_loc> p_loc = SourceToProduct(m_Feat, cbr_loc, fS2P_AllowTer, &m_Scope, &frame);
706 if (!p_loc || p_loc->IsNull() || frame != 1) {
707 PostErr(eDiag_Critical, eErr_SEQ_FEAT_CDSrange,
708 "Code-break location not in coding region - may be frame problem");
709 }
710 }
711 }
712 }
713 if (cbr_loc.IsPartialStart(eExtreme_Biological) ||
714 cbr_loc.IsPartialStop(eExtreme_Biological)) {
715 PostErr(eDiag_Error, eErr_SEQ_FEAT_TranslExceptIsPartial,
716 "Translation exception locations should not be partial");
717 }
718 if ( prev_cbr != 0 ) {
719 if ( Compare(cbr_loc, prev_cbr->GetLoc(), &m_Scope, fCompareOverlapping) == eSame ) {
720 string msg = "Multiple code-breaks at same location ";
721 string str = GetValidatorLocationLabel (cbr_loc, m_Scope);
722 if ( !str.empty() ) {
723 msg += "[" + str + "]";
724 }
725 PostErr(eDiag_Error, eErr_SEQ_FEAT_DuplicateTranslExcept,
726 msg);
727 }
728 }
729 prev_cbr = &cbr;
730 }
731 }
732
733
Validate()734 void CCdregionValidator::Validate()
735 {
736 CSingleFeatValidator::Validate();
737
738 bool feat_is_pseudo = s_IsPseudo(m_Feat);
739 bool pseudo = feat_is_pseudo || m_GeneIsPseudo;
740
741 x_ValidateQuals();
742 x_ValidateGeneticCode();
743
744 const CCdregion& cdregion = m_Feat.GetData().GetCdregion();
745 if (cdregion.IsSetOrf() && cdregion.GetOrf() &&
746 m_Feat.IsSetProduct()) {
747 PostErr(eDiag_Warning, eErr_SEQ_FEAT_OrfCdsHasProduct,
748 "An ORF coding region should not have a product");
749 }
750
751 if (pseudo) {
752 if (m_Feat.IsSetProduct()) {
753 if (feat_is_pseudo) {
754 PostErr(eDiag_Error, eErr_SEQ_FEAT_PseudoCdsHasProduct,
755 "A pseudo coding region should not have a product");
756 } else if (m_GeneIsPseudo) {
757 PostErr(eDiag_Error, eErr_SEQ_FEAT_PseudoCdsViaGeneHasProduct,
758 "A coding region overlapped by a pseudogene should not have a product");
759 } else {
760 PostErr(eDiag_Error, eErr_SEQ_FEAT_PseudoCdsHasProduct,
761 "A pseudo coding region should not have a product");
762 }
763 }
764 } else {
765 ReportShortIntrons();
766 x_ValidateProductId();
767 x_ValidateCommonProduct();
768 }
769
770 x_ValidateBadMRNAOverlap();
771 x_ValidateFarProducts();
772 x_ValidateCDSPeptides();
773 x_ValidateCDSPartial();
774
775 if (x_IsProductMisplaced()) {
776 if (m_Imp.IsSmallGenomeSet()) {
777 PostErr(eDiag_Warning, eErr_SEQ_FEAT_CDSproductPackagingProblem,
778 "Protein product not packaged in nuc-prot set with nucleotide in small genome set");
779 } else {
780 PostErr(eDiag_Error, eErr_SEQ_FEAT_CDSproductPackagingProblem,
781 "Protein product not packaged in nuc-prot set with nucleotide");
782 }
783 }
784
785 bool conflict = cdregion.IsSetConflict() && cdregion.GetConflict();
786 if ( !pseudo && !conflict ) {
787 x_ValidateTrans();
788 ValidateSplice(false, false);
789 }
790
791 if (conflict) {
792 x_ValidateConflict();
793 }
794
795 x_ReportPseudogeneConflict(m_Gene);
796 x_ValidateLocusTagGeneralMatch(m_Gene);
797
798 x_ValidateProductPartials();
799 x_ValidateParentPartialness();
800 }
801
802
x_ValidateQuals()803 void CCdregionValidator::x_ValidateQuals()
804 {
805
806 FOR_EACH_GBQUAL_ON_FEATURE(it, m_Feat) {
807 const CGb_qual& qual = **it;
808 if (qual.CanGetQual()) {
809 const string& key = qual.GetQual();
810 if (NStr::EqualNocase(key, "exception")) {
811 if (!m_Feat.IsSetExcept()) {
812 PostErr(eDiag_Warning, eErr_SEQ_FEAT_MissingExceptionFlag,
813 "Exception flag should be set in coding region");
814 }
815 } else if (NStr::EqualNocase(key, "codon")) {
816 PostErr(eDiag_Error, eErr_SEQ_FEAT_CodonQualifierUsed,
817 "Use the proper genetic code, if available, "
818 "or set transl_excepts on specific codons");
819 } else if (NStr::EqualNocase(key, "protein_id")) {
820 PostErr(eDiag_Warning, eErr_SEQ_FEAT_WrongQualOnFeature,
821 "protein_id should not be a gbqual on a CDS feature");
822 } else if (NStr::EqualNocase(key, "gene_synonym")) {
823 PostErr(eDiag_Warning, eErr_SEQ_FEAT_WrongQualOnCDS,
824 "gene_synonym should not be a gbqual on a CDS feature");
825 } else if (NStr::EqualNocase(key, "transcript_id")) {
826 PostErr(eDiag_Warning, eErr_SEQ_FEAT_WrongQualOnFeature,
827 "transcript_id should not be a gbqual on a CDS feature");
828 } else if (NStr::EqualNocase(key, "codon_start")) {
829 const CCdregion& cdregion = m_Feat.GetData().GetCdregion();
830 if (cdregion.IsSetFrame() && cdregion.GetFrame() != CCdregion::eFrame_not_set) {
831 PostErr(eDiag_Warning, eErr_SEQ_FEAT_WrongQualOnFeature,
832 "conflicting codon_start values");
833 } else {
834 PostErr(eDiag_Warning, eErr_SEQ_FEAT_InvalidCodonStart,
835 "codon_start value should be 1, 2, or 3");
836 }
837 }
838 }
839 }
840 }
841
842
x_ReportOrigProteinId()843 bool CCdregionValidator::x_ReportOrigProteinId()
844 {
845 if (!m_GeneIsPseudo && !s_IsPseudo(m_Feat)) {
846 return true;
847 } else {
848 return false;
849 }
850 }
851
852
853 const string s_PlastidTxt[] = {
854 "",
855 "",
856 "chloroplast",
857 "chromoplast",
858 "",
859 "",
860 "plastid",
861 "",
862 "",
863 "",
864 "",
865 "",
866 "cyanelle",
867 "",
868 "",
869 "",
870 "apicoplast",
871 "leucoplast",
872 "proplastid",
873 ""
874 };
875
876
IsPlastid(int genome)877 bool CCdregionValidator::IsPlastid(int genome)
878 {
879 if ( genome == CBioSource::eGenome_chloroplast ||
880 genome == CBioSource::eGenome_chromoplast ||
881 genome == CBioSource::eGenome_plastid ||
882 genome == CBioSource::eGenome_cyanelle ||
883 genome == CBioSource::eGenome_apicoplast ||
884 genome == CBioSource::eGenome_leucoplast ||
885 genome == CBioSource::eGenome_proplastid ||
886 genome == CBioSource::eGenome_chromatophore ) {
887 return true;
888 }
889
890 return false;
891 }
892
893
IsGeneticCodeValid(int gcode)894 static bool IsGeneticCodeValid(int gcode)
895 {
896 bool ret = false;
897 if (gcode > 0) {
898
899 try {
900 const CTrans_table& tbl = CGen_code_table::GetTransTable(gcode);
901 (void)tbl; // suppress unused-variable warning
902 ret = true;
903 }
904 catch (CException&) {
905 }
906 }
907
908 return ret;
909 }
910
911
s_GetStrictGenCode(const CBioSource & src)912 static int s_GetStrictGenCode(const CBioSource& src)
913 {
914 int gencode = 0;
915
916 try {
917 CBioSource::TGenome genome = src.IsSetGenome() ? src.GetGenome() : CBioSource::eGenome_unknown;
918
919 if ( src.IsSetOrg() && src.GetOrg().IsSetOrgname() ) {
920 const COrgName& orn = src.GetOrg().GetOrgname();
921
922 switch ( genome ) {
923 case CBioSource::eGenome_kinetoplast:
924 case CBioSource::eGenome_mitochondrion:
925 case CBioSource::eGenome_hydrogenosome:
926 // bacteria and plant organelle code
927 if (orn.IsSetMgcode()) {
928 gencode = orn.GetMgcode();
929 }
930 break;
931 case CBioSource::eGenome_chloroplast:
932 case CBioSource::eGenome_chromoplast:
933 case CBioSource::eGenome_plastid:
934 case CBioSource::eGenome_cyanelle:
935 case CBioSource::eGenome_apicoplast:
936 case CBioSource::eGenome_leucoplast:
937 case CBioSource::eGenome_proplastid:
938 if (orn.IsSetPgcode() && orn.GetPgcode() != 0) {
939 gencode = orn.GetPgcode();
940 } else {
941 // bacteria and plant plastids are code 11.
942 gencode = 11;
943 }
944 break;
945 default:
946 if (orn.IsSetGcode()) {
947 gencode = orn.GetGcode();
948 }
949 break;
950 }
951 }
952 } catch (CException ) {
953 } catch (std::exception ) {
954 }
955 return gencode;
956 }
957
958
x_ValidateGeneticCode()959 void CCdregionValidator::x_ValidateGeneticCode()
960 {
961 if (!m_LocationBioseq) {
962 return;
963 }
964 int cdsgencode = 0;
965
966 const CCdregion& cdregion = m_Feat.GetData().GetCdregion();
967
968 if (cdregion.CanGetCode()) {
969 cdsgencode = cdregion.GetCode().GetId();
970
971 if (!IsGeneticCodeValid(cdsgencode)) {
972 PostErr(eDiag_Error, eErr_SEQ_FEAT_GenCodeInvalid,
973 "A coding region contains invalid genetic code [" + NStr::IntToString(cdsgencode) + "]");
974 }
975 }
976
977 CSeqdesc_CI diter(m_LocationBioseq, CSeqdesc::e_Source);
978 if (diter) {
979 const CBioSource& src = diter->GetSource();
980 int biopgencode = s_GetStrictGenCode(src);
981
982 if (biopgencode != cdsgencode
983 && (!m_Feat.IsSetExcept()
984 || !m_Feat.IsSetExcept_text()
985 || NStr::Find(m_Feat.GetExcept_text(), "genetic code exception") == string::npos)) {
986 int genome = 0;
987
988 if (src.CanGetGenome()) {
989 genome = src.GetGenome();
990 }
991
992 if (IsPlastid(genome)) {
993 PostErr(eDiag_Warning, eErr_SEQ_FEAT_GenCodeMismatch,
994 "Genetic code conflict between CDS (code " +
995 NStr::IntToString(cdsgencode) +
996 ") and BioSource.genome biological context (" +
997 s_PlastidTxt[genome] + ") (uses code 11)");
998 } else {
999 PostErr(eDiag_Warning, eErr_SEQ_FEAT_GenCodeMismatch,
1000 "Genetic code conflict between CDS (code " +
1001 NStr::IntToString(cdsgencode) +
1002 ") and BioSource (code " +
1003 NStr::IntToString(biopgencode) + ")");
1004 }
1005 }
1006 }
1007 }
1008
1009
x_ValidateSeqFeatLoc()1010 void CCdregionValidator::x_ValidateSeqFeatLoc()
1011 {
1012 CSingleFeatValidator::x_ValidateSeqFeatLoc();
1013
1014 // for coding regions, internal exons should not be 15 or less bp long
1015 int num_short_exons = 0;
1016 string message;
1017 CSeq_loc_CI it(m_Feat.GetLocation());
1018 if (it) {
1019 // note - do not want to warn for first or last exon
1020 ++it;
1021 size_t prev_len = 16;
1022 size_t prev_start = 0;
1023 size_t prev_stop = 0;
1024 while (it) {
1025 if (prev_len <= 15) {
1026 num_short_exons++;
1027 if (!message.empty()) {
1028 message += ", ";
1029 }
1030 message += NStr::NumericToString(prev_start + 1)
1031 + "-" + NStr::NumericToString(prev_stop + 1);
1032 }
1033 prev_len = it.GetRange().GetLength();
1034 prev_start = it.GetRange().GetFrom();
1035 prev_stop = it.GetRange().GetTo();
1036 ++it;
1037 }
1038 }
1039 if (num_short_exons > 1) {
1040 PostErr(eDiag_Warning, eErr_SEQ_FEAT_ShortExon,
1041 "Coding region has multiple internal exons that are too short at positions " + message);
1042 } else if (num_short_exons == 1) {
1043 PostErr(eDiag_Warning, eErr_SEQ_FEAT_ShortExon,
1044 "Internal coding region exon is too short at position " + message);
1045 }
1046 }
1047
1048
x_ValidateBadMRNAOverlap()1049 void CCdregionValidator::x_ValidateBadMRNAOverlap()
1050 {
1051 if (x_HasGoodParent()) {
1052 return;
1053 }
1054
1055 const CSeq_loc& loc = m_Feat.GetLocation();
1056
1057 CConstRef<CSeq_feat> mrna = GetBestOverlappingFeat(
1058 loc,
1059 CSeqFeatData::eSubtype_mRNA,
1060 eOverlap_Simple,
1061 m_Scope);
1062 if (!mrna) {
1063 return;
1064 }
1065
1066 mrna = GetBestOverlappingFeat(
1067 loc,
1068 CSeqFeatData::eSubtype_mRNA,
1069 eOverlap_CheckIntRev,
1070 m_Scope);
1071 if (mrna) {
1072 return;
1073 }
1074
1075 mrna = GetBestOverlappingFeat(
1076 loc,
1077 CSeqFeatData::eSubtype_mRNA,
1078 eOverlap_Interval,
1079 m_Scope);
1080 if (!mrna) {
1081 return;
1082 }
1083
1084 bool pseudo = s_IsPseudo(m_Feat) || m_GeneIsPseudo;
1085
1086 EErrType err_type = eErr_SEQ_FEAT_CDSmRNArange;
1087 if (pseudo) {
1088 err_type = eErr_SEQ_FEAT_PseudoCDSmRNArange;
1089 }
1090
1091 mrna = GetBestOverlappingFeat(
1092 loc,
1093 CSeqFeatData::eSubtype_mRNA,
1094 eOverlap_SubsetRev,
1095 m_Scope);
1096
1097 EDiagSev sev = eDiag_Warning;
1098 if (pseudo) {
1099 sev = eDiag_Info;
1100 }
1101 if (mrna) {
1102 // ribosomal slippage exception suppresses CDSmRNArange warning
1103 bool supress = false;
1104
1105 if (m_Feat.CanGetExcept_text()) {
1106 const CSeq_feat::TExcept_text& text = m_Feat.GetExcept_text();
1107 if (NStr::FindNoCase(text, "ribosomal slippage") != NPOS
1108 || NStr::FindNoCase(text, "trans-splicing") != NPOS) {
1109 supress = true;
1110 }
1111 }
1112 if (!supress) {
1113 PostErr(sev, err_type,
1114 "mRNA contains CDS but internal intron-exon boundaries "
1115 "do not match");
1116 }
1117 } else {
1118 PostErr(sev, err_type,
1119 "mRNA overlaps or contains CDS but does not completely "
1120 "contain intervals");
1121 }
1122 }
1123
1124
x_HasGoodParent()1125 bool CCdregionValidator::x_HasGoodParent()
1126 {
1127 static const CSeqFeatData::ESubtype parent_types[] = {
1128 CSeqFeatData::eSubtype_C_region,
1129 CSeqFeatData::eSubtype_D_segment,
1130 CSeqFeatData::eSubtype_J_segment,
1131 CSeqFeatData::eSubtype_V_segment
1132 };
1133 size_t num_parent_types = sizeof(parent_types) / sizeof(CSeqFeatData::ESubtype);
1134 CRef<feature::CFeatTree> feat_tree = m_Imp.GetGeneCache().GetFeatTreeFromCache(m_Feat, m_Scope);
1135 if (!feat_tree) {
1136 return false;
1137 }
1138 CSeq_feat_Handle fh;
1139 try {
1140 // will fail if location is bad
1141 fh = m_Scope.GetSeq_featHandle(m_Feat);
1142 } catch (CException&) {
1143 return false;
1144 }
1145
1146 for (size_t i = 0; i < num_parent_types; i++) {
1147 CMappedFeat parent = feat_tree->GetParent(fh, parent_types[i]);
1148 if (parent) {
1149 sequence::ECompare cmp = sequence::Compare(m_Feat.GetLocation(),
1150 parent.GetLocation(),
1151 &m_Scope,
1152 sequence::fCompareOverlapping);
1153 if (cmp == sequence::eContained || cmp == sequence::eSame) {
1154 return true;
1155 }
1156 }
1157 }
1158 return false;
1159 }
1160
1161
1162 // VR-619
1163 // for an mRNA / CDS pair where both have far products
1164 // (which is only true for genomic RefSeqs with instantiated mRNA products),
1165 // please check that the pair found by CFeatTree corresponds to the nuc-prot pair in ID
1166 // (i.e.the CDS product is annotated on the mRNA product).
x_ValidateFarProducts()1167 void CCdregionValidator::x_ValidateFarProducts()
1168 {
1169 // if coding region doesn't have a far product, nothing to check
1170 if (!m_ProductIsFar) {
1171 return;
1172 }
1173 // no point if not far-fetching
1174 if (!m_Imp.IsRemoteFetch()) {
1175 return;
1176 }
1177 if (!m_Feat.GetData().IsCdregion() || !m_Feat.IsSetProduct()) {
1178 return;
1179 }
1180 if (!m_Imp.IsRefSeq()) {
1181 return;
1182 }
1183 const CSeq_id * cds_sid = m_Feat.GetProduct().GetId();
1184 if (!cds_sid) {
1185 return;
1186 }
1187 CRef<feature::CFeatTree> feat_tree = m_Imp.GetGeneCache().GetFeatTreeFromCache(m_Feat, m_Scope);
1188 if (!feat_tree) {
1189 return;
1190 }
1191 CSeq_feat_Handle fh = m_Scope.GetSeq_featHandle(m_Feat);
1192 if (!fh) {
1193 return;
1194 }
1195 CMappedFeat mrna = feat_tree->GetParent(fh, CSeqFeatData::eSubtype_mRNA);
1196 if (!mrna || !mrna.IsSetProduct()) {
1197 // no mRNA or no mRNA product
1198 return;
1199 }
1200 const CSeq_id * mrna_sid = mrna.GetProduct().GetId();
1201 if (!mrna_sid) {
1202 return;
1203 }
1204 CBioseq_Handle mrna_prod = m_Scope.GetBioseqHandleFromTSE(*mrna_sid, m_LocationBioseq.GetTSE_Handle());
1205 if (mrna_prod) {
1206 // mRNA product is not far
1207 return;
1208 }
1209 mrna_prod = m_Scope.GetBioseqHandle(*mrna_sid);
1210 if (!mrna_prod) {
1211 // can't be fetched, will be reported elsewhere
1212 return;
1213 }
1214 CSeq_entry_Handle far_mrna_nps =
1215 mrna_prod.GetExactComplexityLevel(CBioseq_set::eClass_nuc_prot);
1216 if (!far_mrna_nps) {
1217 PostErr(eDiag_Error, eErr_SEQ_FEAT_CDSmRNAmismatch, "no Far mRNA nuc-prot-set");
1218 } else {
1219 CBioseq_Handle cds_prod = m_Scope.GetBioseqHandleFromTSE(*cds_sid, far_mrna_nps);
1220 if (!cds_prod) {
1221 PostErr(eDiag_Error, eErr_SEQ_FEAT_CDSmRNAmismatch, "Far CDS product and far mRNA product are not packaged together");
1222 m_Imp.PostErr(eDiag_Error, eErr_SEQ_FEAT_CDSmRNAmismatch, "Far CDS product and far mRNA product are not packaged together", *(mrna.GetSeq_feat()));
1223 }
1224 }
1225 }
1226
1227
x_ValidateCDSPeptides()1228 void CCdregionValidator::x_ValidateCDSPeptides()
1229 {
1230 try {
1231 if (!m_Feat.GetData().IsCdregion() || !m_Feat.CanGetProduct()) {
1232 return;
1233 }
1234
1235 CBioseq_Handle prot = m_Scope.GetBioseqHandle(m_Feat.GetProduct());
1236 if (!prot) {
1237 return;
1238 }
1239 CBioseq_Handle nuc = m_Scope.GetBioseqHandle(m_Feat.GetLocation());
1240 if (!nuc) {
1241 return;
1242 }
1243 // check for self-referential CDS feature
1244 if (nuc == prot) {
1245 return;
1246 }
1247
1248 const CGene_ref* cds_ref = 0;
1249
1250 // map from cds product to nucleotide
1251 const string prev = GetDiagFilter(eDiagFilter_Post);
1252 SetDiagFilter(eDiagFilter_All, "!(1305.28,31)");
1253 CSeq_loc_Mapper prot_to_cds(m_Feat, CSeq_loc_Mapper::eProductToLocation, &m_Scope);
1254 SetDiagFilter(eDiagFilter_All, prev.c_str());
1255
1256 for (CFeat_CI it(prot, CSeqFeatData::e_Prot); it; ++it) {
1257 CSeq_feat_Handle curr = it->GetSeq_feat_Handle();
1258 CSeqFeatData::ESubtype subtype = curr.GetFeatSubtype();
1259
1260 if (subtype != CSeqFeatData::eSubtype_preprotein &&
1261 subtype != CSeqFeatData::eSubtype_mat_peptide_aa &&
1262 subtype != CSeqFeatData::eSubtype_sig_peptide_aa &&
1263 subtype != CSeqFeatData::eSubtype_transit_peptide_aa &&
1264 subtype != CSeqFeatData::eSubtype_propeptide_aa) {
1265 continue;
1266 }
1267
1268 // see if already has gene xref
1269 if (curr.GetGeneXref()) {
1270 continue;
1271 }
1272
1273 if (! cds_ref) {
1274 // wait until first mat_peptide found to avoid expensive computation on CDS /gene qualifier
1275 CConstRef<CSeq_feat> cgene = sequence::GetBestOverlappingFeat (m_Feat.GetLocation(), CSeqFeatData::eSubtype_gene, sequence::eOverlap_SubsetRev, m_Scope);
1276 if (cgene && cgene->CanGetData() && cgene->GetData().IsGene()) {
1277 const CGene_ref& cgref = cgene->GetData().GetGene();
1278 cds_ref = &cgref;
1279 } else {
1280 // if CDS does not have overlapping gene, bail out of function
1281 return;
1282 }
1283 }
1284
1285 const CSeq_loc& loc = curr.GetLocation();
1286 // map prot location to nuc location
1287 CRef<CSeq_loc> nloc(prot_to_cds.Map(loc));
1288 if (! nloc) {
1289 continue;
1290 }
1291
1292 const CGene_ref* pep_ref = 0;
1293 CConstRef<CSeq_feat> pgene = sequence::GetBestOverlappingFeat (*nloc, CSeqFeatData::eSubtype_gene, sequence::eOverlap_SubsetRev, m_Scope);
1294 if (pgene && pgene->CanGetData() && pgene->GetData().IsGene()) {
1295 const CGene_ref& pgref = pgene->GetData().GetGene();
1296 pep_ref = &pgref;
1297 }
1298
1299 if (! cds_ref || ! pep_ref) {
1300 continue;
1301 }
1302 if (cds_ref->IsSetLocus_tag() && pep_ref->IsSetLocus_tag()) {
1303 if (cds_ref->GetLocus_tag() == pep_ref->GetLocus_tag()) {
1304 continue;
1305 }
1306 } else if (cds_ref->IsSetLocus() && pep_ref->IsSetLocus()) {
1307 if (cds_ref->GetLocus() == pep_ref->GetLocus()) {
1308 continue;
1309 }
1310 }
1311
1312 if (pgene) {
1313
1314 const CSeq_loc& gloc = pgene->GetLocation();
1315
1316 if (sequence::Compare(*nloc, gloc, nullptr /* scope */, sequence::fCompareOverlapping) == sequence::eSame) {
1317
1318 PostErr(eDiag_Warning, eErr_SEQ_FEAT_GeneOnNucPositionOfPeptide,
1319 "Peptide under CDS matches small Gene");
1320 }
1321 }
1322 }
1323 } catch (CException) {
1324 }
1325 }
1326
1327
x_ValidateCDSPartial()1328 void CCdregionValidator::x_ValidateCDSPartial()
1329 {
1330 if (!m_ProductBioseq || x_BypassCDSPartialTest()) {
1331 return;
1332 }
1333
1334 CSeqdesc_CI sd(m_ProductBioseq, CSeqdesc::e_Molinfo);
1335 if (!sd) {
1336 return;
1337 }
1338 const CMolInfo& molinfo = sd->GetMolinfo();
1339
1340 const CSeq_loc& loc = m_Feat.GetLocation();
1341 bool partial5 = loc.IsPartialStart(eExtreme_Biological);
1342 bool partial3 = loc.IsPartialStop(eExtreme_Biological);
1343
1344 if (molinfo.CanGetCompleteness()) {
1345 switch (molinfo.GetCompleteness()) {
1346 case CMolInfo::eCompleteness_unknown:
1347 break;
1348
1349 case CMolInfo::eCompleteness_complete:
1350 if (partial5 || partial3) {
1351 PostErr(eDiag_Error, eErr_SEQ_FEAT_PartialProblem,
1352 "CDS is partial but protein is complete");
1353 }
1354 break;
1355
1356 case CMolInfo::eCompleteness_partial:
1357 break;
1358
1359 case CMolInfo::eCompleteness_no_left:
1360 if (!partial5) {
1361 PostErr(eDiag_Error, eErr_SEQ_FEAT_PartialProblem,
1362 "CDS is 5' complete but protein is NH2 partial");
1363 }
1364 if (partial3) {
1365 EDiagSev sev = eDiag_Error;
1366 if (x_CDS3primePartialTest())
1367 {
1368 sev = eDiag_Warning;
1369 }
1370 PostErr(sev, eErr_SEQ_FEAT_PartialProblem,
1371 "CDS is 3' partial but protein is NH2 partial");
1372 }
1373 break;
1374
1375 case CMolInfo::eCompleteness_no_right:
1376 if (!partial3) {
1377 PostErr(eDiag_Error, eErr_SEQ_FEAT_PartialProblem,
1378 "CDS is 3' complete but protein is CO2 partial");
1379 }
1380 if (partial5) {
1381 EDiagSev sev = eDiag_Error;
1382 if (x_CDS5primePartialTest())
1383 {
1384 sev = eDiag_Warning;
1385 }
1386 PostErr(sev, eErr_SEQ_FEAT_PartialProblem,
1387 "CDS is 5' partial but protein is CO2 partial");
1388 }
1389 break;
1390
1391 case CMolInfo::eCompleteness_no_ends:
1392 if (partial5 && partial3) {
1393 } else if (partial5) {
1394 EDiagSev sev = eDiag_Error;
1395 if (x_CDS5primePartialTest())
1396 {
1397 sev = eDiag_Warning;
1398 }
1399 PostErr(sev, eErr_SEQ_FEAT_PartialProblem,
1400 "CDS is 5' partial but protein has neither end");
1401 } else if (partial3) {
1402 EDiagSev sev = eDiag_Error;
1403 if (x_CDS3primePartialTest()) {
1404 sev = eDiag_Warning;
1405 }
1406
1407 PostErr(sev, eErr_SEQ_FEAT_PartialProblem,
1408 "CDS is 3' partial but protein has neither end");
1409 } else {
1410 PostErr(eDiag_Error, eErr_SEQ_FEAT_PartialProblem,
1411 "CDS is complete but protein has neither end");
1412 }
1413 break;
1414
1415 case CMolInfo::eCompleteness_has_left:
1416 break;
1417
1418 case CMolInfo::eCompleteness_has_right:
1419 break;
1420
1421 case CMolInfo::eCompleteness_other:
1422 break;
1423
1424 default:
1425 break;
1426 }
1427 }
1428 }
1429
1430
1431 static const char* const sc_BypassCdsPartialCheckText[] = {
1432 "RNA editing",
1433 "annotated by transcript or proteomic data",
1434 "artificial frameshift",
1435 "mismatches in translation",
1436 "rearrangement required for product",
1437 "reasons given in citation",
1438 "translated product replaced",
1439 "unclassified translation discrepancy"
1440 };
1441 typedef CStaticArraySet<const char*, PCase_CStr> TBypassCdsPartialCheckSet;
1442 DEFINE_STATIC_ARRAY_MAP(TBypassCdsPartialCheckSet, sc_BypassCdsPartialCheck, sc_BypassCdsPartialCheckText);
1443
x_BypassCDSPartialTest() const1444 bool CCdregionValidator::x_BypassCDSPartialTest() const
1445 {
1446 if (m_Feat.CanGetExcept() && m_Feat.GetExcept() && m_Feat.CanGetExcept_text()) {
1447 const string& except_text = m_Feat.GetExcept_text();
1448 ITERATE(TBypassCdsPartialCheckSet, it, sc_BypassCdsPartialCheck) {
1449 if (NStr::FindNoCase(except_text, *it) != NPOS) {
1450 return true; // biological exception
1451 }
1452 }
1453 }
1454 return false;
1455 }
1456
1457
x_CDS3primePartialTest() const1458 bool CCdregionValidator::x_CDS3primePartialTest() const
1459 {
1460 CSeq_loc_CI last;
1461 for (CSeq_loc_CI sl_iter(m_Feat.GetLocation()); sl_iter; ++sl_iter) {
1462 last = sl_iter;
1463 }
1464
1465 if (last) {
1466 if (last.GetStrand() == eNa_strand_minus) {
1467 if (last.GetRange().GetFrom() == 0) {
1468 return true;
1469 }
1470 } else {
1471 if (!m_LocationBioseq) {
1472 return false;
1473 }
1474 if (last.GetRange().GetTo() == m_LocationBioseq.GetInst_Length() - 1) {
1475 return true;
1476 }
1477 }
1478 }
1479 return false;
1480 }
1481
1482
x_CDS5primePartialTest() const1483 bool CCdregionValidator::x_CDS5primePartialTest() const
1484 {
1485 CSeq_loc_CI first(m_Feat.GetLocation());
1486
1487 if (first) {
1488 if (first.GetStrand() == eNa_strand_minus) {
1489 if (!m_LocationBioseq) {
1490 return false;
1491 }
1492 if (first.GetRange().GetTo() == m_LocationBioseq.GetInst_Length() - 1) {
1493 return true;
1494 }
1495 } else {
1496 if (first.GetRange().GetFrom() == 0) {
1497 return true;
1498 }
1499 }
1500 }
1501 return false;
1502 }
1503
1504
x_IsProductMisplaced() const1505 bool CCdregionValidator::x_IsProductMisplaced() const
1506 {
1507 // don't calculate if no product or if ORF flag is set
1508 if (!m_Feat.IsSetProduct() ||
1509 m_Feat.GetData().GetCdregion().IsSetOrf()) {
1510 return false;
1511 }
1512 // don't calculate if feature is pseudo
1513 if (s_IsPseudo(m_Feat) || m_GeneIsPseudo) {
1514 return false;
1515 }
1516 if (!m_ProductBioseq) {
1517 return false;
1518 } else if (m_ProductIsFar) {
1519 if (m_Imp.RequireLocalProduct(m_Feat.GetProduct().GetId())) {
1520 return true;
1521 } else {
1522 return false;
1523 }
1524 }
1525
1526 bool found_match = false;
1527
1528 CSeq_entry_Handle prod_nps =
1529 m_ProductBioseq.GetExactComplexityLevel(CBioseq_set::eClass_nuc_prot);
1530 if (!prod_nps) {
1531 return true;
1532 }
1533
1534 for (CSeq_loc_CI loc_i(m_Feat.GetLocation()); loc_i; ++loc_i) {
1535 const CSeq_id& sid = loc_i.GetSeq_id();
1536 if (sid.IsOther() && sid.GetOther().IsSetAccession() && NStr::StartsWith(sid.GetOther().GetAccession(), "NT_")) {
1537 return false;
1538 }
1539 CBioseq_Handle nuc = m_Scope.GetBioseqHandle(loc_i.GetSeq_id());
1540 if (nuc) {
1541 if (s_BioseqHasRefSeqThatStartsWithPrefix(nuc, "NT_")) {
1542 // we don't report this for NT records
1543 return false;
1544 }
1545 CSeq_entry_Handle wgs = nuc.GetExactComplexityLevel(CBioseq_set::eClass_gen_prod_set);
1546 if (wgs) {
1547 // we don't report this for gen-prod-sets
1548 return false;
1549 }
1550
1551 CSeq_entry_Handle nuc_nps =
1552 nuc.GetExactComplexityLevel(CBioseq_set::eClass_nuc_prot);
1553
1554 if (prod_nps == nuc_nps) {
1555 found_match = true;
1556 break;
1557 }
1558 }
1559 }
1560 return !found_match;
1561 }
1562
1563
x_AddToIntronList(vector<CCdregionValidator::TShortIntron> & shortlist,TSeqPos last_start,TSeqPos last_stop,TSeqPos this_start,TSeqPos this_stop)1564 void CCdregionValidator::x_AddToIntronList(vector<CCdregionValidator::TShortIntron>& shortlist, TSeqPos last_start, TSeqPos last_stop, TSeqPos this_start, TSeqPos this_stop)
1565 {
1566 if (abs ((int)this_start - (int)last_stop) < 11) {
1567 shortlist.push_back(TShortIntron(last_stop, this_start));
1568 } else if (abs ((int)this_stop - (int)last_start) < 11) {
1569 shortlist.push_back(TShortIntron(last_start, this_stop));
1570 }
1571 }
1572
1573
x_GetShortIntrons(const CSeq_loc & loc,CScope * scope)1574 vector<CCdregionValidator::TShortIntron> CCdregionValidator::x_GetShortIntrons(const CSeq_loc& loc, CScope* scope)
1575 {
1576 vector<CCdregionValidator::TShortIntron> shortlist;
1577
1578 CSeq_loc_CI li(loc);
1579
1580 TSeqPos last_start = li.GetRange().GetFrom();
1581 TSeqPos last_stop = li.GetRange().GetTo();
1582 CRef<CSeq_id> last_id(new CSeq_id());
1583 last_id->Assign(li.GetSeq_id());
1584
1585 ++li;
1586 while (li) {
1587 TSeqPos this_start = li.GetRange().GetFrom();
1588 TSeqPos this_stop = li.GetRange().GetTo();
1589 if (abs ((int)this_start - (int)last_stop) < 11 || abs ((int)this_stop - (int)last_start) < 11) {
1590 if (li.GetSeq_id().Equals(*last_id)) {
1591 // definitely same bioseq, definitely report
1592 x_AddToIntronList(shortlist, last_start, last_stop, this_start, this_stop);
1593 } else if (scope) {
1594 // only report if definitely on same bioseq
1595 CBioseq_Handle last_bsh = scope->GetBioseqHandle(*last_id);
1596 if (last_bsh) {
1597 for (auto id_it : last_bsh.GetId()) {
1598 if (id_it.GetSeqId()->Equals(li.GetSeq_id())) {
1599 x_AddToIntronList(shortlist, last_start, last_stop, this_start, this_stop);
1600 break;
1601 }
1602 }
1603 }
1604 }
1605 }
1606 last_start = this_start;
1607 last_stop = this_stop;
1608 last_id->Assign(li.GetSeq_id());
1609 ++li;
1610 }
1611 return shortlist;
1612 }
1613
1614
x_FormatIntronInterval(const TShortIntron & interval)1615 string CCdregionValidator::x_FormatIntronInterval(const TShortIntron& interval)
1616 {
1617 return NStr::NumericToString(interval.first + 1) + "-"
1618 + NStr::NumericToString(interval.second + 1);
1619 }
1620
1621
ReportShortIntrons()1622 void CCdregionValidator::ReportShortIntrons()
1623 {
1624 if (m_Feat.IsSetExcept()) {
1625 return;
1626 }
1627
1628 string message;
1629
1630 vector<TShortIntron> shortlist = x_GetShortIntrons(m_Feat.GetLocation(), &m_Scope);
1631 if (shortlist.size() == 0) {
1632 return;
1633 }
1634
1635 // only report if no nonsense introns
1636 vector<CRef<CSeq_loc> > nonsense_introns = CCDSTranslationProblems::GetNonsenseIntrons(m_Feat, m_Scope);
1637 if (nonsense_introns.size() > 0) {
1638 return;
1639 }
1640
1641 if (shortlist.size() == 1) {
1642 message = x_FormatIntronInterval(shortlist.front());
1643 } else if (shortlist.size() == 2) {
1644 message = x_FormatIntronInterval(shortlist.front())
1645 + " and " +
1646 x_FormatIntronInterval(shortlist.back());
1647 } else {
1648 for (size_t i = 0; i < shortlist.size() - 2; i++) {
1649 message += x_FormatIntronInterval(shortlist[i]) + ", ";
1650 }
1651 message += " and " + x_FormatIntronInterval(shortlist.back());
1652 }
1653 PostErr(eDiag_Warning, eErr_SEQ_FEAT_ShortIntron,
1654 "Introns at positions " + message + " should be at least 10 nt long");
1655 }
1656
1657
1658 // non-pseudo CDS must have product
x_ValidateProductId()1659 void CCdregionValidator::x_ValidateProductId()
1660 {
1661 // bail if product exists
1662 if ( m_Feat.IsSetProduct() ) {
1663 return;
1664 }
1665 // bail if location has just stop
1666 if ( m_Feat.IsSetLocation() ) {
1667 const CSeq_loc& loc = m_Feat.GetLocation();
1668 if ( loc.IsPartialStart(eExtreme_Biological) && !loc.IsPartialStop(eExtreme_Biological) ) {
1669 if ( GetLength(loc, &m_Scope) <= 5 ) {
1670 return;
1671 }
1672 }
1673 }
1674 // supress in case of the appropriate exception
1675 if ( m_Feat.IsSetExcept() && m_Feat.IsSetExcept_text() &&
1676 !NStr::IsBlank(m_Feat.GetExcept_text()) ) {
1677 if ( NStr::Find(m_Feat.GetExcept_text(),
1678 "rearrangement required for product") != NPOS ) {
1679 return;
1680 }
1681 }
1682
1683 // non-pseudo CDS must have /product
1684 PostErr(eDiag_Error, eErr_SEQ_FEAT_MissingCDSproduct,
1685 "Expected CDS product absent");
1686 }
1687
1688
x_ValidateConflict()1689 void CCdregionValidator::x_ValidateConflict()
1690 {
1691 if (!m_ProductBioseq) {
1692 return;
1693 }
1694 // translate the coding region
1695 string transl_prot;
1696 try {
1697 CSeqTranslator::Translate(m_Feat, m_Scope, transl_prot,
1698 false, // do not include stop codons
1699 false); // do not remove trailing X/B/Z
1700
1701 } catch ( const runtime_error& ) {
1702 }
1703
1704 CSeqVector prot_vec = m_ProductBioseq.GetSeqVector(CBioseq_Handle::eCoding_Iupac);
1705 prot_vec.SetCoding(CSeq_data::e_Ncbieaa);
1706
1707 string prot_seq;
1708 prot_vec.GetSeqData(0, prot_vec.size(), prot_seq);
1709
1710 if ( transl_prot.empty() || prot_seq.empty() || NStr::Equal(transl_prot, prot_seq) ) {
1711 PostErr(eDiag_Error, eErr_SEQ_FEAT_BadConflictFlag,
1712 "Coding region conflict flag should not be set");
1713 } else {
1714 PostErr(eDiag_Warning, eErr_SEQ_FEAT_ConflictFlagSet,
1715 "Coding region conflict flag is set");
1716 }
1717 }
1718
1719
x_ValidateCommonProduct()1720 void CCdregionValidator::x_ValidateCommonProduct()
1721 {
1722 if ( !m_Feat.IsSetProduct() ) {
1723 return;
1724 }
1725
1726 const CCdregion& cdr = m_Feat.GetData().GetCdregion();
1727 if ( cdr.CanGetOrf() ) {
1728 return;
1729 }
1730
1731 if ( !m_ProductBioseq || m_ProductIsFar) {
1732 const CSeq_id* sid = 0;
1733 try {
1734 sid = &(GetId(m_Feat.GetProduct(), &m_Scope));
1735 } catch (const CObjmgrUtilException&) {}
1736 if (m_Imp.RequireLocalProduct(sid)) {
1737 PostErr(eDiag_Warning, eErr_SEQ_FEAT_MissingCDSproduct,
1738 "Unable to find product Bioseq from CDS feature");
1739 }
1740 return;
1741 }
1742
1743 const CSeq_feat* sfp = GetCDSForProduct(m_ProductBioseq);
1744 if ( sfp == 0 ) {
1745 return;
1746 }
1747
1748 if ( &m_Feat != sfp ) {
1749 // if genomic product set, with one cds on contig and one on cdna,
1750 // do not report.
1751 if ( m_Imp.IsGPS() ) {
1752 // feature packaging test will do final contig vs. cdna check
1753 CBioseq_Handle sfh = m_Scope.GetBioseqHandle(sfp->GetLocation());
1754 if ( m_LocationBioseq != sfh ) {
1755 return;
1756 }
1757 }
1758 PostErr(eDiag_Critical, eErr_SEQ_FEAT_MultipleCDSproducts,
1759 "Same product Bioseq from multiple CDS features");
1760 }
1761 }
1762
1763
x_ValidateProductPartials()1764 void CCdregionValidator::x_ValidateProductPartials()
1765 {
1766 if (!m_ProductBioseq || !m_LocationBioseq) {
1767 return;
1768 }
1769
1770 if (m_LocationBioseq.GetTopLevelEntry() != m_ProductBioseq.GetTopLevelEntry()) {
1771 return;
1772 }
1773 CFeat_CI prot(m_ProductBioseq, CSeqFeatData::eSubtype_prot);
1774 if (!prot) {
1775 return;
1776 }
1777 if (!PartialsSame(m_Feat.GetLocation(), prot->GetLocation())) {
1778 PostErr(eDiag_Warning, eErr_SEQ_FEAT_PartialsInconsistentCDSProtein,
1779 "Coding region and protein feature partials conflict");
1780 }
1781 }
1782
1783
x_CheckPosNOrGap(TSeqPos pos,const CSeqVector & vec)1784 bool CCdregionValidator::x_CheckPosNOrGap(TSeqPos pos, const CSeqVector& vec)
1785 {
1786 if (vec.IsInGap(pos) || vec[pos] == 'N') {
1787 return true;
1788 } else {
1789 return false;
1790 }
1791 }
1792
1793
x_ValidateParentPartialness(const CSeq_loc & parent_loc,const string & parent_name)1794 void CCdregionValidator::x_ValidateParentPartialness(const CSeq_loc& parent_loc, const string& parent_name)
1795 {
1796 if (!m_LocationBioseq) {
1797 return;
1798 }
1799
1800 bool check_gaps = false;
1801 if (m_LocationBioseq.IsSetInst() && m_LocationBioseq.GetInst().IsSetRepr() &&
1802 m_LocationBioseq.GetInst().GetRepr() == CSeq_inst::eRepr_delta) {
1803 check_gaps = true;
1804 }
1805
1806 bool has_abutting_gap = false;
1807 bool is_minus_strand = m_Feat.GetLocation().IsSetStrand() && m_Feat.GetLocation().GetStrand() == eNa_strand_minus;
1808
1809 if (m_Feat.GetLocation().IsPartialStart(eExtreme_Biological) && !parent_loc.IsPartialStart(eExtreme_Biological)) {
1810
1811 if (check_gaps) {
1812 CSeqVector seq_vec(m_LocationBioseq, CBioseq_Handle::eCoding_Iupac);
1813 TSeqPos start = m_Feat.GetLocation().GetStart(eExtreme_Biological),
1814 pos = is_minus_strand ? start + 1 : start - 1;
1815
1816 if (pos < m_LocationBioseq.GetBioseqLength()) {
1817 has_abutting_gap = x_CheckPosNOrGap(pos, seq_vec);
1818 }
1819 }
1820
1821 if (!has_abutting_gap) {
1822 EDiagSev sev = eDiag_Warning;
1823 CConstRef <CSeq_feat> gene = m_Gene;
1824 if (gene && gene->GetData().GetGene().IsSetLocus()) {
1825 string locus = gene->GetData().GetGene().GetLocus();
1826 if ( NStr::EqualNocase (locus, "orf1ab") ) {
1827 sev = eDiag_Info;
1828 }
1829 }
1830 PostErr(sev, eErr_SEQ_FEAT_PartialProblemMismatch5Prime, parent_name + " should not be 5' complete if coding region is 5' partial");
1831 }
1832 }
1833 if (m_Feat.GetLocation().IsPartialStop(eExtreme_Biological) && !parent_loc.IsPartialStop(eExtreme_Biological)) {
1834
1835 if (check_gaps) {
1836
1837 CSeqVector seq_vec(m_LocationBioseq, CBioseq_Handle::eCoding_Iupac);
1838 TSeqPos stop = m_Feat.GetLocation().GetStop(eExtreme_Biological),
1839 pos = is_minus_strand ? stop - 1 : stop + 1;
1840
1841 if (pos < m_LocationBioseq.GetBioseqLength()) {
1842 has_abutting_gap = x_CheckPosNOrGap(pos, seq_vec);
1843 }
1844 }
1845
1846 if (!has_abutting_gap) {
1847 EDiagSev sev = eDiag_Warning;
1848 CConstRef <CSeq_feat> gene = m_Gene;
1849 if (gene && gene->GetData().GetGene().IsSetLocus()) {
1850 string locus = gene->GetData().GetGene().GetLocus();
1851 if ( NStr::EqualNocase (locus, "orf1ab") ) {
1852 sev = eDiag_Info;
1853 }
1854 }
1855 PostErr(sev, eErr_SEQ_FEAT_PartialProblemMismatch3Prime, parent_name + " should not be 3' complete if coding region is 3' partial");
1856 }
1857 }
1858 }
1859
1860
x_ValidateParentPartialness()1861 void CCdregionValidator::x_ValidateParentPartialness()
1862 {
1863 if (!m_Gene) {
1864 return;
1865 }
1866 x_ValidateParentPartialness(m_Gene->GetLocation(), "gene");
1867
1868 CConstRef<CSeq_feat> mrna = GetmRNAforCDS(m_Feat, m_Scope);
1869 if (mrna) {
1870 TFeatScores contained_mrna;
1871 GetOverlappingFeatures(m_Gene->GetLocation(), CSeqFeatData::e_Rna,
1872 CSeqFeatData::eSubtype_mRNA, eOverlap_Contains, contained_mrna, m_Scope);
1873 if (contained_mrna.size() == 1) {
1874 // messy for alternate splicing, so only check if there is only one
1875 x_ValidateParentPartialness(mrna->GetLocation(), "mRNA");
1876 }
1877 }
1878 }
1879
1880
1881 END_SCOPE(validator)
1882 END_SCOPE(objects)
1883 END_NCBI_SCOPE
1884