1 /*  $Id: gene_names.cpp 637429 2021-09-13 13:13:19Z ivanov $
2  * =========================================================================
3  *
4  *                            PUBLIC DOMAIN NOTICE
5  *               National Center for Biotechnology Information
6  *
7  *  This software/database is a "United States Government Work" under the
8  *  terms of the United States Copyright Act.  It was written as part of
9  *  the author's official duties as a United States Government employee and
10  *  thus cannot be copyrighted.  This software/database is freely available
11  *  to the public for use. The National Library of Medicine and the U.S.
12  *  Government have not placed any restriction on its use or reproduction.
13  *
14  *  Although all reasonable efforts have been taken to ensure the accuracy
15  *  and reliability of the software and data, the NLM and the U.S.
16  *  Government do not and cannot warrant the performance or results that
17  *  may be obtained by using this software or data. The NLM and the U.S.
18  *  Government disclaim all warranties, express or implied, including
19  *  warranties of performance, merchantability or fitness for any particular
20  *  purpose.
21  *
22  *  Please cite the author in any work or product based on this material.
23  *
24  * =========================================================================
25  *
26  * Authors: Sema Kachalo
27  *
28  */
29 
30 #include <ncbi_pch.hpp>
31 #include "discrepancy_core.hpp"
32 #include "utils.hpp"
33 #include <objmgr/util/feature.hpp>
34 #include <objmgr/util/sequence.hpp>
35 
36 BEGIN_NCBI_SCOPE
37 BEGIN_SCOPE(NDiscrepancy)
38 USING_SCOPE(objects);
39 
40 DISCREPANCY_MODULE(gene_names);
41 
42 
43 // BAD_GENE_NAME
HasBadWord(const string & s,string & word)44 static bool HasBadWord(const string& s, string& word)
45 {
46     static const char* BadWords[] = { "putative", "fragment", "gene", "orf", "like" };
47     for (size_t i = 0; i < ArraySize(BadWords); i++) {
48         if (NStr::FindNoCase(s, BadWords[i]) != NPOS) {
49             word = BadWords[i];
50             return true;
51         }
52     }
53     return false;
54 }
55 
56 
Has4Numbers(const string & s)57 static bool Has4Numbers(const string& s)
58 {
59     size_t n = 0;
60     for (size_t i = 0; i < s.size() && n < 4; i++) {
61         n = isdigit(s[i]) ? n+1 : 0;
62     }
63     return n >= 4;
64 };
65 
66 
67 DISCREPANCY_CASE(BAD_GENE_NAME, FEAT, eDisc | eSubmitter | eSmart, "Bad gene name")
68 {
69     for (const CSeq_feat& feat : context.GetFeat()) {
70         if (feat.IsSetData() && feat.GetData().IsGene() && feat.GetData().GetGene().CanGetLocus()) {
71             string locus = feat.GetData().GetGene().GetLocus();
72             string word;
73             if (locus.size() > 10 || Has4Numbers(locus) || HasBadWord(locus, word)) {
74                 m_Objs[word.empty() ? "[n] gene[s] contain[S] suspect phrase or characters" : "[n] gene[s] contain[S] [(]" + word].Add(*context.SeqFeatObjRef(feat, &feat));
75             }
76         }
77     }
78 }
79 
80 
DISCREPANCY_SUMMARIZE(BAD_GENE_NAME)81 DISCREPANCY_SUMMARIZE(BAD_GENE_NAME)
82 {
83     m_ReportItems = m_Objs.Export(*this)->GetSubitems();
84 }
85 
86 
DISCREPANCY_AUTOFIX(BAD_GENE_NAME)87 DISCREPANCY_AUTOFIX(BAD_GENE_NAME)
88 {
89     const CSeq_feat* sf = dynamic_cast<const CSeq_feat*>(context.FindObject(*obj));
90     CRef<CSeq_feat> new_feat(new CSeq_feat());
91     new_feat->Assign(*sf);
92     if (sf->IsSetData() && sf->GetData().IsGene() && sf->GetData().GetGene().CanGetLocus()) {
93         AddComment(*new_feat, sf->GetData().GetGene().GetLocus());
94     }
95     new_feat->SetData().SetGene().ResetLocus();
96     context.ReplaceSeq_feat(*obj, *sf, *new_feat);
97     obj->SetFixed();
98     return CRef<CAutofixReport>(new CAutofixReport("BAD_GENE_NAME: [n] gene name[s] fixed", 1));
99 }
100 
101 
102 // BAD_BACTERIAL_GENE_NAME
103 
104 DISCREPANCY_CASE(BAD_BACTERIAL_GENE_NAME, FEAT, eDisc | eOncaller | eSubmitter | eSmart, "Bad bacterial gene name")
105 {
106     const CSeqdesc* biosrc = context.GetBiosource();
107     if (biosrc) {
108         const CBioSource* src = &biosrc->GetSource();
109         if ((src->IsSetLineage() || !context.GetLineage().empty()) && !context.HasLineage(src, "Eukaryota") && !context.IsViral(src)) {
110             for (const CSeq_feat& feat : context.GetFeat()) {
111                 if (feat.IsSetData() && feat.GetData().IsGene() && feat.GetData().GetGene().CanGetLocus()) {
112                     string locus = feat.GetData().GetGene().GetLocus();
113                     if (!isalpha(locus[0]) || !islower(locus[0])) {
114                         m_Objs["[n] bacterial gene[s] [does] not start with lowercase letter"].Add(*context.SeqFeatObjRef(feat, &feat));
115                     }
116                 }
117             }
118         }
119     }
120 }
121 
122 
DISCREPANCY_SUMMARIZE(BAD_BACTERIAL_GENE_NAME)123 DISCREPANCY_SUMMARIZE(BAD_BACTERIAL_GENE_NAME)
124 {
125     m_ReportItems = m_Objs.Export(*this)->GetSubitems();
126 }
127 
128 
DISCREPANCY_AUTOFIX(BAD_BACTERIAL_GENE_NAME)129 DISCREPANCY_AUTOFIX(BAD_BACTERIAL_GENE_NAME)
130 {
131     const CSeq_feat* sf = dynamic_cast<const CSeq_feat*>(context.FindObject(*obj));
132     CRef<CSeq_feat> new_feat(new CSeq_feat());
133     new_feat->Assign(*sf);
134     AddComment(*new_feat, sf->GetData().GetGene().GetLocus());
135     new_feat->SetData().SetGene().ResetLocus();
136     context.ReplaceSeq_feat(*obj, *sf, *new_feat);
137     obj->SetFixed();
138     return CRef<CAutofixReport>(new CAutofixReport("BAD_BACTERIAL_GENE_NAME: [n] bacterial gene name[s] fixed", 1));
139 }
140 
141 
142 // EC_NUMBER_ON_UNKNOWN_PROTEIN
143 
144 DISCREPANCY_CASE(EC_NUMBER_ON_UNKNOWN_PROTEIN, FEAT, eDisc | eSubmitter | eSmart | eFatal, "EC number on unknown protein")
145 {
146     for (const CSeq_feat& feat : context.GetFeat()) {
147         if (feat.IsSetData() && feat.GetData().IsProt() && feat.GetData().GetProt().CanGetName() && feat.GetData().GetProt().CanGetEc() && !feat.GetData().GetProt().GetEc().empty()) {
148             const list <string>& names = feat.GetData().GetProt().GetName();
149             if (!names.empty()) {
150                 string str = *names.begin();
151                 NStr::ToLower(str);
152                 //if (NStr::FindNoCase(*names.begin(), "hypothetical protein") != NPOS || NStr::FindNoCase(*names.begin(), "unknown protein") != NPOS) {
153                 if (str == "hypothetical protein" || str == "unknown protein") {
154                     m_Objs["[n] protein feature[s] [has] an EC number and a protein name of 'unknown protein' or 'hypothetical protein'"].Add(*context.SeqFeatObjRef(feat, &feat)).Fatal();
155                 }
156             }
157         }
158     }
159 }
160 
161 
DISCREPANCY_SUMMARIZE(EC_NUMBER_ON_UNKNOWN_PROTEIN)162 DISCREPANCY_SUMMARIZE(EC_NUMBER_ON_UNKNOWN_PROTEIN)
163 {
164     m_ReportItems = m_Objs.Export(*this)->GetSubitems();
165 }
166 
167 
DISCREPANCY_AUTOFIX(EC_NUMBER_ON_UNKNOWN_PROTEIN)168 DISCREPANCY_AUTOFIX(EC_NUMBER_ON_UNKNOWN_PROTEIN)
169 {
170     const CSeq_feat* sf = dynamic_cast<const CSeq_feat*>(context.FindObject(*obj));
171     CRef<CSeq_feat> new_feat(new CSeq_feat());
172     new_feat->Assign(*sf);
173     new_feat->SetData().SetProt().ResetEc();
174     context.ReplaceSeq_feat(*obj, *sf, *new_feat);
175     obj->SetFixed();
176     return CRef<CAutofixReport>(new CAutofixReport("EC_NUMBER_ON_UNKNOWN_PROTEIN: removed [n] EC number[s] from unknown protein[s]", 1));
177 }
178 
179 
180 // SHOW_HYPOTHETICAL_CDS_HAVING_GENE_NAME
181 
182 DISCREPANCY_CASE(SHOW_HYPOTHETICAL_CDS_HAVING_GENE_NAME, FEAT, eDisc | eSubmitter | eSmart | eFatal, "Hypothetical CDS with gene names")
183 {
184     for (const CSeq_feat& feat : context.GetFeat()) {
185         if (feat.IsSetData() && feat.GetData().IsCdregion() && feat.CanGetProduct()) {
186             const CSeq_feat* gene = context.GetGeneForFeature(feat);
187             if (gene && gene->GetData().GetGene().CanGetLocus() && !gene->GetData().GetGene().GetLocus().empty()) {
188                 CBioseq_Handle bioseq = sequence::GetBioseqFromSeqLoc(feat.GetProduct(), context.GetScope());
189                 if (bioseq) {
190                     CFeat_CI feat_it(bioseq, CSeqFeatData::e_Prot); // consider different implementation
191                     if (feat_it) {
192                         const CProt_ref& prot = feat_it->GetOriginalFeature().GetData().GetProt();
193                         if (prot.CanGetName()) {
194                             const auto& names = prot.GetName();
195                             if (!names.empty() && NStr::FindNoCase(names.front(), "hypothetical protein") != NPOS) {
196                                 m_Objs["[n] hypothetical coding region[s] [has] a gene name"].Fatal().Add(*context.SeqFeatObjRef(feat, gene));
197                             }
198                         }
199                     }
200                 }
201             }
202         }
203     }
204 }
205 
206 
DISCREPANCY_SUMMARIZE(SHOW_HYPOTHETICAL_CDS_HAVING_GENE_NAME)207 DISCREPANCY_SUMMARIZE(SHOW_HYPOTHETICAL_CDS_HAVING_GENE_NAME)
208 {
209     m_ReportItems = m_Objs.Export(*this)->GetSubitems();
210 }
211 
212 
DISCREPANCY_AUTOFIX(SHOW_HYPOTHETICAL_CDS_HAVING_GENE_NAME)213 DISCREPANCY_AUTOFIX(SHOW_HYPOTHETICAL_CDS_HAVING_GENE_NAME)
214 {
215     const CSeq_feat* sf = dynamic_cast<const CSeq_feat*>(context.FindObject(*obj, true));
216     CRef<CSeq_feat> new_feat(new CSeq_feat());
217     new_feat->Assign(*sf);
218     AddComment(*new_feat, sf->GetData().GetGene().IsSetLocus() ? sf->GetData().GetGene().GetLocus() : kEmptyStr);
219     new_feat->SetData().SetGene().ResetLocus();
220     context.ReplaceSeq_feat(*obj, *sf, *new_feat, true);
221     obj->SetFixed();
222     return CRef<CAutofixReport>(new CAutofixReport("SHOW_HYPOTHETICAL_CDS_HAVING_GENE_NAME: [n] hypothetical CDS fixed", 1));
223 }
224 
225 
226 // DUPLICATE_LOCUS_TAGS
227 const string kDuplicateLocusTagsTop = "[n] gene[s] [has] duplicate locus tags";
228 const string kDuplicateLocusTagsStart = "[n] gene[s] [has] locus tag ";
229 const string kDuplicateAdjacent = "[n] gene[s] [is] adjacent to another gene with the same locus tag.";
230 
231 
232 DISCREPANCY_CASE(DUPLICATE_LOCUS_TAGS, SEQUENCE, eDisc | eOncaller | eSubmitter | eSmart | eFatal, "Duplicate Locus Tags")
233 {
234     const auto& genes = context.FeatGenes();
235     string last_locus_tag;
236     CRef<CDiscrepancyObject> last_disc_obj;
237     for (const CSeq_feat* gene : genes) {
238         if (gene->GetData().GetGene().IsSetLocus_tag()) {
239             CRef<CDiscrepancyObject> this_disc_obj(context.SeqFeatObjRef(*gene));
240             const string& this_locus_tag = gene->GetData().GetGene().GetLocus_tag();
241             m_Objs[kEmptyStr][this_locus_tag].Add(*this_disc_obj).Fatal();
242             if (last_disc_obj && last_locus_tag == this_locus_tag) {
243                 m_Objs[kDuplicateLocusTagsTop][kDuplicateAdjacent].Add(*last_disc_obj).Fatal();
244                 m_Objs[kDuplicateLocusTagsTop][kDuplicateAdjacent].Add(*this_disc_obj).Fatal();
245             }
246             last_locus_tag = this_locus_tag;
247             last_disc_obj = this_disc_obj;
248         }
249         else {
250             last_locus_tag = kEmptyStr;
251         }
252     }
253 }
254 
255 
DISCREPANCY_SUMMARIZE(DUPLICATE_LOCUS_TAGS)256 DISCREPANCY_SUMMARIZE(DUPLICATE_LOCUS_TAGS)
257 {
258     for (auto& it : m_Objs[kEmptyStr].GetMap()) {
259         if (it.second->GetObjects().size() > 1) {
260             string label = kDuplicateLocusTagsStart + it.first + ".";
261             for (auto& obj : it.second->GetObjects()) {
262                 m_Objs[kDuplicateLocusTagsTop][label].Add(*obj);
263             }
264         }
265     }
266     m_Objs.GetMap().erase(kEmptyStr);
267     m_ReportItems = m_Objs.Export(*this)->GetSubitems();
268 }
269 
270 
271 END_SCOPE(NDiscrepancy)
272 END_NCBI_SCOPE
273