1 /* $Id: gene_names.cpp 637429 2021-09-13 13:13:19Z ivanov $
2 * =========================================================================
3 *
4 * PUBLIC DOMAIN NOTICE
5 * National Center for Biotechnology Information
6 *
7 * This software/database is a "United States Government Work" under the
8 * terms of the United States Copyright Act. It was written as part of
9 * the author's official duties as a United States Government employee and
10 * thus cannot be copyrighted. This software/database is freely available
11 * to the public for use. The National Library of Medicine and the U.S.
12 * Government have not placed any restriction on its use or reproduction.
13 *
14 * Although all reasonable efforts have been taken to ensure the accuracy
15 * and reliability of the software and data, the NLM and the U.S.
16 * Government do not and cannot warrant the performance or results that
17 * may be obtained by using this software or data. The NLM and the U.S.
18 * Government disclaim all warranties, express or implied, including
19 * warranties of performance, merchantability or fitness for any particular
20 * purpose.
21 *
22 * Please cite the author in any work or product based on this material.
23 *
24 * =========================================================================
25 *
26 * Authors: Sema Kachalo
27 *
28 */
29
30 #include <ncbi_pch.hpp>
31 #include "discrepancy_core.hpp"
32 #include "utils.hpp"
33 #include <objmgr/util/feature.hpp>
34 #include <objmgr/util/sequence.hpp>
35
36 BEGIN_NCBI_SCOPE
37 BEGIN_SCOPE(NDiscrepancy)
38 USING_SCOPE(objects);
39
40 DISCREPANCY_MODULE(gene_names);
41
42
43 // BAD_GENE_NAME
HasBadWord(const string & s,string & word)44 static bool HasBadWord(const string& s, string& word)
45 {
46 static const char* BadWords[] = { "putative", "fragment", "gene", "orf", "like" };
47 for (size_t i = 0; i < ArraySize(BadWords); i++) {
48 if (NStr::FindNoCase(s, BadWords[i]) != NPOS) {
49 word = BadWords[i];
50 return true;
51 }
52 }
53 return false;
54 }
55
56
Has4Numbers(const string & s)57 static bool Has4Numbers(const string& s)
58 {
59 size_t n = 0;
60 for (size_t i = 0; i < s.size() && n < 4; i++) {
61 n = isdigit(s[i]) ? n+1 : 0;
62 }
63 return n >= 4;
64 };
65
66
67 DISCREPANCY_CASE(BAD_GENE_NAME, FEAT, eDisc | eSubmitter | eSmart, "Bad gene name")
68 {
69 for (const CSeq_feat& feat : context.GetFeat()) {
70 if (feat.IsSetData() && feat.GetData().IsGene() && feat.GetData().GetGene().CanGetLocus()) {
71 string locus = feat.GetData().GetGene().GetLocus();
72 string word;
73 if (locus.size() > 10 || Has4Numbers(locus) || HasBadWord(locus, word)) {
74 m_Objs[word.empty() ? "[n] gene[s] contain[S] suspect phrase or characters" : "[n] gene[s] contain[S] [(]" + word].Add(*context.SeqFeatObjRef(feat, &feat));
75 }
76 }
77 }
78 }
79
80
DISCREPANCY_SUMMARIZE(BAD_GENE_NAME)81 DISCREPANCY_SUMMARIZE(BAD_GENE_NAME)
82 {
83 m_ReportItems = m_Objs.Export(*this)->GetSubitems();
84 }
85
86
DISCREPANCY_AUTOFIX(BAD_GENE_NAME)87 DISCREPANCY_AUTOFIX(BAD_GENE_NAME)
88 {
89 const CSeq_feat* sf = dynamic_cast<const CSeq_feat*>(context.FindObject(*obj));
90 CRef<CSeq_feat> new_feat(new CSeq_feat());
91 new_feat->Assign(*sf);
92 if (sf->IsSetData() && sf->GetData().IsGene() && sf->GetData().GetGene().CanGetLocus()) {
93 AddComment(*new_feat, sf->GetData().GetGene().GetLocus());
94 }
95 new_feat->SetData().SetGene().ResetLocus();
96 context.ReplaceSeq_feat(*obj, *sf, *new_feat);
97 obj->SetFixed();
98 return CRef<CAutofixReport>(new CAutofixReport("BAD_GENE_NAME: [n] gene name[s] fixed", 1));
99 }
100
101
102 // BAD_BACTERIAL_GENE_NAME
103
104 DISCREPANCY_CASE(BAD_BACTERIAL_GENE_NAME, FEAT, eDisc | eOncaller | eSubmitter | eSmart, "Bad bacterial gene name")
105 {
106 const CSeqdesc* biosrc = context.GetBiosource();
107 if (biosrc) {
108 const CBioSource* src = &biosrc->GetSource();
109 if ((src->IsSetLineage() || !context.GetLineage().empty()) && !context.HasLineage(src, "Eukaryota") && !context.IsViral(src)) {
110 for (const CSeq_feat& feat : context.GetFeat()) {
111 if (feat.IsSetData() && feat.GetData().IsGene() && feat.GetData().GetGene().CanGetLocus()) {
112 string locus = feat.GetData().GetGene().GetLocus();
113 if (!isalpha(locus[0]) || !islower(locus[0])) {
114 m_Objs["[n] bacterial gene[s] [does] not start with lowercase letter"].Add(*context.SeqFeatObjRef(feat, &feat));
115 }
116 }
117 }
118 }
119 }
120 }
121
122
DISCREPANCY_SUMMARIZE(BAD_BACTERIAL_GENE_NAME)123 DISCREPANCY_SUMMARIZE(BAD_BACTERIAL_GENE_NAME)
124 {
125 m_ReportItems = m_Objs.Export(*this)->GetSubitems();
126 }
127
128
DISCREPANCY_AUTOFIX(BAD_BACTERIAL_GENE_NAME)129 DISCREPANCY_AUTOFIX(BAD_BACTERIAL_GENE_NAME)
130 {
131 const CSeq_feat* sf = dynamic_cast<const CSeq_feat*>(context.FindObject(*obj));
132 CRef<CSeq_feat> new_feat(new CSeq_feat());
133 new_feat->Assign(*sf);
134 AddComment(*new_feat, sf->GetData().GetGene().GetLocus());
135 new_feat->SetData().SetGene().ResetLocus();
136 context.ReplaceSeq_feat(*obj, *sf, *new_feat);
137 obj->SetFixed();
138 return CRef<CAutofixReport>(new CAutofixReport("BAD_BACTERIAL_GENE_NAME: [n] bacterial gene name[s] fixed", 1));
139 }
140
141
142 // EC_NUMBER_ON_UNKNOWN_PROTEIN
143
144 DISCREPANCY_CASE(EC_NUMBER_ON_UNKNOWN_PROTEIN, FEAT, eDisc | eSubmitter | eSmart | eFatal, "EC number on unknown protein")
145 {
146 for (const CSeq_feat& feat : context.GetFeat()) {
147 if (feat.IsSetData() && feat.GetData().IsProt() && feat.GetData().GetProt().CanGetName() && feat.GetData().GetProt().CanGetEc() && !feat.GetData().GetProt().GetEc().empty()) {
148 const list <string>& names = feat.GetData().GetProt().GetName();
149 if (!names.empty()) {
150 string str = *names.begin();
151 NStr::ToLower(str);
152 //if (NStr::FindNoCase(*names.begin(), "hypothetical protein") != NPOS || NStr::FindNoCase(*names.begin(), "unknown protein") != NPOS) {
153 if (str == "hypothetical protein" || str == "unknown protein") {
154 m_Objs["[n] protein feature[s] [has] an EC number and a protein name of 'unknown protein' or 'hypothetical protein'"].Add(*context.SeqFeatObjRef(feat, &feat)).Fatal();
155 }
156 }
157 }
158 }
159 }
160
161
DISCREPANCY_SUMMARIZE(EC_NUMBER_ON_UNKNOWN_PROTEIN)162 DISCREPANCY_SUMMARIZE(EC_NUMBER_ON_UNKNOWN_PROTEIN)
163 {
164 m_ReportItems = m_Objs.Export(*this)->GetSubitems();
165 }
166
167
DISCREPANCY_AUTOFIX(EC_NUMBER_ON_UNKNOWN_PROTEIN)168 DISCREPANCY_AUTOFIX(EC_NUMBER_ON_UNKNOWN_PROTEIN)
169 {
170 const CSeq_feat* sf = dynamic_cast<const CSeq_feat*>(context.FindObject(*obj));
171 CRef<CSeq_feat> new_feat(new CSeq_feat());
172 new_feat->Assign(*sf);
173 new_feat->SetData().SetProt().ResetEc();
174 context.ReplaceSeq_feat(*obj, *sf, *new_feat);
175 obj->SetFixed();
176 return CRef<CAutofixReport>(new CAutofixReport("EC_NUMBER_ON_UNKNOWN_PROTEIN: removed [n] EC number[s] from unknown protein[s]", 1));
177 }
178
179
180 // SHOW_HYPOTHETICAL_CDS_HAVING_GENE_NAME
181
182 DISCREPANCY_CASE(SHOW_HYPOTHETICAL_CDS_HAVING_GENE_NAME, FEAT, eDisc | eSubmitter | eSmart | eFatal, "Hypothetical CDS with gene names")
183 {
184 for (const CSeq_feat& feat : context.GetFeat()) {
185 if (feat.IsSetData() && feat.GetData().IsCdregion() && feat.CanGetProduct()) {
186 const CSeq_feat* gene = context.GetGeneForFeature(feat);
187 if (gene && gene->GetData().GetGene().CanGetLocus() && !gene->GetData().GetGene().GetLocus().empty()) {
188 CBioseq_Handle bioseq = sequence::GetBioseqFromSeqLoc(feat.GetProduct(), context.GetScope());
189 if (bioseq) {
190 CFeat_CI feat_it(bioseq, CSeqFeatData::e_Prot); // consider different implementation
191 if (feat_it) {
192 const CProt_ref& prot = feat_it->GetOriginalFeature().GetData().GetProt();
193 if (prot.CanGetName()) {
194 const auto& names = prot.GetName();
195 if (!names.empty() && NStr::FindNoCase(names.front(), "hypothetical protein") != NPOS) {
196 m_Objs["[n] hypothetical coding region[s] [has] a gene name"].Fatal().Add(*context.SeqFeatObjRef(feat, gene));
197 }
198 }
199 }
200 }
201 }
202 }
203 }
204 }
205
206
DISCREPANCY_SUMMARIZE(SHOW_HYPOTHETICAL_CDS_HAVING_GENE_NAME)207 DISCREPANCY_SUMMARIZE(SHOW_HYPOTHETICAL_CDS_HAVING_GENE_NAME)
208 {
209 m_ReportItems = m_Objs.Export(*this)->GetSubitems();
210 }
211
212
DISCREPANCY_AUTOFIX(SHOW_HYPOTHETICAL_CDS_HAVING_GENE_NAME)213 DISCREPANCY_AUTOFIX(SHOW_HYPOTHETICAL_CDS_HAVING_GENE_NAME)
214 {
215 const CSeq_feat* sf = dynamic_cast<const CSeq_feat*>(context.FindObject(*obj, true));
216 CRef<CSeq_feat> new_feat(new CSeq_feat());
217 new_feat->Assign(*sf);
218 AddComment(*new_feat, sf->GetData().GetGene().IsSetLocus() ? sf->GetData().GetGene().GetLocus() : kEmptyStr);
219 new_feat->SetData().SetGene().ResetLocus();
220 context.ReplaceSeq_feat(*obj, *sf, *new_feat, true);
221 obj->SetFixed();
222 return CRef<CAutofixReport>(new CAutofixReport("SHOW_HYPOTHETICAL_CDS_HAVING_GENE_NAME: [n] hypothetical CDS fixed", 1));
223 }
224
225
226 // DUPLICATE_LOCUS_TAGS
227 const string kDuplicateLocusTagsTop = "[n] gene[s] [has] duplicate locus tags";
228 const string kDuplicateLocusTagsStart = "[n] gene[s] [has] locus tag ";
229 const string kDuplicateAdjacent = "[n] gene[s] [is] adjacent to another gene with the same locus tag.";
230
231
232 DISCREPANCY_CASE(DUPLICATE_LOCUS_TAGS, SEQUENCE, eDisc | eOncaller | eSubmitter | eSmart | eFatal, "Duplicate Locus Tags")
233 {
234 const auto& genes = context.FeatGenes();
235 string last_locus_tag;
236 CRef<CDiscrepancyObject> last_disc_obj;
237 for (const CSeq_feat* gene : genes) {
238 if (gene->GetData().GetGene().IsSetLocus_tag()) {
239 CRef<CDiscrepancyObject> this_disc_obj(context.SeqFeatObjRef(*gene));
240 const string& this_locus_tag = gene->GetData().GetGene().GetLocus_tag();
241 m_Objs[kEmptyStr][this_locus_tag].Add(*this_disc_obj).Fatal();
242 if (last_disc_obj && last_locus_tag == this_locus_tag) {
243 m_Objs[kDuplicateLocusTagsTop][kDuplicateAdjacent].Add(*last_disc_obj).Fatal();
244 m_Objs[kDuplicateLocusTagsTop][kDuplicateAdjacent].Add(*this_disc_obj).Fatal();
245 }
246 last_locus_tag = this_locus_tag;
247 last_disc_obj = this_disc_obj;
248 }
249 else {
250 last_locus_tag = kEmptyStr;
251 }
252 }
253 }
254
255
DISCREPANCY_SUMMARIZE(DUPLICATE_LOCUS_TAGS)256 DISCREPANCY_SUMMARIZE(DUPLICATE_LOCUS_TAGS)
257 {
258 for (auto& it : m_Objs[kEmptyStr].GetMap()) {
259 if (it.second->GetObjects().size() > 1) {
260 string label = kDuplicateLocusTagsStart + it.first + ".";
261 for (auto& obj : it.second->GetObjects()) {
262 m_Objs[kDuplicateLocusTagsTop][label].Add(*obj);
263 }
264 }
265 }
266 m_Objs.GetMap().erase(kEmptyStr);
267 m_ReportItems = m_Objs.Export(*this)->GetSubitems();
268 }
269
270
271 END_SCOPE(NDiscrepancy)
272 END_NCBI_SCOPE
273