1 /* $Id: gene_qual_normalization.cpp 632626 2021-06-03 17:38:42Z ivanov $
2 * ===========================================================================
3 *
4 * PUBLIC DOMAIN NOTICE
5 * National Center for Biotechnology Information
6 *
7 * This software/database is a "United States Government Work" under the
8 * terms of the United States Copyright Act. It was written as part of
9 * the author's official duties as a United States Government employee and
10 * thus cannot be copyrighted. This software/database is freely available
11 * to the public for use. The National Library of Medicine and the U.S.
12 * Government have not placed any restriction on its use or reproduction.
13 *
14 * Although all reasonable efforts have been taken to ensure the accuracy
15 * and reliability of the software and data, the NLM and the U.S.
16 * Government do not and cannot warrant the performance or results that
17 * may be obtained by using this software or data. The NLM and the U.S.
18 * Government disclaim all warranties, express or implied, including
19 * warranties of performance, merchantability or fitness for any particular
20 * purpose.
21 *
22 * Please cite the author in any work or product based on this material.
23 *
24 * ===========================================================================
25 *
26 * Author: Colleen Bollin
27 *
28 * File Description:
29 * Code for moving gene qualifiers from gene xrefs on child features to
30 * the actual gene
31 *
32 */
33 #include <ncbi_pch.hpp>
34 #include <corelib/ncbistd.hpp>
35 #include <serial/serialbase.hpp>
36 #include <objects/seqfeat/Seq_feat.hpp>
37 #include <objects/seqfeat/SeqFeatXref.hpp>
38
39 #include <objmgr/object_manager.hpp>
40 #include <objmgr/util/sequence.hpp>
41 #include <objmgr/seqdesc_ci.hpp>
42 #include <objtools/cleanup/cleanup.hpp>
43
44
45 BEGIN_NCBI_SCOPE
BEGIN_SCOPE(objects)46 BEGIN_SCOPE(objects)
47
48
49
50 bool IsMappablePair(const CSeq_feat& cds, const CSeq_feat& gene)
51 {
52 // we're only going to move the qualifiers if the gene has a
53 // locus_tag and no locus
54 if (!gene.GetData().IsGene() ||
55 gene.GetData().GetGene().IsSetLocus() ||
56 !gene.GetData().GetGene().IsSetLocus_tag() ||
57 !cds.IsSetXref()) {
58 return false;
59 }
60
61 CTempString locus;
62 CTempString locus_tag;
63 for (auto it : cds.GetXref()) {
64 if (it->IsSetData() && it->GetData().IsGene()) {
65 if (it->GetData().GetGene().IsSetLocus()) {
66 if (NStr::IsBlank(locus)) {
67 locus = it->GetData().GetGene().GetLocus();
68 } else {
69 // already found a locus value, quit
70 return false;
71 }
72 }
73 if (it->GetData().GetGene().IsSetLocus_tag()) {
74 if (NStr::IsBlank(locus_tag)) {
75 locus_tag = it->GetData().GetGene().GetLocus_tag();
76 } else {
77 // already found a locus-tag value, quit
78 return false;
79 }
80 }
81 }
82 }
83 if (NStr::IsBlank(locus)) {
84 // no locus that we can use
85 return false;
86 }
87
88 if (!NStr::IsBlank(locus_tag) &&
89 gene.GetData().GetGene().IsSetLocus_tag() &&
90 !NStr::Equal(locus_tag, gene.GetData().GetGene().GetLocus_tag())) {
91 return false;
92 }
93
94 return true;
95 }
96
97
NormalizeGeneQuals(CSeq_feat & cds,CSeq_feat & gene)98 bool CCleanup::NormalizeGeneQuals(CSeq_feat& cds, CSeq_feat& gene)
99 {
100 if (!gene.GetData().IsGene() || gene.GetData().GetGene().IsSetLocus()
101 || !cds.IsSetXref()) {
102 return false;
103 }
104 CTempString locus;
105 CTempString locus_tag;
106 CRef<CSeqFeatXref> gene_xref(NULL);
107 for (auto it : cds.SetXref()) {
108 if (it->IsSetData() && it->GetData().IsGene()) {
109 if (it->GetData().GetGene().IsSetLocus()) {
110 if (NStr::IsBlank(locus)) {
111 locus = it->GetData().GetGene().GetLocus();
112 gene_xref = it;
113 } else {
114 // already found a locus value, quit
115 return false;
116 }
117 }
118 if (it->GetData().GetGene().IsSetLocus_tag()) {
119 if (NStr::IsBlank(locus_tag)) {
120 locus_tag = it->GetData().GetGene().GetLocus_tag();
121 } else {
122 // already found a locus-tag value, quit
123 return false;
124 }
125 }
126 }
127 }
128 if (NStr::IsBlank(locus)) {
129 // no locus that we can use
130 return false;
131 }
132
133 if (!NStr::IsBlank(locus_tag) &&
134 gene.GetData().GetGene().IsSetLocus_tag() &&
135 !NStr::Equal(locus_tag, gene.GetData().GetGene().GetLocus_tag())) {
136 return false;
137 }
138
139 if (!NStr::IsBlank(locus)) {
140 gene.SetData().SetGene().SetLocus(locus);
141 if (gene.GetData().GetGene().IsSetLocus_tag() &&
142 !NStr::IsBlank(gene.GetData().GetGene().GetLocus_tag())) {
143 // gene xrefs should point to locus_tag instead of locus
144 gene_xref->SetData().SetGene().ResetLocus();
145 gene_xref->SetData().SetGene().SetLocus_tag(gene.GetData().GetGene().GetLocus_tag());
146 }
147 return true;
148 } else {
149 return false;
150 }
151 }
152
153
GetNormalizableGeneQualPairs(CBioseq_Handle bsh)154 vector<CCleanup::TFeatGenePair> CCleanup::GetNormalizableGeneQualPairs(CBioseq_Handle bsh)
155 {
156 vector<CCleanup::TFeatGenePair> rval;
157
158 // only for nucleotide sequences
159 if (bsh.IsAa()) {
160 return rval;
161 }
162 // only for prokaryotes
163 CSeqdesc_CI src(bsh, CSeqdesc::e_Source);
164 if (!src || !src->IsSource() || !src->GetSource().IsSetLineage() ||
165 (NStr::Find(src->GetSource().GetLineage(), "Bacteria; ") == NPOS &&
166 NStr::Find(src->GetSource().GetLineage(), "Archaea") == NPOS)) {
167 return rval;
168 }
169
170 typedef pair<CSeq_feat_Handle, bool> TFeatOkPair;
171 typedef map<CSeq_feat_Handle, TFeatOkPair > TGeneCDSMap;
172 TGeneCDSMap gene_cds;
173
174 CFeat_CI f(bsh);
175 CRef<feature::CFeatTree> tr(new feature::CFeatTree(f));
176 tr->SetIgnoreMissingGeneXref();
177 while (f) {
178 if (f->GetData().IsCdregion()) {
179 CMappedFeat gene = tr->GetBestGene(*f);
180 if (gene) {
181 TGeneCDSMap::iterator smit = gene_cds.find(gene);
182 if (smit == gene_cds.end()) {
183 // not found before
184 bool ok_to_map = IsMappablePair(f->GetOriginalFeature(), gene.GetOriginalFeature());
185 gene_cds[gene] = { f->GetSeq_feat_Handle(), ok_to_map };
186 } else {
187 // same gene, two different coding regions
188 gene_cds[gene].second = false;
189 }
190 }
191 }
192 ++f;
193 }
194
195 for (auto copy_pair : gene_cds) {
196 if (copy_pair.second.second) {
197 rval.push_back({ copy_pair.second.first, copy_pair.first });
198 }
199 }
200 return rval;
201 }
202
NormalizeGeneQuals(CBioseq_Handle bsh)203 bool CCleanup::NormalizeGeneQuals(CBioseq_Handle bsh)
204 {
205 vector<CCleanup::TFeatGenePair> cds_gene_pairs = GetNormalizableGeneQualPairs(bsh);
206 bool any_change = false;
207 for (auto copy_pair : cds_gene_pairs) {
208 CRef<CSeq_feat> new_cds(new CSeq_feat());
209 new_cds->Assign(*(copy_pair.first.GetOriginalSeq_feat()));
210 CRef<CSeq_feat> new_gene(new CSeq_feat());
211 new_gene->Assign(*(copy_pair.second.GetOriginalSeq_feat()));
212 if (NormalizeGeneQuals(*new_cds, *new_gene)) {
213 CSeq_feat_EditHandle gene_edit = CSeq_feat_EditHandle(copy_pair.second);
214 gene_edit.Replace(*new_gene);
215 CSeq_feat_EditHandle cds_edit = CSeq_feat_EditHandle(copy_pair.first);
216 cds_edit.Replace(*new_cds);
217 any_change = true;
218 }
219 }
220
221 return any_change;
222 }
223
NormalizeGeneQuals(CSeq_entry_Handle seh)224 bool CCleanup::NormalizeGeneQuals(CSeq_entry_Handle seh)
225 {
226 bool any_change = false;
227 CBioseq_CI bi(seh, CSeq_inst::eMol_na);
228 while (bi) {
229 any_change |= NormalizeGeneQuals(*bi);
230 ++bi;
231 }
232 return any_change;
233 }
234
235 END_SCOPE(objects)
236 END_NCBI_SCOPE
237