1 /* $Id: gene_qual_normalization.cpp 632626 2021-06-03 17:38:42Z ivanov $
2  * ===========================================================================
3  *
4  *                            PUBLIC DOMAIN NOTICE
5  *               National Center for Biotechnology Information
6  *
7  *  This software/database is a "United States Government Work" under the
8  *  terms of the United States Copyright Act.  It was written as part of
9  *  the author's official duties as a United States Government employee and
10  *  thus cannot be copyrighted.  This software/database is freely available
11  *  to the public for use. The National Library of Medicine and the U.S.
12  *  Government have not placed any restriction on its use or reproduction.
13  *
14  *  Although all reasonable efforts have been taken to ensure the accuracy
15  *  and reliability of the software and data, the NLM and the U.S.
16  *  Government do not and cannot warrant the performance or results that
17  *  may be obtained by using this software or data. The NLM and the U.S.
18  *  Government disclaim all warranties, express or implied, including
19  *  warranties of performance, merchantability or fitness for any particular
20  *  purpose.
21  *
22  *  Please cite the author in any work or product based on this material.
23  *
24  * ===========================================================================
25  *
26  * Author:  Colleen Bollin
27  *
28  * File Description:
29  *   Code for moving gene qualifiers from gene xrefs on child features to
30  *   the actual gene
31  *
32  */
33 #include <ncbi_pch.hpp>
34 #include <corelib/ncbistd.hpp>
35 #include <serial/serialbase.hpp>
36 #include <objects/seqfeat/Seq_feat.hpp>
37 #include <objects/seqfeat/SeqFeatXref.hpp>
38 
39 #include <objmgr/object_manager.hpp>
40 #include <objmgr/util/sequence.hpp>
41 #include <objmgr/seqdesc_ci.hpp>
42 #include <objtools/cleanup/cleanup.hpp>
43 
44 
45 BEGIN_NCBI_SCOPE
BEGIN_SCOPE(objects)46 BEGIN_SCOPE(objects)
47 
48 
49 
50 bool IsMappablePair(const CSeq_feat& cds, const CSeq_feat& gene)
51 {
52     // we're only going to move the qualifiers if the gene has a
53     // locus_tag and no locus
54     if (!gene.GetData().IsGene() ||
55         gene.GetData().GetGene().IsSetLocus() ||
56         !gene.GetData().GetGene().IsSetLocus_tag() ||
57         !cds.IsSetXref()) {
58         return false;
59     }
60 
61     CTempString locus;
62     CTempString locus_tag;
63     for (auto it : cds.GetXref()) {
64         if (it->IsSetData() && it->GetData().IsGene()) {
65             if (it->GetData().GetGene().IsSetLocus()) {
66                 if (NStr::IsBlank(locus)) {
67                     locus = it->GetData().GetGene().GetLocus();
68                 } else {
69                     // already found a locus value, quit
70                     return false;
71                 }
72             }
73             if (it->GetData().GetGene().IsSetLocus_tag()) {
74                 if (NStr::IsBlank(locus_tag)) {
75                     locus_tag = it->GetData().GetGene().GetLocus_tag();
76                 } else {
77                     // already found a locus-tag value, quit
78                     return false;
79                 }
80             }
81         }
82     }
83     if (NStr::IsBlank(locus)) {
84         // no locus that we can use
85         return false;
86     }
87 
88     if (!NStr::IsBlank(locus_tag) &&
89         gene.GetData().GetGene().IsSetLocus_tag() &&
90         !NStr::Equal(locus_tag, gene.GetData().GetGene().GetLocus_tag())) {
91         return false;
92     }
93 
94     return true;
95 }
96 
97 
NormalizeGeneQuals(CSeq_feat & cds,CSeq_feat & gene)98 bool CCleanup::NormalizeGeneQuals(CSeq_feat& cds, CSeq_feat& gene)
99 {
100     if (!gene.GetData().IsGene() || gene.GetData().GetGene().IsSetLocus()
101         || !cds.IsSetXref()) {
102         return false;
103     }
104     CTempString locus;
105     CTempString locus_tag;
106     CRef<CSeqFeatXref> gene_xref(NULL);
107     for (auto it : cds.SetXref()) {
108         if (it->IsSetData() && it->GetData().IsGene()) {
109             if (it->GetData().GetGene().IsSetLocus()) {
110                 if (NStr::IsBlank(locus)) {
111                     locus = it->GetData().GetGene().GetLocus();
112                     gene_xref = it;
113                 } else {
114                     // already found a locus value, quit
115                     return false;
116                 }
117             }
118             if (it->GetData().GetGene().IsSetLocus_tag()) {
119                 if (NStr::IsBlank(locus_tag)) {
120                     locus_tag = it->GetData().GetGene().GetLocus_tag();
121                 } else {
122                     // already found a locus-tag value, quit
123                     return false;
124                 }
125             }
126         }
127     }
128     if (NStr::IsBlank(locus)) {
129         // no locus that we can use
130         return false;
131     }
132 
133     if (!NStr::IsBlank(locus_tag) &&
134         gene.GetData().GetGene().IsSetLocus_tag() &&
135         !NStr::Equal(locus_tag, gene.GetData().GetGene().GetLocus_tag())) {
136         return false;
137     }
138 
139     if (!NStr::IsBlank(locus)) {
140         gene.SetData().SetGene().SetLocus(locus);
141         if (gene.GetData().GetGene().IsSetLocus_tag() &&
142             !NStr::IsBlank(gene.GetData().GetGene().GetLocus_tag())) {
143             // gene xrefs should point to locus_tag instead of locus
144             gene_xref->SetData().SetGene().ResetLocus();
145             gene_xref->SetData().SetGene().SetLocus_tag(gene.GetData().GetGene().GetLocus_tag());
146         }
147         return true;
148     } else {
149         return false;
150     }
151 }
152 
153 
GetNormalizableGeneQualPairs(CBioseq_Handle bsh)154 vector<CCleanup::TFeatGenePair> CCleanup::GetNormalizableGeneQualPairs(CBioseq_Handle bsh)
155 {
156     vector<CCleanup::TFeatGenePair> rval;
157 
158     // only for nucleotide sequences
159     if (bsh.IsAa()) {
160         return rval;
161     }
162     // only for prokaryotes
163     CSeqdesc_CI src(bsh, CSeqdesc::e_Source);
164     if (!src || !src->IsSource() || !src->GetSource().IsSetLineage() ||
165         (NStr::Find(src->GetSource().GetLineage(), "Bacteria; ") == NPOS &&
166          NStr::Find(src->GetSource().GetLineage(), "Archaea") == NPOS)) {
167         return rval;
168     }
169 
170     typedef pair<CSeq_feat_Handle, bool> TFeatOkPair;
171     typedef map<CSeq_feat_Handle, TFeatOkPair > TGeneCDSMap;
172     TGeneCDSMap gene_cds;
173 
174     CFeat_CI f(bsh);
175     CRef<feature::CFeatTree> tr(new feature::CFeatTree(f));
176     tr->SetIgnoreMissingGeneXref();
177     while (f) {
178         if (f->GetData().IsCdregion()) {
179             CMappedFeat gene = tr->GetBestGene(*f);
180             if (gene) {
181                 TGeneCDSMap::iterator smit = gene_cds.find(gene);
182                 if (smit == gene_cds.end()) {
183                     // not found before
184                     bool ok_to_map = IsMappablePair(f->GetOriginalFeature(), gene.GetOriginalFeature());
185                     gene_cds[gene] = { f->GetSeq_feat_Handle(), ok_to_map };
186                 } else {
187                     // same gene, two different coding regions
188                     gene_cds[gene].second = false;
189                 }
190             }
191         }
192         ++f;
193     }
194 
195     for (auto copy_pair : gene_cds) {
196         if (copy_pair.second.second) {
197             rval.push_back({ copy_pair.second.first, copy_pair.first });
198         }
199     }
200     return rval;
201 }
202 
NormalizeGeneQuals(CBioseq_Handle bsh)203 bool CCleanup::NormalizeGeneQuals(CBioseq_Handle bsh)
204 {
205     vector<CCleanup::TFeatGenePair> cds_gene_pairs = GetNormalizableGeneQualPairs(bsh);
206     bool any_change = false;
207     for (auto copy_pair : cds_gene_pairs) {
208         CRef<CSeq_feat> new_cds(new CSeq_feat());
209         new_cds->Assign(*(copy_pair.first.GetOriginalSeq_feat()));
210         CRef<CSeq_feat> new_gene(new CSeq_feat());
211         new_gene->Assign(*(copy_pair.second.GetOriginalSeq_feat()));
212         if (NormalizeGeneQuals(*new_cds, *new_gene)) {
213             CSeq_feat_EditHandle gene_edit = CSeq_feat_EditHandle(copy_pair.second);
214             gene_edit.Replace(*new_gene);
215             CSeq_feat_EditHandle cds_edit = CSeq_feat_EditHandle(copy_pair.first);
216             cds_edit.Replace(*new_cds);
217             any_change = true;
218         }
219     }
220 
221     return any_change;
222 }
223 
NormalizeGeneQuals(CSeq_entry_Handle seh)224 bool CCleanup::NormalizeGeneQuals(CSeq_entry_Handle seh)
225 {
226     bool any_change = false;
227     CBioseq_CI bi(seh, CSeq_inst::eMol_na);
228     while (bi) {
229         any_change |= NormalizeGeneQuals(*bi);
230         ++bi;
231     }
232     return any_change;
233 }
234 
235 END_SCOPE(objects)
236 END_NCBI_SCOPE
237