1 /* $Id: fix_feature_id.cpp 632626 2021-06-03 17:38:42Z ivanov $
2 * ===========================================================================
3 *
4 * PUBLIC DOMAIN NOTICE
5 * National Center for Biotechnology Information
6 *
7 * This software/database is a "United States Government Work" under the
8 * terms of the United States Copyright Act. It was written as part of
9 * the author's official duties as a United States Government employee and
10 * thus cannot be copyrighted. This software/database is freely available
11 * to the public for use. The National Library of Medicine and the U.S.
12 * Government have not placed any restriction on its use or reproduction.
13 *
14 * Although all reasonable efforts have been taken to ensure the accuracy
15 * and reliability of the software and data, the NLM and the U.S.
16 * Government do not and cannot warrant the performance or results that
17 * may be obtained by using this software or data. The NLM and the U.S.
18 * Government disclaim all warranties, express or implied, including
19 * warranties of performance, merchantability or fitness for any particular
20 * purpose.
21 *
22 * Please cite the author in any work or product based on this material.
23 *
24 * ===========================================================================
25 *
26 * Authors: Igor Filippov
27 */
28
29
30 #include <ncbi_pch.hpp>
31 #include <objects/seqfeat/Feat_id.hpp>
32 #include <objmgr/feat_ci.hpp>
33 #include <objmgr/seq_entry_ci.hpp>
34 #include <objtools/cleanup/fix_feature_id.hpp>
35 #include <unordered_map>
36
37 BEGIN_NCBI_SCOPE
38 USING_SCOPE(objects);
39
40
s_FindHighestFeatureId(const CSeq_entry_Handle & entry)41 CObject_id::TId CFixFeatureId::s_FindHighestFeatureId(const CSeq_entry_Handle& entry)
42 {
43 CObject_id::TId feat_id = 0;
44 for (CFeat_CI feat_it(entry); feat_it; ++feat_it) {
45 if (feat_it->IsSetId()) {
46 const CFeat_id &id = feat_it->GetId();
47 if (id.IsLocal() && id.GetLocal().IsId() && id.GetLocal().GetId() > feat_id) {
48 feat_id = id.GetLocal().GetId();
49 }
50 }
51 }
52 return feat_id;
53 }
54
FindNextOffset(const unordered_set<int> & existing_ids,const unordered_set<int> & new_existing_ids,const unordered_set<int> & current_ids,int & offset)55 static void FindNextOffset(const unordered_set<int> &existing_ids, const unordered_set<int> &new_existing_ids, const unordered_set<int> ¤t_ids, int &offset)
56 {
57 do
58 {
59 ++offset;
60 } while(existing_ids.find(offset) != existing_ids.end() ||
61 new_existing_ids.find(offset) != new_existing_ids.end() ||
62 current_ids.find(offset) != current_ids.end());
63 }
64
s_UpdateFeatureIds(const CSeq_entry_Handle & entry,map<CSeq_feat_Handle,CRef<CSeq_feat>> & changed_feats,unordered_set<int> & existing_ids,int & offset)65 void CFixFeatureId::s_UpdateFeatureIds(const CSeq_entry_Handle& entry, map<CSeq_feat_Handle, CRef<CSeq_feat> > &changed_feats, unordered_set<int> &existing_ids, int &offset)
66 {
67 unordered_map<int, int> remapped_ids; // map between old id to new id within the current seq-entry only
68 unordered_set<int> new_existing_ids; // id's which were left unchanged in the current seq-entry
69 unordered_set<int> current_ids; // newly created (mapped) ids in the current seq-entry
70 CFeat_CI feat_it(entry);
71 for ( ; feat_it; ++feat_it )
72 {
73 bool modified = false;
74 CRef<CSeq_feat> edited;
75 CSeq_feat_Handle fh = feat_it->GetSeq_feat_Handle();
76 if (changed_feats.find(fh) != changed_feats.end())
77 {
78 edited = changed_feats[fh];
79 }
80 else
81 {
82 edited.Reset(new CSeq_feat);
83 edited->Assign(feat_it->GetOriginalFeature());
84 }
85
86 if (edited->IsSetId() && edited->GetId().IsLocal() && edited->GetId().GetLocal().IsId())
87 {
88 int id = edited->GetId().GetLocal().GetId();
89 if (existing_ids.find(id) != existing_ids.end() ||
90 current_ids.find(id) != current_ids.end()) // remap id if it's found in other seq-entries or among newly created ids in the current seq-entry, do not remap existing duplicate ids within the same seq-entry
91 {
92 auto it = remapped_ids.find(id);
93 if (it != remapped_ids.end())
94 {
95 offset = it->second; // use the same remapped id if a duplicate exists in the current seq-entry and was already remapped
96 }
97 else
98 {
99 FindNextOffset(existing_ids, new_existing_ids, current_ids, offset); // find id which does not exist among either of the 3 sets
100 remapped_ids[id] = offset;
101 }
102 edited->SetId().SetLocal().SetId(offset);
103 current_ids.insert(offset);
104 modified = true;
105 }
106 else
107 {
108 new_existing_ids.insert(id);
109 }
110 }
111 if (modified)
112 {
113 changed_feats[fh] = edited;
114 }
115 }
116 existing_ids.insert(new_existing_ids.begin(), new_existing_ids.end());
117 existing_ids.insert(current_ids.begin(), current_ids.end());
118 feat_it.Rewind();
119 for ( ; feat_it; ++feat_it )
120 {
121 bool modified = false;
122 CRef<CSeq_feat> edited;
123 CSeq_feat_Handle fh = feat_it->GetSeq_feat_Handle();
124 if (changed_feats.find(fh) != changed_feats.end())
125 {
126 edited = changed_feats[fh];
127 }
128 else
129 {
130 edited.Reset(new CSeq_feat);
131 edited->Assign(feat_it->GetOriginalFeature());
132 }
133
134 if (edited->IsSetXref())
135 {
136 CSeq_feat::TXref::iterator xref_it = edited->SetXref().begin();
137 while ( xref_it != edited->SetXref().end() )
138 {
139 if ((*xref_it)-> IsSetId() && (*xref_it)->GetId().IsLocal() && (*xref_it)->GetId().GetLocal().IsId())
140 {
141 int id = (*xref_it)->GetId().GetLocal().GetId();
142 auto it = remapped_ids.find(id);
143 if (it != remapped_ids.end())
144 {
145 (*xref_it)->SetId().SetLocal().SetId(it->second); // remap xrefs if necessary
146 modified = true;
147 }
148 }
149 ++xref_it;
150 }
151 }
152 if (modified)
153 {
154 changed_feats[fh] = edited;
155 }
156 }
157 }
158
159
s_ApplyToSeqInSet(CSeq_entry_Handle tse,map<CSeq_feat_Handle,CRef<CSeq_feat>> & changed_feats)160 void CFixFeatureId::s_ApplyToSeqInSet(CSeq_entry_Handle tse, map<CSeq_feat_Handle, CRef<CSeq_feat> > &changed_feats)
161 {
162 int offset = 0;
163 unordered_set<int> existing_ids;
164 if (tse && tse.IsSet() && tse.GetSet().IsSetClass() && tse.GetSet().GetClass() == CBioseq_set::eClass_genbank)
165 {
166 for(CSeq_entry_CI direct_child_ci( tse.GetSet(), CSeq_entry_CI::eNonRecursive ); direct_child_ci; ++direct_child_ci )
167 {
168 const CSeq_entry_Handle& entry = *direct_child_ci;
169 s_UpdateFeatureIds(entry, changed_feats, existing_ids, offset);
170 }
171 }
172 }
173
174 // This function maps existing feature ids to the sequential ints - 1,2,3,...
s_MakeIDPairs(const CSeq_entry_Handle & entry,map<int,int> & id_pairs)175 void CFixFeatureId::s_MakeIDPairs(const CSeq_entry_Handle& entry, map<int,int> &id_pairs)
176 {
177 int feat_id = 0;
178 for (CFeat_CI feat_it(entry); feat_it; ++feat_it) {
179 if (feat_it->IsSetId()) {
180 const CFeat_id &id = feat_it->GetId();
181 if (id.IsLocal() && id.GetLocal().IsId() && id_pairs.find(id.GetLocal().GetId()) == id_pairs.end()) {
182 id_pairs[id.GetLocal().GetId()] = ++feat_id;
183 }
184 }
185 }
186 }
187
188 // Create a map from the existing feature ids to the sequential ints 1,2,3...
189 // and prepare a map from feature handles to the modified features with the reassigned ids both in the feature id and in the xrefs
s_ReassignFeatureIds(const CSeq_entry_Handle & entry,map<CSeq_feat_Handle,CRef<CSeq_feat>> & changed_feats)190 void CFixFeatureId::s_ReassignFeatureIds(const CSeq_entry_Handle& entry, map<CSeq_feat_Handle, CRef<CSeq_feat> > &changed_feats)
191 {
192 if (!entry)
193 return;
194 map<int,int> id_pairs;
195 CFixFeatureId::s_MakeIDPairs(entry, id_pairs);
196
197 for ( CFeat_CI feat_it(entry); feat_it; ++feat_it )
198 {
199 bool modified = false;
200 CRef<CSeq_feat> edited;
201 CSeq_feat_Handle fh = feat_it->GetSeq_feat_Handle();
202 if (changed_feats.find(fh) != changed_feats.end())
203 {
204 edited = changed_feats[fh];
205 }
206 else
207 {
208 edited.Reset(new CSeq_feat);
209 edited->Assign(feat_it->GetOriginalFeature());
210 }
211
212 if (edited->IsSetId() && edited->GetId().IsLocal() && edited->GetId().GetLocal().IsId())
213 {
214 int id = id_pairs[edited->GetId().GetLocal().GetId()];
215 edited->SetId().SetLocal().SetId(id);
216 modified = true;
217 }
218 if (edited->IsSetXref())
219 {
220 CSeq_feat::TXref::iterator xref_it = edited->SetXref().begin();
221 while ( xref_it != edited->SetXref().end() )
222 {
223 if ((*xref_it)-> IsSetId() && (*xref_it)->GetId().IsLocal() && (*xref_it)->GetId().GetLocal().IsId())
224 {
225 modified = true;
226 if (id_pairs.find((*xref_it)->GetId().GetLocal().GetId()) != id_pairs.end())
227 {
228 int id = id_pairs[(*xref_it)->GetId().GetLocal().GetId()];
229 (*xref_it)->SetId().SetLocal().SetId(id);
230 }
231 else
232 {
233 (*xref_it)->ResetId();
234 xref_it = edited->SetXref().erase(xref_it);
235 continue;
236 }
237 }
238 ++xref_it;
239 }
240 if (edited->SetXref().empty())
241 edited->ResetXref();
242 }
243 if (modified)
244 {
245 changed_feats[fh] = edited;
246 }
247 }
248 }
249
250 END_NCBI_SCOPE
251