1 /* $Id: fix_feature_id.cpp 637425 2021-09-13 13:12:54Z ivanov $
2 * ===========================================================================
3 *
4 * PUBLIC DOMAIN NOTICE
5 * National Center for Biotechnology Information
6 *
7 * This software/database is a "United States Government Work" under the
8 * terms of the United States Copyright Act. It was written as part of
9 * the author's official duties as a United States Government employee and
10 * thus cannot be copyrighted. This software/database is freely available
11 * to the public for use. The National Library of Medicine and the U.S.
12 * Government have not placed any restriction on its use or reproduction.
13 *
14 * Although all reasonable efforts have been taken to ensure the accuracy
15 * and reliability of the software and data, the NLM and the U.S.
16 * Government do not and cannot warrant the performance or results that
17 * may be obtained by using this software or data. The NLM and the U.S.
18 * Government disclaim all warranties, express or implied, including
19 * warranties of performance, merchantability or fitness for any particular
20 * purpose.
21 *
22 * Please cite the author in any work or product based on this material.
23 *
24 * ===========================================================================
25 *
26 * Authors: Igor Filippov
27 */
28
29
30 #include <ncbi_pch.hpp>
31 #include <objects/seqfeat/Feat_id.hpp>
32 #include <objmgr/feat_ci.hpp>
33 #include <objmgr/seq_entry_ci.hpp>
34 #include <objtools/cleanup/fix_feature_id.hpp>
35 #include <unordered_map>
36
37 BEGIN_NCBI_SCOPE
38 USING_SCOPE(objects);
39
40
s_FindHighestFeatureId(const CSeq_entry_Handle & entry)41 CFixFeatureId::TId CFixFeatureId::s_FindHighestFeatureId(const CSeq_entry_Handle& entry)
42 {
43 TId feat_id = 0;
44 for (CFeat_CI feat_it(entry); feat_it; ++feat_it) {
45 if (feat_it->IsSetId()) {
46 const CFeat_id &id = feat_it->GetId();
47 if (id.IsLocal() && id.GetLocal().IsId() && id.GetLocal().GetId() > feat_id) {
48 feat_id = id.GetLocal().GetId();
49 }
50 }
51 }
52 return feat_id;
53 }
54
FindNextOffset(const CFixFeatureId::TIdSet & existing_ids,const CFixFeatureId::TIdSet & new_existing_ids,const CFixFeatureId::TIdSet & current_ids,CFixFeatureId::TId & offset)55 static void FindNextOffset(const CFixFeatureId::TIdSet &existing_ids,
56 const CFixFeatureId::TIdSet &new_existing_ids,
57 const CFixFeatureId::TIdSet ¤t_ids,
58 CFixFeatureId::TId &offset)
59 {
60 do
61 {
62 ++offset;
63 } while(existing_ids.find(offset) != existing_ids.end() ||
64 new_existing_ids.find(offset) != new_existing_ids.end() ||
65 current_ids.find(offset) != current_ids.end());
66 }
67
68
UpdateFeatureIds(CSeq_feat & feat,SFeatIdManager & id_manager)69 bool CFixFeatureId::UpdateFeatureIds(CSeq_feat& feat,
70 SFeatIdManager& id_manager)
71 {
72
73 auto& existing_ids = id_manager.existing_ids;
74 auto& offset = id_manager.offset;
75 auto& new_ids = id_manager.new_ids;
76 auto& unchanged_ids = id_manager.unchanged_ids;
77 auto& id_map = id_manager.id_map;
78
79 bool feature_changed = false;
80
81 if (feat.IsSetId() &&
82 feat.GetId().IsLocal() &&
83 feat.GetId().GetLocal().IsId()) {
84 TId feat_id = feat.GetId().GetLocal().GetId();
85 if (existing_ids.find(feat_id) != existing_ids.end() ||
86 new_ids.find(feat_id) != new_ids.end()) {
87
88 auto it = id_map.find(feat_id);
89 if (it != id_map.end()) {
90 feat.SetId().SetLocal().SetId(it->second);
91 }
92 else {
93 FindNextOffset(existing_ids, unchanged_ids, new_ids, offset);
94 id_map[feat_id] = offset;
95 feat.SetId().SetLocal().SetId(offset);
96 new_ids.insert(offset);
97 }
98 feature_changed = true;
99 }
100 else {
101 unchanged_ids.insert(feat_id);
102 }
103 }
104
105 // Loop over xrefs goes here ...
106 if (feat.IsSetXref()) {
107 for (auto pXref : feat.SetXref()) {
108 if (pXref->IsSetId() &&
109 pXref->GetId().IsLocal() &
110 pXref->GetId().GetLocal().IsId()) {
111 TId feat_id = pXref->GetId().GetLocal().GetId();
112 auto it = id_map.find(feat_id);
113 if (it != id_map.end()) {
114 pXref->SetId().SetLocal().SetId(it->second);
115 feature_changed = true;
116 }
117 else if (existing_ids.find(feat_id) != existing_ids.end() ||
118 new_ids.find(feat_id) != new_ids.end()) {
119 FindNextOffset(existing_ids, unchanged_ids, new_ids, offset); // find id that does not exist among either of the 3 sets
120 id_map[feat_id] = offset;
121 new_ids.insert(offset);
122 pXref->SetId().SetLocal().SetId(offset);
123 feature_changed = true;
124 }
125 else {
126 unchanged_ids.insert(feat_id);
127 }
128 }
129 }
130 }
131 return true;
132 }
133
134
UpdateFeatureIds(CSeq_entry_Handle entry_handle,TIdSet & existing_ids,TId & offset)135 bool CFixFeatureId::UpdateFeatureIds(CSeq_entry_Handle entry_handle,
136 TIdSet& existing_ids,
137 TId& offset)
138 {
139 TIdSet new_ids;
140 TIdSet unchanged_ids;
141 unordered_map<TId, TId> id_map;
142 bool any_changes = false;
143
144 for (CFeat_CI feat_it(entry_handle);
145 feat_it;
146 ++feat_it) {
147
148 const auto& feat = feat_it->GetOriginalFeature();
149 CRef<CSeq_feat> pUpdatedFeat(new CSeq_feat());
150 pUpdatedFeat->Assign(feat);
151
152 bool feature_changed = false;
153
154 if (feat.IsSetId() &&
155 feat.GetId().IsLocal() &&
156 feat.GetId().GetLocal().IsId()) {
157 TId feat_id = feat.GetId().GetLocal().GetId();
158 if (existing_ids.find(feat_id) != existing_ids.end() ||
159 new_ids.find(feat_id) != new_ids.end()) {
160
161 auto it = id_map.find(feat_id);
162 if (it != id_map.end()) {
163 pUpdatedFeat->SetId().SetLocal().SetId(it->second);
164 }
165 else {
166 FindNextOffset(existing_ids, unchanged_ids, new_ids, offset); // find id that does not exist among either of the 3 sets
167 id_map[feat_id] = offset;
168 pUpdatedFeat->SetId().SetLocal().SetId(offset);
169 new_ids.insert(offset);
170 }
171 feature_changed = true;
172
173 }
174 else {
175 unchanged_ids.insert(feat_id);
176 }
177 }
178
179 // Loop over xrefs goes here ...
180 if (pUpdatedFeat->IsSetXref()) {
181 for (auto pXref : pUpdatedFeat->SetXref()) {
182 if (pXref->IsSetId() &&
183 pXref->GetId().IsLocal() &
184 pXref->GetId().GetLocal().IsId()) {
185 TId feat_id = pXref->GetId().GetLocal().GetId();
186 auto it = id_map.find(feat_id);
187 if (it != id_map.end()) {
188 pXref->SetId().SetLocal().SetId(it->second);
189 feature_changed = true;
190 }
191 else if (existing_ids.find(feat_id) != existing_ids.end() ||
192 new_ids.find(feat_id) != new_ids.end()) {
193 FindNextOffset(existing_ids, unchanged_ids, new_ids, offset); // find id that does not exist among either of the 3 sets
194 id_map[feat_id] = offset;
195 new_ids.insert(offset);
196 pXref->SetId().SetLocal().SetId(feat_id);
197 feature_changed = true;
198 }
199 else {
200 unchanged_ids.insert(feat_id);
201 }
202 }
203 }
204 }
205
206 if (feature_changed) {
207 CSeq_feat_EditHandle editHandle(*feat_it);
208 editHandle.Replace(*pUpdatedFeat);
209 any_changes = true;
210 }
211 }
212 // Forgot to update existing ids!!
213 return any_changes;
214 }
215
s_UpdateFeatureIds(const CSeq_entry_Handle & entry,map<CSeq_feat_Handle,CRef<CSeq_feat>> & changed_feats,TIdSet & existing_ids,TId & offset)216 void CFixFeatureId::s_UpdateFeatureIds(const CSeq_entry_Handle& entry, map<CSeq_feat_Handle, CRef<CSeq_feat> > &changed_feats,
217 TIdSet &existing_ids, TId &offset)
218 {
219 unordered_map<TId, TId> remapped_ids; // map between old id to new id within the current seq-entry only
220 TIdSet new_existing_ids; // id's which were left unchanged in the current seq-entry
221 TIdSet new_ids; // newly created (mapped) ids in the current seq-entry
222
223
224 CFeat_CI feat_it(entry);
225 for ( ; feat_it; ++feat_it )
226 {
227 CSeq_feat_Handle fh = feat_it->GetSeq_feat_Handle();
228 const auto& feat = feat_it->GetOriginalFeature();
229
230 if (feat.IsSetId() && feat.GetId().IsLocal() && feat.GetId().GetLocal().IsId())
231 {
232 TId id = feat.GetId().GetLocal().GetId();
233 if (existing_ids.find(id) != existing_ids.end() ||
234 new_ids.find(id) != new_ids.end())
235 // remap id if it's found in other seq-entries or among newly created ids in the current seq-entry,
236 // do not remap existing duplicate ids within the same seq-entry
237 {
238 auto it = remapped_ids.find(id);
239 if (it != remapped_ids.end())
240 {
241 offset = it->second; // use the same remapped id if a duplicate exists in the current seq-entry and was already remapped
242 }
243 else
244 {
245 FindNextOffset(existing_ids, new_existing_ids, new_ids, offset); // find id which does not exist among either of the 3 sets
246 remapped_ids[id] = offset;
247 }
248 CRef<CSeq_feat> edited(new CSeq_feat());
249 edited->Assign(feat);
250 edited->SetId().SetLocal().SetId(offset);
251 new_ids.insert(offset);
252 changed_feats[fh] = edited;
253 }
254 else
255 {
256 new_existing_ids.insert(id);
257 }
258 }
259 }
260 existing_ids.insert(new_existing_ids.begin(), new_existing_ids.end());
261 existing_ids.insert(new_ids.begin(), new_ids.end());
262 feat_it.Rewind();
263 for ( ; feat_it; ++feat_it )
264 {
265 CRef<CSeq_feat> edited;
266 CSeq_feat_Handle fh = feat_it->GetSeq_feat_Handle();
267 if (changed_feats.find(fh) != changed_feats.end())
268 {
269 edited = changed_feats[fh];
270 }
271 else
272 {
273 edited.Reset(new CSeq_feat);
274 edited->Assign(feat_it->GetOriginalFeature());
275 }
276
277 if (edited->IsSetXref())
278 {
279 bool modified = false;
280 for (auto pXref : edited->SetXref()) {
281 if (pXref->IsSetId() &&
282 pXref->GetId().IsLocal() &&
283 pXref->GetId().GetLocal().IsId()) {
284 TId id = pXref->GetId().GetLocal().GetId();
285 auto it = remapped_ids.find(id);
286 if (it != remapped_ids.end()){
287 pXref->SetId().SetLocal().SetId(it->second);
288 modified = true;
289 }
290 }
291 if (modified) {
292 changed_feats[fh] = edited;
293 }
294 }
295 }
296 }
297 }
298
299
s_ApplyToSeqInSet(CSeq_entry_Handle tse,map<CSeq_feat_Handle,CRef<CSeq_feat>> & changed_feats)300 void CFixFeatureId::s_ApplyToSeqInSet(CSeq_entry_Handle tse, map<CSeq_feat_Handle, CRef<CSeq_feat> > &changed_feats)
301 {
302 TId offset = 0;
303 TIdSet existing_ids;
304 if (tse && tse.IsSet() && tse.GetSet().IsSetClass() && tse.GetSet().GetClass() == CBioseq_set::eClass_genbank)
305 {
306 for(CSeq_entry_CI direct_child_ci( tse.GetSet(), CSeq_entry_CI::eNonRecursive ); direct_child_ci; ++direct_child_ci )
307 {
308 const CSeq_entry_Handle& entry = *direct_child_ci;
309 s_UpdateFeatureIds(entry, changed_feats, existing_ids, offset);
310 }
311 }
312 }
313
s_ApplyToSeqInSet(CSeq_entry_Handle tse)314 void CFixFeatureId::s_ApplyToSeqInSet(CSeq_entry_Handle tse)
315 {
316 if (tse && tse.IsSet() && tse.GetSet().IsSetClass() && tse.GetSet().GetClass() == CBioseq_set::eClass_genbank)
317 {
318 TId offset = 0;
319 TIdSet existing_ids;
320 for(CSeq_entry_CI direct_child_ci( tse.GetSet(), CSeq_entry_CI::eNonRecursive ); direct_child_ci; ++direct_child_ci )
321 {
322 TFeatMap changed_feats;
323 const CSeq_entry_Handle& entry = *direct_child_ci;
324 s_UpdateFeatureIds(entry, changed_feats, existing_ids, offset);
325 for (auto entry : changed_feats)
326 {
327 auto orig_feat = entry.first;
328 auto new_feat = entry.second;
329 CSeq_feat_EditHandle feh(orig_feat);
330 feh.Replace(*new_feat);
331 }
332 }
333 }
334 }
335
336
337 // This function maps existing feature ids to the sequential ints - 1,2,3,...
s_MakeIDPairs(const CSeq_entry_Handle & entry,map<TId,TId> & id_pairs)338 void CFixFeatureId::s_MakeIDPairs(const CSeq_entry_Handle& entry, map<TId,TId> &id_pairs)
339 {
340 TId feat_id = 0;
341 for (CFeat_CI feat_it(entry); feat_it; ++feat_it) {
342 if (feat_it->IsSetId()) {
343 const CFeat_id &id = feat_it->GetId();
344 if (id.IsLocal() && id.GetLocal().IsId() && id_pairs.find(id.GetLocal().GetId()) == id_pairs.end()) {
345 id_pairs[id.GetLocal().GetId()] = ++feat_id;
346 }
347 }
348 }
349 }
350
351 // Create a map from the existing feature ids to the sequential ints 1,2,3...
352 // and prepare a map from feature handles to the modified features with the reassigned ids both in the feature id and in the xrefs
s_ReassignFeatureIds(const CSeq_entry_Handle & entry,map<CSeq_feat_Handle,CRef<CSeq_feat>> & changed_feats)353 void CFixFeatureId::s_ReassignFeatureIds(const CSeq_entry_Handle& entry, map<CSeq_feat_Handle, CRef<CSeq_feat> > &changed_feats)
354 {
355 if (!entry)
356 return;
357 map<TId,TId> id_pairs;
358 CFixFeatureId::s_MakeIDPairs(entry, id_pairs);
359
360 for ( CFeat_CI feat_it(entry); feat_it; ++feat_it )
361 {
362 bool modified = false;
363 CRef<CSeq_feat> edited;
364 CSeq_feat_Handle fh = feat_it->GetSeq_feat_Handle();
365 if (changed_feats.find(fh) != changed_feats.end())
366 {
367 edited = changed_feats[fh];
368 }
369 else
370 {
371 edited.Reset(new CSeq_feat);
372 edited->Assign(feat_it->GetOriginalFeature());
373 }
374
375 if (edited->IsSetId() && edited->GetId().IsLocal() && edited->GetId().GetLocal().IsId())
376 {
377 TId id = id_pairs[edited->GetId().GetLocal().GetId()];
378 edited->SetId().SetLocal().SetId(id);
379 modified = true;
380 }
381 if (edited->IsSetXref())
382 {
383 CSeq_feat::TXref::iterator xref_it = edited->SetXref().begin();
384 while ( xref_it != edited->SetXref().end() )
385 {
386 if ((*xref_it)-> IsSetId() && (*xref_it)->GetId().IsLocal() && (*xref_it)->GetId().GetLocal().IsId())
387 {
388 modified = true;
389 if (id_pairs.find((*xref_it)->GetId().GetLocal().GetId()) != id_pairs.end())
390 {
391 TId id = id_pairs[(*xref_it)->GetId().GetLocal().GetId()];
392 (*xref_it)->SetId().SetLocal().SetId(id);
393 }
394 else
395 {
396 (*xref_it)->ResetId();
397 xref_it = edited->SetXref().erase(xref_it);
398 continue;
399 }
400 }
401 ++xref_it;
402 }
403 if (edited->SetXref().empty())
404 edited->ResetXref();
405 }
406 if (modified)
407 {
408 changed_feats[fh] = edited;
409 }
410 }
411 }
412
413 END_NCBI_SCOPE
414