1 /*  $Id: fix_feature_id.cpp 637425 2021-09-13 13:12:54Z ivanov $
2  * ===========================================================================
3  *
4  *                            PUBLIC DOMAIN NOTICE
5  *               National Center for Biotechnology Information
6  *
7  *  This software/database is a "United States Government Work" under the
8  *  terms of the United States Copyright Act.  It was written as part of
9  *  the author's official duties as a United States Government employee and
10  *  thus cannot be copyrighted.  This software/database is freely available
11  *  to the public for use. The National Library of Medicine and the U.S.
12  *  Government have not placed any restriction on its use or reproduction.
13  *
14  *  Although all reasonable efforts have been taken to ensure the accuracy
15  *  and reliability of the software and data,  the NLM and the U.S.
16  *  Government do not and cannot warrant the performance or results that
17  *  may be obtained by using this software or data. The NLM and the U.S.
18  *  Government disclaim all warranties,  express or implied,  including
19  *  warranties of performance,  merchantability or fitness for any particular
20  *  purpose.
21  *
22  *  Please cite the author in any work or product based on this material.
23  *
24  * ===========================================================================
25  *
26  * Authors:  Igor Filippov
27  */
28 
29 
30 #include <ncbi_pch.hpp>
31 #include <objects/seqfeat/Feat_id.hpp>
32 #include <objmgr/feat_ci.hpp>
33 #include <objmgr/seq_entry_ci.hpp>
34 #include <objtools/cleanup/fix_feature_id.hpp>
35 #include <unordered_map>
36 
37 BEGIN_NCBI_SCOPE
38 USING_SCOPE(objects);
39 
40 
s_FindHighestFeatureId(const CSeq_entry_Handle & entry)41 CFixFeatureId::TId CFixFeatureId::s_FindHighestFeatureId(const CSeq_entry_Handle& entry)
42 {
43     TId feat_id = 0;
44     for (CFeat_CI feat_it(entry); feat_it; ++feat_it) {
45         if (feat_it->IsSetId()) {
46             const CFeat_id &id = feat_it->GetId();
47             if (id.IsLocal() && id.GetLocal().IsId() && id.GetLocal().GetId() > feat_id) {
48                 feat_id = id.GetLocal().GetId();
49             }
50         }
51     }
52     return feat_id;
53 }
54 
FindNextOffset(const CFixFeatureId::TIdSet & existing_ids,const CFixFeatureId::TIdSet & new_existing_ids,const CFixFeatureId::TIdSet & current_ids,CFixFeatureId::TId & offset)55 static void FindNextOffset(const CFixFeatureId::TIdSet &existing_ids,
56         const CFixFeatureId::TIdSet &new_existing_ids,
57         const CFixFeatureId::TIdSet &current_ids,
58         CFixFeatureId::TId &offset)
59 {
60     do
61     {
62         ++offset;
63     } while(existing_ids.find(offset) != existing_ids.end() ||
64             new_existing_ids.find(offset) != new_existing_ids.end() ||
65             current_ids.find(offset) != current_ids.end());
66 }
67 
68 
UpdateFeatureIds(CSeq_feat & feat,SFeatIdManager & id_manager)69 bool CFixFeatureId::UpdateFeatureIds(CSeq_feat& feat,
70         SFeatIdManager& id_manager)
71 {
72 
73     auto& existing_ids = id_manager.existing_ids;
74     auto& offset = id_manager.offset;
75     auto& new_ids = id_manager.new_ids;
76     auto& unchanged_ids = id_manager.unchanged_ids;
77     auto& id_map = id_manager.id_map;
78 
79     bool feature_changed = false;
80 
81     if (feat.IsSetId() &&
82         feat.GetId().IsLocal() &&
83         feat.GetId().GetLocal().IsId()) {
84         TId feat_id = feat.GetId().GetLocal().GetId();
85         if (existing_ids.find(feat_id) != existing_ids.end() ||
86             new_ids.find(feat_id) != new_ids.end()) {
87 
88             auto it = id_map.find(feat_id);
89             if (it != id_map.end()) {
90                 feat.SetId().SetLocal().SetId(it->second);
91             }
92             else {
93                 FindNextOffset(existing_ids, unchanged_ids, new_ids, offset);
94                 id_map[feat_id] = offset;
95                 feat.SetId().SetLocal().SetId(offset);
96                 new_ids.insert(offset);
97             }
98             feature_changed = true;
99         }
100         else {
101             unchanged_ids.insert(feat_id);
102         }
103     }
104 
105         // Loop over xrefs goes here ...
106     if (feat.IsSetXref()) {
107         for (auto pXref : feat.SetXref()) {
108             if (pXref->IsSetId() &&
109                 pXref->GetId().IsLocal() &
110                 pXref->GetId().GetLocal().IsId()) {
111                 TId feat_id = pXref->GetId().GetLocal().GetId();
112                 auto it = id_map.find(feat_id);
113                 if (it != id_map.end()) {
114                     pXref->SetId().SetLocal().SetId(it->second);
115                     feature_changed = true;
116                 }
117                 else if (existing_ids.find(feat_id)  != existing_ids.end() ||
118                          new_ids.find(feat_id) != new_ids.end()) {
119                     FindNextOffset(existing_ids, unchanged_ids, new_ids, offset); // find id that does not exist among either of the 3 sets
120                     id_map[feat_id] = offset;
121                     new_ids.insert(offset);
122                     pXref->SetId().SetLocal().SetId(offset);
123                     feature_changed = true;
124                 }
125                 else {
126                     unchanged_ids.insert(feat_id);
127                 }
128             }
129         }
130     }
131     return true;
132 }
133 
134 
UpdateFeatureIds(CSeq_entry_Handle entry_handle,TIdSet & existing_ids,TId & offset)135 bool CFixFeatureId::UpdateFeatureIds(CSeq_entry_Handle entry_handle,
136         TIdSet& existing_ids,
137         TId& offset)
138 {
139     TIdSet new_ids;
140     TIdSet unchanged_ids;
141     unordered_map<TId, TId> id_map;
142     bool any_changes = false;
143 
144     for (CFeat_CI feat_it(entry_handle);
145         feat_it;
146         ++feat_it) {
147 
148         const auto& feat = feat_it->GetOriginalFeature();
149         CRef<CSeq_feat> pUpdatedFeat(new CSeq_feat());
150         pUpdatedFeat->Assign(feat);
151 
152         bool feature_changed = false;
153 
154         if (feat.IsSetId() &&
155             feat.GetId().IsLocal() &&
156             feat.GetId().GetLocal().IsId()) {
157             TId feat_id = feat.GetId().GetLocal().GetId();
158             if (existing_ids.find(feat_id) != existing_ids.end() ||
159                 new_ids.find(feat_id) != new_ids.end()) {
160 
161                 auto it = id_map.find(feat_id);
162                 if (it != id_map.end()) {
163                     pUpdatedFeat->SetId().SetLocal().SetId(it->second);
164                 }
165                 else {
166                     FindNextOffset(existing_ids, unchanged_ids, new_ids, offset); // find id that does not exist among either of the 3 sets
167                     id_map[feat_id] = offset;
168                     pUpdatedFeat->SetId().SetLocal().SetId(offset);
169                     new_ids.insert(offset);
170                 }
171                 feature_changed = true;
172 
173             }
174             else {
175                 unchanged_ids.insert(feat_id);
176             }
177         }
178 
179         // Loop over xrefs goes here ...
180         if (pUpdatedFeat->IsSetXref()) {
181             for (auto pXref : pUpdatedFeat->SetXref()) {
182                 if (pXref->IsSetId() &&
183                     pXref->GetId().IsLocal() &
184                     pXref->GetId().GetLocal().IsId()) {
185                     TId feat_id = pXref->GetId().GetLocal().GetId();
186                     auto it = id_map.find(feat_id);
187                     if (it != id_map.end()) {
188                         pXref->SetId().SetLocal().SetId(it->second);
189                         feature_changed = true;
190                     }
191                     else if (existing_ids.find(feat_id)  != existing_ids.end() ||
192                              new_ids.find(feat_id) != new_ids.end()) {
193                         FindNextOffset(existing_ids, unchanged_ids, new_ids, offset); // find id that does not exist among either of the 3 sets
194                         id_map[feat_id] = offset;
195                         new_ids.insert(offset);
196                         pXref->SetId().SetLocal().SetId(feat_id);
197                         feature_changed = true;
198                     }
199                     else {
200                         unchanged_ids.insert(feat_id);
201                     }
202                 }
203             }
204         }
205 
206         if (feature_changed) {
207             CSeq_feat_EditHandle editHandle(*feat_it);
208             editHandle.Replace(*pUpdatedFeat);
209             any_changes = true;
210         }
211     }
212     // Forgot to update existing ids!!
213     return any_changes;
214 }
215 
s_UpdateFeatureIds(const CSeq_entry_Handle & entry,map<CSeq_feat_Handle,CRef<CSeq_feat>> & changed_feats,TIdSet & existing_ids,TId & offset)216 void CFixFeatureId::s_UpdateFeatureIds(const CSeq_entry_Handle& entry, map<CSeq_feat_Handle, CRef<CSeq_feat> > &changed_feats,
217         TIdSet &existing_ids, TId &offset)
218 {
219     unordered_map<TId, TId> remapped_ids; // map between old id to new id within the current seq-entry only
220     TIdSet new_existing_ids; // id's which were left unchanged in the current seq-entry
221     TIdSet new_ids; //  newly created (mapped) ids in the current seq-entry
222 
223 
224     CFeat_CI feat_it(entry);
225     for ( ; feat_it; ++feat_it )
226     {
227         CSeq_feat_Handle fh = feat_it->GetSeq_feat_Handle();
228         const auto& feat = feat_it->GetOriginalFeature();
229 
230         if (feat.IsSetId() && feat.GetId().IsLocal() && feat.GetId().GetLocal().IsId())
231         {
232             TId id = feat.GetId().GetLocal().GetId();
233             if (existing_ids.find(id) != existing_ids.end() ||
234                 new_ids.find(id) != new_ids.end())
235                 // remap id if it's found in other seq-entries or among newly created ids in the current seq-entry,
236                 // do not remap existing duplicate ids within the same seq-entry
237             {
238                 auto it = remapped_ids.find(id);
239                 if (it != remapped_ids.end())
240                 {
241                     offset = it->second; // use the same remapped id if a duplicate exists in the current seq-entry and was already remapped
242                 }
243                 else
244                 {
245                     FindNextOffset(existing_ids, new_existing_ids, new_ids, offset); // find id which does not exist among either of the 3 sets
246                     remapped_ids[id] = offset;
247                 }
248                 CRef<CSeq_feat> edited(new CSeq_feat());
249                 edited->Assign(feat);
250                 edited->SetId().SetLocal().SetId(offset);
251                 new_ids.insert(offset);
252                 changed_feats[fh] = edited;
253             }
254             else
255             {
256                 new_existing_ids.insert(id);
257             }
258         }
259     }
260     existing_ids.insert(new_existing_ids.begin(), new_existing_ids.end());
261     existing_ids.insert(new_ids.begin(), new_ids.end());
262     feat_it.Rewind();
263     for ( ; feat_it; ++feat_it )
264     {
265         CRef<CSeq_feat> edited;
266         CSeq_feat_Handle fh = feat_it->GetSeq_feat_Handle();
267         if (changed_feats.find(fh) != changed_feats.end())
268         {
269             edited = changed_feats[fh];
270         }
271         else
272         {
273             edited.Reset(new CSeq_feat);
274             edited->Assign(feat_it->GetOriginalFeature());
275         }
276 
277         if (edited->IsSetXref())
278         {
279             bool modified = false;
280             for (auto pXref : edited->SetXref()) {
281                 if (pXref->IsSetId() &&
282                     pXref->GetId().IsLocal() &&
283                     pXref->GetId().GetLocal().IsId()) {
284                     TId id = pXref->GetId().GetLocal().GetId();
285                     auto it = remapped_ids.find(id);
286                     if (it != remapped_ids.end()){
287                         pXref->SetId().SetLocal().SetId(it->second);
288                         modified = true;
289                     }
290                 }
291                 if (modified) {
292                     changed_feats[fh] = edited;
293                 }
294             }
295         }
296     }
297 }
298 
299 
s_ApplyToSeqInSet(CSeq_entry_Handle tse,map<CSeq_feat_Handle,CRef<CSeq_feat>> & changed_feats)300 void CFixFeatureId::s_ApplyToSeqInSet(CSeq_entry_Handle tse, map<CSeq_feat_Handle, CRef<CSeq_feat> > &changed_feats)
301 {
302     TId offset = 0;
303     TIdSet existing_ids;
304     if (tse && tse.IsSet() && tse.GetSet().IsSetClass() && tse.GetSet().GetClass() == CBioseq_set::eClass_genbank)
305     {
306         for(CSeq_entry_CI direct_child_ci( tse.GetSet(), CSeq_entry_CI::eNonRecursive ); direct_child_ci; ++direct_child_ci )
307         {
308             const CSeq_entry_Handle& entry = *direct_child_ci;
309             s_UpdateFeatureIds(entry, changed_feats, existing_ids, offset);
310         }
311     }
312 }
313 
s_ApplyToSeqInSet(CSeq_entry_Handle tse)314 void CFixFeatureId::s_ApplyToSeqInSet(CSeq_entry_Handle tse)
315 {
316     if (tse && tse.IsSet() && tse.GetSet().IsSetClass() && tse.GetSet().GetClass() == CBioseq_set::eClass_genbank)
317     {
318         TId offset = 0;
319         TIdSet existing_ids;
320         for(CSeq_entry_CI direct_child_ci( tse.GetSet(), CSeq_entry_CI::eNonRecursive ); direct_child_ci; ++direct_child_ci )
321         {
322             TFeatMap changed_feats;
323             const CSeq_entry_Handle& entry = *direct_child_ci;
324             s_UpdateFeatureIds(entry, changed_feats, existing_ids, offset);
325             for (auto entry : changed_feats)
326             {
327                 auto orig_feat = entry.first;
328                 auto new_feat = entry.second;
329                 CSeq_feat_EditHandle feh(orig_feat);
330                 feh.Replace(*new_feat);
331             }
332         }
333     }
334 }
335 
336 
337 // This function maps existing feature ids to the sequential ints - 1,2,3,...
s_MakeIDPairs(const CSeq_entry_Handle & entry,map<TId,TId> & id_pairs)338 void CFixFeatureId::s_MakeIDPairs(const CSeq_entry_Handle& entry, map<TId,TId> &id_pairs)
339 {
340     TId feat_id = 0;
341     for (CFeat_CI feat_it(entry); feat_it; ++feat_it) {
342         if (feat_it->IsSetId()) {
343             const CFeat_id &id = feat_it->GetId();
344             if (id.IsLocal() && id.GetLocal().IsId() && id_pairs.find(id.GetLocal().GetId()) == id_pairs.end()) {
345                 id_pairs[id.GetLocal().GetId()] = ++feat_id;
346             }
347         }
348     }
349 }
350 
351 // Create a map from the existing feature ids to the sequential ints 1,2,3...
352 // and prepare a map from feature handles to the modified features with the reassigned ids both in the feature id and in the xrefs
s_ReassignFeatureIds(const CSeq_entry_Handle & entry,map<CSeq_feat_Handle,CRef<CSeq_feat>> & changed_feats)353 void CFixFeatureId::s_ReassignFeatureIds(const CSeq_entry_Handle& entry, map<CSeq_feat_Handle, CRef<CSeq_feat> > &changed_feats)
354 {
355     if (!entry)
356         return;
357     map<TId,TId> id_pairs;
358     CFixFeatureId::s_MakeIDPairs(entry, id_pairs);
359 
360     for ( CFeat_CI feat_it(entry); feat_it; ++feat_it )
361     {
362         bool modified = false;
363         CRef<CSeq_feat> edited;
364         CSeq_feat_Handle fh = feat_it->GetSeq_feat_Handle();
365         if (changed_feats.find(fh) != changed_feats.end())
366         {
367             edited = changed_feats[fh];
368         }
369         else
370         {
371             edited.Reset(new CSeq_feat);
372             edited->Assign(feat_it->GetOriginalFeature());
373         }
374 
375         if (edited->IsSetId() && edited->GetId().IsLocal() && edited->GetId().GetLocal().IsId())
376         {
377             TId id = id_pairs[edited->GetId().GetLocal().GetId()];
378             edited->SetId().SetLocal().SetId(id);
379             modified = true;
380         }
381        if (edited->IsSetXref())
382         {
383             CSeq_feat::TXref::iterator xref_it = edited->SetXref().begin();
384             while ( xref_it != edited->SetXref().end() )
385             {
386                 if ((*xref_it)-> IsSetId() && (*xref_it)->GetId().IsLocal() && (*xref_it)->GetId().GetLocal().IsId())
387                 {
388                     modified = true;
389                     if (id_pairs.find((*xref_it)->GetId().GetLocal().GetId()) != id_pairs.end())
390                         {
391                             TId id = id_pairs[(*xref_it)->GetId().GetLocal().GetId()];
392                             (*xref_it)->SetId().SetLocal().SetId(id);
393                         }
394                     else
395                         {
396                             (*xref_it)->ResetId();
397                             xref_it = edited->SetXref().erase(xref_it);
398                             continue;
399                         }
400                 }
401                 ++xref_it;
402             }
403             if (edited->SetXref().empty())
404                 edited->ResetXref();
405         }
406        if (modified)
407        {
408            changed_feats[fh] = edited;
409        }
410     }
411 }
412 
413 END_NCBI_SCOPE
414