1 /*  $Id: fix_feature_id.cpp 632626 2021-06-03 17:38:42Z ivanov $
2  * ===========================================================================
3  *
4  *                            PUBLIC DOMAIN NOTICE
5  *               National Center for Biotechnology Information
6  *
7  *  This software/database is a "United States Government Work" under the
8  *  terms of the United States Copyright Act.  It was written as part of
9  *  the author's official duties as a United States Government employee and
10  *  thus cannot be copyrighted.  This software/database is freely available
11  *  to the public for use. The National Library of Medicine and the U.S.
12  *  Government have not placed any restriction on its use or reproduction.
13  *
14  *  Although all reasonable efforts have been taken to ensure the accuracy
15  *  and reliability of the software and data,  the NLM and the U.S.
16  *  Government do not and cannot warrant the performance or results that
17  *  may be obtained by using this software or data. The NLM and the U.S.
18  *  Government disclaim all warranties,  express or implied,  including
19  *  warranties of performance,  merchantability or fitness for any particular
20  *  purpose.
21  *
22  *  Please cite the author in any work or product based on this material.
23  *
24  * ===========================================================================
25  *
26  * Authors:  Igor Filippov
27  */
28 
29 
30 #include <ncbi_pch.hpp>
31 #include <objects/seqfeat/Feat_id.hpp>
32 #include <objmgr/feat_ci.hpp>
33 #include <objmgr/seq_entry_ci.hpp>
34 #include <objtools/cleanup/fix_feature_id.hpp>
35 #include <unordered_map>
36 
37 BEGIN_NCBI_SCOPE
38 USING_SCOPE(objects);
39 
40 
s_FindHighestFeatureId(const CSeq_entry_Handle & entry)41 CObject_id::TId CFixFeatureId::s_FindHighestFeatureId(const CSeq_entry_Handle& entry)
42 {
43     CObject_id::TId feat_id = 0;
44     for (CFeat_CI feat_it(entry); feat_it; ++feat_it) {
45         if (feat_it->IsSetId()) {
46             const CFeat_id &id = feat_it->GetId();
47             if (id.IsLocal() && id.GetLocal().IsId() && id.GetLocal().GetId() > feat_id) {
48                 feat_id = id.GetLocal().GetId();
49             }
50         }
51     }
52     return feat_id;
53 }
54 
FindNextOffset(const unordered_set<int> & existing_ids,const unordered_set<int> & new_existing_ids,const unordered_set<int> & current_ids,int & offset)55 static void FindNextOffset(const unordered_set<int> &existing_ids, const unordered_set<int> &new_existing_ids, const unordered_set<int> &current_ids, int &offset)
56 {
57     do
58     {
59         ++offset;
60     } while(existing_ids.find(offset) != existing_ids.end() ||
61             new_existing_ids.find(offset) != new_existing_ids.end() ||
62             current_ids.find(offset) != current_ids.end());
63 }
64 
s_UpdateFeatureIds(const CSeq_entry_Handle & entry,map<CSeq_feat_Handle,CRef<CSeq_feat>> & changed_feats,unordered_set<int> & existing_ids,int & offset)65 void CFixFeatureId::s_UpdateFeatureIds(const CSeq_entry_Handle& entry, map<CSeq_feat_Handle, CRef<CSeq_feat> > &changed_feats, unordered_set<int> &existing_ids, int &offset)
66 {
67     unordered_map<int, int> remapped_ids; // map between old id to new id within the current seq-entry only
68     unordered_set<int> new_existing_ids; // id's which were left unchanged in the current seq-entry
69     unordered_set<int>  current_ids; //  newly created (mapped) ids in the current seq-entry
70     CFeat_CI feat_it(entry);
71     for ( ; feat_it; ++feat_it )
72     {
73         bool modified = false;
74         CRef<CSeq_feat> edited;
75         CSeq_feat_Handle fh = feat_it->GetSeq_feat_Handle();
76         if (changed_feats.find(fh) != changed_feats.end())
77         {
78             edited = changed_feats[fh];
79         }
80         else
81         {
82             edited.Reset(new CSeq_feat);
83             edited->Assign(feat_it->GetOriginalFeature());
84         }
85 
86         if (edited->IsSetId() && edited->GetId().IsLocal() && edited->GetId().GetLocal().IsId())
87         {
88             int id = edited->GetId().GetLocal().GetId();
89             if (existing_ids.find(id) != existing_ids.end() ||
90                 current_ids.find(id) != current_ids.end())  // remap id if it's found in other seq-entries or among newly created ids in the current seq-entry, do not remap existing duplicate ids within the same seq-entry
91             {
92                 auto it = remapped_ids.find(id);
93                 if (it != remapped_ids.end())
94                 {
95                     offset = it->second; // use the same remapped id if a duplicate exists in the current seq-entry and was already remapped
96                 }
97                 else
98                 {
99                     FindNextOffset(existing_ids, new_existing_ids, current_ids, offset); // find id which does not exist among either of the 3 sets
100                     remapped_ids[id] = offset;
101                 }
102                 edited->SetId().SetLocal().SetId(offset);
103                 current_ids.insert(offset);
104                 modified = true;
105             }
106             else
107             {
108                 new_existing_ids.insert(id);
109             }
110         }
111         if (modified)
112         {
113             changed_feats[fh] = edited;
114         }
115     }
116     existing_ids.insert(new_existing_ids.begin(), new_existing_ids.end());
117     existing_ids.insert(current_ids.begin(), current_ids.end());
118     feat_it.Rewind();
119     for ( ; feat_it; ++feat_it )
120     {
121         bool modified = false;
122         CRef<CSeq_feat> edited;
123         CSeq_feat_Handle fh = feat_it->GetSeq_feat_Handle();
124         if (changed_feats.find(fh) != changed_feats.end())
125         {
126             edited = changed_feats[fh];
127         }
128         else
129         {
130             edited.Reset(new CSeq_feat);
131             edited->Assign(feat_it->GetOriginalFeature());
132         }
133 
134         if (edited->IsSetXref())
135         {
136             CSeq_feat::TXref::iterator xref_it = edited->SetXref().begin();
137             while ( xref_it != edited->SetXref().end() )
138             {
139                 if ((*xref_it)-> IsSetId() && (*xref_it)->GetId().IsLocal() && (*xref_it)->GetId().GetLocal().IsId())
140                 {
141                     int id = (*xref_it)->GetId().GetLocal().GetId();
142                     auto it = remapped_ids.find(id);
143                     if (it != remapped_ids.end())
144                     {
145                         (*xref_it)->SetId().SetLocal().SetId(it->second); // remap xrefs if necessary
146                         modified = true;
147                     }
148                 }
149                 ++xref_it;
150             }
151         }
152         if (modified)
153         {
154             changed_feats[fh] = edited;
155         }
156     }
157 }
158 
159 
s_ApplyToSeqInSet(CSeq_entry_Handle tse,map<CSeq_feat_Handle,CRef<CSeq_feat>> & changed_feats)160 void CFixFeatureId::s_ApplyToSeqInSet(CSeq_entry_Handle tse, map<CSeq_feat_Handle, CRef<CSeq_feat> > &changed_feats)
161 {
162     int offset = 0;
163     unordered_set<int> existing_ids;
164     if (tse && tse.IsSet() && tse.GetSet().IsSetClass() && tse.GetSet().GetClass() == CBioseq_set::eClass_genbank)
165     {
166         for(CSeq_entry_CI direct_child_ci( tse.GetSet(), CSeq_entry_CI::eNonRecursive ); direct_child_ci; ++direct_child_ci )
167         {
168             const CSeq_entry_Handle& entry = *direct_child_ci;
169             s_UpdateFeatureIds(entry, changed_feats, existing_ids, offset);
170         }
171     }
172 }
173 
174 // This function maps existing feature ids to the sequential ints - 1,2,3,...
s_MakeIDPairs(const CSeq_entry_Handle & entry,map<int,int> & id_pairs)175 void CFixFeatureId::s_MakeIDPairs(const CSeq_entry_Handle& entry, map<int,int> &id_pairs)
176 {
177     int feat_id = 0;
178     for (CFeat_CI feat_it(entry); feat_it; ++feat_it) {
179         if (feat_it->IsSetId()) {
180             const CFeat_id &id = feat_it->GetId();
181             if (id.IsLocal() && id.GetLocal().IsId() && id_pairs.find(id.GetLocal().GetId()) == id_pairs.end()) {
182                 id_pairs[id.GetLocal().GetId()] = ++feat_id;
183             }
184         }
185     }
186 }
187 
188 // Create a map from the existing feature ids to the sequential ints 1,2,3...
189 // and prepare a map from feature handles to the modified features with the reassigned ids both in the feature id and in the xrefs
s_ReassignFeatureIds(const CSeq_entry_Handle & entry,map<CSeq_feat_Handle,CRef<CSeq_feat>> & changed_feats)190 void CFixFeatureId::s_ReassignFeatureIds(const CSeq_entry_Handle& entry, map<CSeq_feat_Handle, CRef<CSeq_feat> > &changed_feats)
191 {
192     if (!entry)
193         return;
194     map<int,int> id_pairs;
195     CFixFeatureId::s_MakeIDPairs(entry, id_pairs);
196 
197     for ( CFeat_CI feat_it(entry); feat_it; ++feat_it )
198     {
199         bool modified = false;
200         CRef<CSeq_feat> edited;
201         CSeq_feat_Handle fh = feat_it->GetSeq_feat_Handle();
202         if (changed_feats.find(fh) != changed_feats.end())
203         {
204             edited = changed_feats[fh];
205         }
206         else
207         {
208             edited.Reset(new CSeq_feat);
209             edited->Assign(feat_it->GetOriginalFeature());
210         }
211 
212         if (edited->IsSetId() && edited->GetId().IsLocal() && edited->GetId().GetLocal().IsId())
213         {
214             int id = id_pairs[edited->GetId().GetLocal().GetId()];
215             edited->SetId().SetLocal().SetId(id);
216             modified = true;
217         }
218        if (edited->IsSetXref())
219         {
220             CSeq_feat::TXref::iterator xref_it = edited->SetXref().begin();
221             while ( xref_it != edited->SetXref().end() )
222             {
223                 if ((*xref_it)-> IsSetId() && (*xref_it)->GetId().IsLocal() && (*xref_it)->GetId().GetLocal().IsId())
224                 {
225                     modified = true;
226                     if (id_pairs.find((*xref_it)->GetId().GetLocal().GetId()) != id_pairs.end())
227                         {
228                             int id = id_pairs[(*xref_it)->GetId().GetLocal().GetId()];
229                             (*xref_it)->SetId().SetLocal().SetId(id);
230                         }
231                     else
232                         {
233                             (*xref_it)->ResetId();
234                             xref_it = edited->SetXref().erase(xref_it);
235                             continue;
236                         }
237                 }
238                 ++xref_it;
239             }
240             if (edited->SetXref().empty())
241                 edited->ResetXref();
242         }
243        if (modified)
244        {
245            changed_feats[fh] = edited;
246        }
247     }
248 }
249 
250 END_NCBI_SCOPE
251