1 /* $Id: molecule_identifier.cpp 600628 2020-01-23 19:40:16Z wangjiy $
2 * ===========================================================================
3 *
4 * PUBLIC DOMAIN NOTICE
5 * National Center for Biotechnology Information
6 *
7 * This software/database is a "United States Government Work" under the
8 * terms of the United States Copyright Act. It was written as part of
9 * the author's official duties as a United States Government employee and
10 * thus cannot be copyrighted. This software/database is freely available
11 * to the public for use. The National Library of Medicine and the U.S.
12 * Government have not placed any restriction on its use or reproduction.
13 *
14 * Although all reasonable efforts have been taken to ensure the accuracy
15 * and reliability of the software and data, the NLM and the U.S.
16 * Government do not and cannot warrant the performance or results that
17 * may be obtained by using this software or data. The NLM and the U.S.
18 * Government disclaim all warranties, express or implied, including
19 * warranties of performance, merchantability or fitness for any particular
20 * purpose.
21 *
22 * Please cite the author in any work or product based on this material.
23 *
24 * ===========================================================================
25 *
26 * Authors: Paul Thiessen
27 *
28 * File Description:
29 * Class to hold, and factory to generate, general
30 * (instance-independent) identifier for any molecule
31 *
32 * ===========================================================================
33 */
34
35 #include <ncbi_pch.hpp>
36 #include <corelib/ncbistre.hpp>
37 #include <corelib/ncbistl.hpp>
38
39 #include <objects/seqloc/Seq_id.hpp>
40 #include <objects/seqloc/PDB_seq_id.hpp>
41 #include <objects/general/Object_id.hpp>
42 #include <objects/seqloc/Textseq_id.hpp>
43 #include <objects/seqloc/PDB_mol_id.hpp>
44
45 #include "remove_header_conflicts.hpp"
46
47 #include "molecule_identifier.hpp"
48 #include "structure_set.hpp"
49 #include "molecule.hpp"
50 #include "sequence_set.hpp"
51 #include "cn3d_tools.hpp"
52
53 USING_NCBI_SCOPE;
54
55 BEGIN_SCOPE(Cn3D)
56 USING_SCOPE(objects);
57
58 // there is one (global) list of molecule identifiers
59
60 typedef list < MoleculeIdentifier > MoleculeIdentifierList;
61 static MoleculeIdentifierList knownIdentifiers;
62
63 const int MoleculeIdentifier::VALUE_NOT_SET = -1;
64
GetIdentifier(const Molecule * molecule,const SeqIdList & ids)65 const MoleculeIdentifier * MoleculeIdentifier::GetIdentifier(const Molecule *molecule, const SeqIdList& ids)
66 {
67 const StructureObject *object;
68 if (!molecule->GetParentOfType(&object)) return NULL;
69
70 // get or create identifer
71 MoleculeIdentifier *identifier = ((ids.size() > 0) ? GetIdentifier(ids) : GetIdentifier(object->mmdbID, molecule->id));
72 if (!identifier)
73 return NULL;
74
75 // check/assign mmdb id
76 if (object->mmdbID != StructureObject::NO_MMDB_ID) {
77 if ((identifier->mmdbID != VALUE_NOT_SET && identifier->mmdbID != object->mmdbID) ||
78 (identifier->moleculeID != VALUE_NOT_SET && identifier->moleculeID != molecule->id)) {
79 ERRORMSG("MoleculeIdentifier::GetIdentifier() - mmdb/molecule ID mismatch for " << identifier->ToString());
80 } else {
81 identifier->mmdbID = object->mmdbID;
82 identifier->moleculeID = molecule->id;
83 }
84 }
85
86 // check/assign #residues
87 if (identifier->nResidues == 0)
88 identifier->nResidues = molecule->residues.size();
89 else if (identifier->nResidues != molecule->residues.size())
90 ERRORMSG("# residue mismatch in molecule identifier for " << identifier->ToString());
91
92 // check/assign pdb id
93 #ifdef _STRUCTURE_USE_LONG_PDB_CHAINS_
94 string name = molecule->name;
95 if (identifier->pdbID.size() == 0 && identifier->pdbChain.empty()) {
96 identifier->pdbID = object->GetPDBID();
97 identifier->pdbChain = name;
98 } else if (identifier->pdbID != object->GetPDBID() || identifier->pdbChain != name)
99 ERRORMSG("PDB ID mismatch in molecule identifier for " << identifier->ToString());
100 #else
101 int name = ((molecule->name.size() == 1) ? molecule->name[0] : MoleculeIdentifier::VALUE_NOT_SET);
102 if (identifier->pdbID.size() == 0 && identifier->pdbChain == MoleculeIdentifier::VALUE_NOT_SET) {
103 identifier->pdbID = object->GetPDBID();
104 identifier->pdbChain = name;
105 } else if (identifier->pdbID != object->GetPDBID() || identifier->pdbChain != name)
106 ERRORMSG("PDB ID mismatch in molecule identifier for " << identifier->ToString());
107 #endif
108
109 return identifier;
110 }
111
GetIdentifier(const Sequence * sequence,int mmdbID,const SeqIdList & ids)112 const MoleculeIdentifier * MoleculeIdentifier::GetIdentifier(const Sequence *sequence, int mmdbID, const SeqIdList& ids)
113 {
114 // get or create identifer
115 MoleculeIdentifier *identifier = GetIdentifier(ids);
116 if (!identifier)
117 return NULL;
118
119 // check/assign mmdb id
120 if (mmdbID != VALUE_NOT_SET) {
121 if (identifier->mmdbID != VALUE_NOT_SET) {
122 if (identifier->mmdbID != mmdbID)
123 ERRORMSG("MoleculeIdentifier::GetIdentifier() - mmdbID mismatch for " << identifier->ToString());
124 } else {
125 identifier->mmdbID = mmdbID;
126 }
127 }
128
129 // check/assign length
130 if (identifier->nResidues == 0)
131 identifier->nResidues = sequence->Length();
132 else if (identifier->nResidues != sequence->Length())
133 ERRORMSG("Length mismatch in sequence identifier for " << identifier->ToString());
134
135 return identifier;
136 }
137
GetIdentifier(const SeqIdList & ids)138 MoleculeIdentifier * MoleculeIdentifier::GetIdentifier(const SeqIdList& ids)
139 {
140 // first check known identifiers to see if there's a match, and posibly merge in new ids
141 MoleculeIdentifierList::iterator k, ke = knownIdentifiers.end();
142 for (k=knownIdentifiers.begin(); k!=ke; ++k) {
143
144 // for each known, compare lists of Seq-ids, looking for matches and mismatches
145 SeqIdList newIDs;
146 vector < string > matches, mismatches;
147 bool mismatchGIonly = false;
148 SeqIdList::const_iterator o, oe = k->seqIDs.end(), n, ne = ids.end();
149 for (n=ids.begin(); n!=ne; ++n) {
150
151 // does the new (incoming) Seq-id (mis)match any old (existing) Seq-id?
152 bool foundMatch = false, foundMismatch = false;
153 for (o=k->seqIDs.begin(); o!=oe; ++o) {
154 switch ((*o)->Compare(**n)) {
155 case CSeq_id::e_DIFF: // different types, can't compare; do nothing
156 break;
157 case CSeq_id::e_NO: // same type but different id -> mismatch
158 mismatches.push_back((*o)->GetSeqIdString() + " != " + (*n)->GetSeqIdString());
159 foundMismatch = true;
160 if (mismatches.size() == 1) {
161 if ((*n)->IsGi())
162 mismatchGIonly = true;
163 } else {
164 mismatchGIonly = false;
165 }
166 break;
167 case CSeq_id::e_YES: // same type and same id -> match
168 matches.push_back((*o)->GetSeqIdString() + " == " + (*n)->GetSeqIdString());
169 foundMatch = true;
170 break;
171 default:
172 ERRORMSG("Problem comparing Seq-ids " << (*o)->GetSeqIdString() << " and " << (*n)->GetSeqIdString());
173 continue;
174 }
175 }
176
177 // if no match or mismatch is found, this is a potential new id for this known identifier
178 if (!foundMatch && !foundMismatch)
179 newIDs.push_back(*n);
180 }
181
182 // if we have matches and no mismatches, then we've found the identifier; merge in any new ids
183 if (matches.size() > 0 && mismatches.size() == 0) {
184 if (newIDs.size() > 0)
185 k->AddFields(newIDs);
186 return &(*k);
187 }
188
189 // if we have matches *and* mismatches then there's a problem
190 if (matches.size() > 0 && mismatches.size() > 0) {
191
192 // special case: gi (only) is different but something else (presumably an accession) is the same, then
193 // warn about possibly outdated gi; don't merge in new ids
194 if (mismatchGIonly) {
195 ERRORMSG("GetIdentifier(): incoming Seq-id list has a GI mismatch ("
196 << mismatches.front() << ") with sequence " << k->seqIDs.front()->GetSeqIdString()
197 << " but otherwise matches (" << matches.front()
198 << "); please update outdated GI(s) for this sequence");
199 return &(*k);
200 }
201
202 // otherwise, error
203 ERRORMSG("GetIdentifier(): incoming Seq-id list has match(es) ("
204 << matches.front() << ") and mismatch(es) ("
205 << mismatches.front() << ") with identifier " << k->ToString());
206 return NULL;
207 }
208 }
209
210 // if we get here, then this is a new sequence
211 knownIdentifiers.resize(knownIdentifiers.size() + 1, MoleculeIdentifier());
212 MoleculeIdentifier *identifier = &(knownIdentifiers.back());
213 identifier->AddFields(ids);
214 return identifier;
215 }
216
GetIdentifier(int mmdbID,int moleculeID)217 MoleculeIdentifier * MoleculeIdentifier::GetIdentifier(int mmdbID, int moleculeID)
218 {
219 // first check known identifiers to see if there's a match, and posibly merge in new ids
220 MoleculeIdentifierList::iterator k, ke = knownIdentifiers.end();
221 for (k=knownIdentifiers.begin(); k!=ke; ++k) {
222 if (k->mmdbID == mmdbID && k->moleculeID == moleculeID)
223 return &(*k);
224 }
225
226 // if we get here, then this is a new sequence
227 knownIdentifiers.resize(knownIdentifiers.size() + 1, MoleculeIdentifier());
228 MoleculeIdentifier *identifier = &(knownIdentifiers.back());
229 identifier->mmdbID = mmdbID;
230 identifier->moleculeID = moleculeID;
231 return identifier;
232 }
233
AddFields(const SeqIdList & ids)234 void MoleculeIdentifier::AddFields(const SeqIdList& ids)
235 {
236 // save these ids (should already know that the new ids don't overlap any existing ones)
237 seqIDs.insert(seqIDs.end(), ids.begin(), ids.end());
238
239 #ifdef _STRUCTURE_USE_LONG_PDB_CHAINS_
240 bool bPdbChainNotSet = pdbChain.empty();
241 #else
242 bool bPdbChainNotSet = (pdbChain == VALUE_NOT_SET);
243 #endif
244
245 SeqIdList::const_iterator n, ne = ids.end();
246 for (n=ids.begin(); n!=ne; ++n) {
247
248 // pdb
249 if ((*n)->IsPdb()) {
250 string newID = (*n)->GetPdb().GetMol();
251
252 #ifdef _STRUCTURE_USE_LONG_PDB_CHAINS_
253 if (pdbID.size() == 0 && pdbChain.empty()) {
254 pdbID = newID;
255 pdbChain = (*n)->GetPdb().GetEffectiveChain_id();
256 } else if (pdbID != newID || pdbChain != (*n)->GetPdb().GetEffectiveChain_id()) {
257 // special case: for merged structures with multiple pdb ids, allow match to a sequence from a single specific pdb id
258 if (pdbID.size() > 4 && pdbChain == (*n)->GetPdb().GetEffectiveChain_id() && NStr::Find(pdbID, newID) != NPOS)
259 pdbID = newID;
260 else
261 ERRORMSG("AddFields(): identifier conflict, already has pdb ID '" << pdbID << "_" << pdbChain << "'");
262 }
263 #else
264 if (pdbID.size() == 0 && pdbChain == VALUE_NOT_SET) {
265 pdbID = newID;
266 pdbChain = (*n)->GetPdb().GetChain();
267 } else if (pdbID != newID || pdbChain != (*n)->GetPdb().GetChain()) {
268 // special case: for merged structures with multiple pdb ids, allow match to a sequence from a single specific pdb id
269 if (pdbID.size() > 4 && pdbChain == (*n)->GetPdb().GetChain() && NStr::Find(pdbID, newID) != NPOS)
270 pdbID = newID;
271 else
272 ERRORMSG("AddFields(): identifier conflict, already has pdb ID '" << pdbID << "_" << ((char) pdbChain) << "'");
273 }
274 #endif
275 }
276
277 // gi
278 else if ((*n)->IsGi()) {
279 if (gi == VALUE_NOT_SET)
280 gi = (*n)->GetGi();
281 else if (gi != (*n)->GetGi())
282 ERRORMSG("AddFields(): identifier conflict: already has gi " << gi);
283 }
284
285 // special case where local accession is actually a PDB identifier + chain + extra stuff,
286 // separated by spaces: of the format '1ABC X ...' where X can be a chain alphanum character or space
287 //else if (pdbID.size() == 0 && pdbChain == VALUE_NOT_SET &&
288 else if (pdbID.size() == 0 && bPdbChainNotSet &&
289 (*n)->IsLocal() && (*n)->GetLocal().IsStr() &&
290 (*n)->GetLocal().GetStr().size() >= 7 && (*n)->GetLocal().GetStr()[4] == ' ' &&
291 (*n)->GetLocal().GetStr()[6] == ' ' &&
292 (isalnum((unsigned char) (*n)->GetLocal().GetStr()[5]) || (*n)->GetLocal().GetStr()[5] == ' '))
293 {
294 pdbID = (*n)->GetLocal().GetStr().substr(0, 4);
295 #ifdef _STRUCTURE_USE_LONG_PDB_CHAINS_
296 string tmpStr(1, (*n)->GetLocal().GetStr()[5]);
297 pdbChain = tmpStr;
298 #else
299 pdbChain = (*n)->GetLocal().GetStr()[5];
300 #endif
301 }
302 }
303 }
304
FindIdentifier(int mmdbID,int moleculeID)305 const MoleculeIdentifier * MoleculeIdentifier::FindIdentifier(int mmdbID, int moleculeID)
306 {
307 const MoleculeIdentifier *identifier = NULL;
308 MoleculeIdentifierList::const_iterator i, ie = knownIdentifiers.end();
309 for (i=knownIdentifiers.begin(); i!=ie; ++i) {
310 if (mmdbID == i->mmdbID && moleculeID == i->moleculeID) {
311 identifier = &(*i);
312 break;
313 }
314 }
315 return identifier;
316 }
317
ClearIdentifiers(void)318 void MoleculeIdentifier::ClearIdentifiers(void)
319 {
320 knownIdentifiers.clear();
321 }
322
MatchesSeqId(const ncbi::objects::CSeq_id & sid) const323 bool MoleculeIdentifier::MatchesSeqId(const ncbi::objects::CSeq_id& sid) const
324 {
325 SeqIdList::const_iterator i, ie = seqIDs.end();
326 for (i=seqIDs.begin(); i!=ie; ++i)
327 if (sid.Match(**i))
328 return true;
329
330 return false;
331 }
332
CompareIdentifiers(const MoleculeIdentifier * a,const MoleculeIdentifier * b)333 bool MoleculeIdentifier::CompareIdentifiers(const MoleculeIdentifier *a, const MoleculeIdentifier *b)
334 {
335 // identifier sort - float sequences with PDB id's to the top, then gi's, then accessions
336 if (a->pdbID.size() > 0) {
337 if (b->pdbID.size() > 0) {
338 if (a->pdbID < b->pdbID)
339 return true;
340 else if (a->pdbID > b->pdbID)
341 return false;
342 else {
343 #ifdef _STRUCTURE_USE_LONG_PDB_CHAINS_
344 return (a->pdbChain.compare(b->pdbChain) < 0);
345 #else
346 return (a->pdbChain < b->pdbChain);
347 #endif
348 }
349 } else
350 return true;
351 }
352
353 else if (a->gi != VALUE_NOT_SET) {
354 if (b->pdbID.size() > 0)
355 return false;
356 else if (b->gi != VALUE_NOT_SET)
357 return (a->gi < b->gi);
358 else
359 return true;
360 }
361
362 else if (b->pdbID.size() > 0 || b->gi != VALUE_NOT_SET)
363 return false;
364
365 else if (a->seqIDs.size() > 0 && b->seqIDs.size() > 0)
366 return (a->seqIDs.front()->GetSeqIdString() < b->seqIDs.front()->GetSeqIdString());
367
368 ERRORMSG("Don't know how to compare identifiers " << a->ToString() << " and " << b->ToString());
369 return false;
370 }
371
ToString(void) const372 string MoleculeIdentifier::ToString(void) const
373 {
374 CNcbiOstrstream oss;
375
376 #ifdef _STRUCTURE_USE_LONG_PDB_CHAINS_
377 bool bPdbChainNotSet = pdbChain.empty();
378 #else
379 bool bPdbChainNotSet = (pdbChain == VALUE_NOT_SET);
380 #endif
381
382 if (pdbID.size() == 4 && !bPdbChainNotSet) {
383 oss << pdbID;
384
385 #ifdef _STRUCTURE_USE_LONG_PDB_CHAINS_
386 if (pdbChain != " ") {
387 oss << '_' << pdbChain;
388 }
389 #else
390 if (pdbChain != ' ') {
391 oss << '_' << (char) pdbChain;
392 }
393 #endif
394 } else if (gi != VALUE_NOT_SET) {
395 oss << "gi " << gi;
396 } else if (mmdbID != VALUE_NOT_SET && moleculeID != VALUE_NOT_SET) {
397 oss << "mmdb " << mmdbID << " molecule " << moleculeID;
398 } else if (seqIDs.size() > 0) {
399 oss << seqIDs.front()->GetSeqIdString();
400 } else {
401 oss << '?';
402 }
403 return (string) CNcbiOstrstreamToString(oss);
404 }
405
406 END_SCOPE(Cn3D)
407