1 /*  $Id: molecule_identifier.cpp 600628 2020-01-23 19:40:16Z wangjiy $
2 * ===========================================================================
3 *
4 *                            PUBLIC DOMAIN NOTICE
5 *               National Center for Biotechnology Information
6 *
7 *  This software/database is a "United States Government Work" under the
8 *  terms of the United States Copyright Act.  It was written as part of
9 *  the author's official duties as a United States Government employee and
10 *  thus cannot be copyrighted.  This software/database is freely available
11 *  to the public for use. The National Library of Medicine and the U.S.
12 *  Government have not placed any restriction on its use or reproduction.
13 *
14 *  Although all reasonable efforts have been taken to ensure the accuracy
15 *  and reliability of the software and data, the NLM and the U.S.
16 *  Government do not and cannot warrant the performance or results that
17 *  may be obtained by using this software or data. The NLM and the U.S.
18 *  Government disclaim all warranties, express or implied, including
19 *  warranties of performance, merchantability or fitness for any particular
20 *  purpose.
21 *
22 *  Please cite the author in any work or product based on this material.
23 *
24 * ===========================================================================
25 *
26 * Authors:  Paul Thiessen
27 *
28 * File Description:
29 *      Class to hold, and factory to generate, general
30 *      (instance-independent) identifier for any molecule
31 *
32 * ===========================================================================
33 */
34 
35 #include <ncbi_pch.hpp>
36 #include <corelib/ncbistre.hpp>
37 #include <corelib/ncbistl.hpp>
38 
39 #include <objects/seqloc/Seq_id.hpp>
40 #include <objects/seqloc/PDB_seq_id.hpp>
41 #include <objects/general/Object_id.hpp>
42 #include <objects/seqloc/Textseq_id.hpp>
43 #include <objects/seqloc/PDB_mol_id.hpp>
44 
45 #include "remove_header_conflicts.hpp"
46 
47 #include "molecule_identifier.hpp"
48 #include "structure_set.hpp"
49 #include "molecule.hpp"
50 #include "sequence_set.hpp"
51 #include "cn3d_tools.hpp"
52 
53 USING_NCBI_SCOPE;
54 
55 BEGIN_SCOPE(Cn3D)
56 USING_SCOPE(objects);
57 
58 // there is one (global) list of molecule identifiers
59 
60 typedef list < MoleculeIdentifier > MoleculeIdentifierList;
61 static MoleculeIdentifierList knownIdentifiers;
62 
63 const int MoleculeIdentifier::VALUE_NOT_SET = -1;
64 
GetIdentifier(const Molecule * molecule,const SeqIdList & ids)65 const MoleculeIdentifier * MoleculeIdentifier::GetIdentifier(const Molecule *molecule, const SeqIdList& ids)
66 {
67     const StructureObject *object;
68     if (!molecule->GetParentOfType(&object)) return NULL;
69 
70     // get or create identifer
71     MoleculeIdentifier *identifier = ((ids.size() > 0) ? GetIdentifier(ids) : GetIdentifier(object->mmdbID, molecule->id));
72     if (!identifier)
73         return NULL;
74 
75     // check/assign mmdb id
76     if (object->mmdbID != StructureObject::NO_MMDB_ID) {
77         if ((identifier->mmdbID != VALUE_NOT_SET && identifier->mmdbID != object->mmdbID) ||
78                 (identifier->moleculeID != VALUE_NOT_SET && identifier->moleculeID != molecule->id)) {
79             ERRORMSG("MoleculeIdentifier::GetIdentifier() - mmdb/molecule ID mismatch for " << identifier->ToString());
80         } else {
81             identifier->mmdbID = object->mmdbID;
82             identifier->moleculeID = molecule->id;
83         }
84     }
85 
86     // check/assign #residues
87     if (identifier->nResidues == 0)
88         identifier->nResidues = molecule->residues.size();
89     else if (identifier->nResidues != molecule->residues.size())
90         ERRORMSG("# residue mismatch in molecule identifier for " << identifier->ToString());
91 
92     // check/assign pdb id
93 	#ifdef _STRUCTURE_USE_LONG_PDB_CHAINS_
94 		string name = molecule->name;
95 		if (identifier->pdbID.size() == 0 && identifier->pdbChain.empty()) {
96 			identifier->pdbID = object->GetPDBID();
97 			identifier->pdbChain = name;
98 		} else if (identifier->pdbID != object->GetPDBID() || identifier->pdbChain != name)
99 			ERRORMSG("PDB ID mismatch in molecule identifier for " << identifier->ToString());
100 	#else
101 		int name = ((molecule->name.size() == 1) ? molecule->name[0] : MoleculeIdentifier::VALUE_NOT_SET);
102 		if (identifier->pdbID.size() == 0 && identifier->pdbChain == MoleculeIdentifier::VALUE_NOT_SET) {
103 			identifier->pdbID = object->GetPDBID();
104 			identifier->pdbChain = name;
105 		} else if (identifier->pdbID != object->GetPDBID() || identifier->pdbChain != name)
106 			ERRORMSG("PDB ID mismatch in molecule identifier for " << identifier->ToString());
107 	#endif
108 
109     return identifier;
110 }
111 
GetIdentifier(const Sequence * sequence,int mmdbID,const SeqIdList & ids)112 const MoleculeIdentifier * MoleculeIdentifier::GetIdentifier(const Sequence *sequence, int mmdbID, const SeqIdList& ids)
113 {
114     // get or create identifer
115     MoleculeIdentifier *identifier = GetIdentifier(ids);
116     if (!identifier)
117         return NULL;
118 
119     // check/assign mmdb id
120     if (mmdbID != VALUE_NOT_SET) {
121         if (identifier->mmdbID != VALUE_NOT_SET) {
122             if (identifier->mmdbID != mmdbID)
123                 ERRORMSG("MoleculeIdentifier::GetIdentifier() - mmdbID mismatch for " << identifier->ToString());
124         } else {
125             identifier->mmdbID = mmdbID;
126         }
127     }
128 
129     // check/assign length
130     if (identifier->nResidues == 0)
131         identifier->nResidues = sequence->Length();
132     else if (identifier->nResidues != sequence->Length())
133         ERRORMSG("Length mismatch in sequence identifier for " << identifier->ToString());
134 
135     return identifier;
136 }
137 
GetIdentifier(const SeqIdList & ids)138 MoleculeIdentifier * MoleculeIdentifier::GetIdentifier(const SeqIdList& ids)
139 {
140     // first check known identifiers to see if there's a match, and posibly merge in new ids
141     MoleculeIdentifierList::iterator k, ke = knownIdentifiers.end();
142     for (k=knownIdentifiers.begin(); k!=ke; ++k) {
143 
144         // for each known, compare lists of Seq-ids, looking for matches and mismatches
145         SeqIdList newIDs;
146         vector < string > matches, mismatches;
147         bool mismatchGIonly = false;
148         SeqIdList::const_iterator o, oe = k->seqIDs.end(), n, ne = ids.end();
149         for (n=ids.begin(); n!=ne; ++n) {
150 
151             // does the new (incoming) Seq-id (mis)match any old (existing) Seq-id?
152             bool foundMatch = false, foundMismatch = false;
153             for (o=k->seqIDs.begin(); o!=oe; ++o) {
154                 switch ((*o)->Compare(**n)) {
155                     case CSeq_id::e_DIFF:   // different types, can't compare; do nothing
156                         break;
157                     case CSeq_id::e_NO:     // same type but different id -> mismatch
158                         mismatches.push_back((*o)->GetSeqIdString() + " != " + (*n)->GetSeqIdString());
159                         foundMismatch = true;
160                         if (mismatches.size() == 1) {
161                             if ((*n)->IsGi())
162                                 mismatchGIonly = true;
163                         } else {
164                             mismatchGIonly = false;
165                         }
166                         break;
167                     case CSeq_id::e_YES:    // same type and same id -> match
168                         matches.push_back((*o)->GetSeqIdString() + " == " + (*n)->GetSeqIdString());
169                         foundMatch = true;
170                         break;
171                    default:
172                         ERRORMSG("Problem comparing Seq-ids " << (*o)->GetSeqIdString() << " and " << (*n)->GetSeqIdString());
173                         continue;
174                 }
175             }
176 
177             // if no match or mismatch is found, this is a potential new id for this known identifier
178             if (!foundMatch && !foundMismatch)
179                 newIDs.push_back(*n);
180         }
181 
182         // if we have matches and no mismatches, then we've found the identifier; merge in any new ids
183         if (matches.size() > 0 && mismatches.size() == 0) {
184             if (newIDs.size() > 0)
185                 k->AddFields(newIDs);
186             return &(*k);
187         }
188 
189         // if we have matches *and* mismatches then there's a problem
190         if (matches.size() > 0 && mismatches.size() > 0) {
191 
192             // special case: gi (only) is different but something else (presumably an accession) is the same, then
193             // warn about possibly outdated gi; don't merge in new ids
194             if (mismatchGIonly) {
195                 ERRORMSG("GetIdentifier(): incoming Seq-id list has a GI mismatch ("
196                     << mismatches.front() << ") with sequence " << k->seqIDs.front()->GetSeqIdString()
197                     << " but otherwise matches (" << matches.front()
198                     << "); please update outdated GI(s) for this sequence");
199                 return &(*k);
200             }
201 
202             // otherwise, error
203             ERRORMSG("GetIdentifier(): incoming Seq-id list has match(es) ("
204                 << matches.front() << ") and mismatch(es) ("
205                 << mismatches.front() << ") with identifier " << k->ToString());
206             return NULL;
207         }
208     }
209 
210     // if we get here, then this is a new sequence
211     knownIdentifiers.resize(knownIdentifiers.size() + 1, MoleculeIdentifier());
212     MoleculeIdentifier *identifier = &(knownIdentifiers.back());
213     identifier->AddFields(ids);
214     return identifier;
215 }
216 
GetIdentifier(int mmdbID,int moleculeID)217 MoleculeIdentifier * MoleculeIdentifier::GetIdentifier(int mmdbID, int moleculeID)
218 {
219     // first check known identifiers to see if there's a match, and posibly merge in new ids
220     MoleculeIdentifierList::iterator k, ke = knownIdentifiers.end();
221     for (k=knownIdentifiers.begin(); k!=ke; ++k) {
222         if (k->mmdbID == mmdbID && k->moleculeID == moleculeID)
223             return &(*k);
224     }
225 
226     // if we get here, then this is a new sequence
227     knownIdentifiers.resize(knownIdentifiers.size() + 1, MoleculeIdentifier());
228     MoleculeIdentifier *identifier = &(knownIdentifiers.back());
229     identifier->mmdbID = mmdbID;
230     identifier->moleculeID = moleculeID;
231     return identifier;
232 }
233 
AddFields(const SeqIdList & ids)234 void MoleculeIdentifier::AddFields(const SeqIdList& ids)
235 {
236     // save these ids (should already know that the new ids don't overlap any existing ones)
237     seqIDs.insert(seqIDs.end(), ids.begin(), ids.end());
238 
239 	#ifdef _STRUCTURE_USE_LONG_PDB_CHAINS_
240 	  bool bPdbChainNotSet = pdbChain.empty();
241 	#else
242 	  bool bPdbChainNotSet = (pdbChain == VALUE_NOT_SET);
243 	#endif
244 
245     SeqIdList::const_iterator n, ne = ids.end();
246     for (n=ids.begin(); n!=ne; ++n) {
247 
248         // pdb
249         if ((*n)->IsPdb()) {
250             string newID = (*n)->GetPdb().GetMol();
251 
252 			#ifdef _STRUCTURE_USE_LONG_PDB_CHAINS_
253 				if (pdbID.size() == 0 && pdbChain.empty()) {
254 					pdbID = newID;
255 					pdbChain = (*n)->GetPdb().GetEffectiveChain_id();
256 				} else if (pdbID != newID || pdbChain != (*n)->GetPdb().GetEffectiveChain_id()) {
257 					// special case: for merged structures with multiple pdb ids, allow match to a sequence from a single specific pdb id
258 					if (pdbID.size() > 4 && pdbChain == (*n)->GetPdb().GetEffectiveChain_id() && NStr::Find(pdbID, newID) != NPOS)
259 							pdbID = newID;
260 					else
261 						ERRORMSG("AddFields(): identifier conflict, already has pdb ID '" << pdbID << "_" << pdbChain << "'");
262 				}
263 			#else
264 				if (pdbID.size() == 0 && pdbChain == VALUE_NOT_SET) {
265 					pdbID = newID;
266 					pdbChain = (*n)->GetPdb().GetChain();
267 				} else if (pdbID != newID || pdbChain != (*n)->GetPdb().GetChain()) {
268 					// special case: for merged structures with multiple pdb ids, allow match to a sequence from a single specific pdb id
269 					if (pdbID.size() > 4 && pdbChain == (*n)->GetPdb().GetChain() && NStr::Find(pdbID, newID) != NPOS)
270 							pdbID = newID;
271 					else
272 						ERRORMSG("AddFields(): identifier conflict, already has pdb ID '" << pdbID << "_" << ((char) pdbChain) << "'");
273 				}
274 			#endif
275         }
276 
277         // gi
278         else if ((*n)->IsGi()) {
279             if (gi == VALUE_NOT_SET)
280                 gi = (*n)->GetGi();
281             else if (gi != (*n)->GetGi())
282                 ERRORMSG("AddFields(): identifier conflict: already has gi " << gi);
283         }
284 
285         // special case where local accession is actually a PDB identifier + chain + extra stuff,
286         // separated by spaces: of the format '1ABC X ...' where X can be a chain alphanum character or space
287         //else if (pdbID.size() == 0 && pdbChain == VALUE_NOT_SET &&
288         else if (pdbID.size() == 0 && bPdbChainNotSet &&
289             (*n)->IsLocal() && (*n)->GetLocal().IsStr() &&
290             (*n)->GetLocal().GetStr().size() >= 7 && (*n)->GetLocal().GetStr()[4] == ' ' &&
291             (*n)->GetLocal().GetStr()[6] == ' ' &&
292             (isalnum((unsigned char) (*n)->GetLocal().GetStr()[5]) || (*n)->GetLocal().GetStr()[5] == ' '))
293         {
294             pdbID = (*n)->GetLocal().GetStr().substr(0, 4);
295 			#ifdef _STRUCTURE_USE_LONG_PDB_CHAINS_
296 			  string tmpStr(1, (*n)->GetLocal().GetStr()[5]);
297 			  pdbChain = tmpStr;
298 			#else
299 			  pdbChain = (*n)->GetLocal().GetStr()[5];
300 			#endif
301         }
302     }
303 }
304 
FindIdentifier(int mmdbID,int moleculeID)305 const MoleculeIdentifier * MoleculeIdentifier::FindIdentifier(int mmdbID, int moleculeID)
306 {
307     const MoleculeIdentifier *identifier = NULL;
308     MoleculeIdentifierList::const_iterator i, ie = knownIdentifiers.end();
309     for (i=knownIdentifiers.begin(); i!=ie; ++i) {
310         if (mmdbID == i->mmdbID && moleculeID == i->moleculeID) {
311             identifier = &(*i);
312             break;
313         }
314     }
315     return identifier;
316 }
317 
ClearIdentifiers(void)318 void MoleculeIdentifier::ClearIdentifiers(void)
319 {
320     knownIdentifiers.clear();
321 }
322 
MatchesSeqId(const ncbi::objects::CSeq_id & sid) const323 bool MoleculeIdentifier::MatchesSeqId(const ncbi::objects::CSeq_id& sid) const
324 {
325     SeqIdList::const_iterator i, ie = seqIDs.end();
326     for (i=seqIDs.begin(); i!=ie; ++i)
327         if (sid.Match(**i))
328             return true;
329 
330     return false;
331 }
332 
CompareIdentifiers(const MoleculeIdentifier * a,const MoleculeIdentifier * b)333 bool MoleculeIdentifier::CompareIdentifiers(const MoleculeIdentifier *a, const MoleculeIdentifier *b)
334 {
335     // identifier sort - float sequences with PDB id's to the top, then gi's, then accessions
336     if (a->pdbID.size() > 0) {
337         if (b->pdbID.size() > 0) {
338             if (a->pdbID < b->pdbID)
339                 return true;
340             else if (a->pdbID > b->pdbID)
341                 return false;
342             else {
343 				#ifdef _STRUCTURE_USE_LONG_PDB_CHAINS_
344 				  return (a->pdbChain.compare(b->pdbChain) < 0);
345 				#else
346 				  return (a->pdbChain < b->pdbChain);
347 				#endif
348 			}
349         } else
350             return true;
351     }
352 
353     else if (a->gi != VALUE_NOT_SET) {
354         if (b->pdbID.size() > 0)
355             return false;
356         else if (b->gi != VALUE_NOT_SET)
357             return (a->gi < b->gi);
358         else
359             return true;
360     }
361 
362     else if (b->pdbID.size() > 0 || b->gi != VALUE_NOT_SET)
363         return false;
364 
365     else if (a->seqIDs.size() > 0 && b->seqIDs.size() > 0)
366         return (a->seqIDs.front()->GetSeqIdString() < b->seqIDs.front()->GetSeqIdString());
367 
368     ERRORMSG("Don't know how to compare identifiers " << a->ToString() << " and " << b->ToString());
369     return false;
370 }
371 
ToString(void) const372 string MoleculeIdentifier::ToString(void) const
373 {
374     CNcbiOstrstream oss;
375 
376 	#ifdef _STRUCTURE_USE_LONG_PDB_CHAINS_
377 	  bool bPdbChainNotSet = pdbChain.empty();
378 	#else
379 	  bool bPdbChainNotSet = (pdbChain == VALUE_NOT_SET);
380 	#endif
381 
382     if (pdbID.size() == 4 && !bPdbChainNotSet) {
383         oss << pdbID;
384 
385 		#ifdef _STRUCTURE_USE_LONG_PDB_CHAINS_
386 			if (pdbChain != " ") {
387 				oss <<  '_' << pdbChain;
388 			}
389 		#else
390 			if (pdbChain != ' ') {
391 				oss <<  '_' << (char) pdbChain;
392 			}
393 		#endif
394     } else if (gi != VALUE_NOT_SET) {
395         oss << "gi " << gi;
396     } else if (mmdbID != VALUE_NOT_SET && moleculeID != VALUE_NOT_SET) {
397         oss << "mmdb " << mmdbID << " molecule " << moleculeID;
398     } else if (seqIDs.size() > 0) {
399         oss << seqIDs.front()->GetSeqIdString();
400     } else {
401         oss << '?';
402     }
403     return (string) CNcbiOstrstreamToString(oss);
404 }
405 
406 END_SCOPE(Cn3D)
407