1 /**********************************************************************
2 chains.h - Parse for macromolecule chains and residues
3 
4 Copyright (C) 1998-2001 by OpenEye Scientific Software, Inc.
5 Some portions Copyright (C) 2001-2006 by Geoffrey R. Hutchison
6 Some portions Copyright (C) 2008 by Tim Vandermeersch
7 
8 This file is part of the Open Babel project.
9 For more information, see <http://openbabel.org/>
10 
11 This program is free software; you can redistribute it and/or modify
12 it under the terms of the GNU General Public License as published by
13 the Free Software Foundation version 2 of the License.
14 
15 This program is distributed in the hope that it will be useful,
16 but WITHOUT ANY WARRANTY; without even the implied warranty of
17 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
18 GNU General Public License for more details.
19 ***********************************************************************/
20 
21 #ifndef OB_CHAINS_H
22 #define OB_CHAINS_H
23 
24 #define MaxMonoAtom 20
25 #define MaxMonoBond 20
26 
27 #include <openbabel/babelconfig.h>
28 #include <vector>
29 
30 namespace OpenBabel
31 {
32 
33   class OBAtom;
34   class OBMol;
35 
36   //! Structure template for atomic patterns in residues for OBChainsParser
37   // implementation in chains.cpp
38   struct Template;
39   typedef struct Template Template;
40 
41   /** @class OBChainsParser chains.h <openbabel/chains.h>
42       @brief Perceives peptide or nucleotide chains and residues in an OBMol
43 
44       Perceive peptide or nucleotide chains and residues from atom connectivity.
45       Based on original RasMol code by Roger Sayle and modified by Joe Corkery.
46       For more on Roger's original talk, see:
47       http://www.daylight.com/meetings/mug96/sayle/sayle.html
48    */
49   class OBAPI OBChainsParser
50   {
51     public:
52 
53       OBChainsParser(void);
54       ~OBChainsParser(void);
55 
56       /**
57        * Perceive macromolecular (peptide and nucleotide) residues and chains
58        * @param mol The molecule to parse and update
59        * @param nukeSingleResidue If only one residue is found, clear information
60        * default = false  -- single residue files should still be recognized.
61        */
62       bool PerceiveChains(OBMol &mol, bool nukeSingleResidue = false);
63 
64     private: // internal methods
65 
66       //! @name Step 1: Determine hetero atoms
67       //@{
68       /**
69        * Determine HETATOM records for all atoms with a heavy valance of 0.
70        * This includes HOH, Cl, Fe, ...
71        *
72        * Sets resids[i] & hetflags[i] for these atoms.
73        * @todo add ions (Cl, Fe, ...)
74        */
75       bool DetermineHetAtoms(OBMol &);
76       //@}
77 
78       //! @name Step 2: Determine connected chains
79       //@{
80       /**
81        * Determine connected chains (e.g., subunits). Chains will be labeled A, B, C, ...
82        * Ligands also get assigned a chain label. The chain for ligands will later be
83        * replaced by ' '. The residue numbers will also be updated in this process to
84        * make sure all ligands, HOH, ions, etc. have a unique residue number in the ' '
85        * chain.
86        *
87        * Sets chains[i] for all atoms. (through RecurseChain())
88        */
89       bool DetermineConnectedChains(OBMol &);
90       /**
91        * Perform the actual work for DetermineConnectedChains(). Set chains[i]
92        * to @p c for all atoms of the recursed chain.
93        * @param mol The molecule.
94        * @param i Index for the current atom. (RecurseChain() will be called for all neighbours)
95        * @param c The chain which we are recusring. ('A' + count)
96        * @return The number of heavy atoms in the recursed chain.
97        */
98       unsigned int RecurseChain(OBMol &mol, unsigned int i, int c);
99       //@}
100 
101       //! @name Step 3: Determine peptide backbone
102       //@{
103       /**
104        * Walk a peptide "backbone" atom sequence, from one residue to the next. This
105        * function will look for N-CA-C-O sequences and mark these atoms.
106        *
107        * Sets bitmaks[i] for these atoms. (through ConstrainBackbone())
108        * Sets resnos[i] for these atoms. (through TracePeptideChain())
109        */
110       bool DeterminePeptideBackbone(OBMol &);
111       /**
112        * First the bitmasks[i] will be OR-ed with Template::flag for all atoms based on
113        * on Template::element and Template::count.
114        *
115        * Next, the bitmasks[i] are iteratively resolved by matching the
116        * constraints in OpenBabel::Peptide or OpenBabel::Nucleotide.
117        * @param mol The molecule.
118        * @param templ OpenBabel::Peptide or OpenBabel::Nucleotide
119        * @param tmax Number of entries in @p templ
120        */
121       void ConstrainBackbone(OBMol &mol, Template *templ, int tmax);
122       /**
123        * @return True if the bitmasks[i] for @p atom matches @p mask.
124        */
125       bool MatchConstraint(OBAtom *atom, int mask);
126       /**
127        * @return True if atom @p na and @p nb match the Template::n1 and
128        * Template::n2.
129        */
130       bool Match2Constraints(Template *templ, OBAtom *na, OBAtom *nb);
131       /**
132        * @return True if atom @p na, @p nb and @p nc match the Template::n1,
133        * Template::n2 and Template::n3.
134        */
135       bool Match3Constraints(Template *templ, OBAtom *na, OBAtom *nb, OBAtom *nc);
136       /**
137        * @return True if atom @p na, @p nb, @p nc and @p nd match the Template::n1,
138        * Template::n2, Template::n3 and Template::n4.
139        */
140       bool Match4Constraints(Template *templ, OBAtom *na, OBAtom *nb, OBAtom *nc, OBAtom *nd);
141       /**
142        * Now we have the constrained bitmaks[i], trace N-CA-C-O-... and set
143        * resnos[i] and atomids[i] for each N-CA-C-O sequence.
144        *
145        * Also adds BF_DOUBLE to flags[b] for< each carbonyl bond in N-CA-C=O.
146        * @param mol The molecule.
147        * @param i Index for the current atom. (TracePeptideChain() will be called for all neighbours)
148        * @param r The residue number which we are tracing.
149        */
150       void TracePeptideChain(OBMol &mol, unsigned int i, int r);
151       //@}
152 
153       //! @name Step 4: Determine peptide side chains
154       //@{
155       /**
156        * Look for atoms with atomids[i] CA and identify their side chain.
157        *
158        * Sets resnos[i] and resids[i] for all identified residues (including the N-CA-C-O).
159        * (through IdentifyResidue() and AssignResidue())
160        */
161       bool  DeterminePeptideSidechains(OBMol &);
162       /**
163        * Identify a residue based on the @p tree ByteCode.
164        *
165        * Sets resnos[i] for all sidechain atoms to the residue number of
166        * the seed CA/C1 atom.
167        * @param tree Bytecode for the residues. (OBChainsParser::PDecisionTree or OBChainsParser::NDecisionTree)
168        * @param mol The molecule.
169        * @param seed Atom index for the CA (peptides) or C1 (nucleotides) atom.
170        * @param resno The residue number for this residue.
171        * @return The resids[i] for the identified residue.
172        */
173       int IdentifyResidue(void *tree, OBMol &mol, unsigned int seed, int resno); // ByteCode *
174       /**
175        * Set resids[i] for all atoms where resids[i] = @p r and chains[i] = @p c.
176        * @param mol The molecule.
177        * @param r The residue number.
178        * @param c The chain number.
179        * @param i The residue id (resids[i] returned by IdentifyResidue())
180        */
181       void  AssignResidue(OBMol &mol, int r, int c, int i);
182       //@}
183 
184       //! @name Step 5: Assign hydrogens
185       //@{
186       /**
187        * Assign the resids[i], resnos[i], ... for all hydrogens based on the
188        * atom they are bound to.
189        */
190       bool  DetermineHydrogens(OBMol &);
191       //@}
192 
193       //! @name Step 6: Set the residue information
194       //@{
195       /**
196        * Convert the private data vectors to OBResidue objects and add them to @p mol.
197        * @param mol The molecule to parse and update
198        * @param nukeSingleResidue If only one residue is found, clear information
199        * default = false  -- single residue files should still be recognized.
200        */
201       void  SetResidueInformation(OBMol &, bool nukeSingleResidue);
202       //@}
203 
204       //! @name Nucleic acids (analog to peptides)
205       //@{
206       /**
207        * Walk a nucleic "backbone" atom sequence, from one residue to the next. This
208        * function will look for ribose-5-P sequences and mark these atoms.
209        *
210        * Sets bitmaks[i] for these atoms. (through ConstrainBackbone())
211        * Sets resnos[i] for these atoms. (through TraceNucleicChain())
212        */
213       bool  DetermineNucleicBackbone(OBMol &);
214       /**
215        * Now we have the constrained bitmaks[i], trace nucleic backbone and set
216        * resnos[i] and atomids[i] for each ribose-5-P sequence.
217        * @param mol The molecule.
218        * @param i Index for the current atom. (TraceNucleicChain() will be called for all neighbours)
219        * @param r The residue number which we are tracing.
220        */
221       void  TraceNucleicChain(OBMol &, unsigned int i, int r);
222       /**
223        * Look for atoms with atomids[i] C1 and identify their side chain.
224        *
225        * Sets resnos[i] and resids[i] for all identified residues.
226        * (through IdentifyResidue() and AssignResidue())
227        */
228       bool  DetermineNucleicSidechains(OBMol &);
229       //@}
230 
231       /**
232        * Set up the chain perception to operate on the supplied molecule
233        * by resizing and initializing the private data vectors.
234        */
235       void  SetupMol(OBMol &);
236       /**
237        * Delete all residues in @p mol
238        */
239       void  ClearResidueInformation(OBMol &mol);
240       /**
241        * Clear all private data vectors
242        */
243       void CleanupMol();
244       /**
245        * Construct and add ByteCode to the @p tree for a single residue.
246        * @param tree Bytecode for the residues. (OBChainsParser::PDecisionTree or OBChainsParser::NDecisionTree)
247        * @param resid The residue id.
248        * @param smiles The pseudo-smiles string (from OpenBabel::AminoAcids or OpenBabel::Nucleotides)
249        */
250       void  DefineMonomer(void **tree, int resid, const char *smiles); // ByteCode **
251       /**
252        * @param ptr Element id (from OpenBabel::ChainsAtomName)
253        * @return The element number.
254        */
255       int   IdentifyElement(char *ptr);
256       /**
257        * Parse a pseudo smiles from OpenBabel::AminoAcids or OpenBabel::Nucleotides.
258        * @param smiles The pseudo-smiles string.
259        * @param prev The previous position (used for recursing, use -1 to start).
260        */
261       const char *ParseSmiles(const char *smiles, int prev);
262       /**
263        * Debugging function.
264        */
265       void DumpState();
266 
267       void *PDecisionTree; //!< ByteCode decision tree for peptides
268       void *NDecisionTree; //!< ByteCode decision tree for nucleotides
269 
270       int   ResMonoAtom[MaxMonoAtom];
271       int   ResMonoBond[MaxMonoBond];
272 
273       std::vector<unsigned short> bitmasks;
274       std::vector<bool>           visits;   //!< mark visits to prevent looping
275       std::vector<unsigned char>  resids;
276       std::vector<unsigned char>  flags;
277       std::vector<bool>           hetflags;
278       std::vector<int>            atomids;
279       std::vector<short>          resnos;
280       std::vector<short>          sernos;   //!< array of residue serial numbers
281       std::vector<char>           hcounts;
282       std::vector<char>           chains;
283     };
284 
285     //! Global OBChainsParser for detecting macromolecular chains and residues
286     OB_EXTERN  OBChainsParser   chainsparser;
287 
288 }
289 #endif // OB_CHAINS_H
290 
291 //! \file chains.h
292 //! \brief Parse for macromolecule chains and residues.
293