1 /**********************************************************************
2 kekulize.cpp - Kekulize a molecule
3 
4 Copyright (C) 2017 Noel M. O'Boyle
5 
6 This file is part of the Open Babel project.
7 For more information, see <http://openbabel.org/>
8 
9 This program is free software; you can redistribute it and/or modify
10 it under the terms of the GNU General Public License as published by
11 the Free Software Foundation version 2 of the License.
12 
13 This program is distributed in the hope that it will be useful,
14 but WITHOUT ANY WARRANTY; without even the implied warranty of
15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
16 GNU General Public License for more details.
17 ***********************************************************************/
18 
19 #include <openbabel/babelconfig.h>
20 
21 #include <openbabel/mol.h>
22 #include <openbabel/atom.h>
23 #include <openbabel/bond.h>
24 #include <openbabel/obiter.h>
25 #include <openbabel/kekulize.h>
26 #include <cstdlib>
27 #include <cstring>
28 
29 namespace OpenBabel
30 {
GetMaxAtomIdx(OBMol * mol)31   static unsigned int GetMaxAtomIdx(OBMol* mol)
32   {
33     return mol->NumAtoms() + 1;
34   }
35 
GetMaxBondIdx(OBMol * mol)36   static unsigned int GetMaxBondIdx(OBMol* mol)
37   {
38     return mol->NumBonds();
39   }
40 
41   class Kekulizer
42   {
43   public:
Kekulizer(OBMol * mol)44     Kekulizer(OBMol* mol) : m_mol(mol), needs_dbl_bond(nullptr), doubleBonds(nullptr), kekule_system(nullptr)
45     {
46       atomArraySize = GetMaxAtomIdx(m_mol) + 1;
47       bondArraySize = GetMaxBondIdx(m_mol) + 1;
48     }
~Kekulizer()49     ~Kekulizer() {
50       delete needs_dbl_bond;
51       delete doubleBonds;
52       delete kekule_system;
53     }
54     bool GreedyMatch();
55     bool BackTrack();
56     void AssignDoubleBonds();
57   private:
58     bool FindPath(unsigned int atomidx, bool isDoubleBond, OBBitVec &visited);
59     OBMol* m_mol;
60     OBBitVec *needs_dbl_bond;
61     OBBitVec *doubleBonds;
62     OBBitVec *kekule_system;
63     unsigned int atomArraySize;
64     unsigned int bondArraySize;
65     std::vector<unsigned int> m_path;
66   };
67 
IsSpecialCase(OBAtom * atom)68   static bool IsSpecialCase(OBAtom* atom)
69   {
70     switch(atom->GetAtomicNum()) {
71     case 7:
72       // Any exo-cyclic double bond from a N
73       // e.g. pyridine N-oxide as the double bond form
74       if (atom->GetTotalDegree() == 3 && atom->GetFormalCharge() == 0)
75         return true;
76       break;
77     case 16: // e.g. Cs1(=O)ccccn1 but not O=s1(=O)cccn1
78       if (atom->GetTotalDegree() == 4 && atom->GetFormalCharge() == 0
79           && atom->GetTotalValence() < 6)
80         return true;
81       break;
82     }
83     return false;
84   }
85 
NeedsDoubleBond(OBAtom * atom)86   static bool NeedsDoubleBond(OBAtom* atom)
87   {
88     if (!atom->IsAromatic())
89       return false;
90 
91     // Does it already have an explicit double bond?
92     FOR_BONDS_OF_ATOM(bond, atom) {
93       if (bond->IsAromatic()) continue;
94       OBAtom *nbr = bond->GetNbrAtom(atom);
95       switch (bond->GetBondOrder()) {
96       case 0: case 1:
97         continue;
98       case 2:
99         if (IsSpecialCase(atom))
100           return true;
101         return false;
102       default: // bond order > 2
103         return false;
104       }
105     }
106 
107     // Is it one of the cases where we know that it only has single bonds?
108     int chg = atom->GetFormalCharge();
109     int deg = atom->GetTotalDegree();
110     switch (atom->GetAtomicNum()) {
111     case 6:
112       if (deg == 3 && (chg == 1 || chg == -1))
113         return false;
114       break;
115     case 5: case 7: case 15: case 33: case 51: case 83:
116       switch (chg) {
117       case 0: // e.g. a pyrrole-type nitrogen
118         if (deg == 3 || deg > 4)
119           return false;
120         break;
121       case -1:
122         if (deg == 2)
123           return false;
124         break;
125       case 1:
126         if (deg > 3)
127           return false;
128       }
129       break;
130     case 8: case 16: case 34: case 52:
131       switch (chg) {
132       case 0:
133         if (deg == 2 || deg == 4 || deg > 5)
134           return false;
135         break;
136       case -1: case 1:
137         if (deg == 3 || deg == 5 || deg > 6)
138           return false;
139       }
140     }
141 
142     return true; // It needs a double bond
143   }
144 
145   class NodeIterator
146   {
147   public:
NodeIterator(unsigned int * & degrees,unsigned int atomArraySize)148     NodeIterator(unsigned int *&degrees, unsigned int atomArraySize) :
149       m_degrees(degrees), m_atomArraySize(atomArraySize),
150       m_counter(0), finishedDegTwo(false)
151     { }
next()152     unsigned int next()
153     {
154       m_counter++;
155 
156       if (!finishedDegTwo) { // return deg 2 nodes first
157         for (; m_counter < m_atomArraySize; ++m_counter) {
158           if (m_degrees[m_counter] == 2) {
159             return m_counter;
160           }
161         }
162         finishedDegTwo = true;
163         m_counter = 1; // first atom has idx 1
164       }
165 
166       // return nodes with degree > 2
167       for (; m_counter < m_atomArraySize; ++m_counter) {
168         if (m_degrees[m_counter] > 2) {
169           return m_counter;
170         }
171       }
172 
173       // Finished - return 0 signalling the end of iteration
174       return 0;
175     }
176   private:
177     unsigned int *&m_degrees;
178     unsigned int m_atomArraySize;
179     unsigned int m_counter;
180     bool finishedDegTwo;
181   };
182 
AssignDoubleBonds()183   void Kekulizer::AssignDoubleBonds()
184   {
185     int bit;
186     for (bit = doubleBonds->FirstBit(); bit != doubleBonds->EndBit(); bit = doubleBonds->NextBit(bit)) {
187       m_mol->GetBond(bit)->SetBondOrder(2);
188     }
189   }
190 
GreedyMatch()191   bool Kekulizer::GreedyMatch()
192   {
193 
194     // What atoms need a double bond? The job of kekulization is
195     // to give all of these atoms a single double bond.
196     needs_dbl_bond = new OBBitVec(atomArraySize); // defaults to all False
197     FOR_ATOMS_OF_MOL(atom, m_mol) {
198       if (NeedsDoubleBond(&*atom)) {
199         needs_dbl_bond->SetBitOn(atom->GetIdx());
200       }
201     }
202     // Make a copy of needs_dbl_bond, to restrict the traversal in BackTrack()
203     kekule_system = new OBBitVec(*needs_dbl_bond);
204 
205     // Create lookup of degrees
206     unsigned int *degrees = (unsigned int*)malloc(sizeof(unsigned int)*atomArraySize);
207     memset(degrees, 0, sizeof(unsigned int)*atomArraySize);
208     std::vector<OBAtom*> degreeOneAtoms;
209     FOR_ATOMS_OF_MOL(atom, m_mol) {
210       unsigned int atom_idx = atom->GetIdx();
211       if (!needs_dbl_bond->BitIsSet(atom_idx)) {
212         degrees[atom_idx] = 0;
213         continue;
214       }
215       unsigned int mdeg = 0;
216       FOR_BONDS_OF_ATOM(bond, &*atom) {
217         if (!bond->IsAromatic()) continue;
218         OBAtom *nbr = bond->GetNbrAtom(&*atom);
219         if (needs_dbl_bond->BitIsSet(nbr->GetIdx()))
220           mdeg++;
221       }
222       degrees[atom_idx] = mdeg;
223       if (mdeg == 1)
224         degreeOneAtoms.push_back(&*atom);
225     }
226 
227     // Location of assigned double bonds
228     doubleBonds = new OBBitVec(bondArraySize); // defaults to all False
229 
230     bool finished = false;
231     while (true) { // Main loop
232 
233       // Complete all of the degree one nodes
234       while (!degreeOneAtoms.empty()) {
235         OBAtom* atom = degreeOneAtoms.back();
236         degreeOneAtoms.pop_back();
237         // some nodes may already have been handled
238         if (!needs_dbl_bond->BitIsSet(atom->GetIdx())) continue;
239         FOR_BONDS_OF_ATOM(bond, atom) {
240           if (!bond->IsAromatic()) continue;
241           OBAtom *nbr = bond->GetNbrAtom(&*atom);
242           if (!needs_dbl_bond->BitIsSet(nbr->GetIdx())) continue;
243           // create a double bond from atom -> nbr
244           doubleBonds->SetBitOn(bond->GetIdx());
245           needs_dbl_bond->SetBitOff(atom->GetIdx());
246           needs_dbl_bond->SetBitOff(nbr->GetIdx());
247           // now update degree information for nbr's neighbors
248           FOR_BONDS_OF_ATOM(nbrbond, nbr) {
249             if (&(*nbrbond) == &(*bond) || !nbrbond->IsAromatic()) continue;
250             OBAtom* nbrnbr = nbrbond->GetNbrAtom(nbr);
251             unsigned int nbrnbrIdx = nbrnbr->GetIdx();
252             if (!needs_dbl_bond->BitIsSet(nbrnbrIdx)) continue;
253             degrees[nbrnbrIdx]--;
254             if (degrees[nbrnbrIdx] == 1)
255               degreeOneAtoms.push_back(nbrnbr);
256           }
257           // only a single double bond can be made to atom so we can break here
258           break;
259         }
260       }
261 
262       if (needs_dbl_bond->IsEmpty()) {
263         finished = true;
264         break;
265       }
266 
267       // Now handle any remaining degree 2 or 3 nodes
268       // We handle deg 2 nodes first and then 3, and the iteration over these nodes
269       // is abstracted away. Once a double-bond is added that generates more
270       // degree one nodes, then the iterator is exited
271       NodeIterator iterator(degrees, atomArraySize);
272       bool change = false;
273       while (unsigned int atomIdx = iterator.next()) {
274         if (!needs_dbl_bond->BitIsSet(atomIdx)) continue;
275         // The following is almost identical to the code above for deg 1 atoms
276         // except for handling the variable 'change'
277         OBAtom *atom = m_mol->GetAtom(atomIdx);
278         FOR_BONDS_OF_ATOM(bond, atom) {
279           if (!bond->IsAromatic()) continue;
280           OBAtom *nbr = bond->GetNbrAtom(&*atom);
281           if (!needs_dbl_bond->BitIsSet(nbr->GetIdx())) continue;
282           // create a double bond from atom -> nbr
283           doubleBonds->SetBitOn(bond->GetIdx());
284           needs_dbl_bond->SetBitOff(atomIdx);
285           needs_dbl_bond->SetBitOff(nbr->GetIdx());
286           // now update degree information for both atom's and nbr's neighbors
287           for(int N=0; N<2; N++) {
288             OBAtom *ref = N == 0 ? atom : nbr;
289             FOR_BONDS_OF_ATOM(nbrbond, ref) {
290               if (&(*nbrbond) == &(*bond) || !nbrbond->IsAromatic()) continue;
291               OBAtom* nbrnbr = nbrbond->GetNbrAtom(ref);
292               unsigned int nbrnbrIdx = nbrnbr->GetIdx();
293               if (!needs_dbl_bond->BitIsSet(nbrnbrIdx)) continue;
294               degrees[nbrnbrIdx]--;
295               if (degrees[nbrnbrIdx] == 1) {
296                 degreeOneAtoms.push_back(nbrnbr);
297                 change = true;
298               }
299             }
300           }
301           // only a single double bond can be made to atom so we can break here
302           break;
303         }
304         if (change)
305           break; // exit the iterator once we have actually set a double bond
306       }
307 
308       // We exit if we are finished or if no degree 2/3 nodes can be set
309       if (!change)
310         break;
311     }
312 
313     // Tidy up
314     free(degrees);
315 
316     return finished;
317   }
318 
319   // The isDoubleBond alternates between double and single, as we need to find
320   // an alternating path
FindPath(unsigned int atomidx,bool isDoubleBond,OBBitVec & visited)321   bool Kekulizer::FindPath(unsigned int atomidx, bool isDoubleBond, OBBitVec &visited)
322   {
323     if (needs_dbl_bond->BitIsSet(atomidx))
324       return true;
325     visited.SetBitOn(atomidx);
326     OBAtom* atom = m_mol->GetAtom(atomidx);
327     FOR_BONDS_OF_ATOM(bond, atom) {
328       if (!bond->IsAromatic()) continue;
329       OBAtom *nbr = bond->GetNbrAtom(atom);
330       if (!kekule_system->BitIsSet(nbr->GetIdx())) continue;
331       if (doubleBonds->BitIsSet(bond->GetIdx()) == isDoubleBond) {
332         if (visited.BitIsSet(nbr->GetIdx())) continue;
333         bool found_path = FindPath(nbr->GetIdx(), !isDoubleBond, visited);
334         if (found_path) {
335           m_path.push_back(nbr->GetIdx());
336           return true;
337         }
338       }
339     }
340     visited.SetBitOff(atomidx);
341     return false;
342   }
343 
BackTrack()344   bool Kekulizer::BackTrack()
345   {
346     // With an odd number of bits, it's never going to kekulize fully, but let's fill in as many as we can
347     unsigned int count = needs_dbl_bond->CountBits();
348 
349     unsigned int total_handled = 0;
350     int idx;
351     for (idx = needs_dbl_bond->FirstBit(); idx != needs_dbl_bond->EndBit(); idx = needs_dbl_bond->NextBit(idx)) {
352       total_handled++;
353       // If there is no additional bit available to match this bit, then terminate
354       if (total_handled == count)
355         return false;
356 
357       // Our goal is to find an alternating path to another atom
358       // that needs a double bond
359       needs_dbl_bond->SetBitOff(idx); // to avoid the trivial null path being found
360       OBBitVec visited(atomArraySize);
361       m_path.clear();
362       bool found_path = FindPath(idx, false, visited);
363       if (!found_path) { // could only happen if not kekulizable
364         needs_dbl_bond->SetBitOn(idx); // reset
365         continue;
366       }
367       total_handled++;
368       m_path.push_back(idx);
369       needs_dbl_bond->SetBitOff(m_path[0]);
370       // Flip all of the bond orders on the path from double<-->single
371       for (unsigned int i = 0; i < m_path.size()-1; ++i) {
372         OBBond *bond = m_mol->GetBond(m_path[i], m_path[i + 1]);
373         if (i % 2 == 0)
374           doubleBonds->SetBitOn(bond->GetIdx());
375         else
376           doubleBonds->SetBitOff(bond->GetIdx());
377       }
378     }
379     return needs_dbl_bond->IsEmpty();
380   }
381 
382 // I'd like to thank John Mayfield for many helpful discussions on the topic of
383 // kekulization, without which this implementation would not have been
384 // possible.
385 //
386 // OBKekulize() implements a two-step kekulization:
387 //   Step one: try a greedy match
388 //   Step two: try an exhaustive backtracking (using the results of step one)
389 //
390 // The greedy match algorithm is outlined in the thesis of John May
391 // and indeed NeedsDoubleBond() is based on the implementation in Beam.
392 // The greedy algorithm almost always works. But when it doesn't, step two is needed.
393 //
394 // The goal of the exhaustive backtracking is find a path of alternating single/double
395 // bonds between two radicals and flip those bonds. For more information, read about
396 // augmenting paths in the context of perfect matching. John's thesis instead describes
397 // the use of Edmond's Blossom algorithm which scales better - this may or may not be
398 // faster in practice for typical chemical graphs.
399 //
400 // Potential speedups:
401 //   * Is OBBitVec performant? I don't know - it seems to do a lot of bounds checking.
402 //     You could try replacing all usages theoreof with
403 //     std::vector<char>, where the char could possibly handle several flags.
404 //   * Before trying the exhaustive search, try a BFS. I have a feeling that this would work
405 //     90% of the time.
406 //   * There's a lot of switching between atoms and atom indices (and similar for bonds).
407 //     Was this completely necessary?
408 //   * The iterator over degree 2 and 3 nodes may iterate twice - it would have been
409 //     faster if I just took the first degree 2 or 3 node I came across, but would
410 //     it have worked as well?
411 
OBKekulize(OBMol * mol)412   bool OBKekulize(OBMol* mol)
413   {
414     Kekulizer kekulizer(mol);
415     bool success = kekulizer.GreedyMatch();
416     if (!success) {
417       success = kekulizer.BackTrack();
418     }
419 
420     kekulizer.AssignDoubleBonds();
421 
422     return success;
423   }
424 
425 
426 } // end namespace OpenBabel
427 
428 //! \file kekulize.cpp
429 //! \brief Algorithm to kekulize a molecule
430