1 /********************************************************************** 2 kekulize.cpp - Kekulize a molecule 3 4 Copyright (C) 2017 Noel M. O'Boyle 5 6 This file is part of the Open Babel project. 7 For more information, see <http://openbabel.org/> 8 9 This program is free software; you can redistribute it and/or modify 10 it under the terms of the GNU General Public License as published by 11 the Free Software Foundation version 2 of the License. 12 13 This program is distributed in the hope that it will be useful, 14 but WITHOUT ANY WARRANTY; without even the implied warranty of 15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 16 GNU General Public License for more details. 17 ***********************************************************************/ 18 19 #include <openbabel/babelconfig.h> 20 21 #include <openbabel/mol.h> 22 #include <openbabel/atom.h> 23 #include <openbabel/bond.h> 24 #include <openbabel/obiter.h> 25 #include <openbabel/kekulize.h> 26 #include <cstdlib> 27 #include <cstring> 28 29 namespace OpenBabel 30 { GetMaxAtomIdx(OBMol * mol)31 static unsigned int GetMaxAtomIdx(OBMol* mol) 32 { 33 return mol->NumAtoms() + 1; 34 } 35 GetMaxBondIdx(OBMol * mol)36 static unsigned int GetMaxBondIdx(OBMol* mol) 37 { 38 return mol->NumBonds(); 39 } 40 41 class Kekulizer 42 { 43 public: Kekulizer(OBMol * mol)44 Kekulizer(OBMol* mol) : m_mol(mol), needs_dbl_bond(nullptr), doubleBonds(nullptr), kekule_system(nullptr) 45 { 46 atomArraySize = GetMaxAtomIdx(m_mol) + 1; 47 bondArraySize = GetMaxBondIdx(m_mol) + 1; 48 } ~Kekulizer()49 ~Kekulizer() { 50 delete needs_dbl_bond; 51 delete doubleBonds; 52 delete kekule_system; 53 } 54 bool GreedyMatch(); 55 bool BackTrack(); 56 void AssignDoubleBonds(); 57 private: 58 bool FindPath(unsigned int atomidx, bool isDoubleBond, OBBitVec &visited); 59 OBMol* m_mol; 60 OBBitVec *needs_dbl_bond; 61 OBBitVec *doubleBonds; 62 OBBitVec *kekule_system; 63 unsigned int atomArraySize; 64 unsigned int bondArraySize; 65 std::vector<unsigned int> m_path; 66 }; 67 IsSpecialCase(OBAtom * atom)68 static bool IsSpecialCase(OBAtom* atom) 69 { 70 switch(atom->GetAtomicNum()) { 71 case 7: 72 // Any exo-cyclic double bond from a N 73 // e.g. pyridine N-oxide as the double bond form 74 if (atom->GetTotalDegree() == 3 && atom->GetFormalCharge() == 0) 75 return true; 76 break; 77 case 16: // e.g. Cs1(=O)ccccn1 but not O=s1(=O)cccn1 78 if (atom->GetTotalDegree() == 4 && atom->GetFormalCharge() == 0 79 && atom->GetTotalValence() < 6) 80 return true; 81 break; 82 } 83 return false; 84 } 85 NeedsDoubleBond(OBAtom * atom)86 static bool NeedsDoubleBond(OBAtom* atom) 87 { 88 if (!atom->IsAromatic()) 89 return false; 90 91 // Does it already have an explicit double bond? 92 FOR_BONDS_OF_ATOM(bond, atom) { 93 if (bond->IsAromatic()) continue; 94 OBAtom *nbr = bond->GetNbrAtom(atom); 95 switch (bond->GetBondOrder()) { 96 case 0: case 1: 97 continue; 98 case 2: 99 if (IsSpecialCase(atom)) 100 return true; 101 return false; 102 default: // bond order > 2 103 return false; 104 } 105 } 106 107 // Is it one of the cases where we know that it only has single bonds? 108 int chg = atom->GetFormalCharge(); 109 int deg = atom->GetTotalDegree(); 110 switch (atom->GetAtomicNum()) { 111 case 6: 112 if (deg == 3 && (chg == 1 || chg == -1)) 113 return false; 114 break; 115 case 5: case 7: case 15: case 33: case 51: case 83: 116 switch (chg) { 117 case 0: // e.g. a pyrrole-type nitrogen 118 if (deg == 3 || deg > 4) 119 return false; 120 break; 121 case -1: 122 if (deg == 2) 123 return false; 124 break; 125 case 1: 126 if (deg > 3) 127 return false; 128 } 129 break; 130 case 8: case 16: case 34: case 52: 131 switch (chg) { 132 case 0: 133 if (deg == 2 || deg == 4 || deg > 5) 134 return false; 135 break; 136 case -1: case 1: 137 if (deg == 3 || deg == 5 || deg > 6) 138 return false; 139 } 140 } 141 142 return true; // It needs a double bond 143 } 144 145 class NodeIterator 146 { 147 public: NodeIterator(unsigned int * & degrees,unsigned int atomArraySize)148 NodeIterator(unsigned int *°rees, unsigned int atomArraySize) : 149 m_degrees(degrees), m_atomArraySize(atomArraySize), 150 m_counter(0), finishedDegTwo(false) 151 { } next()152 unsigned int next() 153 { 154 m_counter++; 155 156 if (!finishedDegTwo) { // return deg 2 nodes first 157 for (; m_counter < m_atomArraySize; ++m_counter) { 158 if (m_degrees[m_counter] == 2) { 159 return m_counter; 160 } 161 } 162 finishedDegTwo = true; 163 m_counter = 1; // first atom has idx 1 164 } 165 166 // return nodes with degree > 2 167 for (; m_counter < m_atomArraySize; ++m_counter) { 168 if (m_degrees[m_counter] > 2) { 169 return m_counter; 170 } 171 } 172 173 // Finished - return 0 signalling the end of iteration 174 return 0; 175 } 176 private: 177 unsigned int *&m_degrees; 178 unsigned int m_atomArraySize; 179 unsigned int m_counter; 180 bool finishedDegTwo; 181 }; 182 AssignDoubleBonds()183 void Kekulizer::AssignDoubleBonds() 184 { 185 int bit; 186 for (bit = doubleBonds->FirstBit(); bit != doubleBonds->EndBit(); bit = doubleBonds->NextBit(bit)) { 187 m_mol->GetBond(bit)->SetBondOrder(2); 188 } 189 } 190 GreedyMatch()191 bool Kekulizer::GreedyMatch() 192 { 193 194 // What atoms need a double bond? The job of kekulization is 195 // to give all of these atoms a single double bond. 196 needs_dbl_bond = new OBBitVec(atomArraySize); // defaults to all False 197 FOR_ATOMS_OF_MOL(atom, m_mol) { 198 if (NeedsDoubleBond(&*atom)) { 199 needs_dbl_bond->SetBitOn(atom->GetIdx()); 200 } 201 } 202 // Make a copy of needs_dbl_bond, to restrict the traversal in BackTrack() 203 kekule_system = new OBBitVec(*needs_dbl_bond); 204 205 // Create lookup of degrees 206 unsigned int *degrees = (unsigned int*)malloc(sizeof(unsigned int)*atomArraySize); 207 memset(degrees, 0, sizeof(unsigned int)*atomArraySize); 208 std::vector<OBAtom*> degreeOneAtoms; 209 FOR_ATOMS_OF_MOL(atom, m_mol) { 210 unsigned int atom_idx = atom->GetIdx(); 211 if (!needs_dbl_bond->BitIsSet(atom_idx)) { 212 degrees[atom_idx] = 0; 213 continue; 214 } 215 unsigned int mdeg = 0; 216 FOR_BONDS_OF_ATOM(bond, &*atom) { 217 if (!bond->IsAromatic()) continue; 218 OBAtom *nbr = bond->GetNbrAtom(&*atom); 219 if (needs_dbl_bond->BitIsSet(nbr->GetIdx())) 220 mdeg++; 221 } 222 degrees[atom_idx] = mdeg; 223 if (mdeg == 1) 224 degreeOneAtoms.push_back(&*atom); 225 } 226 227 // Location of assigned double bonds 228 doubleBonds = new OBBitVec(bondArraySize); // defaults to all False 229 230 bool finished = false; 231 while (true) { // Main loop 232 233 // Complete all of the degree one nodes 234 while (!degreeOneAtoms.empty()) { 235 OBAtom* atom = degreeOneAtoms.back(); 236 degreeOneAtoms.pop_back(); 237 // some nodes may already have been handled 238 if (!needs_dbl_bond->BitIsSet(atom->GetIdx())) continue; 239 FOR_BONDS_OF_ATOM(bond, atom) { 240 if (!bond->IsAromatic()) continue; 241 OBAtom *nbr = bond->GetNbrAtom(&*atom); 242 if (!needs_dbl_bond->BitIsSet(nbr->GetIdx())) continue; 243 // create a double bond from atom -> nbr 244 doubleBonds->SetBitOn(bond->GetIdx()); 245 needs_dbl_bond->SetBitOff(atom->GetIdx()); 246 needs_dbl_bond->SetBitOff(nbr->GetIdx()); 247 // now update degree information for nbr's neighbors 248 FOR_BONDS_OF_ATOM(nbrbond, nbr) { 249 if (&(*nbrbond) == &(*bond) || !nbrbond->IsAromatic()) continue; 250 OBAtom* nbrnbr = nbrbond->GetNbrAtom(nbr); 251 unsigned int nbrnbrIdx = nbrnbr->GetIdx(); 252 if (!needs_dbl_bond->BitIsSet(nbrnbrIdx)) continue; 253 degrees[nbrnbrIdx]--; 254 if (degrees[nbrnbrIdx] == 1) 255 degreeOneAtoms.push_back(nbrnbr); 256 } 257 // only a single double bond can be made to atom so we can break here 258 break; 259 } 260 } 261 262 if (needs_dbl_bond->IsEmpty()) { 263 finished = true; 264 break; 265 } 266 267 // Now handle any remaining degree 2 or 3 nodes 268 // We handle deg 2 nodes first and then 3, and the iteration over these nodes 269 // is abstracted away. Once a double-bond is added that generates more 270 // degree one nodes, then the iterator is exited 271 NodeIterator iterator(degrees, atomArraySize); 272 bool change = false; 273 while (unsigned int atomIdx = iterator.next()) { 274 if (!needs_dbl_bond->BitIsSet(atomIdx)) continue; 275 // The following is almost identical to the code above for deg 1 atoms 276 // except for handling the variable 'change' 277 OBAtom *atom = m_mol->GetAtom(atomIdx); 278 FOR_BONDS_OF_ATOM(bond, atom) { 279 if (!bond->IsAromatic()) continue; 280 OBAtom *nbr = bond->GetNbrAtom(&*atom); 281 if (!needs_dbl_bond->BitIsSet(nbr->GetIdx())) continue; 282 // create a double bond from atom -> nbr 283 doubleBonds->SetBitOn(bond->GetIdx()); 284 needs_dbl_bond->SetBitOff(atomIdx); 285 needs_dbl_bond->SetBitOff(nbr->GetIdx()); 286 // now update degree information for both atom's and nbr's neighbors 287 for(int N=0; N<2; N++) { 288 OBAtom *ref = N == 0 ? atom : nbr; 289 FOR_BONDS_OF_ATOM(nbrbond, ref) { 290 if (&(*nbrbond) == &(*bond) || !nbrbond->IsAromatic()) continue; 291 OBAtom* nbrnbr = nbrbond->GetNbrAtom(ref); 292 unsigned int nbrnbrIdx = nbrnbr->GetIdx(); 293 if (!needs_dbl_bond->BitIsSet(nbrnbrIdx)) continue; 294 degrees[nbrnbrIdx]--; 295 if (degrees[nbrnbrIdx] == 1) { 296 degreeOneAtoms.push_back(nbrnbr); 297 change = true; 298 } 299 } 300 } 301 // only a single double bond can be made to atom so we can break here 302 break; 303 } 304 if (change) 305 break; // exit the iterator once we have actually set a double bond 306 } 307 308 // We exit if we are finished or if no degree 2/3 nodes can be set 309 if (!change) 310 break; 311 } 312 313 // Tidy up 314 free(degrees); 315 316 return finished; 317 } 318 319 // The isDoubleBond alternates between double and single, as we need to find 320 // an alternating path FindPath(unsigned int atomidx,bool isDoubleBond,OBBitVec & visited)321 bool Kekulizer::FindPath(unsigned int atomidx, bool isDoubleBond, OBBitVec &visited) 322 { 323 if (needs_dbl_bond->BitIsSet(atomidx)) 324 return true; 325 visited.SetBitOn(atomidx); 326 OBAtom* atom = m_mol->GetAtom(atomidx); 327 FOR_BONDS_OF_ATOM(bond, atom) { 328 if (!bond->IsAromatic()) continue; 329 OBAtom *nbr = bond->GetNbrAtom(atom); 330 if (!kekule_system->BitIsSet(nbr->GetIdx())) continue; 331 if (doubleBonds->BitIsSet(bond->GetIdx()) == isDoubleBond) { 332 if (visited.BitIsSet(nbr->GetIdx())) continue; 333 bool found_path = FindPath(nbr->GetIdx(), !isDoubleBond, visited); 334 if (found_path) { 335 m_path.push_back(nbr->GetIdx()); 336 return true; 337 } 338 } 339 } 340 visited.SetBitOff(atomidx); 341 return false; 342 } 343 BackTrack()344 bool Kekulizer::BackTrack() 345 { 346 // With an odd number of bits, it's never going to kekulize fully, but let's fill in as many as we can 347 unsigned int count = needs_dbl_bond->CountBits(); 348 349 unsigned int total_handled = 0; 350 int idx; 351 for (idx = needs_dbl_bond->FirstBit(); idx != needs_dbl_bond->EndBit(); idx = needs_dbl_bond->NextBit(idx)) { 352 total_handled++; 353 // If there is no additional bit available to match this bit, then terminate 354 if (total_handled == count) 355 return false; 356 357 // Our goal is to find an alternating path to another atom 358 // that needs a double bond 359 needs_dbl_bond->SetBitOff(idx); // to avoid the trivial null path being found 360 OBBitVec visited(atomArraySize); 361 m_path.clear(); 362 bool found_path = FindPath(idx, false, visited); 363 if (!found_path) { // could only happen if not kekulizable 364 needs_dbl_bond->SetBitOn(idx); // reset 365 continue; 366 } 367 total_handled++; 368 m_path.push_back(idx); 369 needs_dbl_bond->SetBitOff(m_path[0]); 370 // Flip all of the bond orders on the path from double<-->single 371 for (unsigned int i = 0; i < m_path.size()-1; ++i) { 372 OBBond *bond = m_mol->GetBond(m_path[i], m_path[i + 1]); 373 if (i % 2 == 0) 374 doubleBonds->SetBitOn(bond->GetIdx()); 375 else 376 doubleBonds->SetBitOff(bond->GetIdx()); 377 } 378 } 379 return needs_dbl_bond->IsEmpty(); 380 } 381 382 // I'd like to thank John Mayfield for many helpful discussions on the topic of 383 // kekulization, without which this implementation would not have been 384 // possible. 385 // 386 // OBKekulize() implements a two-step kekulization: 387 // Step one: try a greedy match 388 // Step two: try an exhaustive backtracking (using the results of step one) 389 // 390 // The greedy match algorithm is outlined in the thesis of John May 391 // and indeed NeedsDoubleBond() is based on the implementation in Beam. 392 // The greedy algorithm almost always works. But when it doesn't, step two is needed. 393 // 394 // The goal of the exhaustive backtracking is find a path of alternating single/double 395 // bonds between two radicals and flip those bonds. For more information, read about 396 // augmenting paths in the context of perfect matching. John's thesis instead describes 397 // the use of Edmond's Blossom algorithm which scales better - this may or may not be 398 // faster in practice for typical chemical graphs. 399 // 400 // Potential speedups: 401 // * Is OBBitVec performant? I don't know - it seems to do a lot of bounds checking. 402 // You could try replacing all usages theoreof with 403 // std::vector<char>, where the char could possibly handle several flags. 404 // * Before trying the exhaustive search, try a BFS. I have a feeling that this would work 405 // 90% of the time. 406 // * There's a lot of switching between atoms and atom indices (and similar for bonds). 407 // Was this completely necessary? 408 // * The iterator over degree 2 and 3 nodes may iterate twice - it would have been 409 // faster if I just took the first degree 2 or 3 node I came across, but would 410 // it have worked as well? 411 OBKekulize(OBMol * mol)412 bool OBKekulize(OBMol* mol) 413 { 414 Kekulizer kekulizer(mol); 415 bool success = kekulizer.GreedyMatch(); 416 if (!success) { 417 success = kekulizer.BackTrack(); 418 } 419 420 kekulizer.AssignDoubleBonds(); 421 422 return success; 423 } 424 425 426 } // end namespace OpenBabel 427 428 //! \file kekulize.cpp 429 //! \brief Algorithm to kekulize a molecule 430