1 //
2 // Copyright (C) 2018-2020 Susan H. Leung and Greg Landrum
3 //
4 // @@ All Rights Reserved @@
5 // This file is part of the RDKit.
6 // The contents are covered by the terms of the BSD license
7 // which is included in the file license.txt, found at the root
8 // of the RDKit source tree.
9 //
10 #include "Tautomer.h"
11 #include "Fragment.h"
12 #include <GraphMol/MolStandardize/FragmentCatalog/FragmentCatalogUtils.h>
13 #include <GraphMol/SmilesParse/SmilesParse.h>
14 #include <GraphMol/SmilesParse/SmilesWrite.h>
15 #include <GraphMol/Substruct/SubstructMatch.h>
16 #include <boost/dynamic_bitset.hpp>
17 #include <algorithm>
18 #include <limits>
19
20 #include <boost/flyweight.hpp>
21 #include <boost/flyweight/key_value.hpp>
22 #include <boost/flyweight/no_tracking.hpp>
23 #include <utility>
24
25 // #define VERBOSE_ENUMERATION 1
26
27 #ifdef VERBOSE_ENUMERATION
28 #include <GraphMol/SmilesParse/SmartsWrite.h>
29 #endif
30
31 using namespace RDKit;
32
33 namespace RDKit {
34
35 namespace MolStandardize {
36
37 namespace TautomerScoringFunctions {
scoreRings(const ROMol & mol)38 int scoreRings(const ROMol &mol) {
39 int score = 0;
40 auto ringInfo = mol.getRingInfo();
41 std::unique_ptr<ROMol> cp;
42 if (!ringInfo->isInitialized()) {
43 cp.reset(new ROMol(mol));
44 MolOps::symmetrizeSSSR(*cp);
45 ringInfo = cp->getRingInfo();
46 }
47 boost::dynamic_bitset<> isArom(mol.getNumBonds());
48 boost::dynamic_bitset<> bothCarbon(mol.getNumBonds());
49 for (const auto &bnd : mol.bonds()) {
50 if (bnd->getIsAromatic()) {
51 isArom.set(bnd->getIdx());
52 if (bnd->getBeginAtom()->getAtomicNum() == 6 &&
53 bnd->getEndAtom()->getAtomicNum() == 6) {
54 bothCarbon.set(bnd->getIdx());
55 }
56 }
57 }
58 for (const auto &bring : ringInfo->bondRings()) {
59 bool allC = true;
60 bool allAromatic = true;
61 for (const auto bidx : bring) {
62 if (!isArom[bidx]) {
63 allAromatic = false;
64 break;
65 }
66 if (!bothCarbon[bidx]) {
67 allC = false;
68 }
69 }
70 if (allAromatic) {
71 score += 100;
72 if (allC) {
73 score += 150;
74 }
75 }
76 }
77 return score;
78 };
79
80 struct smarts_mol_holder {
81 std::string d_smarts;
82 ROMOL_SPTR dp_mol;
smarts_mol_holderRDKit::MolStandardize::TautomerScoringFunctions::smarts_mol_holder83 smarts_mol_holder(const std::string &smarts) : d_smarts(smarts) {
84 dp_mol.reset(SmartsToMol(smarts));
85 }
86 };
87
88 typedef boost::flyweight<
89 boost::flyweights::key_value<std::string, smarts_mol_holder>,
90 boost::flyweights::no_tracking>
91 smarts_mol_flyweight;
92
93 struct SubstructTerm {
94 std::string name;
95 std::string smarts;
96 int score;
97 ROMOL_SPTR matcher;
SubstructTermRDKit::MolStandardize::TautomerScoringFunctions::SubstructTerm98 SubstructTerm(std::string aname, std::string asmarts, int ascore)
99 : name(std::move(aname)), smarts(std::move(asmarts)), score(ascore) {
100 matcher = smarts_mol_flyweight(smarts).get().dp_mol;
101 };
102 };
103
scoreSubstructs(const ROMol & mol)104 int scoreSubstructs(const ROMol &mol) {
105 // a note on efficiency here: we'll construct the SubstructTerm objects here
106 // repeatedly, but the SMARTS parsing for each entry will only be done once
107 // since we're using the boost::flyweights above to cache them
108 const std::vector<SubstructTerm> substructureTerms{
109 {"benzoquinone", "[#6]1([#6]=[#6][#6]([#6]=[#6]1)=,:[N,S,O])=,:[N,S,O]",
110 25},
111 {"oxim", "[#6]=[N][OH]", 4},
112 {"C=O", "[#6]=,:[#8]", 2},
113 {"N=O", "[#7]=,:[#8]", 2},
114 {"P=O", "[#15]=,:[#8]", 2},
115 {"C=hetero", "[C]=[!#1;!#6]", 1},
116 {"C(=hetero)-hetero", "[C](=[!#1;!#6])[!#1;!#6]", 2},
117 {"aromatic C = exocyclic N", "[c]=!@[N]", -1},
118 {"methyl", "[CX4H3]", 1},
119 {"guanidine terminal=N", "[#7]C(=[NR0])[#7H0]", 1},
120 {"guanidine endocyclic=N", "[#7;R][#6;R]([N])=[#7;R]", 2},
121 {"aci-nitro", "[#6]=[N+]([O-])[OH]", -4}};
122 int score = 0;
123 for (const auto &term : substructureTerms) {
124 if (!term.matcher) {
125 BOOST_LOG(rdErrorLog) << " matcher for term " << term.name
126 << " is invalid, ignoring it." << std::endl;
127 continue;
128 }
129 SubstructMatchParameters params;
130 const auto matches = SubstructMatch(mol, *term.matcher, params);
131 // if (!matches.empty()) {
132 // std::cerr << " " << matches.size() << " matches to " << term.name
133 // << std::endl;
134 // }
135 score += static_cast<int>(matches.size()) * term.score;
136 }
137 return score;
138 }
139
scoreHeteroHs(const ROMol & mol)140 int scoreHeteroHs(const ROMol &mol) {
141 int score = 0;
142 for (const auto &at : mol.atoms()) {
143 int anum = at->getAtomicNum();
144 if (anum == 15 || anum == 16 || anum == 34 || anum == 52) {
145 score -= at->getTotalNumHs();
146 }
147 }
148 return score;
149 }
150 } // namespace TautomerScoringFunctions
151
TautomerEnumerator(const CleanupParameters & params)152 TautomerEnumerator::TautomerEnumerator(const CleanupParameters ¶ms)
153 : d_maxTautomers(params.maxTautomers),
154 d_maxTransforms(params.maxTransforms),
155 d_removeSp3Stereo(params.tautomerRemoveSp3Stereo),
156 d_removeBondStereo(params.tautomerRemoveBondStereo),
157 d_removeIsotopicHs(params.tautomerRemoveIsotopicHs),
158 d_reassignStereo(params.tautomerReassignStereo) {
159 TautomerCatalogParams tautParams(params.tautomerTransforms);
160 dp_catalog.reset(new TautomerCatalog(&tautParams));
161 }
162
setTautomerStereoAndIsoHs(const ROMol & mol,ROMol & taut,const TautomerEnumeratorResult & res) const163 bool TautomerEnumerator::setTautomerStereoAndIsoHs(
164 const ROMol &mol, ROMol &taut, const TautomerEnumeratorResult &res) const {
165 bool modified = false;
166 for (auto atom : mol.atoms()) {
167 auto atomIdx = atom->getIdx();
168 if (!res.d_modifiedAtoms.test(atomIdx)) {
169 continue;
170 }
171 auto tautAtom = taut.getAtomWithIdx(atomIdx);
172 // clear chiral tag on sp2 atoms (also sp3 if d_removeSp3Stereo is true)
173 if (tautAtom->getHybridization() == Atom::SP2 || d_removeSp3Stereo) {
174 modified |= (tautAtom->getChiralTag() != Atom::CHI_UNSPECIFIED);
175 tautAtom->setChiralTag(Atom::CHI_UNSPECIFIED);
176 if (tautAtom->hasProp(common_properties::_CIPCode)) {
177 tautAtom->clearProp(common_properties::_CIPCode);
178 }
179 } else {
180 modified |= (tautAtom->getChiralTag() != atom->getChiralTag());
181 tautAtom->setChiralTag(atom->getChiralTag());
182 if (atom->hasProp(common_properties::_CIPCode)) {
183 tautAtom->setProp(
184 common_properties::_CIPCode,
185 atom->getProp<std::string>(common_properties::_CIPCode));
186 }
187 }
188 // remove isotopic Hs if present (and if d_removeIsotopicHs is true)
189 if (tautAtom->hasProp(common_properties::_isotopicHs) &&
190 (d_removeIsotopicHs || !tautAtom->getTotalNumHs())) {
191 tautAtom->clearProp(common_properties::_isotopicHs);
192 }
193 }
194 // remove stereochemistry on bonds that are part of a tautomeric path
195 for (auto bond : mol.bonds()) {
196 auto bondIdx = bond->getIdx();
197 if (!res.d_modifiedBonds.test(bondIdx)) {
198 continue;
199 }
200 std::vector<unsigned int> bondsToClearDirs;
201 if (bond->getBondType() == Bond::DOUBLE &&
202 bond->getStereo() > Bond::STEREOANY) {
203 // look around the beginning and end atoms and check for bonds with
204 // direction set
205 for (auto atom : {bond->getBeginAtom(), bond->getEndAtom()}) {
206 for (const auto &nbri :
207 boost::make_iterator_range(mol.getAtomBonds(atom))) {
208 const auto &obnd = mol[nbri];
209 if (obnd->getBondDir() == Bond::ENDDOWNRIGHT ||
210 obnd->getBondDir() == Bond::ENDUPRIGHT) {
211 bondsToClearDirs.push_back(obnd->getIdx());
212 }
213 }
214 }
215 }
216 auto tautBond = taut.getBondWithIdx(bondIdx);
217 if (tautBond->getBondType() != Bond::DOUBLE || d_removeBondStereo) {
218 modified |= (tautBond->getStereo() != Bond::STEREONONE);
219 tautBond->setStereo(Bond::STEREONONE);
220 tautBond->getStereoAtoms().clear();
221 for (auto bi : bondsToClearDirs) {
222 taut.getBondWithIdx(bi)->setBondDir(Bond::NONE);
223 }
224 } else {
225 const INT_VECT &sa = bond->getStereoAtoms();
226 modified |= (tautBond->getStereo() != bond->getStereo() ||
227 sa.size() != tautBond->getStereoAtoms().size());
228 if (sa.size() == 2) {
229 tautBond->setStereoAtoms(sa.front(), sa.back());
230 }
231 tautBond->setStereo(bond->getStereo());
232 for (auto bi : bondsToClearDirs) {
233 taut.getBondWithIdx(bi)->setBondDir(
234 mol.getBondWithIdx(bi)->getBondDir());
235 }
236 }
237 }
238 if (d_reassignStereo) {
239 static const bool cleanIt = true;
240 static const bool force = true;
241 MolOps::assignStereochemistry(taut, cleanIt, force);
242 } else {
243 taut.setProp(common_properties::_StereochemDone, 1);
244 }
245 return modified;
246 }
247
enumerate(const ROMol & mol,boost::dynamic_bitset<> * modifiedAtoms,boost::dynamic_bitset<> * modifiedBonds) const248 std::vector<ROMOL_SPTR> TautomerEnumerator::enumerate(
249 const ROMol &mol, boost::dynamic_bitset<> *modifiedAtoms,
250 boost::dynamic_bitset<> *modifiedBonds) const {
251 TautomerEnumeratorResult tresult = enumerate(mol);
252 if (modifiedAtoms) {
253 *modifiedAtoms = tresult.modifiedAtoms();
254 }
255 if (modifiedBonds) {
256 *modifiedBonds = tresult.modifiedBonds();
257 }
258 return tresult.tautomers();
259 }
260
enumerate(const ROMol & mol) const261 TautomerEnumeratorResult TautomerEnumerator::enumerate(const ROMol &mol) const {
262 #ifdef VERBOSE_ENUMERATION
263 std::cout << "**********************************" << std::endl;
264 #endif
265 PRECONDITION(dp_catalog, "no catalog!");
266 const TautomerCatalogParams *tautparams = dp_catalog->getCatalogParams();
267 PRECONDITION(tautparams, "");
268
269 TautomerEnumeratorResult res;
270
271 const std::vector<TautomerTransform> &transforms =
272 tautparams->getTransforms();
273
274 // Enumerate all possible tautomers and return them as a vector.
275 // smi is the input molecule SMILES
276 std::string smi = MolToSmiles(mol, true);
277 // taut is a copy of the input molecule
278 ROMOL_SPTR taut(new ROMol(mol));
279 // Create a kekulized form of the molecule to match the SMARTS against
280 RWMOL_SPTR kekulized(new RWMol(mol));
281 MolOps::Kekulize(*kekulized, false);
282 res.d_tautomers = {{smi, Tautomer(taut, kekulized, 0, 0)}};
283 res.d_modifiedAtoms.resize(mol.getNumAtoms());
284 res.d_modifiedBonds.resize(mol.getNumBonds());
285 bool completed = false;
286 bool bailOut = false;
287 unsigned int nTransforms = 0;
288 static const std::array<const char *, 4> statusMsg{
289 "completed", "max tautomers reached", "max transforms reached",
290 "canceled"};
291
292 while (!completed && !bailOut) {
293 // std::map automatically sorts res.d_tautomers into alphabetical order
294 // (SMILES)
295 for (auto &smilesTautomerPair : res.d_tautomers) {
296 #ifdef VERBOSE_ENUMERATION
297 std::cout << "Current tautomers: " << std::endl;
298 for (const auto &smilesTautomerPair : res.d_tautomers) {
299 std::cout << smilesTautomerPair.first << " done "
300 << smilesTautomerPair.second.d_done << std::endl;
301 }
302 #endif
303 std::string tsmiles;
304 if (smilesTautomerPair.second.d_done) {
305 #ifdef VERBOSE_ENUMERATION
306 std::cout << "Skipping " << smilesTautomerPair.first << " as already done" << std::endl;
307 #endif
308 continue;
309 }
310 #ifdef VERBOSE_ENUMERATION
311 std::cout << "Looking at tautomer: " << smilesTautomerPair.first
312 << std::endl;
313 #endif
314 // tautomer not yet done
315 for (const auto &transform : transforms) {
316 if (bailOut) {
317 break;
318 }
319 // kmol is the kekulized version of the tautomer
320 const auto &kmol = smilesTautomerPair.second.kekulized;
321 std::vector<MatchVectType> matches;
322 unsigned int matched =
323 SubstructMatch(*kmol, *(transform.Mol), matches);
324
325 if (!matched) {
326 continue;
327 }
328 ++nTransforms;
329 #ifdef VERBOSE_ENUMERATION
330 std::string name;
331 (transform.Mol)->getProp(common_properties::_Name, name);
332 std::cout << "kmol for " << smilesTautomerPair.first << " : " << MolToSmiles(*kmol) << std::endl;
333 std::cout << "transform mol: " << MolToSmarts(*(transform.Mol))
334 << std::endl;
335
336 std::cout << "Matched: " << name << std::endl;
337 #endif
338 // loop over transform matches
339 for (const auto &match : matches) {
340 if (nTransforms >= d_maxTransforms) {
341 res.d_status = TautomerEnumeratorStatus::MaxTransformsReached;
342 bailOut = true;
343 } else if (res.d_tautomers.size() >= d_maxTautomers) {
344 res.d_status = TautomerEnumeratorStatus::MaxTautomersReached;
345 bailOut = true;
346 } else if (d_callback.get() && !(*d_callback)(mol, res)) {
347 res.d_status = TautomerEnumeratorStatus::Canceled;
348 bailOut = true;
349 }
350 if (bailOut) {
351 break;
352 }
353 // Create a copy of in the input molecule so we can modify it
354 // Use kekule form so bonds are explicitly single/double instead of
355 // aromatic
356 RWMOL_SPTR product(new RWMol(*kmol));
357 // Remove a hydrogen from the first matched atom and add one to the
358 // last
359 int firstIdx = match.front().second;
360 int lastIdx = match.back().second;
361 Atom *first = product->getAtomWithIdx(firstIdx);
362 Atom *last = product->getAtomWithIdx(lastIdx);
363 res.d_modifiedAtoms.set(firstIdx);
364 res.d_modifiedAtoms.set(lastIdx);
365 first->setNumExplicitHs(
366 std::max(0, static_cast<int>(first->getTotalNumHs()) - 1));
367 last->setNumExplicitHs(last->getTotalNumHs() + 1);
368 // Remove any implicit hydrogens from the first and last atoms
369 // now we have set the count explicitly
370 first->setNoImplicit(true);
371 last->setNoImplicit(true);
372 // Adjust bond orders
373 unsigned int bi = 0;
374 for (size_t i = 0; i < match.size() - 1; ++i) {
375 Bond *bond = product->getBondBetweenAtoms(match[i].second,
376 match[i + 1].second);
377 ASSERT_INVARIANT(bond, "required bond not found");
378 // check if bonds is specified in tautomer.in file
379 if (!transform.BondTypes.empty()) {
380 bond->setBondType(transform.BondTypes[bi]);
381 ++bi;
382 } else {
383 Bond::BondType bondtype = bond->getBondType();
384 #ifdef VERBOSE_ENUMERATION
385 std::cout << "Bond as double: " << bond->getBondTypeAsDouble()
386 << std::endl;
387 std::cout << bondtype << std::endl;
388 #endif
389 if (bondtype == Bond::SINGLE) {
390 bond->setBondType(Bond::DOUBLE);
391 #ifdef VERBOSE_ENUMERATION
392 std::cout << "Set bond to double" << std::endl;
393 #endif
394 }
395 if (bondtype == Bond::DOUBLE) {
396 bond->setBondType(Bond::SINGLE);
397 #ifdef VERBOSE_ENUMERATION
398 std::cout << "Set bond to single" << std::endl;
399 #endif
400 }
401 }
402 res.d_modifiedBonds.set(bond->getIdx());
403 }
404 // TODO adjust charges
405 if (!transform.Charges.empty()) {
406 unsigned int ci = 0;
407 for (const auto &pair : match) {
408 Atom *atom = product->getAtomWithIdx(pair.second);
409 atom->setFormalCharge(atom->getFormalCharge() +
410 transform.Charges[ci++]);
411 }
412 }
413
414 unsigned int failedOp;
415 MolOps::sanitizeMol(*product, failedOp,
416 MolOps::SANITIZE_KEKULIZE |
417 MolOps::SANITIZE_SETAROMATICITY |
418 MolOps::SANITIZE_SETCONJUGATION |
419 MolOps::SANITIZE_SETHYBRIDIZATION |
420 MolOps::SANITIZE_ADJUSTHS);
421 #ifdef VERBOSE_ENUMERATION
422 std::cout << "pre-setTautomerStereo: " << MolToSmiles(*product, true)
423 << std::endl;
424 #endif
425 setTautomerStereoAndIsoHs(mol, *product, res);
426 tsmiles = MolToSmiles(*product, true);
427 #ifdef VERBOSE_ENUMERATION
428 (transform.Mol)->getProp(common_properties::_Name, name);
429 std::cout << "Applied rule: " << name << " to " << smilesTautomerPair.first
430 << std::endl;
431 #endif
432 if (res.d_tautomers.find(tsmiles) != res.d_tautomers.end()) {
433 #ifdef VERBOSE_ENUMERATION
434 std::cout << "Previous tautomer produced again: " << tsmiles
435 << std::endl;
436 #endif
437 continue;
438 }
439 // in addition to the above transformations, sanitization may modify
440 // bonds, e.g. Cc1nc2ccccc2[nH]1
441 for (size_t i = 0; i < mol.getNumBonds(); i++) {
442 auto molBondType = mol.getBondWithIdx(i)->getBondType();
443 auto tautBondType = product->getBondWithIdx(i)->getBondType();
444 if (molBondType != tautBondType && !res.d_modifiedBonds.test(i)) {
445 #ifdef VERBOSE_ENUMERATION
446 std::cout << "Sanitization has modified bond " << i
447 << std::endl;
448 #endif
449 res.d_modifiedBonds.set(i);
450 }
451 }
452 RWMOL_SPTR kekulized_product(new RWMol(*product));
453 MolOps::Kekulize(*kekulized_product, false);
454 #ifdef VERBOSE_ENUMERATION
455 auto it = res.d_tautomers.find(tsmiles);
456 if (it == res.d_tautomers.end()) {
457 std::cout << "New tautomer added as ";
458 } else {
459 std::cout << "New tautomer replaced for ";
460 }
461 std::cout << tsmiles << ", taut: " << MolToSmiles(*product)
462 << ", kek: " << MolToSmiles(*kekulized_product, true, true)
463 << std::endl;
464 #endif
465 res.d_tautomers[tsmiles] = Tautomer(
466 std::move(product), std::move(kekulized_product),
467 res.d_modifiedAtoms.count(), res.d_modifiedBonds.count());
468 }
469 }
470 smilesTautomerPair.second.d_done = true;
471 }
472 completed = true;
473 size_t maxNumModifiedAtoms = res.d_modifiedAtoms.count();
474 size_t maxNumModifiedBonds = res.d_modifiedBonds.count();
475 for (auto it = res.d_tautomers.begin(); it != res.d_tautomers.end();) {
476 auto &taut = it->second;
477 if (!taut.d_done) {
478 completed = false;
479 }
480 if ((taut.d_numModifiedAtoms < maxNumModifiedAtoms ||
481 taut.d_numModifiedBonds < maxNumModifiedBonds) &&
482 setTautomerStereoAndIsoHs(mol, *taut.tautomer, res)) {
483 Tautomer tautStored = std::move(taut);
484 it = res.d_tautomers.erase(it);
485 tautStored.d_numModifiedAtoms = maxNumModifiedAtoms;
486 tautStored.d_numModifiedBonds = maxNumModifiedBonds;
487 auto insertRes = res.d_tautomers.insert(std::make_pair(
488 MolToSmiles(*tautStored.tautomer), std::move(tautStored)));
489 if (insertRes.second) {
490 it = insertRes.first;
491 }
492 } else {
493 ++it;
494 }
495 }
496 if (bailOut && res.d_tautomers.size() < d_maxTautomers &&
497 res.d_status == TautomerEnumeratorStatus::MaxTautomersReached) {
498 res.d_status = TautomerEnumeratorStatus::Completed;
499 bailOut = false;
500 }
501 } // while
502 res.fillTautomersItVec();
503 if (!completed) {
504 BOOST_LOG(rdWarningLog)
505 << "Tautomer enumeration stopped at " << res.d_tautomers.size()
506 << " tautomers: " << statusMsg.at(static_cast<size_t>(res.d_status))
507 << std::endl;
508 }
509
510 return res;
511 }
512
513 // pickCanonical non-templated overload that avoids recomputing SMILES
pickCanonical(const TautomerEnumeratorResult & tautRes,boost::function<int (const ROMol & mol)> scoreFunc) const514 ROMol *TautomerEnumerator::pickCanonical(
515 const TautomerEnumeratorResult &tautRes,
516 boost::function<int(const ROMol &mol)> scoreFunc) const {
517 ROMOL_SPTR bestMol;
518 if (tautRes.d_tautomers.size() == 1) {
519 bestMol = tautRes.d_tautomers.begin()->second.tautomer;
520 } else {
521 // Calculate score for each tautomer
522 int bestScore = std::numeric_limits<int>::min();
523 std::string bestSmiles = "";
524 for (const auto &t : tautRes.d_tautomers) {
525 auto score = scoreFunc(*t.second.tautomer);
526 #ifdef VERBOSE_ENUMERATION
527 std::cerr << " " << t.first << " " << score << std::endl;
528 #endif
529 if (score > bestScore) {
530 bestScore = score;
531 bestSmiles = t.first;
532 bestMol = t.second.tautomer;
533 } else if (score == bestScore) {
534 if (t.first < bestSmiles) {
535 bestSmiles = t.first;
536 bestMol = t.second.tautomer;
537 }
538 }
539 }
540 }
541 ROMol *res = new ROMol(*bestMol);
542 static const bool cleanIt = true;
543 static const bool force = true;
544 MolOps::assignStereochemistry(*res, cleanIt, force);
545
546 return res;
547 }
548
canonicalize(const ROMol & mol,boost::function<int (const ROMol & mol)> scoreFunc) const549 ROMol *TautomerEnumerator::canonicalize(
550 const ROMol &mol, boost::function<int(const ROMol &mol)> scoreFunc) const {
551 auto thisCopy = TautomerEnumerator(*this);
552 thisCopy.setReassignStereo(false);
553 auto res = thisCopy.enumerate(mol);
554 if (res.empty()) {
555 BOOST_LOG(rdWarningLog)
556 << "no tautomers found, returning input molecule" << std::endl;
557 return new ROMol(mol);
558 }
559 return pickCanonical(res, scoreFunc);
560 }
561
562 } // namespace MolStandardize
563 } // namespace RDKit
564