1 //
2 // Copyright (C) 2002-2019 Greg Landrum and Rational Discovery LLC
3 //
4 // @@ All Rights Reserved @@
5 // This file is part of the RDKit.
6 // The contents are covered by the terms of the BSD license
7 // which is included in the file license.txt, found at the root
8 // of the RDKit source tree.
9 //
10 #include "SmartsWrite.h"
11 #include <sstream>
12 #include <cstdint>
13 #include <boost/algorithm/string.hpp>
14 #include "SmilesWrite.h"
15 #include <GraphMol/RDKitBase.h>
16 #include <GraphMol/RDKitQueries.h>
17 #include <GraphMol/Canon.h>
18 #include <GraphMol/new_canon.h>
19 #include <RDGeneral/Exceptions.h>
20 #include <RDGeneral/RDLog.h>
21
22 namespace RDKit {
23 using namespace Canon;
24
25 // local utility namespace
26 namespace {
27
28 enum class QueryBoolFeatures {
29 HAS_AND = 0x1,
30 HAS_LOWAND = 0x2,
31 HAS_OR = 0x4,
32 HAS_RECURSION = 0x8
33 };
34
35 std::string _recurseGetSmarts(const QueryAtom *qatom,
36 const QueryAtom::QUERYATOM_QUERY *node,
37 bool negate, unsigned int &features);
38 std::string _recurseBondSmarts(const Bond *bond,
39 const QueryBond::QUERYBOND_QUERY *node,
40 bool negate, int atomToLeftIdx,
41 unsigned int &features);
42
_combineChildSmarts(std::string cs1,unsigned int features1,std::string cs2,unsigned int features2,std::string descrip,unsigned int & features)43 std::string _combineChildSmarts(std::string cs1, unsigned int features1,
44 std::string cs2, unsigned int features2,
45 std::string descrip, unsigned int &features) {
46 std::string res = "";
47 if ((descrip.find("Or") > 0) && (descrip.find("Or") < descrip.length())) {
48 // if either of child smarts already have a "," and ";" we can't have one
49 // more OR here
50 if ((features1 & static_cast<unsigned int>(QueryBoolFeatures::HAS_LOWAND) &&
51 features1 & static_cast<unsigned int>(QueryBoolFeatures::HAS_OR)) ||
52 (features2 & static_cast<unsigned int>(QueryBoolFeatures::HAS_LOWAND) &&
53 features2 & static_cast<unsigned int>(QueryBoolFeatures::HAS_OR))) {
54 throw ValueErrorException(
55 "This is a non-smartable query - OR above and below AND in the "
56 "binary tree");
57 }
58 res += cs1;
59 if (!(cs1.empty() || cs2.empty())) {
60 res += ",";
61 }
62 res += cs2;
63 features |= static_cast<unsigned int>(QueryBoolFeatures::HAS_OR);
64 } else if ((descrip.find("And") > 0) &&
65 (descrip.find("And") < descrip.length())) {
66 std::string symb;
67 if (features1 & static_cast<unsigned int>(QueryBoolFeatures::HAS_OR) ||
68 features2 & static_cast<unsigned int>(QueryBoolFeatures::HAS_OR)) {
69 symb = ";";
70 features |= static_cast<unsigned int>(QueryBoolFeatures::HAS_LOWAND);
71 } else {
72 symb = "&";
73 features |= static_cast<unsigned int>(QueryBoolFeatures::HAS_AND);
74 }
75 res += cs1;
76 if (!(cs1.empty() || cs2.empty())) {
77 res += symb;
78 }
79 res += cs2;
80 } else {
81 std::stringstream err;
82 err << "Don't know how to combine using " << descrip;
83 throw ValueErrorException(err.str());
84 }
85 features |= features1;
86 features |= features2;
87
88 return res;
89 } // namespace
90
91 template <typename T>
describeQuery(const T * query,std::string leader="\\t")92 void describeQuery(const T *query, std::string leader = "\t") {
93 // BOOST_LOG(rdInfoLog) << leader << query->getDescription() << std::endl;
94 typename T::CHILD_VECT_CI iter;
95 for (iter = query->beginChildren(); iter != query->endChildren(); ++iter) {
96 describeQuery(iter->get(), leader + "\t");
97 }
98 }
99
100 const static std::string _qatomHasStereoSet = "_qatomHasStereoSet";
getAtomSmartsSimple(const QueryAtom * qatom,const ATOM_EQUALS_QUERY * query,bool & needParen)101 std::string getAtomSmartsSimple(const QueryAtom *qatom,
102 const ATOM_EQUALS_QUERY *query,
103 bool &needParen) {
104 PRECONDITION(query, "bad query");
105
106 std::string descrip = query->getDescription();
107 bool hasVal = false;
108 enum class Modifiers : std::uint8_t { NONE, RANGE, LESS, GREATER };
109 Modifiers mods = Modifiers::NONE;
110 if (boost::starts_with(descrip, "range_")) {
111 mods = Modifiers::RANGE;
112 descrip = descrip.substr(6);
113 } else if (boost::starts_with(descrip, "less_")) {
114 mods = Modifiers::LESS;
115 descrip = descrip.substr(5);
116 } else if (boost::starts_with(descrip, "greater_")) {
117 mods = Modifiers::GREATER;
118 descrip = descrip.substr(8);
119 }
120 std::stringstream res;
121 if (descrip == "AtomImplicitHCount") {
122 res << "h";
123 hasVal = true;
124 needParen = true;
125 } else if (descrip == "AtomHasImplicitH") {
126 res << "h";
127 needParen = true;
128 } else if (descrip == "AtomTotalValence") {
129 res << "v";
130 hasVal = true;
131 needParen = true;
132 } else if (descrip == "AtomAtomicNum") {
133 res << "#";
134 hasVal = true;
135 needParen = true;
136 } else if (descrip == "AtomExplicitDegree") {
137 res << "D";
138 hasVal = true;
139 needParen = true;
140 } else if (descrip == "AtomNonHydrogenDegree") {
141 res << "d";
142 hasVal = true;
143 needParen = true;
144 } else if (descrip == "AtomTotalDegree") {
145 res << "X";
146 hasVal = true;
147 needParen = true;
148 } else if (descrip == "AtomHasRingBond") {
149 res << "x";
150 needParen = true;
151 } else if (descrip == "AtomHCount") {
152 res << "H";
153 hasVal = true;
154 needParen = true;
155 } else if (descrip == "AtomIsAliphatic") {
156 res << "A";
157 needParen = false;
158 } else if (descrip == "AtomIsAromatic") {
159 res << "a";
160 needParen = false;
161 } else if (descrip == "AtomNull") {
162 res << "*";
163 needParen = false;
164 } else if (descrip == "AtomInRing") {
165 res << "R";
166 needParen = true;
167 } else if (descrip == "AtomMinRingSize") {
168 res << "r";
169 hasVal = true;
170 needParen = true;
171 } else if (descrip == "AtomInNRings") {
172 res << "R";
173 if (mods == Modifiers::NONE && query->getVal() >= 0) {
174 hasVal = true;
175 }
176 needParen = true;
177 } else if (descrip == "AtomHasHeteroatomNeighbors") {
178 res << "z";
179 needParen = true;
180 } else if (descrip == "AtomNumHeteroatomNeighbors") {
181 res << "z";
182 hasVal = true;
183 needParen = true;
184 } else if (descrip == "AtomHasAliphaticHeteroatomNeighbors") {
185 res << "Z";
186 needParen = true;
187 } else if (descrip == "AtomNumAliphaticHeteroatomNeighbors") {
188 res << "Z";
189 hasVal = true;
190 needParen = true;
191 } else if (descrip == "AtomFormalCharge") {
192 int val = query->getVal();
193 if (val < 0) {
194 res << "-";
195 } else {
196 res << "+";
197 }
198 if (mods == Modifiers::NONE && abs(val) != 1) {
199 res << abs(val);
200 }
201 needParen = true;
202 } else if (descrip == "AtomNegativeFormalCharge") {
203 int val = query->getVal();
204 if (val < 0) {
205 res << "+";
206 } else {
207 res << "-";
208 }
209 if (mods == Modifiers::NONE && abs(val) != 1) {
210 res << abs(val);
211 }
212 needParen = true;
213 } else if (descrip == "AtomHybridization") {
214 res << "^";
215 switch (query->getVal()) {
216 case Atom::S:
217 res << "0";
218 break;
219 case Atom::SP:
220 res << "1";
221 break;
222 case Atom::SP2:
223 res << "2";
224 break;
225 case Atom::SP3:
226 res << "3";
227 break;
228 case Atom::SP3D:
229 res << "4";
230 break;
231 case Atom::SP3D2:
232 res << "5";
233 break;
234 }
235 needParen = true;
236 } else if (descrip == "AtomMass") {
237 res << query->getVal() / massIntegerConversionFactor << "*";
238 needParen = true;
239 } else if (descrip == "AtomIsotope") {
240 res << query->getVal() << "*";
241 needParen = true;
242 } else if (descrip == "AtomRingBondCount") {
243 res << "x";
244 hasVal = true;
245 needParen = true;
246 } else if (descrip == "AtomUnsaturated") {
247 res << "$(*=,:,#*)";
248 needParen = true;
249 } else if (descrip == "AtomType") {
250 int atNum;
251 bool isAromatic;
252 parseAtomType(query->getVal(), atNum, isAromatic);
253 std::string symbol = PeriodicTable::getTable()->getElementSymbol(atNum);
254 if (isAromatic) {
255 symbol[0] += ('a' - 'A');
256 }
257 res << symbol;
258 if (!SmilesWrite::inOrganicSubset(atNum)) {
259 needParen = true;
260 }
261 } else {
262 BOOST_LOG(rdWarningLog)
263 << "Cannot write SMARTS for query type : " << descrip
264 << ". Ignoring it." << std::endl;
265 res << "*";
266 }
267
268 if (mods != Modifiers::NONE) {
269 res << "{";
270 switch (mods) {
271 case Modifiers::LESS:
272 res << ((const ATOM_LESSEQUAL_QUERY *)query)->getVal() << "-";
273 break;
274 case Modifiers::RANGE:
275 res << ((const ATOM_RANGE_QUERY *)query)->getLower() << "-"
276 << ((const ATOM_RANGE_QUERY *)query)->getUpper();
277 break;
278 case Modifiers::GREATER:
279 res << "-" << ((const ATOM_GREATEREQUAL_QUERY *)query)->getVal();
280 break;
281 default:
282 break;
283 }
284 res << "}";
285 } else if (hasVal) {
286 res << query->getVal();
287 }
288
289 // handle atomic stereochemistry
290 if (qatom->hasOwningMol() &&
291 qatom->getOwningMol().hasProp(common_properties::_doIsoSmiles)) {
292 if (qatom->getChiralTag() != Atom::CHI_UNSPECIFIED &&
293 !qatom->hasProp(_qatomHasStereoSet) &&
294 !qatom->hasProp(common_properties::_brokenChirality)) {
295 qatom->setProp(_qatomHasStereoSet, 1);
296 switch (qatom->getChiralTag()) {
297 case Atom::CHI_TETRAHEDRAL_CW:
298 res << "@@";
299 needParen = true;
300 break;
301 case Atom::CHI_TETRAHEDRAL_CCW:
302 res << "@";
303 needParen = true;
304 break;
305 default:
306 break;
307 }
308 }
309 }
310
311 return res.str();
312 }
313
getRecursiveStructureQuerySmarts(const QueryAtom::QUERYATOM_QUERY * query)314 std::string getRecursiveStructureQuerySmarts(
315 const QueryAtom::QUERYATOM_QUERY *query) {
316 PRECONDITION(query, "bad query");
317 PRECONDITION(query->getDescription() == "RecursiveStructure", "bad query");
318 const auto *rquery = static_cast<const RecursiveStructureQuery *>(query);
319 auto *qmol = const_cast<ROMol *>(rquery->getQueryMol());
320 std::string res = MolToSmarts(*qmol);
321 res = "$(" + res + ")";
322 if (rquery->getNegation()) {
323 res = "!" + res;
324 }
325 return res;
326 }
327
getBasicBondRepr(Bond::BondType typ,Bond::BondDir dir,bool doIsomericSmiles,bool reverseDative)328 std::string getBasicBondRepr(Bond::BondType typ, Bond::BondDir dir,
329 bool doIsomericSmiles, bool reverseDative) {
330 std::string res;
331 switch (typ) {
332 case Bond::SINGLE:
333 res = "-";
334 if (doIsomericSmiles) {
335 if (dir == Bond::ENDDOWNRIGHT) {
336 res = "\\";
337 } else if (dir == Bond::ENDUPRIGHT) {
338 res = "/";
339 }
340 }
341 break;
342 case Bond::DOUBLE:
343 res = "=";
344 break;
345 case Bond::TRIPLE:
346 res = "#";
347 break;
348 case Bond::AROMATIC:
349 res = ":";
350 break;
351 case Bond::DATIVE:
352 if (reverseDative) {
353 res = "<-";
354 } else {
355 res = "->";
356 }
357 break;
358 default:
359 res = "";
360 }
361 return res;
362 } // namespace
363
getBondSmartsSimple(const Bond * bond,const BOND_EQUALS_QUERY * bquery,int atomToLeftIdx)364 std::string getBondSmartsSimple(const Bond *bond,
365 const BOND_EQUALS_QUERY *bquery,
366 int atomToLeftIdx) {
367 PRECONDITION(bond, "bad bond");
368
369 PRECONDITION(bquery, "bad query");
370 std::string descrip = bquery->getDescription();
371 std::string res = "";
372 if (descrip == "BondNull") {
373 res += "~";
374 } else if (descrip == "BondInRing") {
375 res += "@";
376 } else if (descrip == "SingleOrAromaticBond") {
377 // don't need to do anything here... :-)
378 } else if (descrip == "SingleOrDoubleBond") {
379 res += "-,=";
380 } else if (descrip == "DoubleOrAromaticBond") {
381 res += "=,:";
382 } else if (descrip == "SingleOrDoubleOrAromaticBond") {
383 res += "-,=,:";
384 } else if (descrip == "BondDir") {
385 int val = bquery->getVal();
386 if (val == static_cast<int>(Bond::ENDDOWNRIGHT)) {
387 res += "\\";
388 } else if (val == static_cast<int>(Bond::ENDUPRIGHT)) {
389 res += "/";
390 } else {
391 throw "Can't write smarts for this bond dir type";
392 }
393 } else if (descrip == "BondOrder") {
394 bool reverseDative =
395 (atomToLeftIdx >= 0 &&
396 bond->getBeginAtomIdx() != static_cast<unsigned int>(atomToLeftIdx));
397 bool doIsoSmiles =
398 !bond->hasOwningMol() ||
399 bond->getOwningMol().hasProp(common_properties::_doIsoSmiles);
400 res += getBasicBondRepr(static_cast<Bond::BondType>(bquery->getVal()),
401 bond->getBondDir(), doIsoSmiles, reverseDative);
402 } else {
403 std::stringstream msg;
404 msg << "Can't write smarts for this query bond type: " << descrip;
405 throw msg.str().c_str();
406 }
407 return res;
408 }
409
_recurseGetSmarts(const QueryAtom * qatom,const QueryAtom::QUERYATOM_QUERY * node,bool negate,unsigned int & features)410 std::string _recurseGetSmarts(const QueryAtom *qatom,
411 const QueryAtom::QUERYATOM_QUERY *node,
412 bool negate, unsigned int &features) {
413 PRECONDITION(node, "bad node");
414 // the algorithm goes something like this
415 // - recursively get the smarts for the child queries
416 // - combine the child smarts using the following rules:
417 // - if we are currently at an OR query, combine the subqueries with a
418 // ",",
419 // but only if neither of child smarts do not contain "," and ";"
420 // This situation leads to a no smartable situation and throw an
421 // error
422 // - if we are currently at an and query, combine the child smarts with
423 // "&"
424 // if neither of the child smarts contain a "," - otherwise combine
425 // them
426 // the child smarts with a ";"
427 //
428 // There is an additional complication with composite nodes that carry a
429 // negation - in this
430 // case we will propagate the negation to the child nodes using the
431 // following rules
432 // NOT (a AND b) = ( NOT (a)) AND ( NOT (b))
433 // NOT (a OR b) = ( NOT (a)) OR ( NOT (b))
434
435 auto descrip = node->getDescription();
436
437 unsigned int child1Features = 0;
438 unsigned int child2Features = 0;
439 auto chi = node->beginChildren();
440 auto child1 = chi->get();
441 auto dsc1 = child1->getDescription();
442
443 ++chi;
444 CHECK_INVARIANT(chi != node->endChildren(),
445 "Not enough children on the query");
446
447 bool needParen;
448 std::string csmarts1;
449 // deal with the first child
450 if (dsc1 == "RecursiveStructure") {
451 csmarts1 = getRecursiveStructureQuerySmarts(child1);
452 features |= static_cast<unsigned int>(QueryBoolFeatures::HAS_RECURSION);
453 } else if ((dsc1 != "AtomOr") && (dsc1 != "AtomAnd")) {
454 // child 1 is a simple node
455 const auto *tchild = static_cast<const ATOM_EQUALS_QUERY *>(child1);
456 csmarts1 = getAtomSmartsSimple(qatom, tchild, needParen);
457 bool nneg = (negate) ^ (tchild->getNegation());
458 if (nneg) {
459 csmarts1 = "!" + csmarts1;
460 }
461 } else {
462 // child 1 is composite node - recurse
463 bool nneg = (negate) ^ (child1->getNegation());
464 csmarts1 = _recurseGetSmarts(qatom, child1, nneg, child1Features);
465 }
466 // ok if we have a negation and we have an OR , we have to change to
467 // an AND since we propagated the negation
468 // i.e NOT (A OR B) = (NOT (A)) AND (NOT(B))
469 if (negate) {
470 if (descrip == "AtomOr") {
471 descrip = "AtomAnd";
472 } else if (descrip == "AtomAnd") {
473 descrip = "AtomOr";
474 }
475 }
476 auto res = csmarts1;
477 while (chi != node->endChildren()) {
478 auto child2 = chi->get();
479 ++chi;
480
481 auto dsc2 = child2->getDescription();
482 std::string csmarts2;
483
484 // deal with the next child
485 if (dsc2 == "RecursiveStructure") {
486 csmarts2 = getRecursiveStructureQuerySmarts(child2);
487 features |= static_cast<unsigned int>(QueryBoolFeatures::HAS_RECURSION);
488 } else if ((dsc2 != "AtomOr") && (dsc2 != "AtomAnd")) {
489 // child 2 is a simple node
490 const auto *tchild = static_cast<const ATOM_EQUALS_QUERY *>(child2);
491 csmarts2 = getAtomSmartsSimple(qatom, tchild, needParen);
492 bool nneg = (negate) ^ (tchild->getNegation());
493 if (nneg) {
494 csmarts2 = "!" + csmarts2;
495 }
496 } else {
497 bool nneg = (negate) ^ (child2->getNegation());
498 csmarts2 = _recurseGetSmarts(qatom, child2, nneg, child2Features);
499 }
500
501 res = _combineChildSmarts(res, child1Features, csmarts2, child2Features,
502 descrip, features);
503 }
504 return res;
505 }
506
_recurseBondSmarts(const Bond * bond,const QueryBond::QUERYBOND_QUERY * node,bool negate,int atomToLeftIdx,unsigned int & features)507 std::string _recurseBondSmarts(const Bond *bond,
508 const QueryBond::QUERYBOND_QUERY *node,
509 bool negate, int atomToLeftIdx,
510 unsigned int &features) {
511 // the algorithm goes something like this
512 // - recursively get the smarts for the child query bonds
513 // - combine the child smarts using the following rules:
514 // - if we are currently at an OR query, combine the subqueries with a
515 // ",",
516 // but only if neither of child smarts do not contain "," and ";"
517 // This situation leads to a no smartable situation and throw an
518 // error
519 // - if we are currently at an and query, combine the child smarts with
520 // "&"
521 // if neither of the child smarts contain a "," - otherwise combine
522 // them
523 // the child smarts with a ";"
524 //
525 // There is an additional complication with composite nodes that carry a
526 // negation - in this
527 // case we will propagate the negation to the child nodes using the
528 // following rules
529 // NOT (a AND b) = ( NOT (a)) AND ( NOT (b))
530 // NOT (a OR b) = ( NOT (a)) OR ( NOT (b))
531 PRECONDITION(bond, "bad bond");
532 PRECONDITION(node, "bad node");
533 std::string descrip = node->getDescription();
534 std::string res = "";
535
536 const QueryBond::QUERYBOND_QUERY *child1;
537 const QueryBond::QUERYBOND_QUERY *child2;
538 unsigned int child1Features = 0;
539 unsigned int child2Features = 0;
540 QueryBond::QUERYBOND_QUERY::CHILD_VECT_CI chi;
541
542 chi = node->beginChildren();
543 child1 = chi->get();
544 chi++;
545 child2 = chi->get();
546 chi++;
547 // OK we should be at the end of vector by now - since we can have only two
548 // children,
549 // well - at least in this case
550 CHECK_INVARIANT(chi == node->endChildren(), "Too many children on the query");
551
552 std::string dsc1, dsc2;
553 dsc1 = child1->getDescription();
554 dsc2 = child2->getDescription();
555 std::string csmarts1, csmarts2;
556
557 if ((dsc1 != "BondOr") && (dsc1 != "BondAnd")) {
558 // child1 is simple node get the smarts directly
559 const auto *tchild = static_cast<const BOND_EQUALS_QUERY *>(child1);
560 csmarts1 = getBondSmartsSimple(bond, tchild, atomToLeftIdx);
561 bool nneg = (negate) ^ (tchild->getNegation());
562 if (nneg) {
563 csmarts1 = "!" + csmarts1;
564 }
565 } else {
566 // child1 is a composite node recurse further
567 bool nneg = (negate) ^ (child1->getNegation());
568 csmarts1 =
569 _recurseBondSmarts(bond, child1, nneg, atomToLeftIdx, child1Features);
570 }
571
572 // now deal with the second child
573 if ((dsc2 != "BondOr") && (dsc2 != "BondAnd")) {
574 // child 2 is a simple node
575 const auto *tchild = static_cast<const BOND_EQUALS_QUERY *>(child2);
576 csmarts2 = getBondSmartsSimple(bond, tchild, atomToLeftIdx);
577 bool nneg = (negate) ^ (tchild->getNegation());
578 if (nneg) {
579 csmarts2 = "!" + csmarts2;
580 }
581 } else {
582 // child two is a composite node - recurse
583 bool nneg = (negate) ^ (child2->getNegation());
584 csmarts1 =
585 _recurseBondSmarts(bond, child2, nneg, atomToLeftIdx, child2Features);
586 }
587
588 // ok if we have a negation and we have to change the underlying logic,
589 // since we propagated the negation i.e NOT (A OR B) = (NOT (A)) AND
590 // (NOT(B))
591 if (negate) {
592 if (descrip == "BondOr") {
593 descrip = "BondAnd";
594 } else if (descrip == "BondAnd") {
595 descrip = "BondOr";
596 }
597 }
598 res += _combineChildSmarts(csmarts1, child1Features, csmarts2, child2Features,
599 descrip, features);
600 return res;
601 }
602
FragmentSmartsConstruct(ROMol & mol,unsigned int atomIdx,std::vector<Canon::AtomColors> & colors,UINT_VECT & ranks,bool doIsomericSmiles,const boost::dynamic_bitset<> * bondsInPlay=nullptr)603 std::string FragmentSmartsConstruct(
604 ROMol &mol, unsigned int atomIdx, std::vector<Canon::AtomColors> &colors,
605 UINT_VECT &ranks, bool doIsomericSmiles,
606 const boost::dynamic_bitset<> *bondsInPlay = nullptr) {
607 Canon::MolStack molStack;
608 molStack.reserve(mol.getNumAtoms() + mol.getNumBonds());
609 std::stringstream res;
610
611 // this is dirty trick get around the fact that canonicalizeFragment
612 // thinks we already called findSSSR - to do some atom ranking
613 // but for smarts we are going to ignore that part. We will artificially
614 // set the "SSSR" property to an empty property
615 mol.getRingInfo()->reset();
616 mol.getRingInfo()->initialize();
617 for (auto &atom : mol.atoms()) {
618 atom->updatePropertyCache(false);
619 }
620
621 // Another dirty trick to avoid reordering of lead chiral atoms in
622 // canonicalizeFragment: since we are writing SMARTS, there won't be a
623 // reordering on parsing.
624 // The trick is as easy as choosing the first non-chiral atom we can find as
625 // root of the string. This should not be a problem, since SMARTS do not get
626 // canonicalized.
627 if (molStack.empty()) {
628 for (const auto atom : mol.atoms()) {
629 if (colors[atom->getIdx()] == Canon::WHITE_NODE &&
630 atom->getChiralTag() != Atom::CHI_TETRAHEDRAL_CCW &&
631 atom->getChiralTag() != Atom::CHI_TETRAHEDRAL_CW) {
632 atomIdx = atom->getIdx();
633 break;
634 }
635 }
636 }
637
638 Canon::canonicalizeFragment(mol, atomIdx, colors, ranks, molStack,
639 bondsInPlay, nullptr, doIsomericSmiles);
640
641 // now clear the "SSSR" property
642 mol.getRingInfo()->reset();
643 for (auto &msCI : molStack) {
644 switch (msCI.type) {
645 case Canon::MOL_STACK_ATOM: {
646 auto *qatm = static_cast<QueryAtom *>(msCI.obj.atom);
647 res << SmartsWrite::GetAtomSmarts(qatm);
648 break;
649 }
650 case Canon::MOL_STACK_BOND: {
651 auto *qbnd = static_cast<QueryBond *>(msCI.obj.bond);
652 res << SmartsWrite::GetBondSmarts(qbnd, msCI.number);
653 break;
654 }
655 case Canon::MOL_STACK_RING: {
656 if (msCI.number < 10) {
657 res << msCI.number;
658 } else {
659 res << "%" << msCI.number;
660 }
661 break;
662 }
663 case Canon::MOL_STACK_BRANCH_OPEN: {
664 res << "(";
665 break;
666 }
667 case Canon::MOL_STACK_BRANCH_CLOSE: {
668 res << ")";
669 break;
670 }
671 default:
672 break;
673 }
674 }
675 return res.str();
676 }
677
678 // this is the used when converting a SMILES or
679 // non-query atom from a mol file into SMARTS.
getNonQueryAtomSmarts(const QueryAtom * qatom)680 std::string getNonQueryAtomSmarts(const QueryAtom *qatom) {
681 PRECONDITION(qatom, "bad atom");
682 PRECONDITION(!qatom->hasQuery(), "atom should not have query");
683 std::stringstream res;
684 res << "[";
685
686 int isotope = qatom->getIsotope();
687 if (isotope) {
688 res << isotope;
689 }
690
691 if (SmilesWrite::inOrganicSubset(qatom->getAtomicNum())) {
692 res << "#" << qatom->getAtomicNum();
693 } else {
694 res << qatom->getSymbol();
695 }
696
697 if (qatom->hasOwningMol() &&
698 qatom->getOwningMol().hasProp(common_properties::_doIsoSmiles)) {
699 if (qatom->getChiralTag() != Atom::CHI_UNSPECIFIED &&
700 !qatom->hasProp(_qatomHasStereoSet) &&
701 !qatom->hasProp(common_properties::_brokenChirality)) {
702 qatom->setProp(_qatomHasStereoSet, 1);
703 switch (qatom->getChiralTag()) {
704 case Atom::CHI_TETRAHEDRAL_CW:
705 res << "@@";
706 break;
707 case Atom::CHI_TETRAHEDRAL_CCW:
708 res << "@";
709 break;
710 default:
711 break;
712 }
713 }
714 }
715
716 auto hs = qatom->getNumExplicitHs();
717 // FIX: probably should be smarter about Hs:
718 if (hs) {
719 res << "H";
720 if (hs > 1) {
721 res << hs;
722 }
723 }
724 auto chg = qatom->getFormalCharge();
725 if (chg) {
726 if (chg == -1) {
727 res << "-";
728 } else if (chg == 1) {
729 res << "+";
730 } else if (chg < 0) {
731 res << qatom->getFormalCharge();
732 } else {
733 res << "+" << qatom->getFormalCharge();
734 }
735 }
736 int mapNum;
737 if (qatom->getPropIfPresent(common_properties::molAtomMapNumber, mapNum)) {
738 res << ":";
739 res << mapNum;
740 }
741 res << "]";
742 return res.str();
743 }
744
745 // this is the used when converting a SMILES or
746 // non-query bond from a mol file into SMARTS.
getNonQueryBondSmarts(const QueryBond * qbond,int atomToLeftIdx)747 std::string getNonQueryBondSmarts(const QueryBond *qbond, int atomToLeftIdx) {
748 PRECONDITION(qbond, "bad bond");
749 PRECONDITION(!qbond->hasQuery(), "bond should not have query");
750 RDUNUSED_PARAM(atomToLeftIdx);
751 std::string res;
752
753 if (qbond->getIsAromatic()) {
754 res = ":";
755 } else {
756 bool reverseDative =
757 (atomToLeftIdx >= 0 &&
758 qbond->getBeginAtomIdx() != static_cast<unsigned int>(atomToLeftIdx));
759 bool doIsoSmiles =
760 !qbond->hasOwningMol() ||
761 qbond->getOwningMol().hasProp(common_properties::_doIsoSmiles);
762 res = getBasicBondRepr(qbond->getBondType(), qbond->getBondDir(),
763 doIsoSmiles, reverseDative);
764 }
765
766 return res;
767 }
768
molToSmarts(const ROMol & inmol,bool doIsomericSmiles,std::vector<AtomColors> && colors,const boost::dynamic_bitset<> * bondsInPlay=nullptr)769 std::string molToSmarts(const ROMol &inmol, bool doIsomericSmiles,
770 std::vector<AtomColors> &&colors,
771 const boost::dynamic_bitset<> *bondsInPlay = nullptr) {
772 ROMol mol(inmol);
773 const unsigned int nAtoms = mol.getNumAtoms();
774 UINT_VECT ranks;
775 ranks.reserve(nAtoms);
776 // For smiles writing we would be canonicalizing but we will not do that here.
777 // We will simply use the atom indices as the rank
778 for (const auto &atom : mol.atoms()) {
779 ranks.push_back(atom->getIdx());
780 }
781
782 if (doIsomericSmiles) {
783 mol.setProp(common_properties::_doIsoSmiles, 1);
784 }
785
786 std::string res;
787 auto colorIt = std::find(colors.begin(), colors.end(), Canon::WHITE_NODE);
788 while (colorIt != colors.end()) {
789 unsigned int nextAtomIdx = 0;
790 unsigned int nextRank;
791 std::string subSmi;
792 nextRank = nAtoms + 1;
793 for (unsigned int i = 0; i < nAtoms; ++i) {
794 if (colors[i] == Canon::WHITE_NODE && ranks[i] < nextRank) {
795 nextRank = ranks[i];
796 nextAtomIdx = i;
797 }
798 }
799
800 subSmi = FragmentSmartsConstruct(mol, nextAtomIdx, colors, ranks,
801 doIsomericSmiles, bondsInPlay);
802 res += subSmi;
803
804 colorIt = std::find(colors.begin(), colors.end(), Canon::WHITE_NODE);
805 if (colorIt != colors.end()) {
806 res += ".";
807 }
808 }
809 return res;
810 }
811
812 } // namespace
813
814 namespace SmartsWrite {
GetAtomSmarts(const QueryAtom * qatom)815 std::string GetAtomSmarts(const QueryAtom *qatom) {
816 PRECONDITION(qatom, "bad atom");
817 std::string res;
818 bool needParen = false;
819
820 // BOOST_LOG(rdInfoLog)<<"Atom: " <<qatom->getIdx()<<std::endl;
821 if (!qatom->hasQuery()) {
822 res = getNonQueryAtomSmarts(qatom);
823 // BOOST_LOG(rdInfoLog)<<"\tno query:" <<res;
824 return res;
825 }
826 const auto query = qatom->getQuery();
827 PRECONDITION(query, "atom has no query");
828 // describeQuery(query);
829 unsigned int queryFeatures = 0;
830 std::string descrip = qatom->getQuery()->getDescription();
831 if (descrip.empty()) {
832 // we have simple atom - just generate the smiles and return
833 res = SmilesWrite::GetAtomSmiles(qatom);
834 if (res[0] == '[') {
835 // chop the brackets off, we'll put them back on later:
836 needParen = true;
837 res = res.substr(1, res.size() - 2);
838 }
839 } else if ((descrip == "AtomOr") || (descrip == "AtomAnd")) {
840 // we have a composite query
841 needParen = true;
842 res = _recurseGetSmarts(qatom, query, query->getNegation(), queryFeatures);
843 if (res.length() == 1) { // single atom symbol we don't need parens
844 needParen = false;
845 }
846 } else if (descrip == "RecursiveStructure") {
847 // it's a bare recursive structure query:
848 res = getRecursiveStructureQuerySmarts(query);
849 needParen = true;
850 } else { // we have a simple smarts
851 auto *tquery = static_cast<ATOM_EQUALS_QUERY *>(qatom->getQuery());
852 res = getAtomSmartsSimple(qatom, tquery, needParen);
853 if (tquery->getNegation()) {
854 res = "!" + res;
855 }
856 }
857 std::string mapNum;
858 if (qatom->getPropIfPresent(common_properties::molAtomMapNumber, mapNum)) {
859 needParen = true;
860 res += ":" + mapNum;
861 }
862 if (needParen) {
863 res = "[" + res + "]";
864 }
865 return res;
866 }
867
GetBondSmarts(const QueryBond * bond,int atomToLeftIdx)868 std::string GetBondSmarts(const QueryBond *bond, int atomToLeftIdx) {
869 PRECONDITION(bond, "bad bond");
870 std::string res = "";
871
872 // BOOST_LOG(rdInfoLog) << "bond: " << bond->getIdx() << std::endl;
873 ;
874 // it is possible that we are regular single bond and we don't need to write
875 // anything
876 if (!bond->hasQuery()) {
877 res = getNonQueryBondSmarts(bond, atomToLeftIdx);
878 // BOOST_LOG(rdInfoLog) << "\tno query:" << res << std::endl;
879 return res;
880 }
881 // describeQuery(bond->getQuery());
882 if ((typeid(*bond) == typeid(Bond)) &&
883 ((bond->getBondType() == Bond::SINGLE) ||
884 (bond->getBondType() == Bond::AROMATIC))) {
885 BOOST_LOG(rdInfoLog) << "\tbasic:" << res << std::endl;
886 return res;
887 }
888 const auto query = bond->getQuery();
889 PRECONDITION(query, "bond has no query");
890
891 unsigned int queryFeatures = 0;
892 auto descrip = query->getDescription();
893 if ((descrip == "BondAnd") || (descrip == "BondOr")) {
894 // composite query
895 res = _recurseBondSmarts(bond, query, query->getNegation(), atomToLeftIdx,
896 queryFeatures);
897 } else {
898 // simple query
899 if (query->getNegation()) {
900 res = "!";
901 }
902 const auto *tquery = static_cast<const BOND_EQUALS_QUERY *>(query);
903 res += getBondSmartsSimple(bond, tquery, atomToLeftIdx);
904 }
905 // BOOST_LOG(rdInfoLog) << "\t query:" << descrip << " " << res << std::endl;
906 return res;
907 }
908
909 } // end of namespace SmartsWrite
910
MolToSmarts(const ROMol & mol,bool doIsomericSmarts)911 std::string MolToSmarts(const ROMol &mol, bool doIsomericSmarts) {
912 const unsigned int nAtoms = mol.getNumAtoms();
913 if (!nAtoms) {
914 return "";
915 }
916
917 std::vector<AtomColors> colors(nAtoms, Canon::WHITE_NODE);
918 return molToSmarts(mol, doIsomericSmarts, std::move(colors));
919 }
920
MolFragmentToSmarts(const ROMol & mol,const std::vector<int> & atomsToUse,const std::vector<int> * bondsToUse,bool doIsomericSmarts)921 std::string MolFragmentToSmarts(const ROMol &mol,
922 const std::vector<int> &atomsToUse,
923 const std::vector<int> *bondsToUse,
924 bool doIsomericSmarts) {
925 PRECONDITION(!atomsToUse.empty(), "no atoms provided");
926 PRECONDITION(!bondsToUse || !bondsToUse->empty(), "no bonds provided");
927
928 auto nAtoms = mol.getNumAtoms();
929 if (!nAtoms) {
930 return "";
931 }
932
933 std::unique_ptr<boost::dynamic_bitset<>> bondsInPlay(nullptr);
934 if (bondsToUse != nullptr) {
935 bondsInPlay.reset(new boost::dynamic_bitset<>(mol.getNumBonds(), 0));
936 for (auto bidx : *bondsToUse) {
937 bondsInPlay->set(bidx);
938 }
939 }
940
941 // Mark all atoms except the ones in atomIndices as already processed.
942 // white: unprocessed
943 // grey: partial
944 // black: complete
945 std::vector<AtomColors> colors(nAtoms, Canon::BLACK_NODE);
946 for (const auto &idx : atomsToUse) {
947 colors[idx] = Canon::WHITE_NODE;
948 }
949
950 return molToSmarts(mol, doIsomericSmarts, std::move(colors),
951 bondsInPlay.get());
952 }
953
954 } // namespace RDKit