1 //
2 //  Copyright (C) 2002-2019 Greg Landrum and Rational Discovery LLC
3 //
4 //   @@ All Rights Reserved @@
5 //  This file is part of the RDKit.
6 //  The contents are covered by the terms of the BSD license
7 //  which is included in the file license.txt, found at the root
8 //  of the RDKit source tree.
9 //
10 #include "SmartsWrite.h"
11 #include <sstream>
12 #include <cstdint>
13 #include <boost/algorithm/string.hpp>
14 #include "SmilesWrite.h"
15 #include <GraphMol/RDKitBase.h>
16 #include <GraphMol/RDKitQueries.h>
17 #include <GraphMol/Canon.h>
18 #include <GraphMol/new_canon.h>
19 #include <RDGeneral/Exceptions.h>
20 #include <RDGeneral/RDLog.h>
21 
22 namespace RDKit {
23 using namespace Canon;
24 
25 // local utility namespace
26 namespace {
27 
28 enum class QueryBoolFeatures {
29   HAS_AND = 0x1,
30   HAS_LOWAND = 0x2,
31   HAS_OR = 0x4,
32   HAS_RECURSION = 0x8
33 };
34 
35 std::string _recurseGetSmarts(const QueryAtom *qatom,
36                               const QueryAtom::QUERYATOM_QUERY *node,
37                               bool negate, unsigned int &features);
38 std::string _recurseBondSmarts(const Bond *bond,
39                                const QueryBond::QUERYBOND_QUERY *node,
40                                bool negate, int atomToLeftIdx,
41                                unsigned int &features);
42 
_combineChildSmarts(std::string cs1,unsigned int features1,std::string cs2,unsigned int features2,std::string descrip,unsigned int & features)43 std::string _combineChildSmarts(std::string cs1, unsigned int features1,
44                                 std::string cs2, unsigned int features2,
45                                 std::string descrip, unsigned int &features) {
46   std::string res = "";
47   if ((descrip.find("Or") > 0) && (descrip.find("Or") < descrip.length())) {
48     // if either of child smarts already have a "," and ";" we can't have one
49     // more OR here
50     if ((features1 & static_cast<unsigned int>(QueryBoolFeatures::HAS_LOWAND) &&
51          features1 & static_cast<unsigned int>(QueryBoolFeatures::HAS_OR)) ||
52         (features2 & static_cast<unsigned int>(QueryBoolFeatures::HAS_LOWAND) &&
53          features2 & static_cast<unsigned int>(QueryBoolFeatures::HAS_OR))) {
54       throw ValueErrorException(
55           "This is a non-smartable query - OR above and below AND in the "
56           "binary tree");
57     }
58     res += cs1;
59     if (!(cs1.empty() || cs2.empty())) {
60       res += ",";
61     }
62     res += cs2;
63     features |= static_cast<unsigned int>(QueryBoolFeatures::HAS_OR);
64   } else if ((descrip.find("And") > 0) &&
65              (descrip.find("And") < descrip.length())) {
66     std::string symb;
67     if (features1 & static_cast<unsigned int>(QueryBoolFeatures::HAS_OR) ||
68         features2 & static_cast<unsigned int>(QueryBoolFeatures::HAS_OR)) {
69       symb = ";";
70       features |= static_cast<unsigned int>(QueryBoolFeatures::HAS_LOWAND);
71     } else {
72       symb = "&";
73       features |= static_cast<unsigned int>(QueryBoolFeatures::HAS_AND);
74     }
75     res += cs1;
76     if (!(cs1.empty() || cs2.empty())) {
77       res += symb;
78     }
79     res += cs2;
80   } else {
81     std::stringstream err;
82     err << "Don't know how to combine using " << descrip;
83     throw ValueErrorException(err.str());
84   }
85   features |= features1;
86   features |= features2;
87 
88   return res;
89 }  // namespace
90 
91 template <typename T>
describeQuery(const T * query,std::string leader="\\t")92 void describeQuery(const T *query, std::string leader = "\t") {
93   // BOOST_LOG(rdInfoLog) << leader << query->getDescription() << std::endl;
94   typename T::CHILD_VECT_CI iter;
95   for (iter = query->beginChildren(); iter != query->endChildren(); ++iter) {
96     describeQuery(iter->get(), leader + "\t");
97   }
98 }
99 
100 const static std::string _qatomHasStereoSet = "_qatomHasStereoSet";
getAtomSmartsSimple(const QueryAtom * qatom,const ATOM_EQUALS_QUERY * query,bool & needParen)101 std::string getAtomSmartsSimple(const QueryAtom *qatom,
102                                 const ATOM_EQUALS_QUERY *query,
103                                 bool &needParen) {
104   PRECONDITION(query, "bad query");
105 
106   std::string descrip = query->getDescription();
107   bool hasVal = false;
108   enum class Modifiers : std::uint8_t { NONE, RANGE, LESS, GREATER };
109   Modifiers mods = Modifiers::NONE;
110   if (boost::starts_with(descrip, "range_")) {
111     mods = Modifiers::RANGE;
112     descrip = descrip.substr(6);
113   } else if (boost::starts_with(descrip, "less_")) {
114     mods = Modifiers::LESS;
115     descrip = descrip.substr(5);
116   } else if (boost::starts_with(descrip, "greater_")) {
117     mods = Modifiers::GREATER;
118     descrip = descrip.substr(8);
119   }
120   std::stringstream res;
121   if (descrip == "AtomImplicitHCount") {
122     res << "h";
123     hasVal = true;
124     needParen = true;
125   } else if (descrip == "AtomHasImplicitH") {
126     res << "h";
127     needParen = true;
128   } else if (descrip == "AtomTotalValence") {
129     res << "v";
130     hasVal = true;
131     needParen = true;
132   } else if (descrip == "AtomAtomicNum") {
133     res << "#";
134     hasVal = true;
135     needParen = true;
136   } else if (descrip == "AtomExplicitDegree") {
137     res << "D";
138     hasVal = true;
139     needParen = true;
140   } else if (descrip == "AtomNonHydrogenDegree") {
141     res << "d";
142     hasVal = true;
143     needParen = true;
144   } else if (descrip == "AtomTotalDegree") {
145     res << "X";
146     hasVal = true;
147     needParen = true;
148   } else if (descrip == "AtomHasRingBond") {
149     res << "x";
150     needParen = true;
151   } else if (descrip == "AtomHCount") {
152     res << "H";
153     hasVal = true;
154     needParen = true;
155   } else if (descrip == "AtomIsAliphatic") {
156     res << "A";
157     needParen = false;
158   } else if (descrip == "AtomIsAromatic") {
159     res << "a";
160     needParen = false;
161   } else if (descrip == "AtomNull") {
162     res << "*";
163     needParen = false;
164   } else if (descrip == "AtomInRing") {
165     res << "R";
166     needParen = true;
167   } else if (descrip == "AtomMinRingSize") {
168     res << "r";
169     hasVal = true;
170     needParen = true;
171   } else if (descrip == "AtomInNRings") {
172     res << "R";
173     if (mods == Modifiers::NONE && query->getVal() >= 0) {
174       hasVal = true;
175     }
176     needParen = true;
177   } else if (descrip == "AtomHasHeteroatomNeighbors") {
178     res << "z";
179     needParen = true;
180   } else if (descrip == "AtomNumHeteroatomNeighbors") {
181     res << "z";
182     hasVal = true;
183     needParen = true;
184   } else if (descrip == "AtomHasAliphaticHeteroatomNeighbors") {
185     res << "Z";
186     needParen = true;
187   } else if (descrip == "AtomNumAliphaticHeteroatomNeighbors") {
188     res << "Z";
189     hasVal = true;
190     needParen = true;
191   } else if (descrip == "AtomFormalCharge") {
192     int val = query->getVal();
193     if (val < 0) {
194       res << "-";
195     } else {
196       res << "+";
197     }
198     if (mods == Modifiers::NONE && abs(val) != 1) {
199       res << abs(val);
200     }
201     needParen = true;
202   } else if (descrip == "AtomNegativeFormalCharge") {
203     int val = query->getVal();
204     if (val < 0) {
205       res << "+";
206     } else {
207       res << "-";
208     }
209     if (mods == Modifiers::NONE && abs(val) != 1) {
210       res << abs(val);
211     }
212     needParen = true;
213   } else if (descrip == "AtomHybridization") {
214     res << "^";
215     switch (query->getVal()) {
216       case Atom::S:
217         res << "0";
218         break;
219       case Atom::SP:
220         res << "1";
221         break;
222       case Atom::SP2:
223         res << "2";
224         break;
225       case Atom::SP3:
226         res << "3";
227         break;
228       case Atom::SP3D:
229         res << "4";
230         break;
231       case Atom::SP3D2:
232         res << "5";
233         break;
234     }
235     needParen = true;
236   } else if (descrip == "AtomMass") {
237     res << query->getVal() / massIntegerConversionFactor << "*";
238     needParen = true;
239   } else if (descrip == "AtomIsotope") {
240     res << query->getVal() << "*";
241     needParen = true;
242   } else if (descrip == "AtomRingBondCount") {
243     res << "x";
244     hasVal = true;
245     needParen = true;
246   } else if (descrip == "AtomUnsaturated") {
247     res << "$(*=,:,#*)";
248     needParen = true;
249   } else if (descrip == "AtomType") {
250     int atNum;
251     bool isAromatic;
252     parseAtomType(query->getVal(), atNum, isAromatic);
253     std::string symbol = PeriodicTable::getTable()->getElementSymbol(atNum);
254     if (isAromatic) {
255       symbol[0] += ('a' - 'A');
256     }
257     res << symbol;
258     if (!SmilesWrite::inOrganicSubset(atNum)) {
259       needParen = true;
260     }
261   } else {
262     BOOST_LOG(rdWarningLog)
263         << "Cannot write SMARTS for query type : " << descrip
264         << ". Ignoring it." << std::endl;
265     res << "*";
266   }
267 
268   if (mods != Modifiers::NONE) {
269     res << "{";
270     switch (mods) {
271       case Modifiers::LESS:
272         res << ((const ATOM_LESSEQUAL_QUERY *)query)->getVal() << "-";
273         break;
274       case Modifiers::RANGE:
275         res << ((const ATOM_RANGE_QUERY *)query)->getLower() << "-"
276             << ((const ATOM_RANGE_QUERY *)query)->getUpper();
277         break;
278       case Modifiers::GREATER:
279         res << "-" << ((const ATOM_GREATEREQUAL_QUERY *)query)->getVal();
280         break;
281       default:
282         break;
283     }
284     res << "}";
285   } else if (hasVal) {
286     res << query->getVal();
287   }
288 
289   // handle atomic stereochemistry
290   if (qatom->hasOwningMol() &&
291       qatom->getOwningMol().hasProp(common_properties::_doIsoSmiles)) {
292     if (qatom->getChiralTag() != Atom::CHI_UNSPECIFIED &&
293         !qatom->hasProp(_qatomHasStereoSet) &&
294         !qatom->hasProp(common_properties::_brokenChirality)) {
295       qatom->setProp(_qatomHasStereoSet, 1);
296       switch (qatom->getChiralTag()) {
297         case Atom::CHI_TETRAHEDRAL_CW:
298           res << "@@";
299           needParen = true;
300           break;
301         case Atom::CHI_TETRAHEDRAL_CCW:
302           res << "@";
303           needParen = true;
304           break;
305         default:
306           break;
307       }
308     }
309   }
310 
311   return res.str();
312 }
313 
getRecursiveStructureQuerySmarts(const QueryAtom::QUERYATOM_QUERY * query)314 std::string getRecursiveStructureQuerySmarts(
315     const QueryAtom::QUERYATOM_QUERY *query) {
316   PRECONDITION(query, "bad query");
317   PRECONDITION(query->getDescription() == "RecursiveStructure", "bad query");
318   const auto *rquery = static_cast<const RecursiveStructureQuery *>(query);
319   auto *qmol = const_cast<ROMol *>(rquery->getQueryMol());
320   std::string res = MolToSmarts(*qmol);
321   res = "$(" + res + ")";
322   if (rquery->getNegation()) {
323     res = "!" + res;
324   }
325   return res;
326 }
327 
getBasicBondRepr(Bond::BondType typ,Bond::BondDir dir,bool doIsomericSmiles,bool reverseDative)328 std::string getBasicBondRepr(Bond::BondType typ, Bond::BondDir dir,
329                              bool doIsomericSmiles, bool reverseDative) {
330   std::string res;
331   switch (typ) {
332     case Bond::SINGLE:
333       res = "-";
334       if (doIsomericSmiles) {
335         if (dir == Bond::ENDDOWNRIGHT) {
336           res = "\\";
337         } else if (dir == Bond::ENDUPRIGHT) {
338           res = "/";
339         }
340       }
341       break;
342     case Bond::DOUBLE:
343       res = "=";
344       break;
345     case Bond::TRIPLE:
346       res = "#";
347       break;
348     case Bond::AROMATIC:
349       res = ":";
350       break;
351     case Bond::DATIVE:
352       if (reverseDative) {
353         res = "<-";
354       } else {
355         res = "->";
356       }
357       break;
358     default:
359       res = "";
360   }
361   return res;
362 }  // namespace
363 
getBondSmartsSimple(const Bond * bond,const BOND_EQUALS_QUERY * bquery,int atomToLeftIdx)364 std::string getBondSmartsSimple(const Bond *bond,
365                                 const BOND_EQUALS_QUERY *bquery,
366                                 int atomToLeftIdx) {
367   PRECONDITION(bond, "bad bond");
368 
369   PRECONDITION(bquery, "bad query");
370   std::string descrip = bquery->getDescription();
371   std::string res = "";
372   if (descrip == "BondNull") {
373     res += "~";
374   } else if (descrip == "BondInRing") {
375     res += "@";
376   } else if (descrip == "SingleOrAromaticBond") {
377     // don't need to do anything here... :-)
378   } else if (descrip == "SingleOrDoubleBond") {
379     res += "-,=";
380   } else if (descrip == "DoubleOrAromaticBond") {
381     res += "=,:";
382   } else if (descrip == "SingleOrDoubleOrAromaticBond") {
383     res += "-,=,:";
384   } else if (descrip == "BondDir") {
385     int val = bquery->getVal();
386     if (val == static_cast<int>(Bond::ENDDOWNRIGHT)) {
387       res += "\\";
388     } else if (val == static_cast<int>(Bond::ENDUPRIGHT)) {
389       res += "/";
390     } else {
391       throw "Can't write smarts for this bond dir type";
392     }
393   } else if (descrip == "BondOrder") {
394     bool reverseDative =
395         (atomToLeftIdx >= 0 &&
396          bond->getBeginAtomIdx() != static_cast<unsigned int>(atomToLeftIdx));
397     bool doIsoSmiles =
398         !bond->hasOwningMol() ||
399         bond->getOwningMol().hasProp(common_properties::_doIsoSmiles);
400     res += getBasicBondRepr(static_cast<Bond::BondType>(bquery->getVal()),
401                             bond->getBondDir(), doIsoSmiles, reverseDative);
402   } else {
403     std::stringstream msg;
404     msg << "Can't write smarts for this query bond type: " << descrip;
405     throw msg.str().c_str();
406   }
407   return res;
408 }
409 
_recurseGetSmarts(const QueryAtom * qatom,const QueryAtom::QUERYATOM_QUERY * node,bool negate,unsigned int & features)410 std::string _recurseGetSmarts(const QueryAtom *qatom,
411                               const QueryAtom::QUERYATOM_QUERY *node,
412                               bool negate, unsigned int &features) {
413   PRECONDITION(node, "bad node");
414   // the algorithm goes something like this
415   // - recursively get the smarts for the child queries
416   // - combine the child smarts using the following rules:
417   //      - if we are currently at an OR query, combine the subqueries with a
418   //      ",",
419   //        but only if neither of child smarts do not contain "," and ";"
420   //        This situation leads to a no smartable situation and throw an
421   //        error
422   //      - if we are currently at an and query, combine the child smarts with
423   //      "&"
424   //        if neither of the child smarts contain a "," - otherwise combine
425   //        them
426   //        the child smarts with a ";"
427   //
428   // There is an additional complication with composite nodes that carry a
429   // negation - in this
430   // case we will propagate the negation to the child nodes using the
431   // following rules
432   //   NOT (a AND b) = ( NOT (a)) AND ( NOT (b))
433   //   NOT (a OR b) = ( NOT (a)) OR ( NOT (b))
434 
435   auto descrip = node->getDescription();
436 
437   unsigned int child1Features = 0;
438   unsigned int child2Features = 0;
439   auto chi = node->beginChildren();
440   auto child1 = chi->get();
441   auto dsc1 = child1->getDescription();
442 
443   ++chi;
444   CHECK_INVARIANT(chi != node->endChildren(),
445                   "Not enough children on the query");
446 
447   bool needParen;
448   std::string csmarts1;
449   // deal with the first child
450   if (dsc1 == "RecursiveStructure") {
451     csmarts1 = getRecursiveStructureQuerySmarts(child1);
452     features |= static_cast<unsigned int>(QueryBoolFeatures::HAS_RECURSION);
453   } else if ((dsc1 != "AtomOr") && (dsc1 != "AtomAnd")) {
454     // child 1 is a simple node
455     const auto *tchild = static_cast<const ATOM_EQUALS_QUERY *>(child1);
456     csmarts1 = getAtomSmartsSimple(qatom, tchild, needParen);
457     bool nneg = (negate) ^ (tchild->getNegation());
458     if (nneg) {
459       csmarts1 = "!" + csmarts1;
460     }
461   } else {
462     // child 1 is composite node - recurse
463     bool nneg = (negate) ^ (child1->getNegation());
464     csmarts1 = _recurseGetSmarts(qatom, child1, nneg, child1Features);
465   }
466   // ok if we have a negation and we have an OR , we have to change to
467   // an AND since we propagated the negation
468   // i.e NOT (A OR B) = (NOT (A)) AND (NOT(B))
469   if (negate) {
470     if (descrip == "AtomOr") {
471       descrip = "AtomAnd";
472     } else if (descrip == "AtomAnd") {
473       descrip = "AtomOr";
474     }
475   }
476   auto res = csmarts1;
477   while (chi != node->endChildren()) {
478     auto child2 = chi->get();
479     ++chi;
480 
481     auto dsc2 = child2->getDescription();
482     std::string csmarts2;
483 
484     // deal with the next child
485     if (dsc2 == "RecursiveStructure") {
486       csmarts2 = getRecursiveStructureQuerySmarts(child2);
487       features |= static_cast<unsigned int>(QueryBoolFeatures::HAS_RECURSION);
488     } else if ((dsc2 != "AtomOr") && (dsc2 != "AtomAnd")) {
489       // child 2 is a simple node
490       const auto *tchild = static_cast<const ATOM_EQUALS_QUERY *>(child2);
491       csmarts2 = getAtomSmartsSimple(qatom, tchild, needParen);
492       bool nneg = (negate) ^ (tchild->getNegation());
493       if (nneg) {
494         csmarts2 = "!" + csmarts2;
495       }
496     } else {
497       bool nneg = (negate) ^ (child2->getNegation());
498       csmarts2 = _recurseGetSmarts(qatom, child2, nneg, child2Features);
499     }
500 
501     res = _combineChildSmarts(res, child1Features, csmarts2, child2Features,
502                               descrip, features);
503   }
504   return res;
505 }
506 
_recurseBondSmarts(const Bond * bond,const QueryBond::QUERYBOND_QUERY * node,bool negate,int atomToLeftIdx,unsigned int & features)507 std::string _recurseBondSmarts(const Bond *bond,
508                                const QueryBond::QUERYBOND_QUERY *node,
509                                bool negate, int atomToLeftIdx,
510                                unsigned int &features) {
511   // the algorithm goes something like this
512   // - recursively get the smarts for the child query bonds
513   // - combine the child smarts using the following rules:
514   //      - if we are currently at an OR query, combine the subqueries with a
515   //      ",",
516   //        but only if neither of child smarts do not contain "," and ";"
517   //        This situation leads to a no smartable situation and throw an
518   //        error
519   //      - if we are currently at an and query, combine the child smarts with
520   //      "&"
521   //        if neither of the child smarts contain a "," - otherwise combine
522   //        them
523   //        the child smarts with a ";"
524   //
525   // There is an additional complication with composite nodes that carry a
526   // negation - in this
527   // case we will propagate the negation to the child nodes using the
528   // following rules
529   //   NOT (a AND b) = ( NOT (a)) AND ( NOT (b))
530   //   NOT (a OR b) = ( NOT (a)) OR ( NOT (b))
531   PRECONDITION(bond, "bad bond");
532   PRECONDITION(node, "bad node");
533   std::string descrip = node->getDescription();
534   std::string res = "";
535 
536   const QueryBond::QUERYBOND_QUERY *child1;
537   const QueryBond::QUERYBOND_QUERY *child2;
538   unsigned int child1Features = 0;
539   unsigned int child2Features = 0;
540   QueryBond::QUERYBOND_QUERY::CHILD_VECT_CI chi;
541 
542   chi = node->beginChildren();
543   child1 = chi->get();
544   chi++;
545   child2 = chi->get();
546   chi++;
547   // OK we should be at the end of vector by now - since we can have only two
548   // children,
549   // well - at least in this case
550   CHECK_INVARIANT(chi == node->endChildren(), "Too many children on the query");
551 
552   std::string dsc1, dsc2;
553   dsc1 = child1->getDescription();
554   dsc2 = child2->getDescription();
555   std::string csmarts1, csmarts2;
556 
557   if ((dsc1 != "BondOr") && (dsc1 != "BondAnd")) {
558     // child1 is  simple node get the smarts directly
559     const auto *tchild = static_cast<const BOND_EQUALS_QUERY *>(child1);
560     csmarts1 = getBondSmartsSimple(bond, tchild, atomToLeftIdx);
561     bool nneg = (negate) ^ (tchild->getNegation());
562     if (nneg) {
563       csmarts1 = "!" + csmarts1;
564     }
565   } else {
566     // child1 is a composite node recurse further
567     bool nneg = (negate) ^ (child1->getNegation());
568     csmarts1 =
569         _recurseBondSmarts(bond, child1, nneg, atomToLeftIdx, child1Features);
570   }
571 
572   // now deal with the second child
573   if ((dsc2 != "BondOr") && (dsc2 != "BondAnd")) {
574     // child 2 is a simple node
575     const auto *tchild = static_cast<const BOND_EQUALS_QUERY *>(child2);
576     csmarts2 = getBondSmartsSimple(bond, tchild, atomToLeftIdx);
577     bool nneg = (negate) ^ (tchild->getNegation());
578     if (nneg) {
579       csmarts2 = "!" + csmarts2;
580     }
581   } else {
582     // child two is a composite node - recurse
583     bool nneg = (negate) ^ (child2->getNegation());
584     csmarts1 =
585         _recurseBondSmarts(bond, child2, nneg, atomToLeftIdx, child2Features);
586   }
587 
588   // ok if we have a negation and we have to change the underlying logic,
589   // since we propagated the negation i.e NOT (A OR B) = (NOT (A)) AND
590   // (NOT(B))
591   if (negate) {
592     if (descrip == "BondOr") {
593       descrip = "BondAnd";
594     } else if (descrip == "BondAnd") {
595       descrip = "BondOr";
596     }
597   }
598   res += _combineChildSmarts(csmarts1, child1Features, csmarts2, child2Features,
599                              descrip, features);
600   return res;
601 }
602 
FragmentSmartsConstruct(ROMol & mol,unsigned int atomIdx,std::vector<Canon::AtomColors> & colors,UINT_VECT & ranks,bool doIsomericSmiles,const boost::dynamic_bitset<> * bondsInPlay=nullptr)603 std::string FragmentSmartsConstruct(
604     ROMol &mol, unsigned int atomIdx, std::vector<Canon::AtomColors> &colors,
605     UINT_VECT &ranks, bool doIsomericSmiles,
606     const boost::dynamic_bitset<> *bondsInPlay = nullptr) {
607   Canon::MolStack molStack;
608   molStack.reserve(mol.getNumAtoms() + mol.getNumBonds());
609   std::stringstream res;
610 
611   // this is dirty trick get around the fact that canonicalizeFragment
612   // thinks we already called findSSSR - to do some atom ranking
613   // but for smarts we are going to ignore that part. We will artificially
614   // set the "SSSR" property to an empty property
615   mol.getRingInfo()->reset();
616   mol.getRingInfo()->initialize();
617   for (auto &atom : mol.atoms()) {
618     atom->updatePropertyCache(false);
619   }
620 
621   // Another dirty trick to avoid reordering of lead chiral atoms in
622   // canonicalizeFragment: since we are writing SMARTS, there won't be a
623   // reordering on parsing.
624   // The trick is as easy as choosing the first non-chiral atom we can find as
625   // root of the string. This should not be a problem, since SMARTS do not get
626   // canonicalized.
627   if (molStack.empty()) {
628     for (const auto atom : mol.atoms()) {
629       if (colors[atom->getIdx()] == Canon::WHITE_NODE &&
630           atom->getChiralTag() != Atom::CHI_TETRAHEDRAL_CCW &&
631           atom->getChiralTag() != Atom::CHI_TETRAHEDRAL_CW) {
632         atomIdx = atom->getIdx();
633         break;
634       }
635     }
636   }
637 
638   Canon::canonicalizeFragment(mol, atomIdx, colors, ranks, molStack,
639                               bondsInPlay, nullptr, doIsomericSmiles);
640 
641   // now clear the "SSSR" property
642   mol.getRingInfo()->reset();
643   for (auto &msCI : molStack) {
644     switch (msCI.type) {
645       case Canon::MOL_STACK_ATOM: {
646         auto *qatm = static_cast<QueryAtom *>(msCI.obj.atom);
647         res << SmartsWrite::GetAtomSmarts(qatm);
648         break;
649       }
650       case Canon::MOL_STACK_BOND: {
651         auto *qbnd = static_cast<QueryBond *>(msCI.obj.bond);
652         res << SmartsWrite::GetBondSmarts(qbnd, msCI.number);
653         break;
654       }
655       case Canon::MOL_STACK_RING: {
656         if (msCI.number < 10) {
657           res << msCI.number;
658         } else {
659           res << "%" << msCI.number;
660         }
661         break;
662       }
663       case Canon::MOL_STACK_BRANCH_OPEN: {
664         res << "(";
665         break;
666       }
667       case Canon::MOL_STACK_BRANCH_CLOSE: {
668         res << ")";
669         break;
670       }
671       default:
672         break;
673     }
674   }
675   return res.str();
676 }
677 
678 // this is the used when converting a SMILES or
679 // non-query atom from a mol file into SMARTS.
getNonQueryAtomSmarts(const QueryAtom * qatom)680 std::string getNonQueryAtomSmarts(const QueryAtom *qatom) {
681   PRECONDITION(qatom, "bad atom");
682   PRECONDITION(!qatom->hasQuery(), "atom should not have query");
683   std::stringstream res;
684   res << "[";
685 
686   int isotope = qatom->getIsotope();
687   if (isotope) {
688     res << isotope;
689   }
690 
691   if (SmilesWrite::inOrganicSubset(qatom->getAtomicNum())) {
692     res << "#" << qatom->getAtomicNum();
693   } else {
694     res << qatom->getSymbol();
695   }
696 
697   if (qatom->hasOwningMol() &&
698       qatom->getOwningMol().hasProp(common_properties::_doIsoSmiles)) {
699     if (qatom->getChiralTag() != Atom::CHI_UNSPECIFIED &&
700         !qatom->hasProp(_qatomHasStereoSet) &&
701         !qatom->hasProp(common_properties::_brokenChirality)) {
702       qatom->setProp(_qatomHasStereoSet, 1);
703       switch (qatom->getChiralTag()) {
704         case Atom::CHI_TETRAHEDRAL_CW:
705           res << "@@";
706           break;
707         case Atom::CHI_TETRAHEDRAL_CCW:
708           res << "@";
709           break;
710         default:
711           break;
712       }
713     }
714   }
715 
716   auto hs = qatom->getNumExplicitHs();
717   // FIX: probably should be smarter about Hs:
718   if (hs) {
719     res << "H";
720     if (hs > 1) {
721       res << hs;
722     }
723   }
724   auto chg = qatom->getFormalCharge();
725   if (chg) {
726     if (chg == -1) {
727       res << "-";
728     } else if (chg == 1) {
729       res << "+";
730     } else if (chg < 0) {
731       res << qatom->getFormalCharge();
732     } else {
733       res << "+" << qatom->getFormalCharge();
734     }
735   }
736   int mapNum;
737   if (qatom->getPropIfPresent(common_properties::molAtomMapNumber, mapNum)) {
738     res << ":";
739     res << mapNum;
740   }
741   res << "]";
742   return res.str();
743 }
744 
745 // this is the used when converting a SMILES or
746 // non-query bond from a mol file into SMARTS.
getNonQueryBondSmarts(const QueryBond * qbond,int atomToLeftIdx)747 std::string getNonQueryBondSmarts(const QueryBond *qbond, int atomToLeftIdx) {
748   PRECONDITION(qbond, "bad bond");
749   PRECONDITION(!qbond->hasQuery(), "bond should not have query");
750   RDUNUSED_PARAM(atomToLeftIdx);
751   std::string res;
752 
753   if (qbond->getIsAromatic()) {
754     res = ":";
755   } else {
756     bool reverseDative =
757         (atomToLeftIdx >= 0 &&
758          qbond->getBeginAtomIdx() != static_cast<unsigned int>(atomToLeftIdx));
759     bool doIsoSmiles =
760         !qbond->hasOwningMol() ||
761         qbond->getOwningMol().hasProp(common_properties::_doIsoSmiles);
762     res = getBasicBondRepr(qbond->getBondType(), qbond->getBondDir(),
763                            doIsoSmiles, reverseDative);
764   }
765 
766   return res;
767 }
768 
molToSmarts(const ROMol & inmol,bool doIsomericSmiles,std::vector<AtomColors> && colors,const boost::dynamic_bitset<> * bondsInPlay=nullptr)769 std::string molToSmarts(const ROMol &inmol, bool doIsomericSmiles,
770                         std::vector<AtomColors> &&colors,
771                         const boost::dynamic_bitset<> *bondsInPlay = nullptr) {
772   ROMol mol(inmol);
773   const unsigned int nAtoms = mol.getNumAtoms();
774   UINT_VECT ranks;
775   ranks.reserve(nAtoms);
776   // For smiles writing we would be canonicalizing but we will not do that here.
777   // We will simply use the atom indices as the rank
778   for (const auto &atom : mol.atoms()) {
779     ranks.push_back(atom->getIdx());
780   }
781 
782   if (doIsomericSmiles) {
783     mol.setProp(common_properties::_doIsoSmiles, 1);
784   }
785 
786   std::string res;
787   auto colorIt = std::find(colors.begin(), colors.end(), Canon::WHITE_NODE);
788   while (colorIt != colors.end()) {
789     unsigned int nextAtomIdx = 0;
790     unsigned int nextRank;
791     std::string subSmi;
792     nextRank = nAtoms + 1;
793     for (unsigned int i = 0; i < nAtoms; ++i) {
794       if (colors[i] == Canon::WHITE_NODE && ranks[i] < nextRank) {
795         nextRank = ranks[i];
796         nextAtomIdx = i;
797       }
798     }
799 
800     subSmi = FragmentSmartsConstruct(mol, nextAtomIdx, colors, ranks,
801                                      doIsomericSmiles, bondsInPlay);
802     res += subSmi;
803 
804     colorIt = std::find(colors.begin(), colors.end(), Canon::WHITE_NODE);
805     if (colorIt != colors.end()) {
806       res += ".";
807     }
808   }
809   return res;
810 }
811 
812 }  // namespace
813 
814 namespace SmartsWrite {
GetAtomSmarts(const QueryAtom * qatom)815 std::string GetAtomSmarts(const QueryAtom *qatom) {
816   PRECONDITION(qatom, "bad atom");
817   std::string res;
818   bool needParen = false;
819 
820   // BOOST_LOG(rdInfoLog)<<"Atom: " <<qatom->getIdx()<<std::endl;
821   if (!qatom->hasQuery()) {
822     res = getNonQueryAtomSmarts(qatom);
823     // BOOST_LOG(rdInfoLog)<<"\tno query:" <<res;
824     return res;
825   }
826   const auto query = qatom->getQuery();
827   PRECONDITION(query, "atom has no query");
828   // describeQuery(query);
829   unsigned int queryFeatures = 0;
830   std::string descrip = qatom->getQuery()->getDescription();
831   if (descrip.empty()) {
832     // we have simple atom - just generate the smiles and return
833     res = SmilesWrite::GetAtomSmiles(qatom);
834     if (res[0] == '[') {
835       // chop the brackets off, we'll put them back on later:
836       needParen = true;
837       res = res.substr(1, res.size() - 2);
838     }
839   } else if ((descrip == "AtomOr") || (descrip == "AtomAnd")) {
840     // we have a composite query
841     needParen = true;
842     res = _recurseGetSmarts(qatom, query, query->getNegation(), queryFeatures);
843     if (res.length() == 1) {  // single atom symbol we don't need parens
844       needParen = false;
845     }
846   } else if (descrip == "RecursiveStructure") {
847     // it's a bare recursive structure query:
848     res = getRecursiveStructureQuerySmarts(query);
849     needParen = true;
850   } else {  // we have a simple smarts
851     auto *tquery = static_cast<ATOM_EQUALS_QUERY *>(qatom->getQuery());
852     res = getAtomSmartsSimple(qatom, tquery, needParen);
853     if (tquery->getNegation()) {
854       res = "!" + res;
855     }
856   }
857   std::string mapNum;
858   if (qatom->getPropIfPresent(common_properties::molAtomMapNumber, mapNum)) {
859     needParen = true;
860     res += ":" + mapNum;
861   }
862   if (needParen) {
863     res = "[" + res + "]";
864   }
865   return res;
866 }
867 
GetBondSmarts(const QueryBond * bond,int atomToLeftIdx)868 std::string GetBondSmarts(const QueryBond *bond, int atomToLeftIdx) {
869   PRECONDITION(bond, "bad bond");
870   std::string res = "";
871 
872   // BOOST_LOG(rdInfoLog) << "bond: " << bond->getIdx() << std::endl;
873   ;
874   // it is possible that we are regular single bond and we don't need to write
875   // anything
876   if (!bond->hasQuery()) {
877     res = getNonQueryBondSmarts(bond, atomToLeftIdx);
878     // BOOST_LOG(rdInfoLog) << "\tno query:" << res << std::endl;
879     return res;
880   }
881   // describeQuery(bond->getQuery());
882   if ((typeid(*bond) == typeid(Bond)) &&
883       ((bond->getBondType() == Bond::SINGLE) ||
884        (bond->getBondType() == Bond::AROMATIC))) {
885     BOOST_LOG(rdInfoLog) << "\tbasic:" << res << std::endl;
886     return res;
887   }
888   const auto query = bond->getQuery();
889   PRECONDITION(query, "bond has no query");
890 
891   unsigned int queryFeatures = 0;
892   auto descrip = query->getDescription();
893   if ((descrip == "BondAnd") || (descrip == "BondOr")) {
894     // composite query
895     res = _recurseBondSmarts(bond, query, query->getNegation(), atomToLeftIdx,
896                              queryFeatures);
897   } else {
898     // simple query
899     if (query->getNegation()) {
900       res = "!";
901     }
902     const auto *tquery = static_cast<const BOND_EQUALS_QUERY *>(query);
903     res += getBondSmartsSimple(bond, tquery, atomToLeftIdx);
904   }
905   // BOOST_LOG(rdInfoLog) << "\t  query:" << descrip << " " << res << std::endl;
906   return res;
907 }
908 
909 }  // end of namespace SmartsWrite
910 
MolToSmarts(const ROMol & mol,bool doIsomericSmarts)911 std::string MolToSmarts(const ROMol &mol, bool doIsomericSmarts) {
912   const unsigned int nAtoms = mol.getNumAtoms();
913   if (!nAtoms) {
914     return "";
915   }
916 
917   std::vector<AtomColors> colors(nAtoms, Canon::WHITE_NODE);
918   return molToSmarts(mol, doIsomericSmarts, std::move(colors));
919 }
920 
MolFragmentToSmarts(const ROMol & mol,const std::vector<int> & atomsToUse,const std::vector<int> * bondsToUse,bool doIsomericSmarts)921 std::string MolFragmentToSmarts(const ROMol &mol,
922                                 const std::vector<int> &atomsToUse,
923                                 const std::vector<int> *bondsToUse,
924                                 bool doIsomericSmarts) {
925   PRECONDITION(!atomsToUse.empty(), "no atoms provided");
926   PRECONDITION(!bondsToUse || !bondsToUse->empty(), "no bonds provided");
927 
928   auto nAtoms = mol.getNumAtoms();
929   if (!nAtoms) {
930     return "";
931   }
932 
933   std::unique_ptr<boost::dynamic_bitset<>> bondsInPlay(nullptr);
934   if (bondsToUse != nullptr) {
935     bondsInPlay.reset(new boost::dynamic_bitset<>(mol.getNumBonds(), 0));
936     for (auto bidx : *bondsToUse) {
937       bondsInPlay->set(bidx);
938     }
939   }
940 
941   // Mark all atoms except the ones in atomIndices as already processed.
942   // white: unprocessed
943   // grey: partial
944   // black: complete
945   std::vector<AtomColors> colors(nAtoms, Canon::BLACK_NODE);
946   for (const auto &idx : atomsToUse) {
947     colors[idx] = Canon::WHITE_NODE;
948   }
949 
950   return molToSmarts(mol, doIsomericSmarts, std::move(colors),
951                      bondsInPlay.get());
952 }
953 
954 }  // namespace RDKit