1 /* Copyright (C) 2007  Rajarshi Guha <rajarshi@users.sourceforge.net>
2  *
3  * Contact: cdk-devel@lists.sourceforge.net
4  *
5  * This program is free software; you can redistribute it and/or
6  * modify it under the terms of the GNU Lesser General Public License
7  * as published by the Free Software Foundation; either version 2.1
8  * of the License, or (at your option) any later version.
9  *
10  * This program is distributed in the hope that it will be useful,
11  * but WITHOUT ANY WARRANTY; without even the implied warranty of
12  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
13  * GNU Lesser General Public License for more details.
14  *
15  * You should have received a copy of the GNU Lesser General Public License
16  * along with this program; if not, write to the Free Software
17  * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
18  */
19 package org.openscience.cdk.smiles.smarts;
20 
21 import com.google.common.collect.FluentIterable;
22 import com.google.common.collect.Sets;
23 import com.google.common.primitives.Ints;
24 import org.openscience.cdk.aromaticity.Aromaticity;
25 import org.openscience.cdk.aromaticity.ElectronDonation;
26 import org.openscience.cdk.exception.CDKException;
27 import org.openscience.cdk.graph.Cycles;
28 import org.openscience.cdk.interfaces.IAtom;
29 import org.openscience.cdk.interfaces.IAtomContainer;
30 import org.openscience.cdk.interfaces.IBond;
31 import org.openscience.cdk.interfaces.IChemObjectBuilder;
32 import org.openscience.cdk.interfaces.IRingSet;
33 import org.openscience.cdk.isomorphism.SmartsStereoMatch;
34 import org.openscience.cdk.isomorphism.VentoFoggia;
35 import org.openscience.cdk.isomorphism.matchers.IQueryAtom;
36 import org.openscience.cdk.isomorphism.matchers.QueryAtomContainer;
37 import org.openscience.cdk.isomorphism.matchers.smarts.SmartsMatchers;
38 import org.openscience.cdk.isomorphism.mcss.RMap;
39 import org.openscience.cdk.smiles.smarts.parser.SMARTSParser;
40 import org.openscience.cdk.smiles.smarts.parser.TokenMgrError;
41 import org.openscience.cdk.tools.ILoggingTool;
42 import org.openscience.cdk.tools.LoggingToolFactory;
43 
44 import java.util.ArrayList;
45 import java.util.BitSet;
46 import java.util.LinkedHashMap;
47 import java.util.List;
48 import java.util.Map;
49 import java.util.Set;
50 import java.util.TreeSet;
51 
52 import static com.google.common.base.Preconditions.checkNotNull;
53 
54 /**
55  * This class provides a easy to use wrapper around SMARTS matching functionality.  User code that wants to do
56  * SMARTS matching should use this rather than using SMARTSParser (and UniversalIsomorphismTester) directly. Example
57  * usage would be
58  *
59  * <pre>{@code
60  * SmilesParser sp = new SmilesParser(DefaultChemObjectBuilder.getInstance());
61  * IAtomContainer atomContainer = sp.parseSmiles("CC(=O)OC(=O)C");
62  * SMARTSQueryTool querytool = new SMARTSQueryTool("O=CO");
63  * boolean status = querytool.matches(atomContainer);
64  * if (status) {
65  *    int nmatch = querytool.countMatches();
66  *    List mappings = querytool.getMatchingAtoms();
67  *    for (int i = 0; i < nmatch; i++) {
68  *       List atomIndices = (List) mappings.get(i);
69  *    }
70  * }
71  * }</pre>
72  * <h3>SMARTS Extensions</h3>
73  *
74  * Currently the CDK supports the following SMARTS symbols, that are not described in the Daylight specification.
75  * However they are supported by other packages and are noted as such.
76  *
77  * <table border=1 cellpadding=3><caption>Table 1 - Supported Extensions</caption> <thead>
78  * <tr> <th>Symbol</th><th>Meaning</th><th>Default</th><th>Notes</th> </tr>
79  * </thead> <tbody> <tr> <td>Gx</td><td>Periodic group number</td><td>None</td><td>x must be specified and must be a
80  * number between 1 and 18. This symbol is supported by the MOE SMARTS implementation</td> <tr> <td>#X</td><td>Any
81  * non-carbon heavy element</td><td>None</td><td>This symbol is supported by the MOE SMARTS implementation</td> </tr>
82  * <tr> <td>^x</td><td>Any atom with the a specified hybridization state</td><td>None</td><td>x must be specified and
83  * should be between 1 and 8 (inclusive), corresponding to SP1, SP2, SP3, SP3D1, SP3D2 SP3D3, SP3D4 and SP3D5. Supported
84  * by the OpenEye SMARTS implementation</td> </tr> </tbody> </table>
85  *
86  * <h3>Notes</h3> <ul> <li>As <a href="http://sourceforge.net/mailarchive/message.php?msg_name=4964F605.1070502%40emolecules.com">described</a>
87  * by Craig James the <code>h&lt;n&gt;</code> SMARTS pattern should not be used. It was included in the Daylight spec
88  * for backwards compatibility. To match hydrogens, use the <code>H&lt;n&gt;</code> pattern.</li> <li>The wild card
89  * pattern (<code>*</code>) will not match hydrogens (explicit or implicit) unless an isotope is specified. In other
90  * words, <code>*</code> gives two hits against <code>C[2H]</code> but 1 hit against <code>C[H]</code>. This also means
91  * that it gives no hits against <code>[H][H]</code>. This is contrary to what is shown by Daylights <a
92  * href="http://www.daylight.com/daycgi_tutorials/depictmatch.cgi">depictmatch</a> service, but is based on this <a
93  * href="https://sourceforge.net/mailarchive/message.php?msg_name=4964FF9D.3040004%40emolecules.com">discussion</a>. A
94  * work around to get <code>*</code> to match <code>[H][H]</code> is to write it in the form <code>[1H][1H]</code>.
95  *
96  * It's not entirely clear what the behavior of * should be with respect to hydrogens. it is possible that the code will
97  * be updated so that <code>*</code> will not match <i>any</i> hydrogen in the future.</li> <li>The
98  * org.openscience.cdk.aromaticity.CDKHueckelAromaticityDetector only considers single rings and two fused non-spiro
99  * rings. As a result, it does not properly detect aromaticity in polycyclic systems such as
100  * <code>[O-]C(=O)c1ccccc1c2c3ccc([O-])cc3oc4cc(=O)ccc24</code>. Thus SMARTS patterns that depend on proper aromaticity
101  * detection may not work correctly in such polycyclic systems</li> </ul>
102  *
103  * @author Rajarshi Guha
104  * @cdk.created 2007-04-08
105  * @cdk.module smarts
106  * @cdk.githash
107  * @cdk.keyword SMARTS
108  * @cdk.keyword substructure search
109  * @deprecated use {@link org.openscience.cdk.smarts.SmartsPattern}
110  */
111 @Deprecated
112 public class SMARTSQueryTool {
113 
114     private static ILoggingTool logger        = LoggingToolFactory.createLoggingTool(SMARTSQueryTool.class);
115     private String              smarts;
116     private IAtomContainer      atomContainer = null;
117     private QueryAtomContainer  query         = null;
118     private List<int[]>         mappings;
119 
120     /**
121      * Defines which set of rings to define rings in the target.
122      */
123     private enum RingSet {
124 
125         /**
126          * Smallest Set of Smallest Rings (or Minimum Cycle Basis - but not
127          * strictly the same). Defines what is typically thought of as a 'ring'
128          * however the non-uniqueness leads to ambiguous matching.
129          */
130         SmallestSetOfSmallestRings {
131 
132             @Override
ringSet(IAtomContainer m)133             IRingSet ringSet(IAtomContainer m) {
134                 return Cycles.sssr(m).toRingSet();
135             }
136         },
137 
138         /**
139          * Intersect of all Minimum Cycle Bases (or SSSR) and thus is a subset.
140          * The set is unique but may excludes rings (e.g. from bridged systems).
141          */
142         EssentialRings {
143 
144             @Override
ringSet(IAtomContainer m)145             IRingSet ringSet(IAtomContainer m) {
146                 return Cycles.essential(m).toRingSet();
147             }
148         },
149 
150         /**
151          * Union of all Minimum Cycle Bases (or SSSR) and thus is a superset.
152          * The set is unique but may include more rings then is necessary.
153          */
154         RelevantRings {
155 
156             @Override
ringSet(IAtomContainer m)157             IRingSet ringSet(IAtomContainer m) {
158                 return Cycles.relevant(m).toRingSet();
159             }
160         };
161 
162         /**
163          * Compute a ring set for a molecule.
164          *
165          * @param m molecule
166          * @return the ring set for the molecule
167          */
ringSet(IAtomContainer m)168         abstract IRingSet ringSet(IAtomContainer m);
169     }
170 
171     /** Which short cyclic set should be used. */
172     private RingSet                  ringSet         = RingSet.EssentialRings;
173 
174     private final IChemObjectBuilder builder;
175 
176     /**
177      * Aromaticity perception - dealing with SMARTS we should use the Daylight
178      * model. This can be set to a different model using {@link #setAromaticity(Aromaticity)}.
179      */
180     private Aromaticity              aromaticity     = new Aromaticity(ElectronDonation.daylight(),
181                                                              Cycles.allOrVertexShort());
182 
183     /**
184      * Logical flag indicates whether the aromaticity model should be skipped.
185      * Generally this should be left as false to ensure the structures being
186      * matched are all treated the same. The flag can however be turned off if
187      * the molecules being tests are known to all have the same aromaticity
188      * model.
189      */
190     private boolean                  skipAromaticity = false;
191 
192     // a simplistic cache to store parsed SMARTS queries
193     private int                      MAX_ENTRIES     = 20;
194     Map<String, QueryAtomContainer>  cache           = new LinkedHashMap<String, QueryAtomContainer>(MAX_ENTRIES + 1,
195                                                              .75F, true) {
196 
197                                                          @Override
198                                                          public boolean removeEldestEntry(Map.Entry eldest) {
199                                                              return size() > MAX_ENTRIES;
200                                                          }
201                                                      };
202 
203     /**
204      * Create a new SMARTS query tool for the specified SMARTS string. Query
205      * objects will contain a reference to the specified {@link
206      * IChemObjectBuilder}.
207      *
208      * @param smarts SMARTS query string
209      * @throws IllegalArgumentException if the SMARTS string can not be handled
210      */
SMARTSQueryTool(String smarts, IChemObjectBuilder builder)211     public SMARTSQueryTool(String smarts, IChemObjectBuilder builder) {
212         this.builder = builder;
213         this.smarts = smarts;
214         try {
215             initializeQuery();
216         } catch (TokenMgrError error) {
217             throw new IllegalArgumentException("Error parsing SMARTS", error);
218         } catch (CDKException error) {
219             throw new IllegalArgumentException("Error parsing SMARTS", error);
220         }
221     }
222 
223     /**
224      * Set the maximum size of the query cache.
225      *
226      * @param maxEntries The maximum number of entries
227      */
setQueryCacheSize(int maxEntries)228     public void setQueryCacheSize(int maxEntries) {
229         MAX_ENTRIES = maxEntries;
230     }
231 
232     /**
233      * Indicates that ring properties should use the Smallest Set of Smallest
234      * Rings. The set is not unique and may lead to ambiguous matches.
235      * @see #useEssentialRings()
236      * @see #useRelevantRings()
237      */
useSmallestSetOfSmallestRings()238     public void useSmallestSetOfSmallestRings() {
239         this.ringSet = RingSet.SmallestSetOfSmallestRings;
240     }
241 
242     /**
243      * Indicates that ring properties should use the Relevant Rings. The set is
244      * unique and includes all of the SSSR but may be exponential in size.
245      *
246      * @see #useSmallestSetOfSmallestRings()
247      * @see #useEssentialRings()
248      */
useRelevantRings()249     public void useRelevantRings() {
250         this.ringSet = RingSet.RelevantRings;
251     }
252 
253     /**
254      * Indicates that ring properties should use the Essential Rings (default).
255      * The set is unique but only includes a subset of the SSSR.
256      *
257      * @see #useSmallestSetOfSmallestRings()
258      * @see #useEssentialRings()
259      */
useEssentialRings()260     public void useEssentialRings() {
261         this.ringSet = RingSet.EssentialRings;
262     }
263 
264     /**
265      * Set the aromaticity perception to use. Different aromaticity models
266      * may required certain attributes to be set (e.g. atom typing). These
267      * will not be automatically configured and should be preset before matching.
268      *
269      * <blockquote><pre>
270      * SMARTSQueryTool sqt = new SMARTSQueryTool(...);
271      * sqt.setAromaticity(new Aromaticity(ElectronDonation.cdk(),
272      *                                    Cycles.cdkAromaticSet));
273      * for (IAtomContainer molecule : molecules) {
274      *
275      *     // CDK Aromatic model needs atom types
276      *     AtomContainerManipulator.percieveAtomTypesAndConfigureAtoms(molecule);
277      *
278      *     sqt.matches(molecule);
279      * }
280      * </pre></blockquote>
281      *
282      * @param aromaticity the new aromaticity perception
283      * @see ElectronDonation
284      * @see Cycles
285      */
setAromaticity(Aromaticity aromaticity)286     public void setAromaticity(Aromaticity aromaticity) {
287         this.aromaticity = checkNotNull(aromaticity, "aromaticity was not provided");
288     }
289 
290     /**
291      * Returns the current SMARTS pattern being used.
292      *
293      * @return The SMARTS pattern
294      */
getSmarts()295     public String getSmarts() {
296         return smarts;
297     }
298 
299     /**
300      * Set a new SMARTS pattern.
301      *
302      * @param smarts The new SMARTS pattern
303      * @throws CDKException if there is an error in parsing the pattern
304      */
setSmarts(String smarts)305     public void setSmarts(String smarts) throws CDKException {
306         this.smarts = smarts;
307         initializeQuery();
308     }
309 
310     /**
311      * Perform a SMARTS match and check whether the query is present in the target molecule.  This function simply
312      * checks whether the query pattern matches the specified molecule. However the function will also, internally, save
313      * the mapping of query atoms to the target molecule
314      *
315      * <b>Note</b>: This method performs a simple caching scheme, by comparing the current molecule to the previous
316      * molecule by reference. If you repeatedly match different SMARTS on the same molecule, this method will avoid
317      * initializing ( ring perception, aromaticity etc.) the molecule each time. If however, you modify the molecule
318      * between such multiple matchings you should use the other form of this method to force initialization.
319      *
320      * @param atomContainer The target moleculoe
321      * @return true if the pattern is found in the target molecule, false otherwise
322      * @throws CDKException if there is an error in ring, aromaticity or isomorphism perception
323      * @see #getMatchingAtoms()
324      * @see #countMatches()
325      * @see #matches(org.openscience.cdk.interfaces.IAtomContainer, boolean)
326      */
matches(IAtomContainer atomContainer)327     public boolean matches(IAtomContainer atomContainer) throws CDKException {
328         return matches(atomContainer, false);
329     }
330 
331     /**
332      * Perform a SMARTS match and check whether the query is present in the target molecule.  This function simply
333      * checks whether the query pattern matches the specified molecule. However the function will also, internally, save
334      * the mapping of query atoms to the target molecule
335      *
336      * @param atomContainer       The target moleculoe
337      * @param forceInitialization If true, then the molecule is initialized (ring perception, aromaticity etc). If
338      *                            false, the molecule is only initialized if it is different (in terms of object
339      *                            reference) than one supplied in a previous call to this method.
340      * @return true if the pattern is found in the target molecule, false otherwise
341      * @throws CDKException if there is an error in ring, aromaticity or isomorphism perception
342      * @see #getMatchingAtoms()
343      * @see #countMatches()
344      * @see #matches(org.openscience.cdk.interfaces.IAtomContainer)
345      */
matches(IAtomContainer atomContainer, boolean forceInitialization)346     public boolean matches(IAtomContainer atomContainer, boolean forceInitialization) throws CDKException {
347 
348         if (this.atomContainer == atomContainer) {
349             if (forceInitialization) initializeMolecule();
350         } else {
351             this.atomContainer = atomContainer;
352             initializeMolecule();
353         }
354 
355         // lets see if we have a single atom query
356         if (query.getAtomCount() == 1) {
357             // lets get the query atom
358             IQueryAtom queryAtom = (IQueryAtom) query.getAtom(0);
359 
360             mappings = new ArrayList<int[]>();
361             for (int i = 0; i < atomContainer.getAtomCount(); i++) {
362                 if (queryAtom.matches(atomContainer.getAtom(i))) {
363                     mappings.add(new int[]{i});
364                 }
365             }
366         } else {
367             mappings = FluentIterable.from(VentoFoggia.findSubstructure(query)
368                                                       .matchAll(atomContainer)
369                                                       .filter(new SmartsStereoMatch(query, atomContainer)))
370                                      .toList();
371         }
372 
373         return !mappings.isEmpty();
374     }
375 
376     /**
377      * Returns the number of times the pattern was found in the target molecule.  This function should be called
378      * after {@link #matches(org.openscience.cdk.interfaces.IAtomContainer)}. If not, the results may be undefined.
379      *
380      * @return The number of times the pattern was found in the target molecule
381      */
countMatches()382     public int countMatches() {
383         return mappings.size();
384     }
385 
386     /**
387      * Get the atoms in the target molecule that match the query pattern.  Since there may be multiple matches, the
388      * return value is a List of List objects. Each List object contains the indices of the atoms in the target
389      * molecule, that match the query pattern
390      *
391      * @return A List of List of atom indices in the target molecule
392      */
getMatchingAtoms()393     public List<List<Integer>> getMatchingAtoms() {
394         List<List<Integer>> matched = new ArrayList<List<Integer>>(mappings.size());
395         for (int[] mapping : mappings)
396             matched.add(Ints.asList(mapping));
397         return matched;
398     }
399 
400     /**
401      * Get the atoms in the target molecule that match the query pattern.  Since there may be multiple matches, the
402      * return value is a List of List objects. Each List object contains the unique set of indices of the atoms in the
403      * target molecule, that match the query pattern
404      *
405      * @return A List of List of atom indices in the target molecule
406      */
getUniqueMatchingAtoms()407     public List<List<Integer>> getUniqueMatchingAtoms() {
408         List<List<Integer>> matched = new ArrayList<List<Integer>>(mappings.size());
409         Set<BitSet> atomSets = Sets.newHashSetWithExpectedSize(mappings.size());
410         for (int[] mapping : mappings) {
411             BitSet atomSet = new BitSet();
412             for (int x : mapping)
413                 atomSet.set(x);
414             if (atomSets.add(atomSet)) matched.add(Ints.asList(mapping));
415         }
416         return matched;
417     }
418 
419     /**
420      * Prepare the target molecule for analysis.  We perform ring perception and aromaticity detection and set up
421      * the appropriate properties. Right now, this function is called each time we need to do a query and this is
422      * inefficient.
423      *
424      * @throws CDKException if there is a problem in ring perception or aromaticity detection, which is usually related
425      *                      to a timeout in the ring finding code.
426      */
initializeMolecule()427     private void initializeMolecule() throws CDKException {
428 
429         // initialise required invariants - the query has ISINRING set if
430         // the query contains ring queries [R?] [r?] [x?] etc.
431         SmartsMatchers.prepare(atomContainer, true);
432 
433         // providing skip aromaticity has not be set apply the desired
434         // aromaticity model
435         try {
436             if (!skipAromaticity) {
437                 aromaticity.apply(atomContainer);
438             }
439         } catch (CDKException e) {
440             logger.debug(e.toString());
441             throw new CDKException(e.toString(), e);
442         }
443     }
444 
initializeQuery()445     private void initializeQuery() throws CDKException {
446         mappings = null;
447         query = cache.get(smarts);
448         if (query == null) {
449             query = SMARTSParser.parse(smarts, builder);
450             cache.put(smarts, query);
451         }
452     }
453 
matchedAtoms(List<List<RMap>> bondMapping, IAtomContainer atomContainer)454     private List<Set<Integer>> matchedAtoms(List<List<RMap>> bondMapping, IAtomContainer atomContainer) {
455 
456         List<Set<Integer>> atomMapping = new ArrayList<Set<Integer>>();
457         // loop over each mapping
458         for (List<RMap> mapping : bondMapping) {
459 
460             Set<Integer> tmp = new TreeSet<Integer>();
461             IAtom atom1 = null;
462             IAtom atom2 = null;
463             // loop over this mapping
464             for (RMap map : mapping) {
465 
466                 int bondID = map.getId1();
467 
468                 // get the atoms in this bond
469                 IBond bond = atomContainer.getBond(bondID);
470                 atom1 = bond.getBegin();
471                 atom2 = bond.getEnd();
472 
473                 Integer idx1 = atomContainer.indexOf(atom1);
474                 Integer idx2 = atomContainer.indexOf(atom2);
475 
476                 if (!tmp.contains(idx1)) tmp.add(idx1);
477                 if (!tmp.contains(idx2)) tmp.add(idx2);
478             }
479             if (tmp.size() == query.getAtomCount()) atomMapping.add(tmp);
480 
481             // If there is only one bond, check if it matches both ways.
482             if (mapping.size() == 1 && atom1.getAtomicNumber().equals(atom2.getAtomicNumber())) {
483                 atomMapping.add(new TreeSet<Integer>(tmp));
484             }
485         }
486 
487         return atomMapping;
488     }
489 }
490