1 /* Copyright (C) 2007 Rajarshi Guha <rajarshi@users.sourceforge.net> 2 * 3 * Contact: cdk-devel@lists.sourceforge.net 4 * 5 * This program is free software; you can redistribute it and/or 6 * modify it under the terms of the GNU Lesser General Public License 7 * as published by the Free Software Foundation; either version 2.1 8 * of the License, or (at your option) any later version. 9 * 10 * This program is distributed in the hope that it will be useful, 11 * but WITHOUT ANY WARRANTY; without even the implied warranty of 12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 13 * GNU Lesser General Public License for more details. 14 * 15 * You should have received a copy of the GNU Lesser General Public License 16 * along with this program; if not, write to the Free Software 17 * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. 18 */ 19 package org.openscience.cdk.smiles.smarts; 20 21 import com.google.common.collect.FluentIterable; 22 import com.google.common.collect.Sets; 23 import com.google.common.primitives.Ints; 24 import org.openscience.cdk.aromaticity.Aromaticity; 25 import org.openscience.cdk.aromaticity.ElectronDonation; 26 import org.openscience.cdk.exception.CDKException; 27 import org.openscience.cdk.graph.Cycles; 28 import org.openscience.cdk.interfaces.IAtom; 29 import org.openscience.cdk.interfaces.IAtomContainer; 30 import org.openscience.cdk.interfaces.IBond; 31 import org.openscience.cdk.interfaces.IChemObjectBuilder; 32 import org.openscience.cdk.interfaces.IRingSet; 33 import org.openscience.cdk.isomorphism.SmartsStereoMatch; 34 import org.openscience.cdk.isomorphism.VentoFoggia; 35 import org.openscience.cdk.isomorphism.matchers.IQueryAtom; 36 import org.openscience.cdk.isomorphism.matchers.QueryAtomContainer; 37 import org.openscience.cdk.isomorphism.matchers.smarts.SmartsMatchers; 38 import org.openscience.cdk.isomorphism.mcss.RMap; 39 import org.openscience.cdk.smiles.smarts.parser.SMARTSParser; 40 import org.openscience.cdk.smiles.smarts.parser.TokenMgrError; 41 import org.openscience.cdk.tools.ILoggingTool; 42 import org.openscience.cdk.tools.LoggingToolFactory; 43 44 import java.util.ArrayList; 45 import java.util.BitSet; 46 import java.util.LinkedHashMap; 47 import java.util.List; 48 import java.util.Map; 49 import java.util.Set; 50 import java.util.TreeSet; 51 52 import static com.google.common.base.Preconditions.checkNotNull; 53 54 /** 55 * This class provides a easy to use wrapper around SMARTS matching functionality. User code that wants to do 56 * SMARTS matching should use this rather than using SMARTSParser (and UniversalIsomorphismTester) directly. Example 57 * usage would be 58 * 59 * <pre>{@code 60 * SmilesParser sp = new SmilesParser(DefaultChemObjectBuilder.getInstance()); 61 * IAtomContainer atomContainer = sp.parseSmiles("CC(=O)OC(=O)C"); 62 * SMARTSQueryTool querytool = new SMARTSQueryTool("O=CO"); 63 * boolean status = querytool.matches(atomContainer); 64 * if (status) { 65 * int nmatch = querytool.countMatches(); 66 * List mappings = querytool.getMatchingAtoms(); 67 * for (int i = 0; i < nmatch; i++) { 68 * List atomIndices = (List) mappings.get(i); 69 * } 70 * } 71 * }</pre> 72 * <h3>SMARTS Extensions</h3> 73 * 74 * Currently the CDK supports the following SMARTS symbols, that are not described in the Daylight specification. 75 * However they are supported by other packages and are noted as such. 76 * 77 * <table border=1 cellpadding=3><caption>Table 1 - Supported Extensions</caption> <thead> 78 * <tr> <th>Symbol</th><th>Meaning</th><th>Default</th><th>Notes</th> </tr> 79 * </thead> <tbody> <tr> <td>Gx</td><td>Periodic group number</td><td>None</td><td>x must be specified and must be a 80 * number between 1 and 18. This symbol is supported by the MOE SMARTS implementation</td> <tr> <td>#X</td><td>Any 81 * non-carbon heavy element</td><td>None</td><td>This symbol is supported by the MOE SMARTS implementation</td> </tr> 82 * <tr> <td>^x</td><td>Any atom with the a specified hybridization state</td><td>None</td><td>x must be specified and 83 * should be between 1 and 8 (inclusive), corresponding to SP1, SP2, SP3, SP3D1, SP3D2 SP3D3, SP3D4 and SP3D5. Supported 84 * by the OpenEye SMARTS implementation</td> </tr> </tbody> </table> 85 * 86 * <h3>Notes</h3> <ul> <li>As <a href="http://sourceforge.net/mailarchive/message.php?msg_name=4964F605.1070502%40emolecules.com">described</a> 87 * by Craig James the <code>h<n></code> SMARTS pattern should not be used. It was included in the Daylight spec 88 * for backwards compatibility. To match hydrogens, use the <code>H<n></code> pattern.</li> <li>The wild card 89 * pattern (<code>*</code>) will not match hydrogens (explicit or implicit) unless an isotope is specified. In other 90 * words, <code>*</code> gives two hits against <code>C[2H]</code> but 1 hit against <code>C[H]</code>. This also means 91 * that it gives no hits against <code>[H][H]</code>. This is contrary to what is shown by Daylights <a 92 * href="http://www.daylight.com/daycgi_tutorials/depictmatch.cgi">depictmatch</a> service, but is based on this <a 93 * href="https://sourceforge.net/mailarchive/message.php?msg_name=4964FF9D.3040004%40emolecules.com">discussion</a>. A 94 * work around to get <code>*</code> to match <code>[H][H]</code> is to write it in the form <code>[1H][1H]</code>. 95 * 96 * It's not entirely clear what the behavior of * should be with respect to hydrogens. it is possible that the code will 97 * be updated so that <code>*</code> will not match <i>any</i> hydrogen in the future.</li> <li>The 98 * org.openscience.cdk.aromaticity.CDKHueckelAromaticityDetector only considers single rings and two fused non-spiro 99 * rings. As a result, it does not properly detect aromaticity in polycyclic systems such as 100 * <code>[O-]C(=O)c1ccccc1c2c3ccc([O-])cc3oc4cc(=O)ccc24</code>. Thus SMARTS patterns that depend on proper aromaticity 101 * detection may not work correctly in such polycyclic systems</li> </ul> 102 * 103 * @author Rajarshi Guha 104 * @cdk.created 2007-04-08 105 * @cdk.module smarts 106 * @cdk.githash 107 * @cdk.keyword SMARTS 108 * @cdk.keyword substructure search 109 * @deprecated use {@link org.openscience.cdk.smarts.SmartsPattern} 110 */ 111 @Deprecated 112 public class SMARTSQueryTool { 113 114 private static ILoggingTool logger = LoggingToolFactory.createLoggingTool(SMARTSQueryTool.class); 115 private String smarts; 116 private IAtomContainer atomContainer = null; 117 private QueryAtomContainer query = null; 118 private List<int[]> mappings; 119 120 /** 121 * Defines which set of rings to define rings in the target. 122 */ 123 private enum RingSet { 124 125 /** 126 * Smallest Set of Smallest Rings (or Minimum Cycle Basis - but not 127 * strictly the same). Defines what is typically thought of as a 'ring' 128 * however the non-uniqueness leads to ambiguous matching. 129 */ 130 SmallestSetOfSmallestRings { 131 132 @Override ringSet(IAtomContainer m)133 IRingSet ringSet(IAtomContainer m) { 134 return Cycles.sssr(m).toRingSet(); 135 } 136 }, 137 138 /** 139 * Intersect of all Minimum Cycle Bases (or SSSR) and thus is a subset. 140 * The set is unique but may excludes rings (e.g. from bridged systems). 141 */ 142 EssentialRings { 143 144 @Override ringSet(IAtomContainer m)145 IRingSet ringSet(IAtomContainer m) { 146 return Cycles.essential(m).toRingSet(); 147 } 148 }, 149 150 /** 151 * Union of all Minimum Cycle Bases (or SSSR) and thus is a superset. 152 * The set is unique but may include more rings then is necessary. 153 */ 154 RelevantRings { 155 156 @Override ringSet(IAtomContainer m)157 IRingSet ringSet(IAtomContainer m) { 158 return Cycles.relevant(m).toRingSet(); 159 } 160 }; 161 162 /** 163 * Compute a ring set for a molecule. 164 * 165 * @param m molecule 166 * @return the ring set for the molecule 167 */ ringSet(IAtomContainer m)168 abstract IRingSet ringSet(IAtomContainer m); 169 } 170 171 /** Which short cyclic set should be used. */ 172 private RingSet ringSet = RingSet.EssentialRings; 173 174 private final IChemObjectBuilder builder; 175 176 /** 177 * Aromaticity perception - dealing with SMARTS we should use the Daylight 178 * model. This can be set to a different model using {@link #setAromaticity(Aromaticity)}. 179 */ 180 private Aromaticity aromaticity = new Aromaticity(ElectronDonation.daylight(), 181 Cycles.allOrVertexShort()); 182 183 /** 184 * Logical flag indicates whether the aromaticity model should be skipped. 185 * Generally this should be left as false to ensure the structures being 186 * matched are all treated the same. The flag can however be turned off if 187 * the molecules being tests are known to all have the same aromaticity 188 * model. 189 */ 190 private boolean skipAromaticity = false; 191 192 // a simplistic cache to store parsed SMARTS queries 193 private int MAX_ENTRIES = 20; 194 Map<String, QueryAtomContainer> cache = new LinkedHashMap<String, QueryAtomContainer>(MAX_ENTRIES + 1, 195 .75F, true) { 196 197 @Override 198 public boolean removeEldestEntry(Map.Entry eldest) { 199 return size() > MAX_ENTRIES; 200 } 201 }; 202 203 /** 204 * Create a new SMARTS query tool for the specified SMARTS string. Query 205 * objects will contain a reference to the specified {@link 206 * IChemObjectBuilder}. 207 * 208 * @param smarts SMARTS query string 209 * @throws IllegalArgumentException if the SMARTS string can not be handled 210 */ SMARTSQueryTool(String smarts, IChemObjectBuilder builder)211 public SMARTSQueryTool(String smarts, IChemObjectBuilder builder) { 212 this.builder = builder; 213 this.smarts = smarts; 214 try { 215 initializeQuery(); 216 } catch (TokenMgrError error) { 217 throw new IllegalArgumentException("Error parsing SMARTS", error); 218 } catch (CDKException error) { 219 throw new IllegalArgumentException("Error parsing SMARTS", error); 220 } 221 } 222 223 /** 224 * Set the maximum size of the query cache. 225 * 226 * @param maxEntries The maximum number of entries 227 */ setQueryCacheSize(int maxEntries)228 public void setQueryCacheSize(int maxEntries) { 229 MAX_ENTRIES = maxEntries; 230 } 231 232 /** 233 * Indicates that ring properties should use the Smallest Set of Smallest 234 * Rings. The set is not unique and may lead to ambiguous matches. 235 * @see #useEssentialRings() 236 * @see #useRelevantRings() 237 */ useSmallestSetOfSmallestRings()238 public void useSmallestSetOfSmallestRings() { 239 this.ringSet = RingSet.SmallestSetOfSmallestRings; 240 } 241 242 /** 243 * Indicates that ring properties should use the Relevant Rings. The set is 244 * unique and includes all of the SSSR but may be exponential in size. 245 * 246 * @see #useSmallestSetOfSmallestRings() 247 * @see #useEssentialRings() 248 */ useRelevantRings()249 public void useRelevantRings() { 250 this.ringSet = RingSet.RelevantRings; 251 } 252 253 /** 254 * Indicates that ring properties should use the Essential Rings (default). 255 * The set is unique but only includes a subset of the SSSR. 256 * 257 * @see #useSmallestSetOfSmallestRings() 258 * @see #useEssentialRings() 259 */ useEssentialRings()260 public void useEssentialRings() { 261 this.ringSet = RingSet.EssentialRings; 262 } 263 264 /** 265 * Set the aromaticity perception to use. Different aromaticity models 266 * may required certain attributes to be set (e.g. atom typing). These 267 * will not be automatically configured and should be preset before matching. 268 * 269 * <blockquote><pre> 270 * SMARTSQueryTool sqt = new SMARTSQueryTool(...); 271 * sqt.setAromaticity(new Aromaticity(ElectronDonation.cdk(), 272 * Cycles.cdkAromaticSet)); 273 * for (IAtomContainer molecule : molecules) { 274 * 275 * // CDK Aromatic model needs atom types 276 * AtomContainerManipulator.percieveAtomTypesAndConfigureAtoms(molecule); 277 * 278 * sqt.matches(molecule); 279 * } 280 * </pre></blockquote> 281 * 282 * @param aromaticity the new aromaticity perception 283 * @see ElectronDonation 284 * @see Cycles 285 */ setAromaticity(Aromaticity aromaticity)286 public void setAromaticity(Aromaticity aromaticity) { 287 this.aromaticity = checkNotNull(aromaticity, "aromaticity was not provided"); 288 } 289 290 /** 291 * Returns the current SMARTS pattern being used. 292 * 293 * @return The SMARTS pattern 294 */ getSmarts()295 public String getSmarts() { 296 return smarts; 297 } 298 299 /** 300 * Set a new SMARTS pattern. 301 * 302 * @param smarts The new SMARTS pattern 303 * @throws CDKException if there is an error in parsing the pattern 304 */ setSmarts(String smarts)305 public void setSmarts(String smarts) throws CDKException { 306 this.smarts = smarts; 307 initializeQuery(); 308 } 309 310 /** 311 * Perform a SMARTS match and check whether the query is present in the target molecule. This function simply 312 * checks whether the query pattern matches the specified molecule. However the function will also, internally, save 313 * the mapping of query atoms to the target molecule 314 * 315 * <b>Note</b>: This method performs a simple caching scheme, by comparing the current molecule to the previous 316 * molecule by reference. If you repeatedly match different SMARTS on the same molecule, this method will avoid 317 * initializing ( ring perception, aromaticity etc.) the molecule each time. If however, you modify the molecule 318 * between such multiple matchings you should use the other form of this method to force initialization. 319 * 320 * @param atomContainer The target moleculoe 321 * @return true if the pattern is found in the target molecule, false otherwise 322 * @throws CDKException if there is an error in ring, aromaticity or isomorphism perception 323 * @see #getMatchingAtoms() 324 * @see #countMatches() 325 * @see #matches(org.openscience.cdk.interfaces.IAtomContainer, boolean) 326 */ matches(IAtomContainer atomContainer)327 public boolean matches(IAtomContainer atomContainer) throws CDKException { 328 return matches(atomContainer, false); 329 } 330 331 /** 332 * Perform a SMARTS match and check whether the query is present in the target molecule. This function simply 333 * checks whether the query pattern matches the specified molecule. However the function will also, internally, save 334 * the mapping of query atoms to the target molecule 335 * 336 * @param atomContainer The target moleculoe 337 * @param forceInitialization If true, then the molecule is initialized (ring perception, aromaticity etc). If 338 * false, the molecule is only initialized if it is different (in terms of object 339 * reference) than one supplied in a previous call to this method. 340 * @return true if the pattern is found in the target molecule, false otherwise 341 * @throws CDKException if there is an error in ring, aromaticity or isomorphism perception 342 * @see #getMatchingAtoms() 343 * @see #countMatches() 344 * @see #matches(org.openscience.cdk.interfaces.IAtomContainer) 345 */ matches(IAtomContainer atomContainer, boolean forceInitialization)346 public boolean matches(IAtomContainer atomContainer, boolean forceInitialization) throws CDKException { 347 348 if (this.atomContainer == atomContainer) { 349 if (forceInitialization) initializeMolecule(); 350 } else { 351 this.atomContainer = atomContainer; 352 initializeMolecule(); 353 } 354 355 // lets see if we have a single atom query 356 if (query.getAtomCount() == 1) { 357 // lets get the query atom 358 IQueryAtom queryAtom = (IQueryAtom) query.getAtom(0); 359 360 mappings = new ArrayList<int[]>(); 361 for (int i = 0; i < atomContainer.getAtomCount(); i++) { 362 if (queryAtom.matches(atomContainer.getAtom(i))) { 363 mappings.add(new int[]{i}); 364 } 365 } 366 } else { 367 mappings = FluentIterable.from(VentoFoggia.findSubstructure(query) 368 .matchAll(atomContainer) 369 .filter(new SmartsStereoMatch(query, atomContainer))) 370 .toList(); 371 } 372 373 return !mappings.isEmpty(); 374 } 375 376 /** 377 * Returns the number of times the pattern was found in the target molecule. This function should be called 378 * after {@link #matches(org.openscience.cdk.interfaces.IAtomContainer)}. If not, the results may be undefined. 379 * 380 * @return The number of times the pattern was found in the target molecule 381 */ countMatches()382 public int countMatches() { 383 return mappings.size(); 384 } 385 386 /** 387 * Get the atoms in the target molecule that match the query pattern. Since there may be multiple matches, the 388 * return value is a List of List objects. Each List object contains the indices of the atoms in the target 389 * molecule, that match the query pattern 390 * 391 * @return A List of List of atom indices in the target molecule 392 */ getMatchingAtoms()393 public List<List<Integer>> getMatchingAtoms() { 394 List<List<Integer>> matched = new ArrayList<List<Integer>>(mappings.size()); 395 for (int[] mapping : mappings) 396 matched.add(Ints.asList(mapping)); 397 return matched; 398 } 399 400 /** 401 * Get the atoms in the target molecule that match the query pattern. Since there may be multiple matches, the 402 * return value is a List of List objects. Each List object contains the unique set of indices of the atoms in the 403 * target molecule, that match the query pattern 404 * 405 * @return A List of List of atom indices in the target molecule 406 */ getUniqueMatchingAtoms()407 public List<List<Integer>> getUniqueMatchingAtoms() { 408 List<List<Integer>> matched = new ArrayList<List<Integer>>(mappings.size()); 409 Set<BitSet> atomSets = Sets.newHashSetWithExpectedSize(mappings.size()); 410 for (int[] mapping : mappings) { 411 BitSet atomSet = new BitSet(); 412 for (int x : mapping) 413 atomSet.set(x); 414 if (atomSets.add(atomSet)) matched.add(Ints.asList(mapping)); 415 } 416 return matched; 417 } 418 419 /** 420 * Prepare the target molecule for analysis. We perform ring perception and aromaticity detection and set up 421 * the appropriate properties. Right now, this function is called each time we need to do a query and this is 422 * inefficient. 423 * 424 * @throws CDKException if there is a problem in ring perception or aromaticity detection, which is usually related 425 * to a timeout in the ring finding code. 426 */ initializeMolecule()427 private void initializeMolecule() throws CDKException { 428 429 // initialise required invariants - the query has ISINRING set if 430 // the query contains ring queries [R?] [r?] [x?] etc. 431 SmartsMatchers.prepare(atomContainer, true); 432 433 // providing skip aromaticity has not be set apply the desired 434 // aromaticity model 435 try { 436 if (!skipAromaticity) { 437 aromaticity.apply(atomContainer); 438 } 439 } catch (CDKException e) { 440 logger.debug(e.toString()); 441 throw new CDKException(e.toString(), e); 442 } 443 } 444 initializeQuery()445 private void initializeQuery() throws CDKException { 446 mappings = null; 447 query = cache.get(smarts); 448 if (query == null) { 449 query = SMARTSParser.parse(smarts, builder); 450 cache.put(smarts, query); 451 } 452 } 453 matchedAtoms(List<List<RMap>> bondMapping, IAtomContainer atomContainer)454 private List<Set<Integer>> matchedAtoms(List<List<RMap>> bondMapping, IAtomContainer atomContainer) { 455 456 List<Set<Integer>> atomMapping = new ArrayList<Set<Integer>>(); 457 // loop over each mapping 458 for (List<RMap> mapping : bondMapping) { 459 460 Set<Integer> tmp = new TreeSet<Integer>(); 461 IAtom atom1 = null; 462 IAtom atom2 = null; 463 // loop over this mapping 464 for (RMap map : mapping) { 465 466 int bondID = map.getId1(); 467 468 // get the atoms in this bond 469 IBond bond = atomContainer.getBond(bondID); 470 atom1 = bond.getBegin(); 471 atom2 = bond.getEnd(); 472 473 Integer idx1 = atomContainer.indexOf(atom1); 474 Integer idx2 = atomContainer.indexOf(atom2); 475 476 if (!tmp.contains(idx1)) tmp.add(idx1); 477 if (!tmp.contains(idx2)) tmp.add(idx2); 478 } 479 if (tmp.size() == query.getAtomCount()) atomMapping.add(tmp); 480 481 // If there is only one bond, check if it matches both ways. 482 if (mapping.size() == 1 && atom1.getAtomicNumber().equals(atom2.getAtomicNumber())) { 483 atomMapping.add(new TreeSet<Integer>(tmp)); 484 } 485 } 486 487 return atomMapping; 488 } 489 } 490