1 /* 2 * Copyright (c) 2013 European Bioinformatics Institute (EMBL-EBI) 3 * John May <jwmay@users.sf.net> 4 * 5 * Contact: cdk-devel@lists.sourceforge.net 6 * 7 * This program is free software; you can redistribute it and/or modify it 8 * under the terms of the GNU Lesser General Public License as published by 9 * the Free Software Foundation; either version 2.1 of the License, or (at 10 * your option) any later version. All we ask is that proper credit is given 11 * for our work, which includes - but is not limited to - adding the above 12 * copyright notice to the beginning of your source code files, and to any 13 * copyright notice that you may distribute with programs based on this work. 14 * 15 * This program is distributed in the hope that it will be useful, but WITHOUT 16 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or 17 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public 18 * License for more details. 19 * 20 * You should have received a copy of the GNU Lesser General Public License 21 * along with this program; if not, write to the Free Software 22 * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 U 23 */ 24 25 package org.openscience.cdk.graph.invariant; 26 27 import org.openscience.cdk.interfaces.IAtom; 28 import org.openscience.cdk.interfaces.IAtomContainer; 29 import org.openscience.cdk.interfaces.IPseudoAtom; 30 import org.openscience.cdk.tools.manipulator.AtomContainerManipulator; 31 32 import java.io.BufferedReader; 33 import java.io.IOException; 34 import java.io.InputStreamReader; 35 import java.util.Arrays; 36 import java.util.Collections; 37 import java.util.Comparator; 38 39 /** 40 * An implementation based on the canon algorithm {@cdk.cite WEI89}. The 41 * algorithm uses an initial set of of invariants which are assigned a rank. 42 * Equivalent ranks are then shattered using an unambiguous function (in this 43 * case, the product of primes of adjacent ranks). Once no more equivalent ranks 44 * can be shattered ties are artificially broken and rank shattering continues. 45 * Unlike the original description rank stability is not maintained reducing 46 * the number of values to rank at each stage to only those which are equivalent. 47 * 48 * 49 * The initial set of invariants is basic and are - <i> 50 * "sufficient for the purpose of obtaining unique notation for simple SMILES, 51 * but it is not necessarily a “complete” set. No “perfect” set of invariants 52 * is known that will distinguish all possible graph asymmetries. However, 53 * for any given set of structures, a set of invariants can be devised to 54 * provide the necessary discrimination"</i> {@cdk.cite WEI89}. As such this 55 * producer should not be considered a complete canonical labelled but in 56 * practice performs well. For a more accurate and computationally expensive 57 * labelling, please using the {@link InChINumbersTools}. 58 * 59 * <blockquote><pre> 60 * IAtomContainer m = ...; 61 * int[][] g = GraphUtil.toAdjList(m); 62 * 63 * // obtain canon labelling 64 * long[] labels = Canon.label(m, g); 65 * 66 * // obtain symmetry classes 67 * long[] labels = Canon.symmetry(m, g); 68 * </pre></blockquote> 69 * 70 * @author John May 71 * @cdk.module standard 72 * @cdk.githash 73 */ 74 public final class Canon { 75 76 private static final int N_PRIMES = 10000; 77 /** 78 * Graph, adjacency list representation. 79 */ 80 private final int[][] g; 81 82 /** 83 * Storage of canon labelling and symmetry classes. 84 */ 85 private final long[] labelling, symmetry; 86 87 /** Only compute the symmetry classes. */ 88 private boolean symOnly = false; 89 90 /** 91 * Create a canon labelling for the graph (g) with the specified 92 * invariants. 93 * 94 * @param g a graph (adjacency list representation) 95 * @param hydrogens binary vector of terminal hydrogens 96 * @param partition an initial partition of the vertices 97 */ Canon(int[][] g, long[] partition, boolean[] hydrogens, boolean symOnly)98 private Canon(int[][] g, long[] partition, boolean[] hydrogens, boolean symOnly) { 99 this.g = g; 100 this.symOnly = symOnly; 101 labelling = partition.clone(); 102 symmetry = refine(labelling, hydrogens); 103 } 104 105 /** 106 * Compute the canonical labels for the provided structure. The labelling 107 * does not consider isomer information or stereochemistry. The current 108 * implementation does not fully distinguish all structure topologies 109 * but in practise performs well in the majority of cases. A complete 110 * canonical labelling can be obtained using the {@link InChINumbersTools} 111 * but is computationally much more expensive. 112 * 113 * @param container structure 114 * @param g adjacency list graph representation 115 * @return the canonical labelling 116 * @see EquivalentClassPartitioner 117 * @see InChINumbersTools 118 */ label(IAtomContainer container, int[][] g)119 public static long[] label(IAtomContainer container, int[][] g) { 120 return label(container, g, basicInvariants(container, g)); 121 } 122 123 /** 124 * Compute the canonical labels for the provided structure. The labelling 125 * does not consider isomer information or stereochemistry. This method 126 * allows provision of a custom array of initial invariants. 127 * 128 * 129 * The current 130 * implementation does not fully distinguish all structure topologies 131 * but in practise performs well in the majority of cases. A complete 132 * canonical labelling can be obtained using the {@link InChINumbersTools} 133 * but is computationally much more expensive. 134 * 135 * @param container structure 136 * @param g adjacency list graph representation 137 * @param initial initial seed invariants 138 * @return the canonical labelling 139 * @see EquivalentClassPartitioner 140 * @see InChINumbersTools 141 */ label(IAtomContainer container, int[][] g, long[] initial)142 public static long[] label(IAtomContainer container, int[][] g, long[] initial) { 143 if (initial.length != g.length) 144 throw new IllegalArgumentException("number of initial != number of atoms"); 145 return new Canon(g, initial, terminalHydrogens(container, g), false).labelling; 146 } 147 148 /** 149 * Compute the canonical labels for the provided structure. The initial 150 * labelling is seed-ed with the provided atom comparator <code>cmp</code> 151 * allowing arbitary properties to be distinguished or ignored. 152 * 153 * @param container structure 154 * @param g adjacency list graph representation 155 * @param cmp comparator to compare atoms 156 * @return the canonical labelling 157 */ label(IAtomContainer container, int[][] g, Comparator<IAtom> cmp)158 public static long[] label(IAtomContainer container, 159 int[][] g, 160 Comparator<IAtom> cmp) { 161 if (g.length == 0) 162 return new long[0]; 163 IAtom[] atoms = AtomContainerManipulator.getAtomArray(container); 164 Arrays.sort(atoms, cmp); 165 long[] initial = new long[atoms.length]; 166 long part = 1; 167 initial[container.indexOf(atoms[0])] = part; 168 for (int i=1; i<atoms.length; i++) { 169 if (cmp.compare(atoms[i], atoms[i-1]) != 0) 170 ++part; 171 initial[container.indexOf(atoms[i])] = part; 172 } 173 return label(container, g, initial); 174 } 175 176 /** 177 * Compute the symmetry classes for the provided structure. There are known 178 * examples where symmetry is incorrectly found. The {@link 179 * EquivalentClassPartitioner} gives more accurate symmetry perception but 180 * this method is very quick and in practise successfully portions the 181 * majority of chemical structures. 182 * 183 * @param container structure 184 * @param g adjacency list graph representation 185 * @return symmetry classes 186 * @see EquivalentClassPartitioner 187 */ symmetry(IAtomContainer container, int[][] g)188 public static long[] symmetry(IAtomContainer container, int[][] g) { 189 return new Canon(g, basicInvariants(container, g), terminalHydrogens(container, g), true).symmetry; 190 } 191 192 /** 193 * Internal - refine invariants to a canonical labelling and 194 * symmetry classes. 195 * 196 * @param invariants the invariants to refine (canonical labelling gets 197 * written here) 198 * @param hydrogens binary vector of terminal hydrogens 199 * @return the symmetry classes 200 */ refine(long[] invariants, boolean[] hydrogens)201 private long[] refine(long[] invariants, boolean[] hydrogens) { 202 203 int ord = g.length; 204 205 InvariantRanker ranker = new InvariantRanker(ord); 206 207 // current/next vertices, these only hold the vertices which are 208 // equivalent 209 int[] currVs = new int[ord]; 210 int[] nextVs = new int[ord]; 211 212 // fill with identity (also set number of non-unique) 213 int nnu = ord; 214 for (int i = 0; i < ord; i++) 215 currVs[i] = i; 216 217 long[] prev = invariants; 218 long[] curr = Arrays.copyOf(invariants, ord); 219 220 // initially all labels are 1, the input invariants are then used to 221 // refine this coarse partition 222 Arrays.fill(prev, 1L); 223 224 // number of ranks 225 int n = 0, m = 0; 226 227 // storage of symmetry classes 228 long[] symmetry = null; 229 230 while (n < ord) { 231 232 // refine the initial invariants using product of primes from 233 // adjacent ranks 234 while ((n = ranker.rank(currVs, nextVs, nnu, curr, prev)) > m && n < ord) { 235 nnu = 0; 236 for (int i = 0; i < ord && nextVs[i] >= 0; i++) { 237 int v = nextVs[i]; 238 currVs[nnu++] = v; 239 curr[v] = hydrogens[v] ? prev[v] : primeProduct(g[v], prev, hydrogens); 240 } 241 m = n; 242 } 243 244 if (symmetry == null) { 245 246 // After symmetry classes have been found without hydrogens we add 247 // back in the hydrogens and assign ranks. We don't refine the 248 // partition until the next time round the while loop to avoid 249 // artificially splitting due to hydrogen representation, for example 250 // the two hydrogens are equivalent in this SMILES for ethane '[H]CC' 251 for (int i = 0; i < g.length; i++) { 252 if (hydrogens[i]) { 253 curr[i] = prev[g[i][0]]; 254 hydrogens[i] = false; 255 } 256 } 257 n = ranker.rank(currVs, nextVs, nnu, curr, prev); 258 symmetry = Arrays.copyOf(prev, ord); 259 260 // Update the buffer of non-unique vertices as hydrogens next 261 // to discrete heavy atoms are also discrete (and removed from 262 // 'nextVs' during ranking. 263 nnu = 0; 264 for (int i = 0; i < ord && nextVs[i] >= 0; i++) { 265 currVs[nnu++] = nextVs[i]; 266 } 267 } 268 269 // partition is discrete or only symmetry classes are needed 270 if (symOnly || n == ord) return symmetry; 271 272 // artificially split the lowest cell, we perturb the value 273 // of all vertices with equivalent rank to the lowest non-unique 274 // vertex 275 int lo = nextVs[0]; 276 for (int i = 1; i < ord && nextVs[i] >= 0 && prev[nextVs[i]] == prev[lo]; i++) 277 prev[nextVs[i]]++; 278 279 // could also swap but this is cleaner 280 System.arraycopy(nextVs, 0, currVs, 0, nnu); 281 } 282 283 return symmetry; 284 } 285 286 /** 287 * Compute the prime product of the values (ranks) for the given 288 * adjacent neighbors (ws). 289 * 290 * @param ws indices (adjacent neighbors) 291 * @param ranks invariant ranks 292 * @return the prime product 293 */ primeProduct(int[] ws, long[] ranks, boolean[] hydrogens)294 private long primeProduct(int[] ws, long[] ranks, boolean[] hydrogens) { 295 long prod = 1; 296 for (int w : ws) { 297 if (!hydrogens[w]) { 298 prod *= PRIMES[(int) ranks[w]]; 299 } 300 } 301 return prod; 302 } 303 304 /** 305 * Generate the initial invariants for each atom in the {@code container}. 306 * The labels use the invariants described in {@cdk.cite WEI89}. 307 * 308 * The bits in the low 32-bits are: {@code 0000000000xxxxXXXXeeeeeeescchhhh} 309 * where: 310 * <ul> 311 * <li>0: padding</li> 312 * <li>x: number of connections</li> 313 * <li>X: number of non-hydrogens bonds</li> 314 * <li>e: atomic number</li> 315 * <li>s: sign of charge</li> 316 * <li>c: absolute charge</li> 317 * <li>h: number of attached hydrogens</li> 318 * </ul> 319 * 320 * <b>Important: These invariants are <i>basic</i> and there are known 321 * examples they don't distinguish. One trivial example to consider is 322 * {@code [O]C=O} where both oxygens have no hydrogens and a single 323 * connection but the atoms are not equivalent. Including a better 324 * initial partition is more expensive</b> 325 * 326 * @param container an atom container to generate labels for 327 * @param graph graph representation (adjacency list) 328 * @return initial invariants 329 * @throws NullPointerException an atom had unset atomic number, hydrogen 330 * count or formal charge 331 */ basicInvariants(IAtomContainer container, int[][] graph)332 public static long[] basicInvariants(IAtomContainer container, int[][] graph) { 333 334 long[] labels = new long[graph.length]; 335 336 for (int v = 0; v < graph.length; v++) { 337 IAtom atom = container.getAtom(v); 338 339 int deg = graph[v].length; 340 int impH = implH(atom); 341 int expH = 0; 342 int elem = atomicNumber(atom); 343 int chg = charge(atom); 344 345 // count non-suppressed (explicit) hydrogens 346 for (int w : graph[v]) 347 if (atomicNumber(container.getAtom(w)) == 1) expH++; 348 349 long label = 0; // connectivity (first in) 350 label |= deg + impH & 0xf; 351 label <<= 4; // connectivity (heavy) <= 15 (4 bits) 352 label |= deg - expH & 0xf; 353 label <<= 7; // atomic number <= 127 (7 bits) 354 label |= elem & 0x7f; 355 label <<= 1; // charge sign == 1 (1 bit) 356 label |= chg >> 31 & 0x1; 357 label <<= 2; // charge <= 3 (2 bits) 358 label |= Math.abs(chg) & 0x3; 359 label <<= 4; // hydrogen count <= 15 (4 bits) 360 label |= impH + expH & 0xf; 361 362 labels[v] = label; 363 } 364 return labels; 365 } 366 367 /** 368 * Access atomic number of atom defaulting to 0 for pseudo atoms. 369 * 370 * @param atom an atom 371 * @return the atomic number 372 * @throws NullPointerException the atom was non-pseudo at did not have an 373 * atomic number 374 */ atomicNumber(IAtom atom)375 private static int atomicNumber(IAtom atom) { 376 Integer elem = atom.getAtomicNumber(); 377 if (elem != null) return elem; 378 if (atom instanceof IPseudoAtom) return 0; 379 throw new NullPointerException("a non-pseudoatom had unset atomic number"); 380 } 381 382 /** 383 * Access implicit hydrogen count of the atom defaulting to 0 for pseudo 384 * atoms. 385 * 386 * @param atom an atom 387 * @return the implicit hydrogen count 388 * @throws NullPointerException the atom was non-pseudo at did not have an 389 * implicit hydrogen count 390 */ implH(IAtom atom)391 private static int implH(IAtom atom) { 392 Integer h = atom.getImplicitHydrogenCount(); 393 if (h != null) return h; 394 if (atom instanceof IPseudoAtom) return 0; 395 throw new NullPointerException("a non-pseudoatom had unset hydrogen count"); 396 } 397 398 /** 399 * Access formal charge of an atom defaulting to 0 if undefined. 400 * 401 * @param atom an atom 402 * @return the formal charge 403 */ charge(IAtom atom)404 private static int charge(IAtom atom) { 405 Integer charge = atom.getFormalCharge(); 406 if (charge != null) return charge; 407 return 0; 408 } 409 410 /** 411 * Locate explicit hydrogens that are attached to exactly one other atom. 412 * 413 * @param ac a structure 414 * @return binary set of terminal hydrogens 415 */ terminalHydrogens(final IAtomContainer ac, final int[][] g)416 static boolean[] terminalHydrogens(final IAtomContainer ac, final int[][] g) { 417 418 final boolean[] hydrogens = new boolean[ac.getAtomCount()]; 419 420 // we specifically don't check for null atomic number, this must be set. 421 // if not, something major is wrong 422 for (int i = 0; i < ac.getAtomCount(); i++) { 423 IAtom atom = ac.getAtom(i); 424 hydrogens[i] = atom.getAtomicNumber() == 1 && 425 atom.getMassNumber() == null && 426 g[i].length == 1; 427 } 428 429 return hydrogens; 430 } 431 432 /** 433 * The first 10,000 primes. 434 */ 435 private static final int[] PRIMES = loadPrimes(); 436 loadPrimes()437 private static int[] loadPrimes() { 438 try (BufferedReader br = new BufferedReader(new InputStreamReader(Canon.class.getResourceAsStream("primes.dat")))) { 439 int[] primes = new int[N_PRIMES]; 440 int i = 0; 441 String line = null; 442 while ((line = br.readLine()) != null) { 443 primes[i++] = Integer.parseInt(line); 444 } 445 assert i == N_PRIMES; 446 return primes; 447 } catch (NumberFormatException | IOException e) { 448 System.err.println("Critical - could not load primes table for canonical labelling!"); 449 return new int[0]; 450 } 451 } 452 } 453