1 /*
2  * Copyright (c) 2013 European Bioinformatics Institute (EMBL-EBI)
3  *                    John May <jwmay@users.sf.net>
4  *
5  * Contact: cdk-devel@lists.sourceforge.net
6  *
7  * This program is free software; you can redistribute it and/or modify it
8  * under the terms of the GNU Lesser General Public License as published by
9  * the Free Software Foundation; either version 2.1 of the License, or (at
10  * your option) any later version. All we ask is that proper credit is given
11  * for our work, which includes - but is not limited to - adding the above
12  * copyright notice to the beginning of your source code files, and to any
13  * copyright notice that you may distribute with programs based on this work.
14  *
15  * This program is distributed in the hope that it will be useful, but WITHOUT
16  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
17  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
18  * License for more details.
19  *
20  * You should have received a copy of the GNU Lesser General Public License
21  * along with this program; if not, write to the Free Software
22  * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 U
23  */
24 
25 package org.openscience.cdk.graph.invariant;
26 
27 import org.openscience.cdk.interfaces.IAtom;
28 import org.openscience.cdk.interfaces.IAtomContainer;
29 import org.openscience.cdk.interfaces.IPseudoAtom;
30 import org.openscience.cdk.tools.manipulator.AtomContainerManipulator;
31 
32 import java.io.BufferedReader;
33 import java.io.IOException;
34 import java.io.InputStreamReader;
35 import java.util.Arrays;
36 import java.util.Collections;
37 import java.util.Comparator;
38 
39 /**
40  * An implementation based on the canon algorithm {@cdk.cite WEI89}. The
41  * algorithm uses an initial set of of invariants which are assigned a rank.
42  * Equivalent ranks are then shattered using an unambiguous function (in this
43  * case, the product of primes of adjacent ranks). Once no more equivalent ranks
44  * can be shattered ties are artificially broken and rank shattering continues.
45  * Unlike the original description rank stability is not maintained reducing
46  * the number of values to rank at each stage to only those which are equivalent.
47  *
48  *
49  * The initial set of invariants is basic and are - <i>
50  * "sufficient for the purpose of obtaining unique notation for simple SMILES,
51  *  but it is not necessarily a “complete” set. No “perfect” set of invariants
52  *  is known that will distinguish all possible graph asymmetries. However,
53  *  for any given set of structures, a set of invariants can be devised to
54  *  provide the necessary discrimination"</i> {@cdk.cite WEI89}. As such this
55  *  producer should not be considered a complete canonical labelled but in
56  *  practice performs well. For a more accurate and computationally expensive
57  *  labelling, please using the {@link InChINumbersTools}.
58  *
59  * <blockquote><pre>
60  * IAtomContainer m = ...;
61  * int[][]        g = GraphUtil.toAdjList(m);
62  *
63  * // obtain canon labelling
64  * long[] labels = Canon.label(m, g);
65  *
66  * // obtain symmetry classes
67  * long[] labels = Canon.symmetry(m, g);
68  * </pre></blockquote>
69  *
70  * @author John May
71  * @cdk.module standard
72  * @cdk.githash
73  */
74 public final class Canon {
75 
76     private static final int N_PRIMES = 10000;
77     /**
78      * Graph, adjacency list representation.
79      */
80     private final int[][] g;
81 
82     /**
83      * Storage of canon labelling and symmetry classes.
84      */
85     private final long[] labelling, symmetry;
86 
87     /** Only compute the symmetry classes. */
88     private boolean symOnly = false;
89 
90     /**
91      * Create a canon labelling for the graph (g) with the specified
92      * invariants.
93      *
94      * @param g         a graph (adjacency list representation)
95      * @param hydrogens binary vector of terminal hydrogens
96      * @param partition an initial partition of the vertices
97      */
Canon(int[][] g, long[] partition, boolean[] hydrogens, boolean symOnly)98     private Canon(int[][] g, long[] partition, boolean[] hydrogens, boolean symOnly) {
99         this.g = g;
100         this.symOnly = symOnly;
101         labelling = partition.clone();
102         symmetry = refine(labelling, hydrogens);
103     }
104 
105     /**
106      * Compute the canonical labels for the provided structure. The labelling
107      * does not consider isomer information or stereochemistry. The current
108      * implementation does not fully distinguish all structure topologies
109      * but in practise performs well in the majority of cases. A complete
110      * canonical labelling can be obtained using the {@link InChINumbersTools}
111      * but is computationally much more expensive.
112      *
113      * @param container structure
114      * @param g         adjacency list graph representation
115      * @return the canonical labelling
116      * @see EquivalentClassPartitioner
117      * @see InChINumbersTools
118      */
label(IAtomContainer container, int[][] g)119     public static long[] label(IAtomContainer container, int[][] g) {
120         return label(container, g, basicInvariants(container, g));
121     }
122 
123     /**
124      * Compute the canonical labels for the provided structure. The labelling
125      * does not consider isomer information or stereochemistry. This method
126      * allows provision of a custom array of initial invariants.
127      *
128      *
129      * The current
130      * implementation does not fully distinguish all structure topologies
131      * but in practise performs well in the majority of cases. A complete
132      * canonical labelling can be obtained using the {@link InChINumbersTools}
133      * but is computationally much more expensive.
134      *
135      * @param container  structure
136      * @param g          adjacency list graph representation
137      * @param initial    initial seed invariants
138      * @return the canonical labelling
139      * @see EquivalentClassPartitioner
140      * @see InChINumbersTools
141      */
label(IAtomContainer container, int[][] g, long[] initial)142     public static long[] label(IAtomContainer container, int[][] g, long[] initial) {
143         if (initial.length != g.length)
144             throw new IllegalArgumentException("number of initial != number of atoms");
145         return new Canon(g, initial, terminalHydrogens(container, g), false).labelling;
146     }
147 
148     /**
149      * Compute the canonical labels for the provided structure. The initial
150      * labelling is seed-ed with the provided atom comparator <code>cmp</code>
151      * allowing arbitary properties to be distinguished or ignored.
152      *
153      * @param container  structure
154      * @param g          adjacency list graph representation
155      * @param cmp        comparator to compare atoms
156      * @return the canonical labelling
157      */
label(IAtomContainer container, int[][] g, Comparator<IAtom> cmp)158     public static long[] label(IAtomContainer    container,
159                                int[][]           g,
160                                Comparator<IAtom> cmp) {
161         if (g.length == 0)
162             return new long[0];
163         IAtom[] atoms = AtomContainerManipulator.getAtomArray(container);
164         Arrays.sort(atoms, cmp);
165         long[] initial = new long[atoms.length];
166         long   part    = 1;
167         initial[container.indexOf(atoms[0])] = part;
168         for (int i=1; i<atoms.length; i++) {
169             if (cmp.compare(atoms[i], atoms[i-1]) != 0)
170                 ++part;
171             initial[container.indexOf(atoms[i])] = part;
172         }
173         return label(container, g, initial);
174     }
175 
176     /**
177      * Compute the symmetry classes for the provided structure. There are known
178      * examples where symmetry is incorrectly found. The {@link
179      * EquivalentClassPartitioner} gives more accurate symmetry perception but
180      * this method is very quick and in practise successfully portions the
181      * majority of chemical structures.
182      *
183      * @param container structure
184      * @param g         adjacency list graph representation
185      * @return symmetry classes
186      * @see EquivalentClassPartitioner
187      */
symmetry(IAtomContainer container, int[][] g)188     public static long[] symmetry(IAtomContainer container, int[][] g) {
189         return new Canon(g, basicInvariants(container, g), terminalHydrogens(container, g), true).symmetry;
190     }
191 
192     /**
193      * Internal - refine invariants to a canonical labelling and
194      * symmetry classes.
195      *
196      * @param invariants the invariants to refine (canonical labelling gets
197      *                   written here)
198      * @param hydrogens  binary vector of terminal hydrogens
199      * @return the symmetry classes
200      */
refine(long[] invariants, boolean[] hydrogens)201     private long[] refine(long[] invariants, boolean[] hydrogens) {
202 
203         int ord = g.length;
204 
205         InvariantRanker ranker = new InvariantRanker(ord);
206 
207         // current/next vertices, these only hold the vertices which are
208         // equivalent
209         int[] currVs = new int[ord];
210         int[] nextVs = new int[ord];
211 
212         // fill with identity (also set number of non-unique)
213         int nnu = ord;
214         for (int i = 0; i < ord; i++)
215             currVs[i] = i;
216 
217         long[] prev = invariants;
218         long[] curr = Arrays.copyOf(invariants, ord);
219 
220         // initially all labels are 1, the input invariants are then used to
221         // refine this coarse partition
222         Arrays.fill(prev, 1L);
223 
224         // number of ranks
225         int n = 0, m = 0;
226 
227         // storage of symmetry classes
228         long[] symmetry = null;
229 
230         while (n < ord) {
231 
232             // refine the initial invariants using product of primes from
233             // adjacent ranks
234             while ((n = ranker.rank(currVs, nextVs, nnu, curr, prev)) > m && n < ord) {
235                 nnu = 0;
236                 for (int i = 0; i < ord && nextVs[i] >= 0; i++) {
237                     int v = nextVs[i];
238                     currVs[nnu++] = v;
239                     curr[v] = hydrogens[v] ? prev[v] : primeProduct(g[v], prev, hydrogens);
240                 }
241                 m = n;
242             }
243 
244             if (symmetry == null) {
245 
246                 // After symmetry classes have been found without hydrogens we add
247                 // back in the hydrogens and assign ranks. We don't refine the
248                 // partition until the next time round the while loop to avoid
249                 // artificially splitting due to hydrogen representation, for example
250                 // the two hydrogens are equivalent in this SMILES for ethane '[H]CC'
251                 for (int i = 0; i < g.length; i++) {
252                     if (hydrogens[i]) {
253                         curr[i] = prev[g[i][0]];
254                         hydrogens[i] = false;
255                     }
256                 }
257                 n = ranker.rank(currVs, nextVs, nnu, curr, prev);
258                 symmetry = Arrays.copyOf(prev, ord);
259 
260                 // Update the buffer of non-unique vertices as hydrogens next
261                 // to discrete heavy atoms are also discrete (and removed from
262                 // 'nextVs' during ranking.
263                 nnu = 0;
264                 for (int i = 0; i < ord && nextVs[i] >= 0; i++) {
265                     currVs[nnu++] = nextVs[i];
266                 }
267             }
268 
269             // partition is discrete or only symmetry classes are needed
270             if (symOnly || n == ord) return symmetry;
271 
272             // artificially split the lowest cell, we perturb the value
273             // of all vertices with equivalent rank to the lowest non-unique
274             // vertex
275             int lo = nextVs[0];
276             for (int i = 1; i < ord && nextVs[i] >= 0 && prev[nextVs[i]] == prev[lo]; i++)
277                 prev[nextVs[i]]++;
278 
279             // could also swap but this is cleaner
280             System.arraycopy(nextVs, 0, currVs, 0, nnu);
281         }
282 
283         return symmetry;
284     }
285 
286     /**
287      * Compute the prime product of the values (ranks) for the given
288      * adjacent neighbors (ws).
289      *
290      * @param ws    indices (adjacent neighbors)
291      * @param ranks invariant ranks
292      * @return the prime product
293      */
primeProduct(int[] ws, long[] ranks, boolean[] hydrogens)294     private long primeProduct(int[] ws, long[] ranks, boolean[] hydrogens) {
295         long prod = 1;
296         for (int w : ws) {
297             if (!hydrogens[w]) {
298                 prod *= PRIMES[(int) ranks[w]];
299             }
300         }
301         return prod;
302     }
303 
304     /**
305      * Generate the initial invariants for each atom in the {@code container}.
306      * The labels use the invariants described in {@cdk.cite WEI89}.
307      *
308      * The bits in the low 32-bits are: {@code 0000000000xxxxXXXXeeeeeeescchhhh}
309      * where:
310      * <ul>
311      *     <li>0: padding</li>
312      *     <li>x: number of connections</li>
313      *     <li>X: number of non-hydrogens bonds</li>
314      *     <li>e: atomic number</li>
315      *     <li>s: sign of charge</li>
316      *     <li>c: absolute charge</li>
317      *     <li>h: number of attached hydrogens</li>
318      * </ul>
319      *
320      * <b>Important: These invariants are <i>basic</i> and there are known
321      * examples they don't distinguish. One trivial example to consider is
322      * {@code [O]C=O} where both oxygens have no hydrogens and a single
323      * connection but the atoms are not equivalent. Including a better
324      * initial partition is more expensive</b>
325      *
326      * @param container an atom container to generate labels for
327      * @param graph     graph representation (adjacency list)
328      * @return initial invariants
329      * @throws NullPointerException an atom had unset atomic number, hydrogen
330      *                              count or formal charge
331      */
basicInvariants(IAtomContainer container, int[][] graph)332     public static long[] basicInvariants(IAtomContainer container, int[][] graph) {
333 
334         long[] labels = new long[graph.length];
335 
336         for (int v = 0; v < graph.length; v++) {
337             IAtom atom = container.getAtom(v);
338 
339             int deg = graph[v].length;
340             int impH = implH(atom);
341             int expH = 0;
342             int elem = atomicNumber(atom);
343             int chg = charge(atom);
344 
345             // count non-suppressed (explicit) hydrogens
346             for (int w : graph[v])
347                 if (atomicNumber(container.getAtom(w)) == 1) expH++;
348 
349             long label = 0; // connectivity (first in)
350             label |= deg + impH & 0xf;
351             label <<= 4; // connectivity (heavy) <= 15 (4 bits)
352             label |= deg - expH & 0xf;
353             label <<= 7; // atomic number <= 127 (7 bits)
354             label |= elem & 0x7f;
355             label <<= 1; // charge sign == 1 (1 bit)
356             label |= chg >> 31 & 0x1;
357             label <<= 2; // charge <= 3 (2 bits)
358             label |= Math.abs(chg) & 0x3;
359             label <<= 4; // hydrogen count <= 15 (4 bits)
360             label |= impH + expH & 0xf;
361 
362             labels[v] = label;
363         }
364         return labels;
365     }
366 
367     /**
368      * Access atomic number of atom defaulting to 0 for pseudo atoms.
369      *
370      * @param atom an atom
371      * @return the atomic number
372      * @throws NullPointerException the atom was non-pseudo at did not have an
373      *                              atomic number
374      */
atomicNumber(IAtom atom)375     private static int atomicNumber(IAtom atom) {
376         Integer elem = atom.getAtomicNumber();
377         if (elem != null) return elem;
378         if (atom instanceof IPseudoAtom) return 0;
379         throw new NullPointerException("a non-pseudoatom had unset atomic number");
380     }
381 
382     /**
383      * Access implicit hydrogen count of the atom defaulting to 0 for pseudo
384      * atoms.
385      *
386      * @param atom an atom
387      * @return the implicit hydrogen count
388      * @throws NullPointerException the atom was non-pseudo at did not have an
389      *                              implicit hydrogen count
390      */
implH(IAtom atom)391     private static int implH(IAtom atom) {
392         Integer h = atom.getImplicitHydrogenCount();
393         if (h != null) return h;
394         if (atom instanceof IPseudoAtom) return 0;
395         throw new NullPointerException("a non-pseudoatom had unset hydrogen count");
396     }
397 
398     /**
399      * Access formal charge of an atom defaulting to 0 if undefined.
400      *
401      * @param atom an atom
402      * @return the formal charge
403      */
charge(IAtom atom)404     private static int charge(IAtom atom) {
405         Integer charge = atom.getFormalCharge();
406         if (charge != null) return charge;
407         return 0;
408     }
409 
410     /**
411      * Locate explicit hydrogens that are attached to exactly one other atom.
412      *
413      * @param ac a structure
414      * @return binary set of terminal hydrogens
415      */
terminalHydrogens(final IAtomContainer ac, final int[][] g)416     static boolean[] terminalHydrogens(final IAtomContainer ac, final int[][] g) {
417 
418         final boolean[] hydrogens = new boolean[ac.getAtomCount()];
419 
420         // we specifically don't check for null atomic number, this must be set.
421         // if not, something major is wrong
422         for (int i = 0; i < ac.getAtomCount(); i++) {
423             IAtom atom = ac.getAtom(i);
424             hydrogens[i] = atom.getAtomicNumber() == 1 &&
425                            atom.getMassNumber() == null &&
426                            g[i].length == 1;
427         }
428 
429         return hydrogens;
430     }
431 
432     /**
433      * The first 10,000 primes.
434      */
435     private static final int[] PRIMES = loadPrimes();
436 
loadPrimes()437     private static int[] loadPrimes() {
438         try (BufferedReader br = new BufferedReader(new InputStreamReader(Canon.class.getResourceAsStream("primes.dat")))) {
439             int[] primes = new int[N_PRIMES];
440             int i = 0;
441             String line = null;
442             while ((line = br.readLine()) != null) {
443                 primes[i++] = Integer.parseInt(line);
444             }
445             assert i == N_PRIMES;
446             return primes;
447         } catch (NumberFormatException | IOException e) {
448             System.err.println("Critical - could not load primes table for canonical labelling!");
449             return new int[0];
450         }
451     }
452 }
453