1 //
2 // Copyright (C) 2007-2013 Greg Landrum
3 //
4 // @@ All Rights Reserved @@
5 // This file is part of the RDKit.
6 // The contents are covered by the terms of the BSD license
7 // which is included in the file license.txt, found at the root
8 // of the RDKit source tree.
9 //
10
11 #include <GraphMol/RDKitBase.h>
12 #include <GraphMol/Fingerprints/AtomPairs.h>
13 #include <GraphMol/Subgraphs/Subgraphs.h>
14 #include <DataStructs/SparseIntVect.h>
15 #include <RDGeneral/hash/hash.hpp>
16 #include <cstdint>
17 #include <boost/dynamic_bitset.hpp>
18 #include <GraphMol/Fingerprints/FingerprintUtil.h>
19
20 namespace RDKit {
21 namespace AtomPairs {
22
23 template <typename T1, typename T2>
updateElement(SparseIntVect<T1> & v,T2 elem)24 void updateElement(SparseIntVect<T1> &v, T2 elem) {
25 v.setVal(elem, v.getVal(elem) + 1);
26 }
27
28 template <typename T1>
updateElement(ExplicitBitVect & v,T1 elem)29 void updateElement(ExplicitBitVect &v, T1 elem) {
30 v.setBit(elem % v.getNumBits());
31 }
32
33 template <typename T>
setAtomPairBit(std::uint32_t i,std::uint32_t j,std::uint32_t nAtoms,const std::vector<std::uint32_t> & atomCodes,const double * dm,T * bv,unsigned int minLength,unsigned int maxLength,bool includeChirality)34 void setAtomPairBit(std::uint32_t i, std::uint32_t j, std::uint32_t nAtoms,
35 const std::vector<std::uint32_t> &atomCodes,
36 const double *dm, T *bv, unsigned int minLength,
37 unsigned int maxLength, bool includeChirality) {
38 auto dist = static_cast<unsigned int>(floor(dm[i * nAtoms + j]));
39 if (dist >= minLength && dist <= maxLength) {
40 std::uint32_t bitId =
41 getAtomPairCode(atomCodes[i], atomCodes[j], dist, includeChirality);
42 updateElement(*bv, static_cast<std::uint32_t>(bitId));
43 }
44 }
45
getAtomPairFingerprint(const ROMol & mol,const std::vector<std::uint32_t> * fromAtoms,const std::vector<std::uint32_t> * ignoreAtoms,const std::vector<std::uint32_t> * atomInvariants,bool includeChirality,bool use2D,int confId)46 SparseIntVect<std::int32_t> *getAtomPairFingerprint(
47 const ROMol &mol, const std::vector<std::uint32_t> *fromAtoms,
48 const std::vector<std::uint32_t> *ignoreAtoms,
49 const std::vector<std::uint32_t> *atomInvariants, bool includeChirality,
50 bool use2D, int confId) {
51 return getAtomPairFingerprint(mol, 1, maxPathLen - 1, fromAtoms, ignoreAtoms,
52 atomInvariants, includeChirality, use2D,
53 confId);
54 };
55
getAtomPairFingerprint(const ROMol & mol,unsigned int minLength,unsigned int maxLength,const std::vector<std::uint32_t> * fromAtoms,const std::vector<std::uint32_t> * ignoreAtoms,const std::vector<std::uint32_t> * atomInvariants,bool includeChirality,bool use2D,int confId)56 SparseIntVect<std::int32_t> *getAtomPairFingerprint(
57 const ROMol &mol, unsigned int minLength, unsigned int maxLength,
58 const std::vector<std::uint32_t> *fromAtoms,
59 const std::vector<std::uint32_t> *ignoreAtoms,
60 const std::vector<std::uint32_t> *atomInvariants, bool includeChirality,
61 bool use2D, int confId) {
62 PRECONDITION(minLength <= maxLength, "bad lengths provided");
63 PRECONDITION(!atomInvariants || atomInvariants->size() >= mol.getNumAtoms(),
64 "bad atomInvariants size");
65
66 const ROMol *lmol = &mol;
67 std::unique_ptr<ROMol> tmol;
68 if (includeChirality && !mol.hasProp(common_properties::_StereochemDone)) {
69 tmol = std::unique_ptr<ROMol>(new ROMol(mol));
70 MolOps::assignStereochemistry(*tmol);
71 lmol = tmol.get();
72 }
73
74 auto *res = new SparseIntVect<std::int32_t>(
75 1 << (numAtomPairFingerprintBits + 2 * (includeChirality ? 2 : 0)));
76 const double *dm;
77 if (use2D) {
78 dm = MolOps::getDistanceMat(*lmol);
79 } else {
80 dm = MolOps::get3DDistanceMat(*lmol, confId);
81 }
82 const unsigned int nAtoms = lmol->getNumAtoms();
83
84 std::vector<std::uint32_t> atomCodes;
85 for (ROMol::ConstAtomIterator atomItI = lmol->beginAtoms();
86 atomItI != lmol->endAtoms(); ++atomItI) {
87 if (!atomInvariants) {
88 atomCodes.push_back(getAtomCode(*atomItI, 0, includeChirality));
89 } else {
90 atomCodes.push_back((*atomInvariants)[(*atomItI)->getIdx()] %
91 ((1 << codeSize) - 1));
92 }
93 }
94
95 for (ROMol::ConstAtomIterator atomItI = lmol->beginAtoms();
96 atomItI != lmol->endAtoms(); ++atomItI) {
97 unsigned int i = (*atomItI)->getIdx();
98 if (ignoreAtoms && std::find(ignoreAtoms->begin(), ignoreAtoms->end(), i) !=
99 ignoreAtoms->end()) {
100 continue;
101 }
102 if (!fromAtoms) {
103 for (ROMol::ConstAtomIterator atomItJ = atomItI + 1;
104 atomItJ != lmol->endAtoms(); ++atomItJ) {
105 unsigned int j = (*atomItJ)->getIdx();
106 if (ignoreAtoms && std::find(ignoreAtoms->begin(), ignoreAtoms->end(),
107 j) != ignoreAtoms->end()) {
108 continue;
109 }
110 setAtomPairBit(i, j, nAtoms, atomCodes, dm, res, minLength, maxLength,
111 includeChirality);
112 }
113 } else {
114 for (auto j : *fromAtoms) {
115 if (j != i) {
116 if (ignoreAtoms && std::find(ignoreAtoms->begin(), ignoreAtoms->end(),
117 j) != ignoreAtoms->end()) {
118 continue;
119 }
120 setAtomPairBit(i, j, nAtoms, atomCodes, dm, res, minLength, maxLength,
121 includeChirality);
122 }
123 }
124 }
125 }
126 return res;
127 }
128
getHashedAtomPairFingerprint(const ROMol & mol,unsigned int nBits,unsigned int minLength,unsigned int maxLength,const std::vector<std::uint32_t> * fromAtoms,const std::vector<std::uint32_t> * ignoreAtoms,const std::vector<std::uint32_t> * atomInvariants,bool includeChirality,bool use2D,int confId)129 SparseIntVect<std::int32_t> *getHashedAtomPairFingerprint(
130 const ROMol &mol, unsigned int nBits, unsigned int minLength,
131 unsigned int maxLength, const std::vector<std::uint32_t> *fromAtoms,
132 const std::vector<std::uint32_t> *ignoreAtoms,
133 const std::vector<std::uint32_t> *atomInvariants, bool includeChirality,
134 bool use2D, int confId) {
135 PRECONDITION(minLength <= maxLength, "bad lengths provided");
136 PRECONDITION(!atomInvariants || atomInvariants->size() >= mol.getNumAtoms(),
137 "bad atomInvariants size");
138 const ROMol *lmol = &mol;
139 std::unique_ptr<ROMol> tmol;
140 if (includeChirality && !mol.hasProp(common_properties::_StereochemDone)) {
141 tmol = std::unique_ptr<ROMol>(new ROMol(mol));
142 MolOps::assignStereochemistry(*tmol);
143 lmol = tmol.get();
144 }
145 auto *res = new SparseIntVect<std::int32_t>(nBits);
146 const double *dm;
147 try {
148 if (use2D) {
149 dm = MolOps::getDistanceMat(*lmol);
150 } else {
151 dm = MolOps::get3DDistanceMat(*lmol, confId);
152 }
153 } catch (const ConformerException &) {
154 delete res;
155 throw;
156 }
157
158 const unsigned int nAtoms = lmol->getNumAtoms();
159
160 std::vector<std::uint32_t> atomCodes;
161 atomCodes.reserve(nAtoms);
162 for (ROMol::ConstAtomIterator atomItI = lmol->beginAtoms();
163 atomItI != lmol->endAtoms(); ++atomItI) {
164 if (!atomInvariants) {
165 atomCodes.push_back(getAtomCode(*atomItI, 0, includeChirality));
166 } else {
167 atomCodes.push_back((*atomInvariants)[(*atomItI)->getIdx()]);
168 }
169 }
170
171 for (ROMol::ConstAtomIterator atomItI = lmol->beginAtoms();
172 atomItI != lmol->endAtoms(); ++atomItI) {
173 unsigned int i = (*atomItI)->getIdx();
174 if (ignoreAtoms && std::find(ignoreAtoms->begin(), ignoreAtoms->end(), i) !=
175 ignoreAtoms->end()) {
176 continue;
177 }
178 if (!fromAtoms) {
179 for (ROMol::ConstAtomIterator atomItJ = atomItI + 1;
180 atomItJ != lmol->endAtoms(); ++atomItJ) {
181 unsigned int j = (*atomItJ)->getIdx();
182 if (ignoreAtoms && std::find(ignoreAtoms->begin(), ignoreAtoms->end(),
183 j) != ignoreAtoms->end()) {
184 continue;
185 }
186 auto dist = static_cast<unsigned int>(floor(dm[i * nAtoms + j]));
187 if (dist >= minLength && dist <= maxLength) {
188 std::uint32_t bit = 0;
189 gboost::hash_combine(bit, std::min(atomCodes[i], atomCodes[j]));
190 gboost::hash_combine(bit, dist);
191 gboost::hash_combine(bit, std::max(atomCodes[i], atomCodes[j]));
192 updateElement(*res, static_cast<std::int32_t>(bit % nBits));
193 }
194 }
195 } else {
196 for (auto j : *fromAtoms) {
197 if (j != i) {
198 if (ignoreAtoms && std::find(ignoreAtoms->begin(), ignoreAtoms->end(),
199 j) != ignoreAtoms->end()) {
200 continue;
201 }
202 auto dist = static_cast<unsigned int>(floor(dm[i * nAtoms + j]));
203 if (dist >= minLength && dist <= maxLength) {
204 std::uint32_t bit = 0;
205 gboost::hash_combine(bit, std::min(atomCodes[i], atomCodes[j]));
206 gboost::hash_combine(bit, dist);
207 gboost::hash_combine(bit, std::max(atomCodes[i], atomCodes[j]));
208 updateElement(*res, static_cast<std::int32_t>(bit % nBits));
209 }
210 }
211 }
212 }
213 }
214 return res;
215 }
216
getHashedAtomPairFingerprintAsBitVect(const ROMol & mol,unsigned int nBits,unsigned int minLength,unsigned int maxLength,const std::vector<std::uint32_t> * fromAtoms,const std::vector<std::uint32_t> * ignoreAtoms,const std::vector<std::uint32_t> * atomInvariants,unsigned int nBitsPerEntry,bool includeChirality,bool use2D,int confId)217 ExplicitBitVect *getHashedAtomPairFingerprintAsBitVect(
218 const ROMol &mol, unsigned int nBits, unsigned int minLength,
219 unsigned int maxLength, const std::vector<std::uint32_t> *fromAtoms,
220 const std::vector<std::uint32_t> *ignoreAtoms,
221 const std::vector<std::uint32_t> *atomInvariants,
222 unsigned int nBitsPerEntry, bool includeChirality, bool use2D, int confId) {
223 PRECONDITION(minLength <= maxLength, "bad lengths provided");
224 PRECONDITION(!atomInvariants || atomInvariants->size() >= mol.getNumAtoms(),
225 "bad atomInvariants size");
226 static int bounds[4] = {1, 2, 4, 8};
227
228 unsigned int blockLength = nBits / nBitsPerEntry;
229 SparseIntVect<std::int32_t> *sres = getHashedAtomPairFingerprint(
230 mol, blockLength, minLength, maxLength, fromAtoms, ignoreAtoms,
231 atomInvariants, includeChirality, use2D, confId);
232 auto *res = new ExplicitBitVect(nBits);
233 if (nBitsPerEntry != 4) {
234 for (auto val : sres->getNonzeroElements()) {
235 for (unsigned int i = 0; i < nBitsPerEntry; ++i) {
236 if (val.second > static_cast<int>(i)) {
237 res->setBit(val.first * nBitsPerEntry + i);
238 }
239 }
240 }
241 } else {
242 for (auto val : sres->getNonzeroElements()) {
243 for (unsigned int i = 0; i < nBitsPerEntry; ++i) {
244 if (val.second >= bounds[i]) {
245 res->setBit(val.first * nBitsPerEntry + i);
246 }
247 }
248 }
249 }
250 delete sres;
251 return res;
252 }
253
getTopologicalTorsionFingerprint(const ROMol & mol,unsigned int targetSize,const std::vector<std::uint32_t> * fromAtoms,const std::vector<std::uint32_t> * ignoreAtoms,const std::vector<std::uint32_t> * atomInvariants,bool includeChirality)254 SparseIntVect<boost::int64_t> *getTopologicalTorsionFingerprint(
255 const ROMol &mol, unsigned int targetSize,
256 const std::vector<std::uint32_t> *fromAtoms,
257 const std::vector<std::uint32_t> *ignoreAtoms,
258 const std::vector<std::uint32_t> *atomInvariants, bool includeChirality) {
259 PRECONDITION(!atomInvariants || atomInvariants->size() >= mol.getNumAtoms(),
260 "bad atomInvariants size");
261 const ROMol *lmol = &mol;
262 std::unique_ptr<ROMol> tmol;
263 if (includeChirality && !mol.hasProp(common_properties::_StereochemDone)) {
264 tmol = std::unique_ptr<ROMol>(new ROMol(mol));
265 MolOps::assignStereochemistry(*tmol);
266 lmol = tmol.get();
267 }
268 boost::uint64_t sz = 1;
269 sz = (sz << (targetSize *
270 (codeSize + (includeChirality ? numChiralBits : 0))));
271 // NOTE: this -1 is incorrect but it's needed for backwards compatibility.
272 // hopefully we'll never have a case with a torsion that hits this.
273 //
274 // mmm, bug compatible.
275 sz -= 1;
276 auto *res = new SparseIntVect<boost::int64_t>(sz);
277
278 std::vector<std::uint32_t> atomCodes;
279 atomCodes.reserve(lmol->getNumAtoms());
280 for (ROMol::ConstAtomIterator atomItI = lmol->beginAtoms();
281 atomItI != lmol->endAtoms(); ++atomItI) {
282 if (!atomInvariants) {
283 atomCodes.push_back(getAtomCode(*atomItI, 0, includeChirality));
284 } else {
285 // need to add to the atomCode here because we subtract off up to 2 below
286 // as part of the branch correction
287 atomCodes.push_back(
288 (*atomInvariants)[(*atomItI)->getIdx()] % ((1 << codeSize) - 1) + 2);
289 }
290 }
291
292 boost::dynamic_bitset<> *fromAtomsBV = nullptr;
293 if (fromAtoms) {
294 fromAtomsBV = new boost::dynamic_bitset<>(lmol->getNumAtoms());
295 for (auto fAt : *fromAtoms) {
296 fromAtomsBV->set(fAt);
297 }
298 }
299 boost::dynamic_bitset<> *ignoreAtomsBV = nullptr;
300 if (ignoreAtoms) {
301 ignoreAtomsBV = new boost::dynamic_bitset<>(mol.getNumAtoms());
302 for (auto fAt : *ignoreAtoms) {
303 ignoreAtomsBV->set(fAt);
304 }
305 }
306 boost::dynamic_bitset<> pAtoms(lmol->getNumAtoms());
307 PATH_LIST paths = findAllPathsOfLengthN(*lmol, targetSize, false);
308 for (PATH_LIST::const_iterator pathIt = paths.begin(); pathIt != paths.end();
309 ++pathIt) {
310 bool keepIt = true;
311 if (fromAtomsBV) {
312 keepIt = false;
313 }
314 std::vector<std::uint32_t> pathCodes;
315 const PATH_TYPE &path = *pathIt;
316 if (fromAtomsBV) {
317 if (fromAtomsBV->test(static_cast<std::uint32_t>(path.front())) ||
318 fromAtomsBV->test(static_cast<std::uint32_t>(path.back()))) {
319 keepIt = true;
320 }
321 }
322 if (keepIt && ignoreAtomsBV) {
323 for (auto pElem : path) {
324 if (ignoreAtomsBV->test(pElem)) {
325 keepIt = false;
326 break;
327 }
328 }
329 }
330 if (keepIt) {
331 pAtoms.reset();
332 for (auto pIt = path.begin(); pIt < path.end(); ++pIt) {
333 // look for a cycle that doesn't start at the first atom
334 // we can't effectively canonicalize these at the moment
335 // (was github #811)
336 if (pIt != path.begin() && *pIt != *(path.begin()) && pAtoms[*pIt]) {
337 pathCodes.clear();
338 break;
339 }
340 pAtoms.set(*pIt);
341 unsigned int code = atomCodes[*pIt] - 1;
342 // subtract off the branching number:
343 if (pIt != path.begin() && pIt + 1 != path.end()) {
344 --code;
345 }
346 pathCodes.push_back(code);
347 }
348 if (pathCodes.size()) {
349 boost::int64_t code =
350 getTopologicalTorsionCode(pathCodes, includeChirality);
351 updateElement(*res, code);
352 }
353 }
354 }
355 delete fromAtomsBV;
356 delete ignoreAtomsBV;
357
358 return res;
359 }
360
361 namespace {
362 template <typename T>
TorsionFpCalc(T * res,const ROMol & mol,unsigned int nBits,unsigned int targetSize,const std::vector<std::uint32_t> * fromAtoms,const std::vector<std::uint32_t> * ignoreAtoms,const std::vector<std::uint32_t> * atomInvariants,bool includeChirality)363 void TorsionFpCalc(T *res, const ROMol &mol, unsigned int nBits,
364 unsigned int targetSize,
365 const std::vector<std::uint32_t> *fromAtoms,
366 const std::vector<std::uint32_t> *ignoreAtoms,
367 const std::vector<std::uint32_t> *atomInvariants,
368 bool includeChirality) {
369 PRECONDITION(!atomInvariants || atomInvariants->size() >= mol.getNumAtoms(),
370 "bad atomInvariants size");
371 const ROMol *lmol = &mol;
372 std::unique_ptr<ROMol> tmol;
373 if (includeChirality && !mol.hasProp(common_properties::_StereochemDone)) {
374 tmol = std::unique_ptr<ROMol>(new ROMol(mol));
375 MolOps::assignStereochemistry(*tmol);
376 lmol = tmol.get();
377 }
378 std::vector<std::uint32_t> atomCodes;
379 atomCodes.reserve(lmol->getNumAtoms());
380 for (ROMol::ConstAtomIterator atomItI = lmol->beginAtoms();
381 atomItI != lmol->endAtoms(); ++atomItI) {
382 if (!atomInvariants) {
383 atomCodes.push_back(getAtomCode(*atomItI, 0, includeChirality));
384 } else {
385 // need to add to the atomCode here because we subtract off up to 2 below
386 // as part of the branch correction
387 atomCodes.push_back(((*atomInvariants)[(*atomItI)->getIdx()] << 1) + 1);
388 }
389 }
390
391 boost::dynamic_bitset<> *fromAtomsBV = nullptr;
392 if (fromAtoms) {
393 fromAtomsBV = new boost::dynamic_bitset<>(lmol->getNumAtoms());
394 for (auto fAt : *fromAtoms) {
395 fromAtomsBV->set(fAt);
396 }
397 }
398 boost::dynamic_bitset<> *ignoreAtomsBV = nullptr;
399 if (ignoreAtoms) {
400 ignoreAtomsBV = new boost::dynamic_bitset<>(lmol->getNumAtoms());
401 for (auto fAt : *ignoreAtoms) {
402 ignoreAtomsBV->set(fAt);
403 }
404 }
405
406 PATH_LIST paths = findAllPathsOfLengthN(*lmol, targetSize, false);
407 for (PATH_LIST::const_iterator pathIt = paths.begin(); pathIt != paths.end();
408 ++pathIt) {
409 bool keepIt = true;
410 if (fromAtomsBV) {
411 keepIt = false;
412 }
413 const PATH_TYPE &path = *pathIt;
414 if (fromAtomsBV) {
415 if (fromAtomsBV->test(static_cast<std::uint32_t>(path.front())) ||
416 fromAtomsBV->test(static_cast<std::uint32_t>(path.back()))) {
417 keepIt = true;
418 }
419 }
420 if (keepIt && ignoreAtomsBV) {
421 for (auto pElem : path) {
422 if (ignoreAtomsBV->test(pElem)) {
423 keepIt = false;
424 break;
425 }
426 }
427 }
428 if (keepIt) {
429 std::vector<std::uint32_t> pathCodes(targetSize);
430 for (unsigned int i = 0; i < targetSize; ++i) {
431 unsigned int code = atomCodes[path[i]] - 1;
432 // subtract off the branching number:
433 if (i > 0 && i < targetSize - 1) {
434 --code;
435 }
436 pathCodes[i] = code;
437 }
438 size_t bit = getTopologicalTorsionHash(pathCodes);
439 updateElement(*res, bit % nBits);
440 }
441 }
442 delete fromAtomsBV;
443 delete ignoreAtomsBV;
444 }
445 } // namespace
getHashedTopologicalTorsionFingerprint(const ROMol & mol,unsigned int nBits,unsigned int targetSize,const std::vector<std::uint32_t> * fromAtoms,const std::vector<std::uint32_t> * ignoreAtoms,const std::vector<std::uint32_t> * atomInvariants,bool includeChirality)446 SparseIntVect<boost::int64_t> *getHashedTopologicalTorsionFingerprint(
447 const ROMol &mol, unsigned int nBits, unsigned int targetSize,
448 const std::vector<std::uint32_t> *fromAtoms,
449 const std::vector<std::uint32_t> *ignoreAtoms,
450 const std::vector<std::uint32_t> *atomInvariants, bool includeChirality) {
451 PRECONDITION(!atomInvariants || atomInvariants->size() >= mol.getNumAtoms(),
452 "bad atomInvariants size");
453 auto *res = new SparseIntVect<boost::int64_t>(nBits);
454 TorsionFpCalc(res, mol, nBits, targetSize, fromAtoms, ignoreAtoms,
455 atomInvariants, includeChirality);
456 return res;
457 }
458
getHashedTopologicalTorsionFingerprintAsBitVect(const ROMol & mol,unsigned int nBits,unsigned int targetSize,const std::vector<std::uint32_t> * fromAtoms,const std::vector<std::uint32_t> * ignoreAtoms,const std::vector<std::uint32_t> * atomInvariants,unsigned int nBitsPerEntry,bool includeChirality)459 ExplicitBitVect *getHashedTopologicalTorsionFingerprintAsBitVect(
460 const ROMol &mol, unsigned int nBits, unsigned int targetSize,
461 const std::vector<std::uint32_t> *fromAtoms,
462 const std::vector<std::uint32_t> *ignoreAtoms,
463 const std::vector<std::uint32_t> *atomInvariants,
464 unsigned int nBitsPerEntry, bool includeChirality) {
465 PRECONDITION(!atomInvariants || atomInvariants->size() >= mol.getNumAtoms(),
466 "bad atomInvariants size");
467 static int bounds[4] = {1, 2, 4, 8};
468 unsigned int blockLength = nBits / nBitsPerEntry;
469 auto *sres = new SparseIntVect<boost::int64_t>(blockLength);
470 TorsionFpCalc(sres, mol, blockLength, targetSize, fromAtoms, ignoreAtoms,
471 atomInvariants, includeChirality);
472 auto *res = new ExplicitBitVect(nBits);
473
474 if (nBitsPerEntry != 4) {
475 for (auto val : sres->getNonzeroElements()) {
476 for (unsigned int i = 0; i < nBitsPerEntry; ++i) {
477 if (val.second > static_cast<int>(i)) {
478 res->setBit(val.first * nBitsPerEntry + i);
479 }
480 }
481 }
482 } else {
483 for (auto val : sres->getNonzeroElements()) {
484 for (unsigned int i = 0; i < nBitsPerEntry; ++i) {
485 if (val.second >= bounds[i]) {
486 res->setBit(val.first * nBitsPerEntry + i);
487 }
488 }
489 }
490 }
491 delete sres;
492 return res;
493 }
494 } // end of namespace AtomPairs
495 } // end of namespace RDKit
496