1 // $Id$
2 //
3 // Copyright (C) 2007-2008 Greg Landrum
4 //
5 // @@ All Rights Reserved @@
6 // This file is part of the RDKit.
7 // The contents are covered by the terms of the BSD license
8 // which is included in the file license.txt, found at the root
9 // of the RDKit source tree.
10 //
11 #include <RDGeneral/export.h>
12 #ifndef __RD_SPARSE_INT_VECT_20070921__
13 #define __RD_SPARSE_INT_VECT_20070921__
14
15 #include <map>
16 #include <string>
17 #include <RDGeneral/Invariant.h>
18 #include <sstream>
19 #include <RDGeneral/Exceptions.h>
20 #include <RDGeneral/StreamOps.h>
21 #include <cstdint>
22
23 const int ci_SPARSEINTVECT_VERSION =
24 0x0001; //!< version number to use in pickles
25 namespace RDKit {
26 //! a class for efficiently storing sparse vectors of ints
27 template <typename IndexType>
28 class SparseIntVect {
29 public:
30 typedef std::map<IndexType, int> StorageType;
31
SparseIntVect()32 SparseIntVect() : d_length(0){};
33
34 //! initialize with a particular length
SparseIntVect(IndexType length)35 SparseIntVect(IndexType length) : d_length(length){};
36
37 //! Copy constructor
SparseIntVect(const SparseIntVect<IndexType> & other)38 SparseIntVect(const SparseIntVect<IndexType> &other) {
39 d_length = other.d_length;
40 d_data.clear();
41 d_data.insert(other.d_data.begin(), other.d_data.end());
42 }
43
44 //! constructor from a pickle
SparseIntVect(const std::string & pkl)45 SparseIntVect(const std::string &pkl) {
46 initFromText(pkl.c_str(), pkl.size());
47 };
48 //! constructor from a pickle
SparseIntVect(const char * pkl,const unsigned int len)49 SparseIntVect(const char *pkl, const unsigned int len) {
50 initFromText(pkl, len);
51 };
52
53 SparseIntVect &operator=(const SparseIntVect<IndexType> &other) {
54 if (this == &other) {
55 return *this;
56 }
57 d_length = other.d_length;
58 d_data.clear();
59 d_data.insert(other.d_data.begin(), other.d_data.end());
60 return *this;
61 }
62
63 //! destructor (doesn't need to do anything)
~SparseIntVect()64 ~SparseIntVect() {}
65
66 #ifdef __clang__
67 #pragma clang diagnostic push
68 #pragma clang diagnostic ignored "-Wtautological-compare"
69 #elif (defined(__GNUC__) || defined(__GNUG__)) && \
70 (__GNUC__ > 4 || (__GNUC__ == 4 && __GNUC_MINOR__ > 1))
71 #if (__GNUC__ > 4 || __GNUC_MINOR__ > 5)
72 #pragma GCC diagnostic push
73 #endif
74 #pragma GCC diagnostic ignored "-Wtype-limits"
75 #endif
76 //! return the value at an index
getVal(IndexType idx)77 int getVal(IndexType idx) const {
78 if (idx < 0 || idx >= d_length) {
79 throw IndexErrorException(static_cast<int>(idx));
80 }
81 int res = 0;
82 typename StorageType::const_iterator iter = d_data.find(idx);
83 if (iter != d_data.end()) {
84 res = iter->second;
85 }
86 return res;
87 };
88
89 //! set the value at an index
setVal(IndexType idx,int val)90 void setVal(IndexType idx, int val) {
91 if (idx < 0 || idx >= d_length) {
92 throw IndexErrorException(static_cast<int>(idx));
93 }
94 if (val != 0) {
95 d_data[idx] = val;
96 } else {
97 d_data.erase(idx);
98 }
99 };
100 #ifdef __clang__
101 #pragma clang diagnostic pop
102 #elif (defined(__GNUC__) || defined(__GNUG__)) && \
103 (__GNUC__ > 4 || (__GNUC__ == 4 && __GNUC_MINOR__ > 5))
104 #pragma GCC diagnostic pop
105 #endif
106 //! support indexing using []
107 int operator[](IndexType idx) const { return getVal(idx); };
108
109 //! returns the length
getLength()110 IndexType getLength() const { return d_length; };
111
112 //! returns the sum of all the elements in the vect
113 //! the doAbs argument toggles summing the absolute values of the elements
114 int getTotalVal(bool doAbs = false) const {
115 int res = 0;
116 typename StorageType::const_iterator iter;
117 for (iter = d_data.begin(); iter != d_data.end(); ++iter) {
118 if (!doAbs)
119 res += iter->second;
120 else
121 res += abs(iter->second);
122 }
123 return res;
124 };
125 //! returns the length
size()126 unsigned int size() const { return getLength(); };
127
128 //! returns our nonzero elements as a map(IndexType->int)
getNonzeroElements()129 const StorageType &getNonzeroElements() const { return d_data; }
130
131 //! this is a "fuzzy" intesection, the final value
132 //! of each element is equal to the minimum from
133 //! the two vects.
134 SparseIntVect<IndexType> &operator&=(const SparseIntVect<IndexType> &other) {
135 if (other.d_length != d_length) {
136 throw ValueErrorException("SparseIntVect size mismatch");
137 }
138
139 typename StorageType::iterator iter = d_data.begin();
140 typename StorageType::const_iterator oIter = other.d_data.begin();
141 while (iter != d_data.end()) {
142 // we're relying on the fact that the maps are sorted:
143 while (oIter != other.d_data.end() && oIter->first < iter->first) {
144 ++oIter;
145 }
146 if (oIter != other.d_data.end() && oIter->first == iter->first) {
147 // found it:
148 if (oIter->second < iter->second) {
149 iter->second = oIter->second;
150 }
151 ++oIter;
152 ++iter;
153 } else {
154 // not there; our value is zero, which means
155 // we should remove this value:
156 typename StorageType::iterator tmpIter = iter;
157 ++tmpIter;
158 d_data.erase(iter);
159 iter = tmpIter;
160 }
161 }
162 return *this;
163 };
164 const SparseIntVect<IndexType> operator&(
165 const SparseIntVect<IndexType> &other) const {
166 SparseIntVect<IndexType> res(*this);
167 return res &= other;
168 }
169
170 //! this is a "fuzzy" union, the final value
171 //! of each element is equal to the maximum from
172 //! the two vects.
173 SparseIntVect<IndexType> &operator|=(const SparseIntVect<IndexType> &other) {
174 if (other.d_length != d_length) {
175 throw ValueErrorException("SparseIntVect size mismatch");
176 }
177
178 typename StorageType::iterator iter = d_data.begin();
179 typename StorageType::const_iterator oIter = other.d_data.begin();
180 while (iter != d_data.end()) {
181 // we're relying on the fact that the maps are sorted:
182 while (oIter != other.d_data.end() && oIter->first < iter->first) {
183 d_data[oIter->first] = oIter->second;
184 ++oIter;
185 }
186 if (oIter != other.d_data.end() && oIter->first == iter->first) {
187 // found it:
188 if (oIter->second > iter->second) {
189 iter->second = oIter->second;
190 }
191 ++oIter;
192 }
193 ++iter;
194 }
195 // finish up the other vect:
196 while (oIter != other.d_data.end()) {
197 d_data[oIter->first] = oIter->second;
198 ++oIter;
199 }
200 return *this;
201 };
202 const SparseIntVect<IndexType> operator|(
203 const SparseIntVect<IndexType> &other) const {
204 SparseIntVect<IndexType> res(*this);
205 return res |= other;
206 }
207
208 SparseIntVect<IndexType> &operator+=(const SparseIntVect<IndexType> &other) {
209 if (other.d_length != d_length) {
210 throw ValueErrorException("SparseIntVect size mismatch");
211 }
212 typename StorageType::iterator iter = d_data.begin();
213 typename StorageType::const_iterator oIter = other.d_data.begin();
214 while (oIter != other.d_data.end()) {
215 while (iter != d_data.end() && iter->first < oIter->first) {
216 ++iter;
217 }
218 if (iter != d_data.end() && oIter->first == iter->first) {
219 // found it:
220 iter->second += oIter->second;
221 if (!iter->second) {
222 typename StorageType::iterator tIter = iter;
223 ++tIter;
224 d_data.erase(iter);
225 iter = tIter;
226 } else {
227 ++iter;
228 }
229 } else {
230 d_data[oIter->first] = oIter->second;
231 }
232 ++oIter;
233 }
234 return *this;
235 };
236 const SparseIntVect<IndexType> operator+(
237 const SparseIntVect<IndexType> &other) const {
238 SparseIntVect<IndexType> res(*this);
239 return res += other;
240 }
241
242 SparseIntVect<IndexType> &operator-=(const SparseIntVect<IndexType> &other) {
243 if (other.d_length != d_length) {
244 throw ValueErrorException("SparseIntVect size mismatch");
245 }
246 typename StorageType::iterator iter = d_data.begin();
247 typename StorageType::const_iterator oIter = other.d_data.begin();
248 while (oIter != other.d_data.end()) {
249 while (iter != d_data.end() && iter->first < oIter->first) {
250 ++iter;
251 }
252 if (iter != d_data.end() && oIter->first == iter->first) {
253 // found it:
254 iter->second -= oIter->second;
255 if (!iter->second) {
256 typename StorageType::iterator tIter = iter;
257 ++tIter;
258 d_data.erase(iter);
259 iter = tIter;
260 } else {
261 ++iter;
262 }
263 } else {
264 d_data[oIter->first] = -oIter->second;
265 }
266 ++oIter;
267 }
268 return *this;
269 };
270 const SparseIntVect<IndexType> operator-(
271 const SparseIntVect<IndexType> &other) const {
272 SparseIntVect<IndexType> res(*this);
273 return res -= other;
274 }
275 SparseIntVect<IndexType> &operator*=(int v) {
276 typename StorageType::iterator iter = d_data.begin();
277 while (iter != d_data.end()) {
278 iter->second *= v;
279 ++iter;
280 }
281 return *this;
282 };
283 SparseIntVect<IndexType> &operator*(int v) {
284 SparseIntVect<IndexType> res(*this);
285 return res *= v;
286 };
287 SparseIntVect<IndexType> &operator/=(int v) {
288 typename StorageType::iterator iter = d_data.begin();
289 while (iter != d_data.end()) {
290 iter->second /= v;
291 ++iter;
292 }
293 return *this;
294 };
295 SparseIntVect<IndexType> &operator/(int v) {
296 SparseIntVect<IndexType> res(*this);
297 return res /= v;
298 };
299 SparseIntVect<IndexType> &operator+=(int v) {
300 typename StorageType::iterator iter = d_data.begin();
301 while (iter != d_data.end()) {
302 iter->second += v;
303 ++iter;
304 }
305 return *this;
306 };
307 SparseIntVect<IndexType> &operator+(int v) {
308 SparseIntVect<IndexType> res(*this);
309 return res += v;
310 };
311 SparseIntVect<IndexType> &operator-=(int v) {
312 typename StorageType::iterator iter = d_data.begin();
313 while (iter != d_data.end()) {
314 iter->second -= v;
315 ++iter;
316 }
317 return *this;
318 };
319 SparseIntVect<IndexType> &operator-(int v) {
320 SparseIntVect<IndexType> res(*this);
321 return res -= v;
322 };
323
324 bool operator==(const SparseIntVect<IndexType> &v2) const {
325 if (d_length != v2.d_length) {
326 return false;
327 }
328 return d_data == v2.d_data;
329 }
330 bool operator!=(const SparseIntVect<IndexType> &v2) const {
331 return !(*this == v2);
332 }
333
334 //! returns a binary string representation (pickle)
toString()335 std::string toString() const {
336 std::stringstream ss(std::ios_base::binary | std::ios_base::out |
337 std::ios_base::in);
338 std::uint32_t tInt;
339 tInt = ci_SPARSEINTVECT_VERSION;
340 streamWrite(ss, tInt);
341 tInt = sizeof(IndexType);
342 streamWrite(ss, tInt);
343 streamWrite(ss, d_length);
344 IndexType nEntries = d_data.size();
345 streamWrite(ss, nEntries);
346
347 typename StorageType::const_iterator iter = d_data.begin();
348 while (iter != d_data.end()) {
349 streamWrite(ss, iter->first);
350 std::int32_t tInt = iter->second;
351 streamWrite(ss, tInt);
352 ++iter;
353 }
354 return ss.str();
355 };
356
fromString(const std::string & txt)357 void fromString(const std::string &txt) {
358 initFromText(txt.c_str(), txt.length());
359 }
360
361 private:
362 IndexType d_length;
363 StorageType d_data;
364
initFromText(const char * pkl,const unsigned int len)365 void initFromText(const char *pkl, const unsigned int len) {
366 d_data.clear();
367 std::stringstream ss(std::ios_base::binary | std::ios_base::out |
368 std::ios_base::in);
369 ss.write(pkl, len);
370
371 std::uint32_t vers;
372 streamRead(ss, vers);
373 if (vers == 0x0001) {
374 std::uint32_t tInt;
375 streamRead(ss, tInt);
376 if (tInt > sizeof(IndexType)) {
377 throw ValueErrorException(
378 "IndexType cannot accommodate index size in SparseIntVect pickle");
379 }
380 switch (tInt) {
381 case sizeof(char):
382 readVals<unsigned char>(ss);
383 break;
384 case sizeof(std::int32_t):
385 readVals<std::uint32_t>(ss);
386 break;
387 case sizeof(boost::int64_t):
388 readVals<boost::uint64_t>(ss);
389 break;
390 default:
391 throw ValueErrorException("unreadable format");
392 }
393 } else {
394 throw ValueErrorException("bad version in SparseIntVect pickle");
395 }
396 };
397 template <typename T>
readVals(std::stringstream & ss)398 void readVals(std::stringstream &ss) {
399 PRECONDITION(sizeof(T) <= sizeof(IndexType), "invalid size");
400 T tVal;
401 streamRead(ss, tVal);
402 d_length = tVal;
403 T nEntries;
404 streamRead(ss, nEntries);
405 for (T i = 0; i < nEntries; ++i) {
406 streamRead(ss, tVal);
407 std::int32_t val;
408 streamRead(ss, val);
409 d_data[tVal] = val;
410 }
411 }
412 };
413
414 template <typename IndexType, typename SequenceType>
updateFromSequence(SparseIntVect<IndexType> & vect,const SequenceType & seq)415 void updateFromSequence(SparseIntVect<IndexType> &vect,
416 const SequenceType &seq) {
417 typename SequenceType::const_iterator seqIt;
418 for (seqIt = seq.begin(); seqIt != seq.end(); ++seqIt) {
419 // EFF: probably not the most efficient approach
420 IndexType idx = *seqIt;
421 vect.setVal(idx, vect.getVal(idx) + 1);
422 }
423 }
424
425 namespace {
426 template <typename IndexType>
calcVectParams(const SparseIntVect<IndexType> & v1,const SparseIntVect<IndexType> & v2,double & v1Sum,double & v2Sum,double & andSum)427 void calcVectParams(const SparseIntVect<IndexType> &v1,
428 const SparseIntVect<IndexType> &v2, double &v1Sum,
429 double &v2Sum, double &andSum) {
430 if (v1.getLength() != v2.getLength()) {
431 throw ValueErrorException("SparseIntVect size mismatch");
432 }
433 v1Sum = v2Sum = andSum = 0.0;
434 // we're doing : (v1&v2).getTotalVal(), but w/o generating
435 // the other vector:
436 typename SparseIntVect<IndexType>::StorageType::const_iterator iter1, iter2;
437 iter1 = v1.getNonzeroElements().begin();
438 if (iter1 != v1.getNonzeroElements().end()) v1Sum += abs(iter1->second);
439 iter2 = v2.getNonzeroElements().begin();
440 if (iter2 != v2.getNonzeroElements().end()) v2Sum += abs(iter2->second);
441 while (iter1 != v1.getNonzeroElements().end()) {
442 while (iter2 != v2.getNonzeroElements().end() &&
443 iter2->first < iter1->first) {
444 ++iter2;
445 if (iter2 != v2.getNonzeroElements().end()) v2Sum += abs(iter2->second);
446 }
447 if (iter2 != v2.getNonzeroElements().end()) {
448 if (iter2->first == iter1->first) {
449 if (abs(iter2->second) < abs(iter1->second)) {
450 andSum += abs(iter2->second);
451 } else {
452 andSum += abs(iter1->second);
453 }
454 ++iter2;
455 if (iter2 != v2.getNonzeroElements().end()) v2Sum += abs(iter2->second);
456 }
457 ++iter1;
458 if (iter1 != v1.getNonzeroElements().end()) v1Sum += abs(iter1->second);
459 } else {
460 break;
461 }
462 }
463 if (iter1 != v1.getNonzeroElements().end()) {
464 ++iter1;
465 while (iter1 != v1.getNonzeroElements().end()) {
466 v1Sum += abs(iter1->second);
467 ++iter1;
468 }
469 }
470 if (iter2 != v2.getNonzeroElements().end()) {
471 ++iter2;
472 while (iter2 != v2.getNonzeroElements().end()) {
473 v2Sum += abs(iter2->second);
474 ++iter2;
475 }
476 }
477 }
478 } // namespace
479
480 template <typename IndexType>
481 double DiceSimilarity(const SparseIntVect<IndexType> &v1,
482 const SparseIntVect<IndexType> &v2,
483 bool returnDistance = false, double bounds = 0.0) {
484 if (v1.getLength() != v2.getLength()) {
485 throw ValueErrorException("SparseIntVect size mismatch");
486 }
487 double v1Sum = 0.0;
488 double v2Sum = 0.0;
489 if (!returnDistance && bounds > 0.0) {
490 v1Sum = v1.getTotalVal(true);
491 v2Sum = v2.getTotalVal(true);
492 double denom = v1Sum + v2Sum;
493 if (fabs(denom) < 1e-6) {
494 // no need to worry about returnDistance here
495 return 0.0;
496 }
497 double minV = v1Sum < v2Sum ? v1Sum : v2Sum;
498 if (2. * minV / denom < bounds) {
499 return 0.0;
500 }
501 v1Sum = 0.0;
502 v2Sum = 0.0;
503 }
504
505 double numer = 0.0;
506
507 calcVectParams(v1, v2, v1Sum, v2Sum, numer);
508
509 double denom = v1Sum + v2Sum;
510 double sim;
511 if (fabs(denom) < 1e-6) {
512 sim = 0.0;
513 } else {
514 sim = 2. * numer / denom;
515 }
516 if (returnDistance) sim = 1. - sim;
517 // std::cerr<<" "<<v1Sum<<" "<<v2Sum<<" " << numer << " " << sim <<std::endl;
518 return sim;
519 }
520
521 template <typename IndexType>
522 double TverskySimilarity(const SparseIntVect<IndexType> &v1,
523 const SparseIntVect<IndexType> &v2, double a, double b,
524 bool returnDistance = false, double bounds = 0.0) {
525 RDUNUSED_PARAM(bounds);
526 if (v1.getLength() != v2.getLength()) {
527 throw ValueErrorException("SparseIntVect size mismatch");
528 }
529 double v1Sum = 0.0;
530 double v2Sum = 0.0;
531 double andSum = 0.0;
532
533 calcVectParams(v1, v2, v1Sum, v2Sum, andSum);
534
535 double denom = a * v1Sum + b * v2Sum + (1 - a - b) * andSum;
536 double sim;
537
538 if (fabs(denom) < 1e-6) {
539 sim = 0.0;
540 } else {
541 sim = andSum / denom;
542 }
543 if (returnDistance) sim = 1. - sim;
544 // std::cerr<<" "<<v1Sum<<" "<<v2Sum<<" " << numer << " " << sim <<std::endl;
545 return sim;
546 }
547
548 template <typename IndexType>
549 double TanimotoSimilarity(const SparseIntVect<IndexType> &v1,
550 const SparseIntVect<IndexType> &v2,
551 bool returnDistance = false, double bounds = 0.0) {
552 return TverskySimilarity(v1, v2, 1.0, 1.0, returnDistance, bounds);
553 }
554 } // namespace RDKit
555
556 #endif
557