1 // $Id$
2 //
3 //  Copyright (C) 2007-2008 Greg Landrum
4 //
5 //  @@ All Rights Reserved @@
6 //  This file is part of the RDKit.
7 //  The contents are covered by the terms of the BSD license
8 //  which is included in the file license.txt, found at the root
9 //  of the RDKit source tree.
10 //
11 #include <RDGeneral/export.h>
12 #ifndef __RD_SPARSE_INT_VECT_20070921__
13 #define __RD_SPARSE_INT_VECT_20070921__
14 
15 #include <map>
16 #include <string>
17 #include <RDGeneral/Invariant.h>
18 #include <sstream>
19 #include <RDGeneral/Exceptions.h>
20 #include <RDGeneral/StreamOps.h>
21 #include <cstdint>
22 
23 const int ci_SPARSEINTVECT_VERSION =
24     0x0001;  //!< version number to use in pickles
25 namespace RDKit {
26 //! a class for efficiently storing sparse vectors of ints
27 template <typename IndexType>
28 class SparseIntVect {
29  public:
30   typedef std::map<IndexType, int> StorageType;
31 
SparseIntVect()32   SparseIntVect() : d_length(0){};
33 
34   //! initialize with a particular length
SparseIntVect(IndexType length)35   SparseIntVect(IndexType length) : d_length(length){};
36 
37   //! Copy constructor
SparseIntVect(const SparseIntVect<IndexType> & other)38   SparseIntVect(const SparseIntVect<IndexType> &other) {
39     d_length = other.d_length;
40     d_data.clear();
41     d_data.insert(other.d_data.begin(), other.d_data.end());
42   }
43 
44   //! constructor from a pickle
SparseIntVect(const std::string & pkl)45   SparseIntVect(const std::string &pkl) {
46     initFromText(pkl.c_str(), pkl.size());
47   };
48   //! constructor from a pickle
SparseIntVect(const char * pkl,const unsigned int len)49   SparseIntVect(const char *pkl, const unsigned int len) {
50     initFromText(pkl, len);
51   };
52 
53   SparseIntVect &operator=(const SparseIntVect<IndexType> &other) {
54     if (this == &other) {
55       return *this;
56     }
57     d_length = other.d_length;
58     d_data.clear();
59     d_data.insert(other.d_data.begin(), other.d_data.end());
60     return *this;
61   }
62 
63   //! destructor (doesn't need to do anything)
~SparseIntVect()64   ~SparseIntVect() {}
65 
66 #ifdef __clang__
67 #pragma clang diagnostic push
68 #pragma clang diagnostic ignored "-Wtautological-compare"
69 #elif (defined(__GNUC__) || defined(__GNUG__)) && \
70     (__GNUC__ > 4 || (__GNUC__ == 4 && __GNUC_MINOR__ > 1))
71 #if (__GNUC__ > 4 || __GNUC_MINOR__ > 5)
72 #pragma GCC diagnostic push
73 #endif
74 #pragma GCC diagnostic ignored "-Wtype-limits"
75 #endif
76   //! return the value at an index
getVal(IndexType idx)77   int getVal(IndexType idx) const {
78     if (idx < 0 || idx >= d_length) {
79       throw IndexErrorException(static_cast<int>(idx));
80     }
81     int res = 0;
82     typename StorageType::const_iterator iter = d_data.find(idx);
83     if (iter != d_data.end()) {
84       res = iter->second;
85     }
86     return res;
87   };
88 
89   //! set the value at an index
setVal(IndexType idx,int val)90   void setVal(IndexType idx, int val) {
91     if (idx < 0 || idx >= d_length) {
92       throw IndexErrorException(static_cast<int>(idx));
93     }
94     if (val != 0) {
95       d_data[idx] = val;
96     } else {
97       d_data.erase(idx);
98     }
99   };
100 #ifdef __clang__
101 #pragma clang diagnostic pop
102 #elif (defined(__GNUC__) || defined(__GNUG__)) && \
103     (__GNUC__ > 4 || (__GNUC__ == 4 && __GNUC_MINOR__ > 5))
104 #pragma GCC diagnostic pop
105 #endif
106   //! support indexing using []
107   int operator[](IndexType idx) const { return getVal(idx); };
108 
109   //! returns the length
getLength()110   IndexType getLength() const { return d_length; };
111 
112   //! returns the sum of all the elements in the vect
113   //! the doAbs argument toggles summing the absolute values of the elements
114   int getTotalVal(bool doAbs = false) const {
115     int res = 0;
116     typename StorageType::const_iterator iter;
117     for (iter = d_data.begin(); iter != d_data.end(); ++iter) {
118       if (!doAbs)
119         res += iter->second;
120       else
121         res += abs(iter->second);
122     }
123     return res;
124   };
125   //! returns the length
size()126   unsigned int size() const { return getLength(); };
127 
128   //! returns our nonzero elements as a map(IndexType->int)
getNonzeroElements()129   const StorageType &getNonzeroElements() const { return d_data; }
130 
131   //! this is a "fuzzy" intesection, the final value
132   //! of each element is equal to the minimum from
133   //! the two vects.
134   SparseIntVect<IndexType> &operator&=(const SparseIntVect<IndexType> &other) {
135     if (other.d_length != d_length) {
136       throw ValueErrorException("SparseIntVect size mismatch");
137     }
138 
139     typename StorageType::iterator iter = d_data.begin();
140     typename StorageType::const_iterator oIter = other.d_data.begin();
141     while (iter != d_data.end()) {
142       // we're relying on the fact that the maps are sorted:
143       while (oIter != other.d_data.end() && oIter->first < iter->first) {
144         ++oIter;
145       }
146       if (oIter != other.d_data.end() && oIter->first == iter->first) {
147         // found it:
148         if (oIter->second < iter->second) {
149           iter->second = oIter->second;
150         }
151         ++oIter;
152         ++iter;
153       } else {
154         // not there; our value is zero, which means
155         // we should remove this value:
156         typename StorageType::iterator tmpIter = iter;
157         ++tmpIter;
158         d_data.erase(iter);
159         iter = tmpIter;
160       }
161     }
162     return *this;
163   };
164   const SparseIntVect<IndexType> operator&(
165       const SparseIntVect<IndexType> &other) const {
166     SparseIntVect<IndexType> res(*this);
167     return res &= other;
168   }
169 
170   //! this is a "fuzzy" union, the final value
171   //! of each element is equal to the maximum from
172   //! the two vects.
173   SparseIntVect<IndexType> &operator|=(const SparseIntVect<IndexType> &other) {
174     if (other.d_length != d_length) {
175       throw ValueErrorException("SparseIntVect size mismatch");
176     }
177 
178     typename StorageType::iterator iter = d_data.begin();
179     typename StorageType::const_iterator oIter = other.d_data.begin();
180     while (iter != d_data.end()) {
181       // we're relying on the fact that the maps are sorted:
182       while (oIter != other.d_data.end() && oIter->first < iter->first) {
183         d_data[oIter->first] = oIter->second;
184         ++oIter;
185       }
186       if (oIter != other.d_data.end() && oIter->first == iter->first) {
187         // found it:
188         if (oIter->second > iter->second) {
189           iter->second = oIter->second;
190         }
191         ++oIter;
192       }
193       ++iter;
194     }
195     // finish up the other vect:
196     while (oIter != other.d_data.end()) {
197       d_data[oIter->first] = oIter->second;
198       ++oIter;
199     }
200     return *this;
201   };
202   const SparseIntVect<IndexType> operator|(
203       const SparseIntVect<IndexType> &other) const {
204     SparseIntVect<IndexType> res(*this);
205     return res |= other;
206   }
207 
208   SparseIntVect<IndexType> &operator+=(const SparseIntVect<IndexType> &other) {
209     if (other.d_length != d_length) {
210       throw ValueErrorException("SparseIntVect size mismatch");
211     }
212     typename StorageType::iterator iter = d_data.begin();
213     typename StorageType::const_iterator oIter = other.d_data.begin();
214     while (oIter != other.d_data.end()) {
215       while (iter != d_data.end() && iter->first < oIter->first) {
216         ++iter;
217       }
218       if (iter != d_data.end() && oIter->first == iter->first) {
219         // found it:
220         iter->second += oIter->second;
221         if (!iter->second) {
222           typename StorageType::iterator tIter = iter;
223           ++tIter;
224           d_data.erase(iter);
225           iter = tIter;
226         } else {
227           ++iter;
228         }
229       } else {
230         d_data[oIter->first] = oIter->second;
231       }
232       ++oIter;
233     }
234     return *this;
235   };
236   const SparseIntVect<IndexType> operator+(
237       const SparseIntVect<IndexType> &other) const {
238     SparseIntVect<IndexType> res(*this);
239     return res += other;
240   }
241 
242   SparseIntVect<IndexType> &operator-=(const SparseIntVect<IndexType> &other) {
243     if (other.d_length != d_length) {
244       throw ValueErrorException("SparseIntVect size mismatch");
245     }
246     typename StorageType::iterator iter = d_data.begin();
247     typename StorageType::const_iterator oIter = other.d_data.begin();
248     while (oIter != other.d_data.end()) {
249       while (iter != d_data.end() && iter->first < oIter->first) {
250         ++iter;
251       }
252       if (iter != d_data.end() && oIter->first == iter->first) {
253         // found it:
254         iter->second -= oIter->second;
255         if (!iter->second) {
256           typename StorageType::iterator tIter = iter;
257           ++tIter;
258           d_data.erase(iter);
259           iter = tIter;
260         } else {
261           ++iter;
262         }
263       } else {
264         d_data[oIter->first] = -oIter->second;
265       }
266       ++oIter;
267     }
268     return *this;
269   };
270   const SparseIntVect<IndexType> operator-(
271       const SparseIntVect<IndexType> &other) const {
272     SparseIntVect<IndexType> res(*this);
273     return res -= other;
274   }
275   SparseIntVect<IndexType> &operator*=(int v) {
276     typename StorageType::iterator iter = d_data.begin();
277     while (iter != d_data.end()) {
278       iter->second *= v;
279       ++iter;
280     }
281     return *this;
282   };
283   SparseIntVect<IndexType> &operator*(int v) {
284     SparseIntVect<IndexType> res(*this);
285     return res *= v;
286   };
287   SparseIntVect<IndexType> &operator/=(int v) {
288     typename StorageType::iterator iter = d_data.begin();
289     while (iter != d_data.end()) {
290       iter->second /= v;
291       ++iter;
292     }
293     return *this;
294   };
295   SparseIntVect<IndexType> &operator/(int v) {
296     SparseIntVect<IndexType> res(*this);
297     return res /= v;
298   };
299   SparseIntVect<IndexType> &operator+=(int v) {
300     typename StorageType::iterator iter = d_data.begin();
301     while (iter != d_data.end()) {
302       iter->second += v;
303       ++iter;
304     }
305     return *this;
306   };
307   SparseIntVect<IndexType> &operator+(int v) {
308     SparseIntVect<IndexType> res(*this);
309     return res += v;
310   };
311   SparseIntVect<IndexType> &operator-=(int v) {
312     typename StorageType::iterator iter = d_data.begin();
313     while (iter != d_data.end()) {
314       iter->second -= v;
315       ++iter;
316     }
317     return *this;
318   };
319   SparseIntVect<IndexType> &operator-(int v) {
320     SparseIntVect<IndexType> res(*this);
321     return res -= v;
322   };
323 
324   bool operator==(const SparseIntVect<IndexType> &v2) const {
325     if (d_length != v2.d_length) {
326       return false;
327     }
328     return d_data == v2.d_data;
329   }
330   bool operator!=(const SparseIntVect<IndexType> &v2) const {
331     return !(*this == v2);
332   }
333 
334   //! returns a binary string representation (pickle)
toString()335   std::string toString() const {
336     std::stringstream ss(std::ios_base::binary | std::ios_base::out |
337                          std::ios_base::in);
338     std::uint32_t tInt;
339     tInt = ci_SPARSEINTVECT_VERSION;
340     streamWrite(ss, tInt);
341     tInt = sizeof(IndexType);
342     streamWrite(ss, tInt);
343     streamWrite(ss, d_length);
344     IndexType nEntries = d_data.size();
345     streamWrite(ss, nEntries);
346 
347     typename StorageType::const_iterator iter = d_data.begin();
348     while (iter != d_data.end()) {
349       streamWrite(ss, iter->first);
350       std::int32_t tInt = iter->second;
351       streamWrite(ss, tInt);
352       ++iter;
353     }
354     return ss.str();
355   };
356 
fromString(const std::string & txt)357   void fromString(const std::string &txt) {
358     initFromText(txt.c_str(), txt.length());
359   }
360 
361  private:
362   IndexType d_length;
363   StorageType d_data;
364 
initFromText(const char * pkl,const unsigned int len)365   void initFromText(const char *pkl, const unsigned int len) {
366     d_data.clear();
367     std::stringstream ss(std::ios_base::binary | std::ios_base::out |
368                          std::ios_base::in);
369     ss.write(pkl, len);
370 
371     std::uint32_t vers;
372     streamRead(ss, vers);
373     if (vers == 0x0001) {
374       std::uint32_t tInt;
375       streamRead(ss, tInt);
376       if (tInt > sizeof(IndexType)) {
377         throw ValueErrorException(
378             "IndexType cannot accommodate index size in SparseIntVect pickle");
379       }
380       switch (tInt) {
381         case sizeof(char):
382           readVals<unsigned char>(ss);
383           break;
384         case sizeof(std::int32_t):
385           readVals<std::uint32_t>(ss);
386           break;
387         case sizeof(boost::int64_t):
388           readVals<boost::uint64_t>(ss);
389           break;
390         default:
391           throw ValueErrorException("unreadable format");
392       }
393     } else {
394       throw ValueErrorException("bad version in SparseIntVect pickle");
395     }
396   };
397   template <typename T>
readVals(std::stringstream & ss)398   void readVals(std::stringstream &ss) {
399     PRECONDITION(sizeof(T) <= sizeof(IndexType), "invalid size");
400     T tVal;
401     streamRead(ss, tVal);
402     d_length = tVal;
403     T nEntries;
404     streamRead(ss, nEntries);
405     for (T i = 0; i < nEntries; ++i) {
406       streamRead(ss, tVal);
407       std::int32_t val;
408       streamRead(ss, val);
409       d_data[tVal] = val;
410     }
411   }
412 };
413 
414 template <typename IndexType, typename SequenceType>
updateFromSequence(SparseIntVect<IndexType> & vect,const SequenceType & seq)415 void updateFromSequence(SparseIntVect<IndexType> &vect,
416                         const SequenceType &seq) {
417   typename SequenceType::const_iterator seqIt;
418   for (seqIt = seq.begin(); seqIt != seq.end(); ++seqIt) {
419     // EFF: probably not the most efficient approach
420     IndexType idx = *seqIt;
421     vect.setVal(idx, vect.getVal(idx) + 1);
422   }
423 }
424 
425 namespace {
426 template <typename IndexType>
calcVectParams(const SparseIntVect<IndexType> & v1,const SparseIntVect<IndexType> & v2,double & v1Sum,double & v2Sum,double & andSum)427 void calcVectParams(const SparseIntVect<IndexType> &v1,
428                     const SparseIntVect<IndexType> &v2, double &v1Sum,
429                     double &v2Sum, double &andSum) {
430   if (v1.getLength() != v2.getLength()) {
431     throw ValueErrorException("SparseIntVect size mismatch");
432   }
433   v1Sum = v2Sum = andSum = 0.0;
434   // we're doing : (v1&v2).getTotalVal(), but w/o generating
435   // the other vector:
436   typename SparseIntVect<IndexType>::StorageType::const_iterator iter1, iter2;
437   iter1 = v1.getNonzeroElements().begin();
438   if (iter1 != v1.getNonzeroElements().end()) v1Sum += abs(iter1->second);
439   iter2 = v2.getNonzeroElements().begin();
440   if (iter2 != v2.getNonzeroElements().end()) v2Sum += abs(iter2->second);
441   while (iter1 != v1.getNonzeroElements().end()) {
442     while (iter2 != v2.getNonzeroElements().end() &&
443            iter2->first < iter1->first) {
444       ++iter2;
445       if (iter2 != v2.getNonzeroElements().end()) v2Sum += abs(iter2->second);
446     }
447     if (iter2 != v2.getNonzeroElements().end()) {
448       if (iter2->first == iter1->first) {
449         if (abs(iter2->second) < abs(iter1->second)) {
450           andSum += abs(iter2->second);
451         } else {
452           andSum += abs(iter1->second);
453         }
454         ++iter2;
455         if (iter2 != v2.getNonzeroElements().end()) v2Sum += abs(iter2->second);
456       }
457       ++iter1;
458       if (iter1 != v1.getNonzeroElements().end()) v1Sum += abs(iter1->second);
459     } else {
460       break;
461     }
462   }
463   if (iter1 != v1.getNonzeroElements().end()) {
464     ++iter1;
465     while (iter1 != v1.getNonzeroElements().end()) {
466       v1Sum += abs(iter1->second);
467       ++iter1;
468     }
469   }
470   if (iter2 != v2.getNonzeroElements().end()) {
471     ++iter2;
472     while (iter2 != v2.getNonzeroElements().end()) {
473       v2Sum += abs(iter2->second);
474       ++iter2;
475     }
476   }
477 }
478 }  // namespace
479 
480 template <typename IndexType>
481 double DiceSimilarity(const SparseIntVect<IndexType> &v1,
482                       const SparseIntVect<IndexType> &v2,
483                       bool returnDistance = false, double bounds = 0.0) {
484   if (v1.getLength() != v2.getLength()) {
485     throw ValueErrorException("SparseIntVect size mismatch");
486   }
487   double v1Sum = 0.0;
488   double v2Sum = 0.0;
489   if (!returnDistance && bounds > 0.0) {
490     v1Sum = v1.getTotalVal(true);
491     v2Sum = v2.getTotalVal(true);
492     double denom = v1Sum + v2Sum;
493     if (fabs(denom) < 1e-6) {
494       // no need to worry about returnDistance here
495       return 0.0;
496     }
497     double minV = v1Sum < v2Sum ? v1Sum : v2Sum;
498     if (2. * minV / denom < bounds) {
499       return 0.0;
500     }
501     v1Sum = 0.0;
502     v2Sum = 0.0;
503   }
504 
505   double numer = 0.0;
506 
507   calcVectParams(v1, v2, v1Sum, v2Sum, numer);
508 
509   double denom = v1Sum + v2Sum;
510   double sim;
511   if (fabs(denom) < 1e-6) {
512     sim = 0.0;
513   } else {
514     sim = 2. * numer / denom;
515   }
516   if (returnDistance) sim = 1. - sim;
517   // std::cerr<<" "<<v1Sum<<" "<<v2Sum<<" " << numer << " " << sim <<std::endl;
518   return sim;
519 }
520 
521 template <typename IndexType>
522 double TverskySimilarity(const SparseIntVect<IndexType> &v1,
523                          const SparseIntVect<IndexType> &v2, double a, double b,
524                          bool returnDistance = false, double bounds = 0.0) {
525   RDUNUSED_PARAM(bounds);
526   if (v1.getLength() != v2.getLength()) {
527     throw ValueErrorException("SparseIntVect size mismatch");
528   }
529   double v1Sum = 0.0;
530   double v2Sum = 0.0;
531   double andSum = 0.0;
532 
533   calcVectParams(v1, v2, v1Sum, v2Sum, andSum);
534 
535   double denom = a * v1Sum + b * v2Sum + (1 - a - b) * andSum;
536   double sim;
537 
538   if (fabs(denom) < 1e-6) {
539     sim = 0.0;
540   } else {
541     sim = andSum / denom;
542   }
543   if (returnDistance) sim = 1. - sim;
544   // std::cerr<<" "<<v1Sum<<" "<<v2Sum<<" " << numer << " " << sim <<std::endl;
545   return sim;
546 }
547 
548 template <typename IndexType>
549 double TanimotoSimilarity(const SparseIntVect<IndexType> &v1,
550                           const SparseIntVect<IndexType> &v2,
551                           bool returnDistance = false, double bounds = 0.0) {
552   return TverskySimilarity(v1, v2, 1.0, 1.0, returnDistance, bounds);
553 }
554 }  // namespace RDKit
555 
556 #endif
557