RDKit
Open-source cheminformatics and machine learning.
SparseIntVect.h
Go to the documentation of this file.
1 // $Id$
2 //
3 // Copyright (C) 2007-2008 Greg Landrum
4 //
5 // @@ All Rights Reserved @@
6 // This file is part of the RDKit.
7 // The contents are covered by the terms of the BSD license
8 // which is included in the file license.txt, found at the root
9 // of the RDKit source tree.
10 //
11 #include <RDGeneral/export.h>
12 #ifndef __RD_SPARSE_INT_VECT_20070921__
13 #define __RD_SPARSE_INT_VECT_20070921__
14 
15 #include <map>
16 #include <string>
17 #include <RDGeneral/Invariant.h>
18 #include <sstream>
19 #include <RDGeneral/Exceptions.h>
20 #include <RDGeneral/StreamOps.h>
21 #include <cstdint>
22 
24  0x0001; //!< version number to use in pickles
25 namespace RDKit {
26 //! a class for efficiently storing sparse vectors of ints
27 template <typename IndexType>
29  public:
30  typedef std::map<IndexType, int> StorageType;
31 
32  SparseIntVect() : d_length(0) {}
33 
34  //! initialize with a particular length
35  SparseIntVect(IndexType length) : d_length(length) {}
36 
37  //! Copy constructor
39  d_length = other.d_length;
40  d_data.clear();
41  d_data.insert(other.d_data.begin(), other.d_data.end());
42  }
43 
44  //! constructor from a pickle
45  SparseIntVect(const std::string &pkl) {
46  initFromText(pkl.c_str(), pkl.size());
47  }
48  //! constructor from a pickle
49  SparseIntVect(const char *pkl, const unsigned int len) {
50  initFromText(pkl, len);
51  }
52 
54  if (this == &other) {
55  return *this;
56  }
57  d_length = other.d_length;
58  d_data.clear();
59  d_data.insert(other.d_data.begin(), other.d_data.end());
60  return *this;
61  }
62 
63  //! destructor (doesn't need to do anything)
64  ~SparseIntVect() = default;
65 
66 #ifdef __clang__
67 #pragma clang diagnostic push
68 #pragma clang diagnostic ignored "-Wtautological-compare"
69 #elif (defined(__GNUC__) || defined(__GNUG__)) && \
70  (__GNUC__ > 4 || (__GNUC__ == 4 && __GNUC_MINOR__ > 1))
71 #if (__GNUC__ > 4 || __GNUC_MINOR__ > 5)
72 #pragma GCC diagnostic push
73 #endif
74 #pragma GCC diagnostic ignored "-Wtype-limits"
75 #endif
76  //! return the value at an index
77  int getVal(IndexType idx) const {
78  if (idx < 0 || idx >= d_length) {
79  throw IndexErrorException(static_cast<int>(idx));
80  }
81  int res = 0;
82  typename StorageType::const_iterator iter = d_data.find(idx);
83  if (iter != d_data.end()) {
84  res = iter->second;
85  }
86  return res;
87  }
88 
89  //! set the value at an index
90  void setVal(IndexType idx, int val) {
91  if (idx < 0 || idx >= d_length) {
92  throw IndexErrorException(static_cast<int>(idx));
93  }
94  if (val != 0) {
95  d_data[idx] = val;
96  } else {
97  d_data.erase(idx);
98  }
99  }
100 #ifdef __clang__
101 #pragma clang diagnostic pop
102 #elif (defined(__GNUC__) || defined(__GNUG__)) && \
103  (__GNUC__ > 4 || (__GNUC__ == 4 && __GNUC_MINOR__ > 5))
104 #pragma GCC diagnostic pop
105 #endif
106  //! support indexing using []
107  int operator[](IndexType idx) const { return getVal(idx); }
108 
109  //! returns the length
110  IndexType getLength() const { return d_length; }
111 
112  //! returns the sum of all the elements in the vect
113  //! the doAbs argument toggles summing the absolute values of the elements
114  int getTotalVal(bool doAbs = false) const {
115  int res = 0;
116  typename StorageType::const_iterator iter;
117  for (iter = d_data.begin(); iter != d_data.end(); ++iter) {
118  if (!doAbs)
119  res += iter->second;
120  else
121  res += abs(iter->second);
122  }
123  return res;
124  }
125  //! returns the length
126  unsigned int size() const { return getLength(); }
127 
128  //! returns our nonzero elements as a map(IndexType->int)
129  const StorageType &getNonzeroElements() const { return d_data; }
130 
131  //! this is a "fuzzy" intesection, the final value
132  //! of each element is equal to the minimum from
133  //! the two vects.
135  if (other.d_length != d_length) {
136  throw ValueErrorException("SparseIntVect size mismatch");
137  }
138 
139  typename StorageType::iterator iter = d_data.begin();
140  typename StorageType::const_iterator oIter = other.d_data.begin();
141  while (iter != d_data.end()) {
142  // we're relying on the fact that the maps are sorted:
143  while (oIter != other.d_data.end() && oIter->first < iter->first) {
144  ++oIter;
145  }
146  if (oIter != other.d_data.end() && oIter->first == iter->first) {
147  // found it:
148  if (oIter->second < iter->second) {
149  iter->second = oIter->second;
150  }
151  ++oIter;
152  ++iter;
153  } else {
154  // not there; our value is zero, which means
155  // we should remove this value:
156  typename StorageType::iterator tmpIter = iter;
157  ++tmpIter;
158  d_data.erase(iter);
159  iter = tmpIter;
160  }
161  }
162  return *this;
163  }
165  const SparseIntVect<IndexType> &other) const {
166  SparseIntVect<IndexType> res(*this);
167  return res &= other;
168  }
169 
170  //! this is a "fuzzy" union, the final value
171  //! of each element is equal to the maximum from
172  //! the two vects.
174  if (other.d_length != d_length) {
175  throw ValueErrorException("SparseIntVect size mismatch");
176  }
177 
178  typename StorageType::iterator iter = d_data.begin();
179  typename StorageType::const_iterator oIter = other.d_data.begin();
180  while (iter != d_data.end()) {
181  // we're relying on the fact that the maps are sorted:
182  while (oIter != other.d_data.end() && oIter->first < iter->first) {
183  d_data[oIter->first] = oIter->second;
184  ++oIter;
185  }
186  if (oIter != other.d_data.end() && oIter->first == iter->first) {
187  // found it:
188  if (oIter->second > iter->second) {
189  iter->second = oIter->second;
190  }
191  ++oIter;
192  }
193  ++iter;
194  }
195  // finish up the other vect:
196  while (oIter != other.d_data.end()) {
197  d_data[oIter->first] = oIter->second;
198  ++oIter;
199  }
200  return *this;
201  }
203  const SparseIntVect<IndexType> &other) const {
204  SparseIntVect<IndexType> res(*this);
205  return res |= other;
206  }
207 
209  if (other.d_length != d_length) {
210  throw ValueErrorException("SparseIntVect size mismatch");
211  }
212  typename StorageType::iterator iter = d_data.begin();
213  typename StorageType::const_iterator oIter = other.d_data.begin();
214  while (oIter != other.d_data.end()) {
215  while (iter != d_data.end() && iter->first < oIter->first) {
216  ++iter;
217  }
218  if (iter != d_data.end() && oIter->first == iter->first) {
219  // found it:
220  iter->second += oIter->second;
221  if (!iter->second) {
222  typename StorageType::iterator tIter = iter;
223  ++tIter;
224  d_data.erase(iter);
225  iter = tIter;
226  } else {
227  ++iter;
228  }
229  } else {
230  d_data[oIter->first] = oIter->second;
231  }
232  ++oIter;
233  }
234  return *this;
235  }
237  const SparseIntVect<IndexType> &other) const {
238  SparseIntVect<IndexType> res(*this);
239  return res += other;
240  }
241 
243  if (other.d_length != d_length) {
244  throw ValueErrorException("SparseIntVect size mismatch");
245  }
246  typename StorageType::iterator iter = d_data.begin();
247  typename StorageType::const_iterator oIter = other.d_data.begin();
248  while (oIter != other.d_data.end()) {
249  while (iter != d_data.end() && iter->first < oIter->first) {
250  ++iter;
251  }
252  if (iter != d_data.end() && oIter->first == iter->first) {
253  // found it:
254  iter->second -= oIter->second;
255  if (!iter->second) {
256  typename StorageType::iterator tIter = iter;
257  ++tIter;
258  d_data.erase(iter);
259  iter = tIter;
260  } else {
261  ++iter;
262  }
263  } else {
264  d_data[oIter->first] = -oIter->second;
265  }
266  ++oIter;
267  }
268  return *this;
269  }
271  const SparseIntVect<IndexType> &other) const {
272  SparseIntVect<IndexType> res(*this);
273  return res -= other;
274  }
276  typename StorageType::iterator iter = d_data.begin();
277  while (iter != d_data.end()) {
278  iter->second *= v;
279  ++iter;
280  }
281  return *this;
282  }
284  SparseIntVect<IndexType> res(*this);
285  return res *= v;
286  }
288  typename StorageType::iterator iter = d_data.begin();
289  while (iter != d_data.end()) {
290  iter->second /= v;
291  ++iter;
292  }
293  return *this;
294  }
296  SparseIntVect<IndexType> res(*this);
297  return res /= v;
298  }
300  typename StorageType::iterator iter = d_data.begin();
301  while (iter != d_data.end()) {
302  iter->second += v;
303  ++iter;
304  }
305  return *this;
306  }
308  SparseIntVect<IndexType> res(*this);
309  return res += v;
310  }
312  typename StorageType::iterator iter = d_data.begin();
313  while (iter != d_data.end()) {
314  iter->second -= v;
315  ++iter;
316  }
317  return *this;
318  }
320  SparseIntVect<IndexType> res(*this);
321  return res -= v;
322  }
323 
324  bool operator==(const SparseIntVect<IndexType> &v2) const {
325  if (d_length != v2.d_length) {
326  return false;
327  }
328  return d_data == v2.d_data;
329  }
330  bool operator!=(const SparseIntVect<IndexType> &v2) const {
331  return !(*this == v2);
332  }
333 
334  //! returns a binary string representation (pickle)
335  std::string toString() const {
336  std::stringstream ss(std::ios_base::binary | std::ios_base::out |
337  std::ios_base::in);
338  std::uint32_t tInt;
340  streamWrite(ss, tInt);
341  tInt = sizeof(IndexType);
342  streamWrite(ss, tInt);
343  streamWrite(ss, d_length);
344  IndexType nEntries = d_data.size();
345  streamWrite(ss, nEntries);
346 
347  typename StorageType::const_iterator iter = d_data.begin();
348  while (iter != d_data.end()) {
349  streamWrite(ss, iter->first);
350  std::int32_t tInt = iter->second;
351  streamWrite(ss, tInt);
352  ++iter;
353  }
354  return ss.str();
355  }
356 
357  void fromString(const std::string &txt) {
358  initFromText(txt.c_str(), txt.length());
359  }
360 
361  private:
362  IndexType d_length;
363  StorageType d_data;
364 
365  void initFromText(const char *pkl, const unsigned int len) {
366  d_data.clear();
367  std::stringstream ss(std::ios_base::binary | std::ios_base::out |
368  std::ios_base::in);
369  ss.write(pkl, len);
370 
371  std::uint32_t vers;
372  streamRead(ss, vers);
373  if (vers == 0x0001) {
374  std::uint32_t tInt;
375  streamRead(ss, tInt);
376  if (tInt > sizeof(IndexType)) {
377  throw ValueErrorException(
378  "IndexType cannot accommodate index size in SparseIntVect pickle");
379  }
380  switch (tInt) {
381  case sizeof(char):
382  readVals<unsigned char>(ss);
383  break;
384  case sizeof(std::int32_t):
385  readVals<std::uint32_t>(ss);
386  break;
387  case sizeof(boost::int64_t):
388  readVals<boost::uint64_t>(ss);
389  break;
390  default:
391  throw ValueErrorException("unreadable format");
392  }
393  } else {
394  throw ValueErrorException("bad version in SparseIntVect pickle");
395  }
396  }
397  template <typename T>
398  void readVals(std::stringstream &ss) {
399  PRECONDITION(sizeof(T) <= sizeof(IndexType), "invalid size");
400  T tVal;
401  streamRead(ss, tVal);
402  d_length = tVal;
403  T nEntries;
404  streamRead(ss, nEntries);
405  for (T i = 0; i < nEntries; ++i) {
406  streamRead(ss, tVal);
407  std::int32_t val;
408  streamRead(ss, val);
409  d_data[tVal] = val;
410  }
411  }
412 };
413 
414 template <typename IndexType, typename SequenceType>
416  const SequenceType &seq) {
417  typename SequenceType::const_iterator seqIt;
418  for (seqIt = seq.begin(); seqIt != seq.end(); ++seqIt) {
419  // EFF: probably not the most efficient approach
420  IndexType idx = *seqIt;
421  vect.setVal(idx, vect.getVal(idx) + 1);
422  }
423 }
424 
425 namespace {
426 template <typename IndexType>
427 void calcVectParams(const SparseIntVect<IndexType> &v1,
428  const SparseIntVect<IndexType> &v2, double &v1Sum,
429  double &v2Sum, double &andSum) {
430  if (v1.getLength() != v2.getLength()) {
431  throw ValueErrorException("SparseIntVect size mismatch");
432  }
433  v1Sum = v2Sum = andSum = 0.0;
434  // we're doing : (v1&v2).getTotalVal(), but w/o generating
435  // the other vector:
436  typename SparseIntVect<IndexType>::StorageType::const_iterator iter1, iter2;
437  iter1 = v1.getNonzeroElements().begin();
438  if (iter1 != v1.getNonzeroElements().end()) {
439  v1Sum += abs(iter1->second);
440  }
441  iter2 = v2.getNonzeroElements().begin();
442  if (iter2 != v2.getNonzeroElements().end()) {
443  v2Sum += abs(iter2->second);
444  }
445  while (iter1 != v1.getNonzeroElements().end()) {
446  while (iter2 != v2.getNonzeroElements().end() &&
447  iter2->first < iter1->first) {
448  ++iter2;
449  if (iter2 != v2.getNonzeroElements().end()) {
450  v2Sum += abs(iter2->second);
451  }
452  }
453  if (iter2 != v2.getNonzeroElements().end()) {
454  if (iter2->first == iter1->first) {
455  if (abs(iter2->second) < abs(iter1->second)) {
456  andSum += abs(iter2->second);
457  } else {
458  andSum += abs(iter1->second);
459  }
460  ++iter2;
461  if (iter2 != v2.getNonzeroElements().end()) {
462  v2Sum += abs(iter2->second);
463  }
464  }
465  ++iter1;
466  if (iter1 != v1.getNonzeroElements().end()) {
467  v1Sum += abs(iter1->second);
468  }
469  } else {
470  break;
471  }
472  }
473  if (iter1 != v1.getNonzeroElements().end()) {
474  ++iter1;
475  while (iter1 != v1.getNonzeroElements().end()) {
476  v1Sum += abs(iter1->second);
477  ++iter1;
478  }
479  }
480  if (iter2 != v2.getNonzeroElements().end()) {
481  ++iter2;
482  while (iter2 != v2.getNonzeroElements().end()) {
483  v2Sum += abs(iter2->second);
484  ++iter2;
485  }
486  }
487 }
488 } // namespace
489 
490 template <typename IndexType>
492  const SparseIntVect<IndexType> &v2,
493  bool returnDistance = false, double bounds = 0.0) {
494  if (v1.getLength() != v2.getLength()) {
495  throw ValueErrorException("SparseIntVect size mismatch");
496  }
497  double v1Sum = 0.0;
498  double v2Sum = 0.0;
499  if (!returnDistance && bounds > 0.0) {
500  v1Sum = v1.getTotalVal(true);
501  v2Sum = v2.getTotalVal(true);
502  double denom = v1Sum + v2Sum;
503  if (fabs(denom) < 1e-6) {
504  // no need to worry about returnDistance here
505  return 0.0;
506  }
507  double minV = v1Sum < v2Sum ? v1Sum : v2Sum;
508  if (2. * minV / denom < bounds) {
509  return 0.0;
510  }
511  v1Sum = 0.0;
512  v2Sum = 0.0;
513  }
514 
515  double numer = 0.0;
516 
517  calcVectParams(v1, v2, v1Sum, v2Sum, numer);
518 
519  double denom = v1Sum + v2Sum;
520  double sim;
521  if (fabs(denom) < 1e-6) {
522  sim = 0.0;
523  } else {
524  sim = 2. * numer / denom;
525  }
526  if (returnDistance) sim = 1. - sim;
527  // std::cerr<<" "<<v1Sum<<" "<<v2Sum<<" " << numer << " " << sim <<std::endl;
528  return sim;
529 }
530 
531 template <typename IndexType>
533  const SparseIntVect<IndexType> &v2, double a, double b,
534  bool returnDistance = false, double bounds = 0.0) {
535  RDUNUSED_PARAM(bounds);
536  if (v1.getLength() != v2.getLength()) {
537  throw ValueErrorException("SparseIntVect size mismatch");
538  }
539  double v1Sum = 0.0;
540  double v2Sum = 0.0;
541  double andSum = 0.0;
542 
543  calcVectParams(v1, v2, v1Sum, v2Sum, andSum);
544 
545  double denom = a * v1Sum + b * v2Sum + (1 - a - b) * andSum;
546  double sim;
547 
548  if (fabs(denom) < 1e-6) {
549  sim = 0.0;
550  } else {
551  sim = andSum / denom;
552  }
553  if (returnDistance) sim = 1. - sim;
554  // std::cerr<<" "<<v1Sum<<" "<<v2Sum<<" " << numer << " " << sim <<std::endl;
555  return sim;
556 }
557 
558 template <typename IndexType>
560  const SparseIntVect<IndexType> &v2,
561  bool returnDistance = false, double bounds = 0.0) {
562  return TverskySimilarity(v1, v2, 1.0, 1.0, returnDistance, bounds);
563 }
564 } // namespace RDKit
565 
566 #endif
#define RDUNUSED_PARAM(x)
Definition: Invariant.h:196
#define PRECONDITION(expr, mess)
Definition: Invariant.h:109
const int ci_SPARSEINTVECT_VERSION
version number to use in pickles
Definition: SparseIntVect.h:23
Class to allow us to throw an IndexError from C++ and have it make it back to Python.
Definition: Exceptions.h:20
a class for efficiently storing sparse vectors of ints
Definition: SparseIntVect.h:28
SparseIntVect< IndexType > & operator+=(int v)
SparseIntVect< IndexType > & operator/(int v)
SparseIntVect(IndexType length)
initialize with a particular length
Definition: SparseIntVect.h:35
unsigned int size() const
returns the length
const SparseIntVect< IndexType > operator+(const SparseIntVect< IndexType > &other) const
SparseIntVect< IndexType > & operator*(int v)
SparseIntVect< IndexType > & operator+=(const SparseIntVect< IndexType > &other)
bool operator==(const SparseIntVect< IndexType > &v2) const
SparseIntVect(const SparseIntVect< IndexType > &other)
Copy constructor.
Definition: SparseIntVect.h:38
~SparseIntVect()=default
destructor (doesn't need to do anything)
SparseIntVect< IndexType > & operator|=(const SparseIntVect< IndexType > &other)
const SparseIntVect< IndexType > operator-(const SparseIntVect< IndexType > &other) const
const SparseIntVect< IndexType > operator|(const SparseIntVect< IndexType > &other) const
SparseIntVect< IndexType > & operator/=(int v)
const SparseIntVect< IndexType > operator&(const SparseIntVect< IndexType > &other) const
SparseIntVect(const char *pkl, const unsigned int len)
constructor from a pickle
Definition: SparseIntVect.h:49
int operator[](IndexType idx) const
support indexing using []
void fromString(const std::string &txt)
SparseIntVect< IndexType > & operator*=(int v)
void setVal(IndexType idx, int val)
set the value at an index
Definition: SparseIntVect.h:90
SparseIntVect< IndexType > & operator&=(const SparseIntVect< IndexType > &other)
SparseIntVect & operator=(const SparseIntVect< IndexType > &other)
Definition: SparseIntVect.h:53
std::string toString() const
returns a binary string representation (pickle)
int getTotalVal(bool doAbs=false) const
SparseIntVect< IndexType > & operator-(int v)
std::map< IndexType, int > StorageType
Definition: SparseIntVect.h:30
SparseIntVect< IndexType > & operator-=(const SparseIntVect< IndexType > &other)
bool operator!=(const SparseIntVect< IndexType > &v2) const
SparseIntVect< IndexType > & operator-=(int v)
SparseIntVect(const std::string &pkl)
constructor from a pickle
Definition: SparseIntVect.h:45
SparseIntVect< IndexType > & operator+(int v)
IndexType getLength() const
returns the length
int getVal(IndexType idx) const
return the value at an index
Definition: SparseIntVect.h:77
const StorageType & getNonzeroElements() const
returns our nonzero elements as a map(IndexType->int)
Class to allow us to throw a ValueError from C++ and have it make it back to Python.
Definition: Exceptions.h:40
Std stuff.
Definition: Abbreviations.h:18
double TverskySimilarity(const SparseIntVect< IndexType > &v1, const SparseIntVect< IndexType > &v2, double a, double b, bool returnDistance=false, double bounds=0.0)
void updateFromSequence(SparseIntVect< IndexType > &vect, const SequenceType &seq)
double TanimotoSimilarity(const SparseIntVect< IndexType > &v1, const SparseIntVect< IndexType > &v2, bool returnDistance=false, double bounds=0.0)
double DiceSimilarity(const SparseIntVect< IndexType > &v1, const SparseIntVect< IndexType > &v2, bool returnDistance=false, double bounds=0.0)
void streamRead(std::istream &ss, T &loc)
does a binary read of an object from a stream
Definition: StreamOps.h:274
void streamWrite(std::ostream &ss, const T &val)
does a binary write of an object to a stream
Definition: StreamOps.h:254