RDKit
Open-source cheminformatics and machine learning.
FPBReader.h
Go to the documentation of this file.
1 //
2 // Copyright (c) 2016 Greg Landrum
3 //
4 // @@ All Rights Reserved @@
5 // This file is part of the RDKit.
6 // The contents are covered by the terms of the BSD license
7 // which is included in the file license.txt, found at the root
8 // of the RDKit source tree.
9 //
10 #include <RDGeneral/export.h>
11 #ifndef RD_FPBREADER_H_DEC2015
12 #define RD_FPBREADER_H_DEC2015
13 /*! \file FPBReader.h
14 
15  \brief contains a simple class for reading and searching FPB files
16 
17  \b Note that this functionality is experimental and the API may change
18  in future releases.
19 */
20 
21 #include <iostream>
22 #include <fstream>
23 #include <sstream>
24 #include <string>
27 
28 #include <cstdint>
29 #include <boost/shared_ptr.hpp>
30 #include <boost/shared_array.hpp>
31 
32 namespace RDKit {
33 namespace detail {
34 struct FPBReader_impl;
35 }
36 
37 //! class for reading and searching FPB files
38 /*!
39  basic usage:
40  \code
41  FPBReader reader("foo.fpb");
42  reader.init();
43  boost::shared_ptr<ExplicitBitVect> ebv = reader.getFP(95);
44  std::vector<std::pair<double, unsigned int> > nbrs =
45  reader.getTanimotoNeighbors(*ebv.get(), 0.70);
46  \endcode
47 
48  \b Note: this functionality is experimental and the API may change
49  in future releases.
50 
51  <b>Note on thread safety</b>
52  Operations that involve reading from the FPB file are not thread safe.
53  This means that the \c init() method is not thread safe and none of the
54  search operations are thread safe when an \c FPBReader is initialized in
55  \c lazyRead mode.
56 
57 */
59  public:
60  FPBReader() {}
61 
62  //! ctor for reading from a named file
63  /*!
64  \param fname the name of the file to reads
65  \param lazyRead if set to \c false all fingerprints from the file will be read
66  into memory when \c init() is called.
67  */
68  FPBReader(const char *fname, bool lazyRead = false) {
69  _initFromFilename(fname, lazyRead);
70  }
71  //! \overload
72  FPBReader(const std::string &fname, bool lazyRead = false) {
73  _initFromFilename(fname.c_str(), lazyRead);
74  }
75  //! ctor for reading from an open istream
76  /*!
77  \param inStream the stream to read from
78  \param takeOwnership if set, we will take over ownership of the stream pointer
79  \param lazyRead if set to \c false all fingerprints from the file will be read
80  into memory when \c init() is called.
81 
82  Some additional notes:
83  - if \c lazyRead is set, \c inStream must support the \c seekg() and \c
84  tellg() operations.
85 
86  */
87  FPBReader(std::istream *inStream, bool takeOwnership = true,
88  bool lazyRead = false)
89  : dp_istrm(inStream),
90  dp_impl(nullptr),
91  df_owner(takeOwnership),
92  df_init(false),
93  df_lazyRead(lazyRead) {}
95  destroy();
96  if (df_owner) delete dp_istrm;
97  dp_istrm = nullptr;
98  df_init = false;
99  }
100 
101  //! Read the data from the file and initialize internal data structures
102  /*!
103  This must be called before most of the other methods of this class.
104 
105  Some notes:
106  \li if \c lazyRead is not set, all fingerprints will be read into memory. This
107  can require substantial amounts of memory for large files.
108  \li For large files, this can take a long time.
109  \li If \c lazyRead and \c takeOwnership are both \c false it is safe to close
110  and delete inStream after calling \c init()
111  */
112  void init();
113  //! cleanup
114  /*!
115  Cleans up whatever memory was allocated during init()
116  */
117  void cleanup() {
118  if (!df_init) return;
119  destroy();
120  df_init = false;
121  }
122  //! returns the requested fingerprint as an \c ExplicitBitVect
123  boost::shared_ptr<ExplicitBitVect> getFP(unsigned int idx) const;
124  //! returns the requested fingerprint as an array of bytes
125  boost::shared_array<std::uint8_t> getBytes(unsigned int idx) const;
126 
127  //! returns the id of the requested fingerprint
128  std::string getId(unsigned int idx) const;
129  //! returns the fingerprint and id of the requested fingerprint
130  std::pair<boost::shared_ptr<ExplicitBitVect>, std::string> operator[](
131  unsigned int idx) const {
132  return std::make_pair(getFP(idx), getId(idx));
133  }
134 
135  //! returns beginning and end indices of fingerprints having on-bit counts
136  //! within the range (including end points)
137  std::pair<unsigned int, unsigned int> getFPIdsInCountRange(
138  unsigned int minCount, unsigned int maxCount);
139 
140  //! returns the number of fingerprints
141  unsigned int length() const;
142  //! returns the number of bits in our fingerprints
143  unsigned int nBits() const;
144 
145  //! returns the tanimoto similarity between the specified fingerprint and the
146  //! provided fingerprint
147  double getTanimoto(unsigned int idx, const std::uint8_t *bv) const;
148  //! \overload
149  double getTanimoto(unsigned int idx,
150  boost::shared_array<std::uint8_t> bv) const {
151  return getTanimoto(idx, bv.get());
152  }
153  //! \overload
154  double getTanimoto(unsigned int idx, const ExplicitBitVect &ebv) const;
155 
156  //! returns tanimoto neighbors that are within a similarity threshold
157  /*!
158  The result vector of (similarity,index) pairs is sorted in order
159  of decreasing similarity
160 
161  \param bv the query fingerprint
162  \param threshold the minimum similarity to return
163  \param usePopcountScreen if this is true (the default) the popcount of the
164  neighbors will be used to reduce the number of calculations that need
165  to be done
166 
167  */
168  std::vector<std::pair<double, unsigned int>> getTanimotoNeighbors(
169  const std::uint8_t *bv, double threshold = 0.7,
170  bool usePopcountScreen = true) const;
171  //! \overload
172  std::vector<std::pair<double, unsigned int>> getTanimotoNeighbors(
173  boost::shared_array<std::uint8_t> bv, double threshold = 0.7,
174  bool usePopcountScreen = true) const {
175  return getTanimotoNeighbors(bv.get(), threshold, usePopcountScreen);
176  }
177  //! \overload
178  std::vector<std::pair<double, unsigned int>> getTanimotoNeighbors(
179  const ExplicitBitVect &ebv, double threshold = 0.7,
180  bool usePopcountScreen = true) const;
181 
182  //! returns the Tversky similarity between the specified fingerprint and the
183  //! provided fingerprint
184  /*!
185 
186  \param idx the fingerprint to compare to
187  \param bv the query fingerprint
188  \param ca the Tversky a coefficient
189  \param cb the Tversky a coefficient
190 
191  */
192  double getTversky(unsigned int idx, const std::uint8_t *bv, double ca,
193  double cb) const;
194  //! \overload
195  double getTversky(unsigned int idx, boost::shared_array<std::uint8_t> bv,
196  double ca, double cb) const {
197  return getTversky(idx, bv.get(), ca, cb);
198  }
199  //! \overload
200  double getTversky(unsigned int idx, const ExplicitBitVect &ebv, double ca,
201  double cb) const;
202 
203  //! returns Tversky neighbors that are within a similarity threshold
204  /*!
205  The result vector of (similarity,index) pairs is sorted in order
206  of decreasing similarity
207 
208  \param bv the query fingerprint
209  \param ca the Tversky a coefficient
210  \param cb the Tversky a coefficient
211  \param threshold the minimum similarity to return
212  \param usePopcountScreen if this is true (the default) the popcount of the
213  neighbors will be used to reduce the number of calculations that need
214  to be done
215 
216  */
217  std::vector<std::pair<double, unsigned int>> getTverskyNeighbors(
218  const std::uint8_t *bv, double ca, double cb, double threshold = 0.7,
219  bool usePopcountScreen = true) const;
220  //! \overload
221  std::vector<std::pair<double, unsigned int>> getTverskyNeighbors(
222  boost::shared_array<std::uint8_t> bv, double ca, double cb,
223  double threshold = 0.7, bool usePopcountScreen = true) const {
224  return getTverskyNeighbors(bv.get(), ca, cb, threshold, usePopcountScreen);
225  }
226  //! \overload
227  std::vector<std::pair<double, unsigned int>> getTverskyNeighbors(
228  const ExplicitBitVect &ebv, double ca, double cb, double threshold = 0.7,
229  bool usePopcountScreen = true) const;
230 
231  //! returns indices of all fingerprints that completely contain this one
232  /*! (i.e. where all the bits set in the query are also set in the db
233  molecule)
234  */
235  std::vector<unsigned int> getContainingNeighbors(
236  const std::uint8_t *bv) const;
237  //! \overload
238  std::vector<unsigned int> getContainingNeighbors(
239  boost::shared_array<std::uint8_t> bv) const {
240  return getContainingNeighbors(bv.get());
241  }
242  //! \overload
243  std::vector<unsigned int> getContainingNeighbors(
244  const ExplicitBitVect &ebv) const;
245 
246  private:
247  std::istream *dp_istrm{nullptr};
248  detail::FPBReader_impl *dp_impl{nullptr}; // implementation details
249  bool df_owner{false};
250  bool df_init{false};
251  bool df_lazyRead{false};
252 
253  // disable automatic copy constructors and assignment operators
254  // for this class and its subclasses. They will likely be
255  // carrying around stream pointers and copying those is a recipe
256  // for disaster.
257  FPBReader(const FPBReader &);
258  FPBReader &operator=(const FPBReader &);
259  void destroy();
260  void _initFromFilename(const char *fname, bool lazyRead) {
261  std::istream *tmpStream = static_cast<std::istream *>(
262  new std::ifstream(fname, std::ios_base::binary));
263  if (!(*tmpStream) || (tmpStream->bad())) {
264  std::ostringstream errout;
265  errout << "Bad input file " << fname;
266  delete tmpStream;
267  throw BadFileException(errout.str());
268  }
269  dp_istrm = tmpStream;
270  dp_impl = nullptr;
271  df_owner = true;
272  df_init = false;
273  df_lazyRead = lazyRead;
274  }
275 };
276 } // namespace RDKit
277 #endif
a class for bit vectors that are densely occupied
class for reading and searching FPB files
Definition: FPBReader.h:58
void cleanup()
cleanup
Definition: FPBReader.h:117
double getTversky(unsigned int idx, const std::uint8_t *bv, double ca, double cb) const
std::vector< std::pair< double, unsigned int > > getTanimotoNeighbors(const ExplicitBitVect &ebv, double threshold=0.7, bool usePopcountScreen=true) const
This is an overloaded member function, provided for convenience. It differs from the above function o...
std::pair< unsigned int, unsigned int > getFPIdsInCountRange(unsigned int minCount, unsigned int maxCount)
unsigned int length() const
returns the number of fingerprints
double getTanimoto(unsigned int idx, const std::uint8_t *bv) const
boost::shared_ptr< ExplicitBitVect > getFP(unsigned int idx) const
returns the requested fingerprint as an ExplicitBitVect
boost::shared_array< std::uint8_t > getBytes(unsigned int idx) const
returns the requested fingerprint as an array of bytes
double getTanimoto(unsigned int idx, boost::shared_array< std::uint8_t > bv) const
This is an overloaded member function, provided for convenience. It differs from the above function o...
Definition: FPBReader.h:149
double getTversky(unsigned int idx, boost::shared_array< std::uint8_t > bv, double ca, double cb) const
This is an overloaded member function, provided for convenience. It differs from the above function o...
Definition: FPBReader.h:195
std::vector< std::pair< double, unsigned int > > getTverskyNeighbors(const ExplicitBitVect &ebv, double ca, double cb, double threshold=0.7, bool usePopcountScreen=true) const
This is an overloaded member function, provided for convenience. It differs from the above function o...
double getTversky(unsigned int idx, const ExplicitBitVect &ebv, double ca, double cb) const
This is an overloaded member function, provided for convenience. It differs from the above function o...
FPBReader(std::istream *inStream, bool takeOwnership=true, bool lazyRead=false)
ctor for reading from an open istream
Definition: FPBReader.h:87
std::vector< unsigned int > getContainingNeighbors(const ExplicitBitVect &ebv) const
This is an overloaded member function, provided for convenience. It differs from the above function o...
FPBReader(const char *fname, bool lazyRead=false)
ctor for reading from a named file
Definition: FPBReader.h:68
FPBReader(const std::string &fname, bool lazyRead=false)
This is an overloaded member function, provided for convenience. It differs from the above function o...
Definition: FPBReader.h:72
std::vector< unsigned int > getContainingNeighbors(boost::shared_array< std::uint8_t > bv) const
This is an overloaded member function, provided for convenience. It differs from the above function o...
Definition: FPBReader.h:238
std::vector< std::pair< double, unsigned int > > getTverskyNeighbors(const std::uint8_t *bv, double ca, double cb, double threshold=0.7, bool usePopcountScreen=true) const
returns Tversky neighbors that are within a similarity threshold
unsigned int nBits() const
returns the number of bits in our fingerprints
std::vector< unsigned int > getContainingNeighbors(const std::uint8_t *bv) const
returns indices of all fingerprints that completely contain this one
double getTanimoto(unsigned int idx, const ExplicitBitVect &ebv) const
This is an overloaded member function, provided for convenience. It differs from the above function o...
std::string getId(unsigned int idx) const
returns the id of the requested fingerprint
std::pair< boost::shared_ptr< ExplicitBitVect >, std::string > operator[](unsigned int idx) const
returns the fingerprint and id of the requested fingerprint
Definition: FPBReader.h:130
std::vector< std::pair< double, unsigned int > > getTanimotoNeighbors(boost::shared_array< std::uint8_t > bv, double threshold=0.7, bool usePopcountScreen=true) const
This is an overloaded member function, provided for convenience. It differs from the above function o...
Definition: FPBReader.h:172
void init()
Read the data from the file and initialize internal data structures.
std::vector< std::pair< double, unsigned int > > getTanimotoNeighbors(const std::uint8_t *bv, double threshold=0.7, bool usePopcountScreen=true) const
returns tanimoto neighbors that are within a similarity threshold
std::vector< std::pair< double, unsigned int > > getTverskyNeighbors(boost::shared_array< std::uint8_t > bv, double ca, double cb, double threshold=0.7, bool usePopcountScreen=true) const
This is an overloaded member function, provided for convenience. It differs from the above function o...
Definition: FPBReader.h:221
#define RDKIT_DATASTRUCTS_EXPORT
Definition: export.h:81
Std stuff.
Definition: Abbreviations.h:18
RDKIT_FINGERPRINTS_EXPORT ExplicitBitVect * getFP(const ROMol &mol, FPType fPType)