RDKit
Open-source cheminformatics and machine learning.
MolStandardize/Tautomer.h
Go to the documentation of this file.
1 //
2 // Copyright (C) 2018-2021 Susan H. Leung and other RDKit contributors
3 //
4 // @@ All Rights Reserved @@
5 // This file is part of the RDKit.
6 // The contents are covered by the terms of the BSD license
7 // which is included in the file license.txt, found at the root
8 // of the RDKit source tree.
9 //
10 #include <RDGeneral/export.h>
11 #ifndef RD_TAUTOMER_H
12 #define RD_TAUTOMER_H
13 
14 #include <boost/function.hpp>
15 #include <string>
16 #include <utility>
17 #include <iterator>
18 #include <Catalogs/Catalog.h>
23 #include <boost/dynamic_bitset.hpp>
24 
25 namespace RDKit {
26 class ROMol;
27 class RWMol;
28 
29 namespace MolStandardize {
30 
31 typedef RDCatalog::HierarchCatalog<TautomerCatalogEntry, TautomerCatalogParams,
32  int>
34 
35 namespace TautomerScoringFunctions {
36 const std::string tautomerScoringVersion = "1.0.0";
37 
41 
42 inline int scoreTautomer(const ROMol &mol) {
43  return scoreRings(mol) + scoreSubstructs(mol) + scoreHeteroHs(mol);
44 }
45 } // namespace TautomerScoringFunctions
46 
48  Completed = 0,
51  Canceled
52 };
53 
54 class Tautomer {
55  friend class TautomerEnumerator;
56 
57  public:
58  Tautomer() : d_numModifiedAtoms(0), d_numModifiedBonds(0), d_done(false) {}
59  Tautomer(ROMOL_SPTR t, ROMOL_SPTR k, size_t a = 0, size_t b = 0)
60  : tautomer(std::move(t)),
61  kekulized(std::move(k)),
62  d_numModifiedAtoms(a),
63  d_numModifiedBonds(b),
64  d_done(false) {}
67 
68  private:
69  size_t d_numModifiedAtoms;
70  size_t d_numModifiedBonds;
71  bool d_done;
72 };
73 
74 typedef std::map<std::string, Tautomer> SmilesTautomerMap;
75 typedef std::pair<std::string, Tautomer> SmilesTautomerPair;
76 
77 //! Contains results of tautomer enumeration
79  friend class TautomerEnumerator;
80 
81  public:
83  public:
85  typedef std::ptrdiff_t difference_type;
86  typedef const ROMol *pointer;
87  typedef const ROMOL_SPTR &reference;
88  typedef std::bidirectional_iterator_tag iterator_category;
89 
90  explicit const_iterator(const SmilesTautomerMap::const_iterator &it)
91  : d_it(it) {}
92  reference operator*() const { return d_it->second.tautomer; }
93  pointer operator->() const { return d_it->second.tautomer.get(); }
94  bool operator==(const const_iterator &other) const {
95  return (d_it == other.d_it);
96  }
97  bool operator!=(const const_iterator &other) const {
98  return !(*this == other);
99  }
101  const_iterator copy(d_it);
102  operator++();
103  return copy;
104  }
106  ++d_it;
107  return *this;
108  }
110  const_iterator copy(d_it);
111  operator--();
112  return copy;
113  }
115  --d_it;
116  return *this;
117  }
118 
119  private:
120  SmilesTautomerMap::const_iterator d_it;
121  };
124  : d_tautomers(other.d_tautomers),
125  d_status(other.d_status),
126  d_modifiedAtoms(other.d_modifiedAtoms),
127  d_modifiedBonds(other.d_modifiedBonds) {
128  fillTautomersItVec();
129  }
130  const const_iterator begin() const {
131  return const_iterator(d_tautomers.begin());
132  }
133  const const_iterator end() const { return const_iterator(d_tautomers.end()); }
134  size_t size() const { return d_tautomers.size(); }
135  bool empty() const { return d_tautomers.empty(); }
136  const ROMOL_SPTR &at(size_t pos) const {
137  PRECONDITION(pos < d_tautomers.size(), "index out of bounds");
138  return d_tautomersItVec.at(pos)->second.tautomer;
139  }
140  const ROMOL_SPTR &operator[](size_t pos) const { return at(pos); }
141  const boost::dynamic_bitset<> &modifiedAtoms() const {
142  return d_modifiedAtoms;
143  }
144  const boost::dynamic_bitset<> &modifiedBonds() const {
145  return d_modifiedBonds;
146  }
147  TautomerEnumeratorStatus status() const { return d_status; }
148  std::vector<ROMOL_SPTR> tautomers() const {
149  std::vector<ROMOL_SPTR> tautomerVec;
150  tautomerVec.reserve(d_tautomers.size());
151  std::transform(
152  d_tautomers.begin(), d_tautomers.end(), std::back_inserter(tautomerVec),
153  [](const SmilesTautomerPair &t) { return t.second.tautomer; });
154  return tautomerVec;
155  }
156  std::vector<ROMOL_SPTR> operator()() const { return tautomers(); }
157  std::vector<std::string> smiles() const {
158  std::vector<std::string> smilesVec;
159  smilesVec.reserve(d_tautomers.size());
160  std::transform(d_tautomers.begin(), d_tautomers.end(),
161  std::back_inserter(smilesVec),
162  [](const SmilesTautomerPair &t) { return t.first; });
163  return smilesVec;
164  }
165  const SmilesTautomerMap &smilesTautomerMap() const { return d_tautomers; }
166 
167  private:
168  void fillTautomersItVec() {
169  for (auto it = d_tautomers.begin(); it != d_tautomers.end(); ++it) {
170  d_tautomersItVec.push_back(it);
171  }
172  }
173  // the enumerated tautomers
174  SmilesTautomerMap d_tautomers;
175  // internal; vector of iterators into map items to enable random
176  // access to map items by index
177  std::vector<SmilesTautomerMap::const_iterator> d_tautomersItVec;
178  // status of the enumeration: did it complete? did it hit a limit?
179  // was it canceled?
180  TautomerEnumeratorStatus d_status;
181  // bit vector: flags atoms modified by the transforms
182  boost::dynamic_bitset<> d_modifiedAtoms;
183  // bit vector: flags bonds modified by the transforms
184  boost::dynamic_bitset<> d_modifiedBonds;
185 };
186 
188  public:
191  virtual bool operator()(const ROMol &, const TautomerEnumeratorResult &) = 0;
192 };
193 
195  public:
197  : dp_catalog(tautCat),
198  d_maxTautomers(1000),
199  d_maxTransforms(1000),
200  d_removeSp3Stereo(true),
201  d_removeBondStereo(true),
202  d_removeIsotopicHs(true),
203  d_reassignStereo(true) {}
206  : dp_catalog(other.dp_catalog),
207  d_callback(other.d_callback.get()),
208  d_maxTautomers(other.d_maxTautomers),
209  d_maxTransforms(other.d_maxTransforms),
210  d_removeSp3Stereo(other.d_removeSp3Stereo),
211  d_removeBondStereo(other.d_removeBondStereo),
212  d_removeIsotopicHs(other.d_removeIsotopicHs),
213  d_reassignStereo(other.d_reassignStereo) {}
215  if (this == &other) return *this;
216  dp_catalog = other.dp_catalog;
217  d_callback.reset(other.d_callback.get());
218  d_maxTautomers = other.d_maxTautomers;
219  d_maxTransforms = other.d_maxTransforms;
220  d_removeSp3Stereo = other.d_removeSp3Stereo;
221  d_removeBondStereo = other.d_removeBondStereo;
222  d_removeIsotopicHs = other.d_removeIsotopicHs;
223  d_reassignStereo = other.d_reassignStereo;
224  return *this;
225  }
226  //! \param maxTautomers maximum number of tautomers to be generated
227  void setMaxTautomers(unsigned int maxTautomers) {
228  d_maxTautomers = maxTautomers;
229  }
230  //! \return maximum number of tautomers to be generated
231  unsigned int getMaxTautomers() { return d_maxTautomers; }
232  /*! \param maxTransforms maximum number of transformations to be applied
233  this limit is usually hit earlier than the maxTautomers limit
234  and leads to a more linear scaling of CPU time with increasing
235  number of tautomeric centers (see Sitzmann et al.)
236  */
237  void setMaxTransforms(unsigned int maxTransforms) {
238  d_maxTransforms = maxTransforms;
239  }
240  //! \return maximum number of transformations to be applied
241  unsigned int getMaxTransforms() { return d_maxTransforms; }
242  /*! \param removeSp3Stereo; if set to true, stereochemistry information
243  will be removed from sp3 atoms involved in tautomerism.
244  This means that S-aminoacids will lose their stereochemistry after going
245  through tautomer enumeration because of the amido-imidol tautomerism.
246  This defaults to true in RDKit, false in the workflow described
247  by Sitzmann et al.
248  */
249  void setRemoveSp3Stereo(bool removeSp3Stereo) {
250  d_removeSp3Stereo = removeSp3Stereo;
251  }
252  /*! \return whether stereochemistry information will be removed from
253  sp3 atoms involved in tautomerism
254  */
255  bool getRemoveSp3Stereo() { return d_removeSp3Stereo; }
256  /*! \param removeBondStereo; if set to true, stereochemistry information
257  will be removed from double bonds involved in tautomerism.
258  This means that enols will lose their E/Z stereochemistry after going
259  through tautomer enumeration because of the keto-enolic tautomerism.
260  This defaults to true in RDKit and also in the workflow described
261  by Sitzmann et al.
262  */
263  void setRemoveBondStereo(bool removeBondStereo) {
264  d_removeBondStereo = removeBondStereo;
265  }
266  /*! \return whether stereochemistry information will be removed from
267  double bonds involved in tautomerism
268  */
269  bool getRemoveBondStereo() { return d_removeBondStereo; }
270  /*! \param removeIsotopicHs; if set to true, isotopic Hs
271  will be removed from centers involved in tautomerism.
272  */
273  void setRemoveIsotopicHs(bool removeIsotopicHs) {
274  d_removeIsotopicHs = removeIsotopicHs;
275  }
276  /*! \return whether isotpoic Hs will be removed from
277  centers involved in tautomerism
278  */
279  bool getRemoveIsotopicHs() { return d_removeIsotopicHs; }
280  /*! \param reassignStereo; if set to true, assignStereochemistry
281  will be called on each tautomer generated by the enumerate() method.
282  This defaults to true.
283  */
284  void setReassignStereo(bool reassignStereo) {
285  d_reassignStereo = reassignStereo;
286  }
287  /*! \return whether assignStereochemistry will be called on each
288  tautomer generated by the enumerate() method
289  */
290  bool getReassignStereo() { return d_reassignStereo; }
291  /*! set this to an instance of a class derived from
292  TautomerEnumeratorCallback where operator() is overridden.
293  DO NOT delete the instance as ownership of the pointer is transferred
294  to the TautomerEnumerator
295  */
297  d_callback.reset(callback);
298  }
299  /*! \return pointer to an instance of a class derived from
300  TautomerEnumeratorCallback.
301  DO NOT delete the instance as ownership of the pointer is transferred
302  to the TautomerEnumerator
303  */
304  TautomerEnumeratorCallback *getCallback() const { return d_callback.get(); }
305 
306  //! returns a \c TautomerEnumeratorResult structure for the input molecule
307  /*!
308  The enumeration rules are inspired by the publication:
309  M. Sitzmann et al., “Tautomerism in Large Databases.”, JCAMD 24:521 (2010)
310  https://doi.org/10.1007/s10822-010-9346-4
311 
312  \param mol: the molecule to be enumerated
313 
314  Note: the definitions used here are that the atoms modified during
315  tautomerization are the atoms at the beginning and end of each tautomer
316  transform (the H "donor" and H "acceptor" in the transform) and the bonds
317  modified during transformation are any bonds whose order is changed during
318  the tautomer transform (these are the bonds between the "donor" and the
319  "acceptor")
320 
321  */
323 
324  //! Deprecated, please use the form returning a \c TautomerEnumeratorResult
325  //! instead
326  [[deprecated(
327  "please use the form returning a TautomerEnumeratorResult "
328  "instead")]] std::vector<ROMOL_SPTR>
329  enumerate(const ROMol &mol, boost::dynamic_bitset<> *modifiedAtoms,
330  boost::dynamic_bitset<> *modifiedBonds = nullptr) const;
331 
332  //! returns the canonical tautomer from a \c TautomerEnumeratorResult
334  boost::function<int(const ROMol &mol)> scoreFunc =
336 
337  //! returns the canonical tautomer from an iterable of possible tautomers
338  /// When Iterable is TautomerEnumeratorResult we use the other non-templated
339  /// overload for efficiency (TautomerEnumeratorResult already has SMILES so no
340  /// need to recompute them)
341  template <class Iterable,
342  typename std::enable_if<
343  !std::is_same<Iterable, TautomerEnumeratorResult>::value,
344  int>::type = 0>
345  ROMol *pickCanonical(const Iterable &tautomers,
346  boost::function<int(const ROMol &mol)> scoreFunc =
348  ROMOL_SPTR bestMol;
349  if (tautomers.size() == 1) {
350  bestMol = *tautomers.begin();
351  } else {
352  // Calculate score for each tautomer
353  int bestScore = std::numeric_limits<int>::min();
354  std::string bestSmiles = "";
355  for (const auto &t : tautomers) {
356  auto score = scoreFunc(*t);
357 #ifdef VERBOSE_ENUMERATION
358  std::cerr << " " << MolToSmiles(*t) << " " << score << std::endl;
359 #endif
360  if (score > bestScore) {
361  bestScore = score;
362  bestSmiles = MolToSmiles(*t);
363  bestMol = t;
364  } else if (score == bestScore) {
365  auto smiles = MolToSmiles(*t);
366  if (smiles < bestSmiles) {
367  bestSmiles = smiles;
368  bestMol = t;
369  }
370  }
371  }
372  }
373  ROMol *res = new ROMol(*bestMol);
374  static const bool cleanIt = true;
375  static const bool force = true;
376  MolOps::assignStereochemistry(*res, cleanIt, force);
377 
378  return res;
379  }
380 
381  //! returns the canonical tautomer for a molecule
382  /*!
383  Note that the canonical tautomer is very likely not the most stable tautomer
384  for any given conditions. The default scoring rules are designed to produce
385  "reasonable" tautomers, but the primary concern is that the results are
386  canonical: you always get the same canonical tautomer for a molecule
387  regardless of what the input tautomer or atom ordering were.
388 
389  The default scoring scheme is inspired by the publication:
390  M. Sitzmann et al., “Tautomerism in Large Databases.”, JCAMD 24:521 (2010)
391  https://doi.org/10.1007/s10822-010-9346-4
392 
393  */
394  ROMol *canonicalize(const ROMol &mol,
395  boost::function<int(const ROMol &mol)> scoreFunc =
397 
398  private:
399  bool setTautomerStereoAndIsoHs(const ROMol &mol, ROMol &taut,
400  const TautomerEnumeratorResult &res) const;
401  std::shared_ptr<TautomerCatalog> dp_catalog;
402  std::unique_ptr<TautomerEnumeratorCallback> d_callback;
403  unsigned int d_maxTautomers;
404  unsigned int d_maxTransforms;
405  bool d_removeSp3Stereo;
406  bool d_removeBondStereo;
407  bool d_removeIsotopicHs;
408  bool d_reassignStereo;
409 }; // TautomerEnumerator class
410 
411 // caller owns the pointer
413  const CleanupParameters &params) {
414  return new TautomerEnumerator(params);
415 }
416 
417 } // namespace MolStandardize
418 } // namespace RDKit
419 
420 #endif
#define PRECONDITION(expr, mess)
Definition: Invariant.h:109
A Catalog with a hierarchical structure.
Definition: Catalog.h:135
virtual bool operator()(const ROMol &, const TautomerEnumeratorResult &)=0
Contains results of tautomer enumeration.
const ROMOL_SPTR & operator[](size_t pos) const
const boost::dynamic_bitset & modifiedBonds() const
const boost::dynamic_bitset & modifiedAtoms() const
TautomerEnumeratorResult(const TautomerEnumeratorResult &other)
ROMol * canonicalize(const ROMol &mol, boost::function< int(const ROMol &mol)> scoreFunc=TautomerScoringFunctions::scoreTautomer) const
returns the canonical tautomer for a molecule
TautomerEnumerator(const CleanupParameters &params=CleanupParameters())
ROMol * pickCanonical(const TautomerEnumeratorResult &tautRes, boost::function< int(const ROMol &mol)> scoreFunc=TautomerScoringFunctions::scoreTautomer) const
returns the canonical tautomer from a TautomerEnumeratorResult
TautomerEnumerator & operator=(const TautomerEnumerator &other)
TautomerEnumeratorResult enumerate(const ROMol &mol) const
returns a TautomerEnumeratorResult structure for the input molecule
TautomerEnumeratorCallback * getCallback() const
std::vector< ROMOL_SPTR > enumerate(const ROMol &mol, boost::dynamic_bitset<> *modifiedAtoms, boost::dynamic_bitset<> *modifiedBonds=nullptr) const
void setCallback(TautomerEnumeratorCallback *callback)
TautomerEnumerator(const TautomerEnumerator &other)
ROMol * pickCanonical(const Iterable &tautomers, boost::function< int(const ROMol &mol)> scoreFunc=TautomerScoringFunctions::scoreTautomer) const
void setMaxTransforms(unsigned int maxTransforms)
void setMaxTautomers(unsigned int maxTautomers)
Tautomer(ROMOL_SPTR t, ROMOL_SPTR k, size_t a=0, size_t b=0)
#define RDKIT_MOLSTANDARDIZE_EXPORT
Definition: export.h:305
RDKIT_GRAPHMOL_EXPORT void assignStereochemistry(ROMol &mol, bool cleanIt=false, bool force=false, bool flagPossibleStereoCenters=false)
Assign stereochemistry tags to atoms (i.e. R/S) and bonds (i.e. Z/E)
RDKIT_MOLSTANDARDIZE_EXPORT int scoreHeteroHs(const ROMol &mol)
RDKIT_MOLSTANDARDIZE_EXPORT int scoreSubstructs(const ROMol &mol)
RDKIT_MOLSTANDARDIZE_EXPORT int scoreRings(const ROMol &mol)
std::map< std::string, Tautomer > SmilesTautomerMap
TautomerEnumerator * tautomerEnumeratorFromParams(const CleanupParameters &params)
RDCatalog::HierarchCatalog< TautomerCatalogEntry, TautomerCatalogParams, int > TautomerCatalog
std::pair< std::string, Tautomer > SmilesTautomerPair
Std stuff.
Definition: Abbreviations.h:18
RDKIT_SMILESPARSE_EXPORT std::string MolToSmiles(const ROMol &mol, const SmilesWriteParams &params)
returns canonical SMILES for a molecule
boost::shared_ptr< ROMol > ROMOL_SPTR