1 //
2 //  Copyright (C) 2002-2021 greg landrum, Rational Discovery LLC
3 //
4 //   @@ All Rights Reserved @@
5 //  This file is part of the RDKit.
6 //  The contents are covered by the terms of the BSD license
7 //  which is included in the file license.txt, found at the root
8 //  of the RDKit source tree.
9 //
10 #include <RDGeneral/export.h>
11 #ifndef RD_MOLSUPPLIER_H
12 #define RD_MOLSUPPLIER_H
13 
14 #include <RDGeneral/types.h>
15 
16 #include <string>
17 #include <list>
18 #include <memory>
19 #include <vector>
20 #include <iostream>
21 #include <fstream>
22 #include <GraphMol/ROMol.h>
23 #include <RDGeneral/BadFileException.h>
24 
25 #ifdef RDK_BUILD_MAEPARSER_SUPPORT
26 namespace schrodinger {
27 namespace mae {
28 class Reader;
29 class Block;
30 }  // namespace mae
31 }  // namespace schrodinger
32 #endif  // RDK_BUILD_MAEPARSER_SUPPORT
33 
34 namespace RDKit {
35 RDKIT_FILEPARSERS_EXPORT std::string strip(const std::string &orig);
36 
37 /*!
38 //
39 //  Here are a couple of ways one can interact with MolSuppliers:
40 //
41 //  1) Lazy (ForwardIterator):
42 //     while(!supplier.atEnd()){
43 //       ROMol *mol = supplier.next();
44 //       if(mol){
45 //           do something;
46 //       }
47 //     }
48 //  2) Random Access:
49 //     for(int i=0;i<supplier.length();i++){
50 //       ROMol *mol = supplier[i];
51 //       if(mol){
52 //           do something;
53 //       }
54 //     }
55 //
56 //
57 */
58 class RDKIT_FILEPARSERS_EXPORT MolSupplier {
59   // this is an abstract base class to supply molecules one at a time
60  public:
MolSupplier()61   MolSupplier(){};
~MolSupplier()62   virtual ~MolSupplier(){};
63   virtual void init() = 0;
64   virtual void reset() = 0;
65   virtual bool atEnd() = 0;
66   virtual ROMol *next() = 0;
67 
close()68   virtual void close() {
69     if (df_owner) {
70       delete dp_inStream;
71       df_owner = false;
72     }
73     dp_inStream = nullptr;
74   }
75 
76  private:
77   // disable automatic copy constructors and assignment operators
78   // for this class and its subclasses.  They will likely be
79   // carrying around stream pointers and copying those is a recipe
80   // for disaster.
81   MolSupplier(const MolSupplier &);
82   MolSupplier &operator=(const MolSupplier &);
83 
84  protected:
85   // stream to read the molecules from:
86   std::istream *dp_inStream = nullptr;
87   // do we own dp_inStream?
88   bool df_owner = false;
89   // opens a stream for reading and verifies that it can be read from.
90   // if not it throws an exception
91   // the caller owns the resulting stream
openAndCheckStream(const std::string & filename)92   std::istream *openAndCheckStream(const std::string &filename) {
93     // FIX: this binary mode of opening file is here because of a bug in
94     // VC++ 6.0
95     // the function "tellg" does not work correctly if we do not open it this
96     // way
97     //   Jan 2009: Confirmed that this is still the case in visual studio 2008
98     std::ifstream *strm =
99         new std::ifstream(filename.c_str(), std::ios_base::binary);
100     if ((!(*strm)) || strm->bad()) {
101       std::ostringstream errout;
102       errout << "Bad input file " << filename;
103       delete strm;
104       throw BadFileException(errout.str());
105     }
106 
107     strm->peek();
108     if (strm->bad() || strm->eof()) {
109       std::ostringstream errout;
110       errout << "Invalid input file " << filename;
111       delete strm;
112       throw BadFileException(errout.str());
113     }
114     return static_cast<std::istream *>(strm);
115   }
116 };
117 
118 // \brief a supplier from an SD file that only reads forward:
119 class RDKIT_FILEPARSERS_EXPORT ForwardSDMolSupplier : public MolSupplier {
120   /*************************************************************************
121    * A lazy mol supplier from a SD file.
122    *  - When new molecules are read using "next" their positions in the file are
123    *noted.
124    ***********************************************************************************/
125  public:
ForwardSDMolSupplier()126   ForwardSDMolSupplier() { init(); };
127 
128   explicit ForwardSDMolSupplier(std::istream *inStream,
129                                 bool takeOwnership = true, bool sanitize = true,
130                                 bool removeHs = true,
131                                 bool strictParsing = false);
132 
~ForwardSDMolSupplier()133   virtual ~ForwardSDMolSupplier() { close(); };
134 
135   virtual void init();
136   virtual void reset();
137   virtual ROMol *next();
138   virtual bool atEnd();
139 
setProcessPropertyLists(bool val)140   void setProcessPropertyLists(bool val) { df_processPropertyLists = val; }
getProcessPropertyLists()141   bool getProcessPropertyLists() const { return df_processPropertyLists; }
142 
getEOFHitOnRead()143   bool getEOFHitOnRead() const { return df_eofHitOnRead; }
144 
145  protected:
146   virtual void checkForEnd();
147   ROMol *_next();
148   virtual void readMolProps(ROMol *);
149   bool df_end = false;
150   int d_line = 0;  // line number we are currently on
151   bool df_sanitize = true, df_removeHs = true, df_strictParsing = true;
152   bool df_processPropertyLists = true;
153   bool df_eofHitOnRead = false;
154 };
155 
156 // \brief a lazy supplier from an SD file
157 class RDKIT_FILEPARSERS_EXPORT SDMolSupplier : public ForwardSDMolSupplier {
158   /*************************************************************************
159    * A lazy mol supplier from a SD file.
160    *  - When new molecules are read using "next" their positions in the file are
161    *noted.
162    *  - A call to the "length" will automatically parse the entire file and
163    *cache all the mol
164    *    block positions
165    *  - [] operator is used to access a molecule at "idx", calling next
166    *following this will result
167    *    in the next molecule after "idx"
168    ***********************************************************************************/
169 
170  public:
SDMolSupplier()171   SDMolSupplier() { init(); };
172 
173   /*!
174    *   \param fileName - the name of the SD file
175    *   \param sanitize - if true sanitize the molecule before returning it
176    *   \param removeHs - if true remove Hs from the molecule before returning it
177    *                     (triggers sanitization)
178    *   \param strictParsing - if set to false, the parser is more lax about
179    * correctness
180    *                          of the contents.
181    */
182   explicit SDMolSupplier(const std::string &fileName, bool sanitize = true,
183                          bool removeHs = true, bool strictParsing = true);
184 
185   explicit SDMolSupplier(std::istream *inStream, bool takeOwnership = true,
186                          bool sanitize = true, bool removeHs = true,
187                          bool strictParsing = true);
188 
~SDMolSupplier()189   virtual ~SDMolSupplier() { close(); };
190   void init();
191   void reset();
192   ROMol *next();
193   bool atEnd();
194   void moveTo(unsigned int idx);
195   ROMol *operator[](unsigned int idx);
196   /*! \brief returns the text block for a particular item
197    *
198    *  \param idx - which item to return
199    */
200   std::string getItemText(unsigned int idx);
201   unsigned int length();
202   void setData(const std::string &text, bool sanitize = true,
203                bool removeHs = true);
204   void setData(const std::string &text, bool sanitize, bool removeHs,
205                bool strictParsing);
206 
207   /*! Resets our internal state and sets the indices of molecules in the stream.
208    *  The client should be *very* careful about calling this method, as it's
209    *trivial
210    *  to end up with a completely useless supplier.
211    *
212    *   \param locs - the vector of stream positions.
213    *
214    *  Note that this can be used not only to make reading selected molecules
215    *from a
216    *  large SD file much faster, but it can also allow subsetting an SD file or
217    *  rearranging the order of the molecules.
218    */
219   void setStreamIndices(const std::vector<std::streampos> &locs);
220 
221  private:
222   void checkForEnd();
223   void setDataCommon(const std::string &text, bool sanitize, bool removeHs);
224   int d_len = 0;   // total number of mol blocks in the file (initialized to -1)
225   int d_last = 0;  // the molecule we are ready to read
226   std::vector<std::streampos> d_molpos;
227 };
228 
229 //! lazy file parser for Smiles tables
230 class RDKIT_FILEPARSERS_EXPORT SmilesMolSupplier : public MolSupplier {
231   /**************************************************************************
232    * Lazy file parser for Smiles table file, similar to the lazy SD
233    * file parser above
234    * - As an when new molecules are read using "next" their
235    *    positions in the file are noted.
236    *  - A call to the "length" will autamatically parse the entire
237    *    file and cache all the mol block positions
238    *  - [] operator is used to access a molecule at "idx", calling
239    *    next following this will result in the next molecule after
240    *    "idx"
241    ***************************************************************************/
242  public:
243   /*!
244    *   \param fileName - the name of smiles table file
245    *   \param delimiter - delimiting characters between records on a each
246    *     line NOTE that this is not a string, the tokenizer looks for
247    *     the individual characters in delimiter, not the full string
248    *     itself.  So the default delimiter: " \t", means " " or "\t".
249    *   \param smilesColumn - column number for the SMILES string (defaults
250    *     to the first column)
251    *   \param nameColumn - column number for the molecule name (defaults to
252    *     the second column) If set to -1 we assume that no name is
253    *     available for the molecule and the name is defaulted to the
254    *     smiles string
255    *   \param titleLine - if true, the first line is assumed to list the
256    *     names of properties in order separated by 'delimiter'. It is
257    *     also assume that the 'SMILES' column and the 'name' column
258    *     are not specified here if false - no title line is assumed
259    *     and the properties are recorded as the "columnX" where "X" is
260    *     the column number
261    *   \param sanitize - if true sanitize the molecule before returning it
262    */
263   explicit SmilesMolSupplier(const std::string &fileName,
264                              const std::string &delimiter = " \t",
265                              int smilesColumn = 0, int nameColumn = 1,
266                              bool titleLine = true, bool sanitize = true);
267   SmilesMolSupplier();
268   explicit SmilesMolSupplier(std::istream *inStream, bool takeOwnership = true,
269                              const std::string &delimiter = " \t",
270                              int smilesColumn = 0, int nameColumn = 1,
271                              bool titleLine = true, bool sanitize = true);
272 
~SmilesMolSupplier()273   virtual ~SmilesMolSupplier() { close(); };
274   void setData(const std::string &text, const std::string &delimiter = " ",
275                int smilesColumn = 0, int nameColumn = 1, bool titleLine = true,
276                bool sanitize = true);
277   void init();
278   void reset();
279   ROMol *next();
280   bool atEnd();
281   void moveTo(unsigned int idx);
282   ROMol *operator[](unsigned int idx);
283   /*! \brief returns the text block for a particular item
284    *
285    *  \param idx - which item to return
286    */
287   std::string getItemText(unsigned int idx);
288   unsigned int length();
289 
290  private:
291   ROMol *processLine(std::string inLine);
292   void processTitleLine();
293   std::string nextLine();
294   long int skipComments();
295   void checkForEnd();
296 
297   bool df_end = false;  // have we reached the end of the file?
298   int d_len = 0;        // total number of smiles in the file
299   int d_next = 0;       // the  molecule we are ready to read
300   int d_line = 0;       // line number we are currently on
301   std::vector<std::streampos>
302       d_molpos;  // vector of positions in the file for molecules
303   std::vector<int> d_lineNums;
304   std::string d_delim;      // the delimiter string
305   bool df_sanitize = true;  // sanitize molecules before returning them?
306   STR_VECT d_props;         // vector of property names
307   bool df_title = true;     // do we have a title line?
308   int d_smi = 0;            // column id for the smile string
309   int d_name = 1;           // column id for the name
310 };
311 
312 //! lazy file parser for TDT files
313 class RDKIT_FILEPARSERS_EXPORT TDTMolSupplier : public MolSupplier {
314   /**************************************************************************
315    * Lazy file parser for TDT files, similar to the lazy SD
316    * file parser above
317    * - As an when new molecules are read using "next" their
318    *    positions in the file are noted.
319    *  - A call to the "length" will autamatically parse the entire
320    *    file and cache all the mol block positions
321    *  - [] operator is used to access a molecule at "idx", calling
322    *    next following this will result in the next molecule after
323    *    "idx"
324    ***************************************************************************/
325  public:
326   /*!
327    *   \param fileName - the name of the TDT file
328    *   \param nameRecord - property name for the molecule name.
329    *     If empty (the default), the name defaults to be empty
330    *   \param confId2D - if >=0 and 2D coordinates are provided, the 2D
331    *                   structure (depiction) in the input will be read into the
332    *                   corresponding conformer id.
333    *   \param confId3D - if >=0 and 3D coordinates are provided, the 3D
334    *                   structure (depiction) in the input will be read into the
335    *                   corresponding conformer id.
336    *   \param sanitize - if true sanitize the molecule before returning it
337    */
338   explicit TDTMolSupplier(const std::string &fileName,
339                           const std::string &nameRecord = "", int confId2D = -1,
340                           int confId3D = 0, bool sanitize = true);
341   explicit TDTMolSupplier(std::istream *inStream, bool takeOwnership = true,
342                           const std::string &nameRecord = "", int confId2D = -1,
343                           int confId3D = 0, bool sanitize = true);
344   TDTMolSupplier();
~TDTMolSupplier()345   virtual ~TDTMolSupplier() { close(); };
346   void setData(const std::string &text, const std::string &nameRecord = "",
347                int confId2D = -1, int confId3D = 0, bool sanitize = true);
348   void init();
349   void reset();
350   ROMol *next();
351   bool atEnd();
352   void moveTo(unsigned int idx);
353   ROMol *operator[](unsigned int idx);
354   /*! \brief returns the text block for a particular item
355    *
356    *  \param idx - which item to return
357    */
358   std::string getItemText(unsigned int idx);
359   unsigned int length();
360 
361  private:
362   bool advanceToNextRecord();
363   void checkForEnd();
364   ROMol *parseMol(std::string inLine);
365 
366   bool df_end = false;  // have we reached the end of the file?
367   int d_len = 0;        // total number of mols in the file
368   int d_last = 0;       // the molecule we are ready to read
369   int d_line = 0;       // line number we are currently on
370   int d_confId2D = -1;  // id to use for 2D conformers
371   int d_confId3D = 0;   // id to use for 3D conformers
372   std::vector<std::streampos>
373       d_molpos;             // vector of positions in the file for molecules
374   bool df_sanitize = true;  // sanitize molecules before returning them?
375   std::string d_nameProp =
376       "";  // local storage for the property providing mol names
377 };
378 
379 //! lazy file parser for PDB files
380 class RDKIT_FILEPARSERS_EXPORT PDBMolSupplier : public MolSupplier {
381  public:
382   explicit PDBMolSupplier(std::istream *inStream, bool takeOwnership = true,
383                           bool sanitize = true, bool removeHs = true,
384                           unsigned int flavor = 0,
385                           bool proximityBonding = true);
386   explicit PDBMolSupplier(const std::string &fname, bool sanitize = true,
387                           bool removeHs = true, unsigned int flavor = 0,
388                           bool proximityBonding = true);
389 
~PDBMolSupplier()390   virtual ~PDBMolSupplier() { close(); };
391 
392   virtual void init();
393   virtual void reset();
394   virtual ROMol *next();
395   virtual bool atEnd();
396 
397  protected:
398   bool df_sanitize, df_removeHs, df_proximityBonding;
399   unsigned int d_flavor;
400 };
401 #ifdef RDK_BUILD_MAEPARSER_SUPPORT
402 //! lazy file parser for MAE files
403 class RDKIT_FILEPARSERS_EXPORT MaeMolSupplier : public MolSupplier {
404   /**
405    * Due to maeparser's shared_ptr<istream> Reader interface, MaeMolSupplier
406    * always requires taking ownership of the istream ptr, as the shared ptr will
407    * always clear it upon destruction.
408    */
409 
410  public:
MaeMolSupplier()411   MaeMolSupplier() { init(); };
412 
413   explicit MaeMolSupplier(std::shared_ptr<std::istream> inStream,
414                           bool sanitize = true, bool removeHs = true);
415 
416   explicit MaeMolSupplier(std::istream *inStream, bool takeOwnership = true,
417                           bool sanitize = true, bool removeHs = true);
418 
419   explicit MaeMolSupplier(const std::string &fname, bool sanitize = true,
420                           bool removeHs = true);
421 
~MaeMolSupplier()422   virtual ~MaeMolSupplier(){};
423 
424   virtual void init();
425   virtual void reset();
426   virtual ROMol *next();
427   virtual bool atEnd();
428 
close()429   virtual void close() { dp_sInStream.reset(); }
430 
431  private:
432   void moveToNextBlock();
433 
434  protected:
435   bool df_sanitize, df_removeHs;
436   std::shared_ptr<schrodinger::mae::Reader> d_reader;
437   std::shared_ptr<schrodinger::mae::Block> d_next_struct;
438   std::shared_ptr<std::istream> dp_sInStream;
439   std::string d_stored_exc;
440 };
441 #endif  // RDK_BUILD_MAEPARSER_SUPPORT
442 }  // namespace RDKit
443 
444 #endif
445