1 // 2 // Copyright (C) 2002-2021 greg landrum, Rational Discovery LLC 3 // 4 // @@ All Rights Reserved @@ 5 // This file is part of the RDKit. 6 // The contents are covered by the terms of the BSD license 7 // which is included in the file license.txt, found at the root 8 // of the RDKit source tree. 9 // 10 #include <RDGeneral/export.h> 11 #ifndef RD_MOLSUPPLIER_H 12 #define RD_MOLSUPPLIER_H 13 14 #include <RDGeneral/types.h> 15 16 #include <string> 17 #include <list> 18 #include <memory> 19 #include <vector> 20 #include <iostream> 21 #include <fstream> 22 #include <GraphMol/ROMol.h> 23 #include <RDGeneral/BadFileException.h> 24 25 #ifdef RDK_BUILD_MAEPARSER_SUPPORT 26 namespace schrodinger { 27 namespace mae { 28 class Reader; 29 class Block; 30 } // namespace mae 31 } // namespace schrodinger 32 #endif // RDK_BUILD_MAEPARSER_SUPPORT 33 34 namespace RDKit { 35 RDKIT_FILEPARSERS_EXPORT std::string strip(const std::string &orig); 36 37 /*! 38 // 39 // Here are a couple of ways one can interact with MolSuppliers: 40 // 41 // 1) Lazy (ForwardIterator): 42 // while(!supplier.atEnd()){ 43 // ROMol *mol = supplier.next(); 44 // if(mol){ 45 // do something; 46 // } 47 // } 48 // 2) Random Access: 49 // for(int i=0;i<supplier.length();i++){ 50 // ROMol *mol = supplier[i]; 51 // if(mol){ 52 // do something; 53 // } 54 // } 55 // 56 // 57 */ 58 class RDKIT_FILEPARSERS_EXPORT MolSupplier { 59 // this is an abstract base class to supply molecules one at a time 60 public: MolSupplier()61 MolSupplier(){}; ~MolSupplier()62 virtual ~MolSupplier(){}; 63 virtual void init() = 0; 64 virtual void reset() = 0; 65 virtual bool atEnd() = 0; 66 virtual ROMol *next() = 0; 67 close()68 virtual void close() { 69 if (df_owner) { 70 delete dp_inStream; 71 df_owner = false; 72 } 73 dp_inStream = nullptr; 74 } 75 76 private: 77 // disable automatic copy constructors and assignment operators 78 // for this class and its subclasses. They will likely be 79 // carrying around stream pointers and copying those is a recipe 80 // for disaster. 81 MolSupplier(const MolSupplier &); 82 MolSupplier &operator=(const MolSupplier &); 83 84 protected: 85 // stream to read the molecules from: 86 std::istream *dp_inStream = nullptr; 87 // do we own dp_inStream? 88 bool df_owner = false; 89 // opens a stream for reading and verifies that it can be read from. 90 // if not it throws an exception 91 // the caller owns the resulting stream openAndCheckStream(const std::string & filename)92 std::istream *openAndCheckStream(const std::string &filename) { 93 // FIX: this binary mode of opening file is here because of a bug in 94 // VC++ 6.0 95 // the function "tellg" does not work correctly if we do not open it this 96 // way 97 // Jan 2009: Confirmed that this is still the case in visual studio 2008 98 std::ifstream *strm = 99 new std::ifstream(filename.c_str(), std::ios_base::binary); 100 if ((!(*strm)) || strm->bad()) { 101 std::ostringstream errout; 102 errout << "Bad input file " << filename; 103 delete strm; 104 throw BadFileException(errout.str()); 105 } 106 107 strm->peek(); 108 if (strm->bad() || strm->eof()) { 109 std::ostringstream errout; 110 errout << "Invalid input file " << filename; 111 delete strm; 112 throw BadFileException(errout.str()); 113 } 114 return static_cast<std::istream *>(strm); 115 } 116 }; 117 118 // \brief a supplier from an SD file that only reads forward: 119 class RDKIT_FILEPARSERS_EXPORT ForwardSDMolSupplier : public MolSupplier { 120 /************************************************************************* 121 * A lazy mol supplier from a SD file. 122 * - When new molecules are read using "next" their positions in the file are 123 *noted. 124 ***********************************************************************************/ 125 public: ForwardSDMolSupplier()126 ForwardSDMolSupplier() { init(); }; 127 128 explicit ForwardSDMolSupplier(std::istream *inStream, 129 bool takeOwnership = true, bool sanitize = true, 130 bool removeHs = true, 131 bool strictParsing = false); 132 ~ForwardSDMolSupplier()133 virtual ~ForwardSDMolSupplier() { close(); }; 134 135 virtual void init(); 136 virtual void reset(); 137 virtual ROMol *next(); 138 virtual bool atEnd(); 139 setProcessPropertyLists(bool val)140 void setProcessPropertyLists(bool val) { df_processPropertyLists = val; } getProcessPropertyLists()141 bool getProcessPropertyLists() const { return df_processPropertyLists; } 142 getEOFHitOnRead()143 bool getEOFHitOnRead() const { return df_eofHitOnRead; } 144 145 protected: 146 virtual void checkForEnd(); 147 ROMol *_next(); 148 virtual void readMolProps(ROMol *); 149 bool df_end = false; 150 int d_line = 0; // line number we are currently on 151 bool df_sanitize = true, df_removeHs = true, df_strictParsing = true; 152 bool df_processPropertyLists = true; 153 bool df_eofHitOnRead = false; 154 }; 155 156 // \brief a lazy supplier from an SD file 157 class RDKIT_FILEPARSERS_EXPORT SDMolSupplier : public ForwardSDMolSupplier { 158 /************************************************************************* 159 * A lazy mol supplier from a SD file. 160 * - When new molecules are read using "next" their positions in the file are 161 *noted. 162 * - A call to the "length" will automatically parse the entire file and 163 *cache all the mol 164 * block positions 165 * - [] operator is used to access a molecule at "idx", calling next 166 *following this will result 167 * in the next molecule after "idx" 168 ***********************************************************************************/ 169 170 public: SDMolSupplier()171 SDMolSupplier() { init(); }; 172 173 /*! 174 * \param fileName - the name of the SD file 175 * \param sanitize - if true sanitize the molecule before returning it 176 * \param removeHs - if true remove Hs from the molecule before returning it 177 * (triggers sanitization) 178 * \param strictParsing - if set to false, the parser is more lax about 179 * correctness 180 * of the contents. 181 */ 182 explicit SDMolSupplier(const std::string &fileName, bool sanitize = true, 183 bool removeHs = true, bool strictParsing = true); 184 185 explicit SDMolSupplier(std::istream *inStream, bool takeOwnership = true, 186 bool sanitize = true, bool removeHs = true, 187 bool strictParsing = true); 188 ~SDMolSupplier()189 virtual ~SDMolSupplier() { close(); }; 190 void init(); 191 void reset(); 192 ROMol *next(); 193 bool atEnd(); 194 void moveTo(unsigned int idx); 195 ROMol *operator[](unsigned int idx); 196 /*! \brief returns the text block for a particular item 197 * 198 * \param idx - which item to return 199 */ 200 std::string getItemText(unsigned int idx); 201 unsigned int length(); 202 void setData(const std::string &text, bool sanitize = true, 203 bool removeHs = true); 204 void setData(const std::string &text, bool sanitize, bool removeHs, 205 bool strictParsing); 206 207 /*! Resets our internal state and sets the indices of molecules in the stream. 208 * The client should be *very* careful about calling this method, as it's 209 *trivial 210 * to end up with a completely useless supplier. 211 * 212 * \param locs - the vector of stream positions. 213 * 214 * Note that this can be used not only to make reading selected molecules 215 *from a 216 * large SD file much faster, but it can also allow subsetting an SD file or 217 * rearranging the order of the molecules. 218 */ 219 void setStreamIndices(const std::vector<std::streampos> &locs); 220 221 private: 222 void checkForEnd(); 223 void setDataCommon(const std::string &text, bool sanitize, bool removeHs); 224 int d_len = 0; // total number of mol blocks in the file (initialized to -1) 225 int d_last = 0; // the molecule we are ready to read 226 std::vector<std::streampos> d_molpos; 227 }; 228 229 //! lazy file parser for Smiles tables 230 class RDKIT_FILEPARSERS_EXPORT SmilesMolSupplier : public MolSupplier { 231 /************************************************************************** 232 * Lazy file parser for Smiles table file, similar to the lazy SD 233 * file parser above 234 * - As an when new molecules are read using "next" their 235 * positions in the file are noted. 236 * - A call to the "length" will autamatically parse the entire 237 * file and cache all the mol block positions 238 * - [] operator is used to access a molecule at "idx", calling 239 * next following this will result in the next molecule after 240 * "idx" 241 ***************************************************************************/ 242 public: 243 /*! 244 * \param fileName - the name of smiles table file 245 * \param delimiter - delimiting characters between records on a each 246 * line NOTE that this is not a string, the tokenizer looks for 247 * the individual characters in delimiter, not the full string 248 * itself. So the default delimiter: " \t", means " " or "\t". 249 * \param smilesColumn - column number for the SMILES string (defaults 250 * to the first column) 251 * \param nameColumn - column number for the molecule name (defaults to 252 * the second column) If set to -1 we assume that no name is 253 * available for the molecule and the name is defaulted to the 254 * smiles string 255 * \param titleLine - if true, the first line is assumed to list the 256 * names of properties in order separated by 'delimiter'. It is 257 * also assume that the 'SMILES' column and the 'name' column 258 * are not specified here if false - no title line is assumed 259 * and the properties are recorded as the "columnX" where "X" is 260 * the column number 261 * \param sanitize - if true sanitize the molecule before returning it 262 */ 263 explicit SmilesMolSupplier(const std::string &fileName, 264 const std::string &delimiter = " \t", 265 int smilesColumn = 0, int nameColumn = 1, 266 bool titleLine = true, bool sanitize = true); 267 SmilesMolSupplier(); 268 explicit SmilesMolSupplier(std::istream *inStream, bool takeOwnership = true, 269 const std::string &delimiter = " \t", 270 int smilesColumn = 0, int nameColumn = 1, 271 bool titleLine = true, bool sanitize = true); 272 ~SmilesMolSupplier()273 virtual ~SmilesMolSupplier() { close(); }; 274 void setData(const std::string &text, const std::string &delimiter = " ", 275 int smilesColumn = 0, int nameColumn = 1, bool titleLine = true, 276 bool sanitize = true); 277 void init(); 278 void reset(); 279 ROMol *next(); 280 bool atEnd(); 281 void moveTo(unsigned int idx); 282 ROMol *operator[](unsigned int idx); 283 /*! \brief returns the text block for a particular item 284 * 285 * \param idx - which item to return 286 */ 287 std::string getItemText(unsigned int idx); 288 unsigned int length(); 289 290 private: 291 ROMol *processLine(std::string inLine); 292 void processTitleLine(); 293 std::string nextLine(); 294 long int skipComments(); 295 void checkForEnd(); 296 297 bool df_end = false; // have we reached the end of the file? 298 int d_len = 0; // total number of smiles in the file 299 int d_next = 0; // the molecule we are ready to read 300 int d_line = 0; // line number we are currently on 301 std::vector<std::streampos> 302 d_molpos; // vector of positions in the file for molecules 303 std::vector<int> d_lineNums; 304 std::string d_delim; // the delimiter string 305 bool df_sanitize = true; // sanitize molecules before returning them? 306 STR_VECT d_props; // vector of property names 307 bool df_title = true; // do we have a title line? 308 int d_smi = 0; // column id for the smile string 309 int d_name = 1; // column id for the name 310 }; 311 312 //! lazy file parser for TDT files 313 class RDKIT_FILEPARSERS_EXPORT TDTMolSupplier : public MolSupplier { 314 /************************************************************************** 315 * Lazy file parser for TDT files, similar to the lazy SD 316 * file parser above 317 * - As an when new molecules are read using "next" their 318 * positions in the file are noted. 319 * - A call to the "length" will autamatically parse the entire 320 * file and cache all the mol block positions 321 * - [] operator is used to access a molecule at "idx", calling 322 * next following this will result in the next molecule after 323 * "idx" 324 ***************************************************************************/ 325 public: 326 /*! 327 * \param fileName - the name of the TDT file 328 * \param nameRecord - property name for the molecule name. 329 * If empty (the default), the name defaults to be empty 330 * \param confId2D - if >=0 and 2D coordinates are provided, the 2D 331 * structure (depiction) in the input will be read into the 332 * corresponding conformer id. 333 * \param confId3D - if >=0 and 3D coordinates are provided, the 3D 334 * structure (depiction) in the input will be read into the 335 * corresponding conformer id. 336 * \param sanitize - if true sanitize the molecule before returning it 337 */ 338 explicit TDTMolSupplier(const std::string &fileName, 339 const std::string &nameRecord = "", int confId2D = -1, 340 int confId3D = 0, bool sanitize = true); 341 explicit TDTMolSupplier(std::istream *inStream, bool takeOwnership = true, 342 const std::string &nameRecord = "", int confId2D = -1, 343 int confId3D = 0, bool sanitize = true); 344 TDTMolSupplier(); ~TDTMolSupplier()345 virtual ~TDTMolSupplier() { close(); }; 346 void setData(const std::string &text, const std::string &nameRecord = "", 347 int confId2D = -1, int confId3D = 0, bool sanitize = true); 348 void init(); 349 void reset(); 350 ROMol *next(); 351 bool atEnd(); 352 void moveTo(unsigned int idx); 353 ROMol *operator[](unsigned int idx); 354 /*! \brief returns the text block for a particular item 355 * 356 * \param idx - which item to return 357 */ 358 std::string getItemText(unsigned int idx); 359 unsigned int length(); 360 361 private: 362 bool advanceToNextRecord(); 363 void checkForEnd(); 364 ROMol *parseMol(std::string inLine); 365 366 bool df_end = false; // have we reached the end of the file? 367 int d_len = 0; // total number of mols in the file 368 int d_last = 0; // the molecule we are ready to read 369 int d_line = 0; // line number we are currently on 370 int d_confId2D = -1; // id to use for 2D conformers 371 int d_confId3D = 0; // id to use for 3D conformers 372 std::vector<std::streampos> 373 d_molpos; // vector of positions in the file for molecules 374 bool df_sanitize = true; // sanitize molecules before returning them? 375 std::string d_nameProp = 376 ""; // local storage for the property providing mol names 377 }; 378 379 //! lazy file parser for PDB files 380 class RDKIT_FILEPARSERS_EXPORT PDBMolSupplier : public MolSupplier { 381 public: 382 explicit PDBMolSupplier(std::istream *inStream, bool takeOwnership = true, 383 bool sanitize = true, bool removeHs = true, 384 unsigned int flavor = 0, 385 bool proximityBonding = true); 386 explicit PDBMolSupplier(const std::string &fname, bool sanitize = true, 387 bool removeHs = true, unsigned int flavor = 0, 388 bool proximityBonding = true); 389 ~PDBMolSupplier()390 virtual ~PDBMolSupplier() { close(); }; 391 392 virtual void init(); 393 virtual void reset(); 394 virtual ROMol *next(); 395 virtual bool atEnd(); 396 397 protected: 398 bool df_sanitize, df_removeHs, df_proximityBonding; 399 unsigned int d_flavor; 400 }; 401 #ifdef RDK_BUILD_MAEPARSER_SUPPORT 402 //! lazy file parser for MAE files 403 class RDKIT_FILEPARSERS_EXPORT MaeMolSupplier : public MolSupplier { 404 /** 405 * Due to maeparser's shared_ptr<istream> Reader interface, MaeMolSupplier 406 * always requires taking ownership of the istream ptr, as the shared ptr will 407 * always clear it upon destruction. 408 */ 409 410 public: MaeMolSupplier()411 MaeMolSupplier() { init(); }; 412 413 explicit MaeMolSupplier(std::shared_ptr<std::istream> inStream, 414 bool sanitize = true, bool removeHs = true); 415 416 explicit MaeMolSupplier(std::istream *inStream, bool takeOwnership = true, 417 bool sanitize = true, bool removeHs = true); 418 419 explicit MaeMolSupplier(const std::string &fname, bool sanitize = true, 420 bool removeHs = true); 421 ~MaeMolSupplier()422 virtual ~MaeMolSupplier(){}; 423 424 virtual void init(); 425 virtual void reset(); 426 virtual ROMol *next(); 427 virtual bool atEnd(); 428 close()429 virtual void close() { dp_sInStream.reset(); } 430 431 private: 432 void moveToNextBlock(); 433 434 protected: 435 bool df_sanitize, df_removeHs; 436 std::shared_ptr<schrodinger::mae::Reader> d_reader; 437 std::shared_ptr<schrodinger::mae::Block> d_next_struct; 438 std::shared_ptr<std::istream> dp_sInStream; 439 std::string d_stored_exc; 440 }; 441 #endif // RDK_BUILD_MAEPARSER_SUPPORT 442 } // namespace RDKit 443 444 #endif 445