1 //
2 //  Copyright (C) 2005-2020 Greg Landrum and Rational Discovery LLC
3 //
4 //   @@ All Rights Reserved @@
5 //  This file is part of the RDKit.
6 //  The contents are covered by the terms of the BSD license
7 //  which is included in the file license.txt, found at the root
8 //  of the RDKit source tree.
9 //
10 #include <RDGeneral/BoostStartInclude.h>
11 #include <boost/tokenizer.hpp>
12 #include <boost/algorithm/string.hpp>
13 #include <boost/lexical_cast.hpp>
14 #include <RDGeneral/BoostEndInclude.h>
15 
16 #include <RDGeneral/BadFileException.h>
17 #include <RDGeneral/FileParseException.h>
18 #include <RDGeneral/RDLog.h>
19 #include "MolSupplier.h"
20 #include "FileParsers.h"
21 #include <GraphMol/SmilesParse/SmilesParse.h>
22 #include <RDGeneral/LocaleSwitcher.h>
23 
24 #include <fstream>
25 #include <iostream>
26 #include <sstream>
27 #include <string>
28 
29 namespace RDKit {
30 namespace TDTParseUtils {
31 typedef boost::tokenizer<boost::escaped_list_separator<char>> CommaTokenizer;
32 
33 /*
34  * if inStream is valid, we'll allow the numbers to be broken across multiple
35  * lines.
36  *
37  * This will throw a boost::bad_lexical_cast exception if it hits a bogus number
38  *
39  */
40 template <typename T>
ParseNumberList(std::string inLine,std::vector<T> & res,std::istream * inStream=nullptr)41 void ParseNumberList(std::string inLine, std::vector<T> &res,
42                      std::istream *inStream = nullptr) {
43   bool foundEnd = false;
44   while (!foundEnd) {
45     CommaTokenizer commaTok(inLine);
46     for (CommaTokenizer::const_iterator commaTokIt = commaTok.begin();
47          commaTokIt != commaTok.end(); commaTokIt++) {
48       std::string number = *commaTokIt;
49       bool atEnd = number.find(";>") != std::string::npos;
50       boost::trim_if(number, boost::is_any_of(" \r\n\t;>"));
51       if (number != "" && !atEnd) {
52         res.push_back(boost::lexical_cast<T>(number));
53       } else if (atEnd) {
54         // that's it, we're done:
55         foundEnd = true;
56         break;
57       }
58     }
59     if (foundEnd || !inStream || inStream->eof()) {
60       break;
61     } else {
62       std::getline(*inStream, inLine);
63     }
64   }
65   if (!foundEnd) {
66     throw FileParseException("no end tag found for numeric list");
67   }
68 }
69 
70 }  // end of namespace TDTParseUtils
71 
TDTMolSupplier()72 TDTMolSupplier::TDTMolSupplier() { init(); }
73 
TDTMolSupplier(const std::string & fileName,const std::string & nameRecord,int confId2D,int confId3D,bool sanitize)74 TDTMolSupplier::TDTMolSupplier(const std::string &fileName,
75                                const std::string &nameRecord, int confId2D,
76                                int confId3D, bool sanitize) {
77   init();
78   d_confId2D = confId2D;
79   d_confId3D = confId3D;
80   d_nameProp = nameRecord;
81   dp_inStream = openAndCheckStream(fileName);
82   df_owner = true;
83 
84   this->advanceToNextRecord();
85   d_molpos.push_back(dp_inStream->tellg());
86   df_sanitize = sanitize;
87   this->checkForEnd();
88 }
89 
TDTMolSupplier(std::istream * inStream,bool takeOwnership,const std::string & nameRecord,int confId2D,int confId3D,bool sanitize)90 TDTMolSupplier::TDTMolSupplier(std::istream *inStream, bool takeOwnership,
91                                const std::string &nameRecord, int confId2D,
92                                int confId3D, bool sanitize) {
93   CHECK_INVARIANT(inStream, "bad instream");
94   CHECK_INVARIANT(!(inStream->eof()), "early EOF");
95   init();
96   dp_inStream = inStream;
97   df_owner = takeOwnership;
98   d_confId2D = confId2D;
99   d_confId3D = confId3D;
100   d_nameProp = nameRecord;
101   this->advanceToNextRecord();
102   d_molpos.push_back(dp_inStream->tellg());
103   df_sanitize = sanitize;
104   this->checkForEnd();
105 }
106 
init()107 void TDTMolSupplier::init() {
108   dp_inStream = nullptr;
109   df_owner = false;
110   df_end = false;
111   d_len = -1;
112   d_last = 0;
113   d_line = 0;
114 }
115 
setData(const std::string & text,const std::string & nameRecord,int confId2D,int confId3D,bool sanitize)116 void TDTMolSupplier::setData(const std::string &text,
117                              const std::string &nameRecord, int confId2D,
118                              int confId3D, bool sanitize) {
119   if (dp_inStream && df_owner) {
120     delete dp_inStream;
121   }
122   init();
123   d_confId2D = confId2D;
124   d_confId3D = confId3D;
125   d_nameProp = nameRecord;
126   std::istream *tmpStream = nullptr;
127   tmpStream = static_cast<std::istream *>(
128       new std::istringstream(text, std::ios_base::binary));
129   dp_inStream = tmpStream;
130   df_owner = true;
131   this->advanceToNextRecord();
132   d_molpos.push_back(dp_inStream->tellg());
133   df_sanitize = sanitize;
134   this->checkForEnd();
135   POSTCONDITION(dp_inStream, "bad instream");
136 }
137 
advanceToNextRecord()138 bool TDTMolSupplier::advanceToNextRecord() {
139   PRECONDITION(dp_inStream, "no stream");
140   std::streampos pos;
141   bool res = false;
142   while (1) {
143     if (dp_inStream->eof() || dp_inStream->bad()) {
144       return false;
145     }
146     pos = dp_inStream->tellg();
147     std::string inL;
148     std::getline(*dp_inStream, inL);
149     if (inL.find("$SMI<") == 0) {
150       res = true;
151       break;
152     }
153   }
154   dp_inStream->clear();
155   dp_inStream->seekg(pos);
156   return res;
157 }
158 
checkForEnd()159 void TDTMolSupplier::checkForEnd() {
160   PRECONDITION(dp_inStream, "no stream");
161   if (dp_inStream->eof() || dp_inStream->bad()) {
162     df_end = true;
163     // the -1 here is because by the time we get here we've already pushed on
164     // the
165     // position of the next line:
166     d_len = d_molpos.size() - 1;
167     return;
168   }
169 
170   // we are not at the end of file, but check for blank lines:
171   std::string tempStr;
172   std::getline(*dp_inStream, tempStr);
173 
174   boost::trim_left_if(tempStr, boost::is_any_of(std::string(" \t\r\n")));
175 
176   if (tempStr.length() == 0) {
177     df_end = true;
178     // the -1 here is because by the time we get here we've already pushed on
179     // the
180     // position of the next line:
181     d_len = d_molpos.size() - 1;
182   }
183   return;
184 }
185 
reset()186 void TDTMolSupplier::reset() {
187   PRECONDITION(dp_inStream, "no stream");
188   dp_inStream->clear();
189 
190   dp_inStream->seekg(0, std::ios::beg);
191   df_end = false;
192   d_last = 0;
193   d_line = 0;
194 }
195 
parseMol(std::string inLine)196 ROMol *TDTMolSupplier::parseMol(std::string inLine) {
197   PRECONDITION(dp_inStream, "no stream");
198   Utils::LocaleSwitcher ls;
199   std::size_t startP = inLine.find("<");
200   std::size_t endP = inLine.find_last_of(">");
201   std::string smiles = inLine.substr(startP + 1, endP - startP - 1);
202   ROMol *res = SmilesToMol(smiles, 0, df_sanitize);
203 
204   if (res && res->getNumAtoms() > 0) {
205     // -----------
206     //   Process the properties:
207     d_line++;
208     std::getline(*dp_inStream, inLine);
209     while (!dp_inStream->eof() && !dp_inStream->fail() &&
210            inLine.find("|") != 0) {
211       endP = inLine.find("<");
212       std::string propName = inLine.substr(0, endP);
213       boost::trim_if(propName, boost::is_any_of(" \t"));
214       startP = endP + 1;
215 
216       if (propName == common_properties::TWOD && d_confId2D >= 0) {
217         std::string rest = inLine.substr(startP, inLine.size() - startP);
218         std::vector<double> coords;
219         TDTParseUtils::ParseNumberList(rest, coords, dp_inStream);
220         auto *conf = new Conformer(res->getNumAtoms());
221         conf->setId(d_confId2D);
222         conf->set3D(false);
223         for (unsigned int atIdx = 0; atIdx < res->getNumAtoms(); atIdx++) {
224           if (2 * atIdx + 1 < coords.size()) {
225             conf->setAtomPos(
226                 atIdx,
227                 RDGeom::Point3D(coords[2 * atIdx], coords[2 * atIdx + 1], 0.0));
228           } else {
229             // we're going to let this slide... but maybe we should do something
230             // else?
231           }
232         }
233         res->addConformer(conf, false);
234       } else if (propName == "3D" && d_confId3D >= 0) {
235         std::string rest = inLine.substr(startP, inLine.size() - startP);
236         std::vector<double> coords;
237         TDTParseUtils::ParseNumberList(rest, coords, dp_inStream);
238         auto *conf = new Conformer(res->getNumAtoms());
239         conf->setId(d_confId3D);
240         conf->set3D(true);
241         for (unsigned int atIdx = 0; atIdx < res->getNumAtoms(); atIdx++) {
242           if (3 * atIdx + 2 < coords.size()) {
243             conf->setAtomPos(
244                 atIdx, RDGeom::Point3D(coords[3 * atIdx], coords[3 * atIdx + 1],
245                                        coords[3 * atIdx + 2]));
246           } else {
247             // we're going to let this slide... but maybe we should do something
248             // else?
249           }
250         }
251         res->addConformer(conf, false);
252       } else {
253         endP = inLine.find_last_of(">");
254         if (endP == std::string::npos) {
255           std::ostringstream errout;
256           errout << "no end tag found for property" << propName;
257           throw FileParseException(errout.str());
258         } else {
259           std::string propVal = inLine.substr(startP, endP - startP);
260           res->setProp(propName, propVal);
261           if (propName == d_nameProp) {
262             res->setProp(common_properties::_Name, propVal);
263           }
264         }
265       }
266       std::getline(*dp_inStream, inLine);
267     }
268   }
269 
270   return res;
271 }
272 
next()273 ROMol *TDTMolSupplier::next() {
274   PRECONDITION(dp_inStream, "no stream");
275   // set the stream to the appropriate position
276   dp_inStream->seekg(d_molpos[d_last]);
277 
278   std::string tempStr;
279   ROMol *res = nullptr;
280   // finally if we reached the end of the file set end to be true
281   if (dp_inStream->eof()) {
282     // FIX: we should probably be throwing an exception here
283     df_end = true;
284     d_len = d_molpos.size();
285     return res;
286   }
287 
288   // start by finding the $SMI element (we're assuming that this starts the
289   // block)
290   std::string tempp;
291   d_line++;
292   std::getline(*dp_inStream, tempp);
293   while (tempp.find("$SMI<") != 0 && !dp_inStream->eof() &&
294          !dp_inStream->fail()) {
295     d_line++;
296     std::getline(*dp_inStream, tempp);
297   }
298   if (tempp.find("$SMI<") == 0) {
299     try {
300       res = parseMol(tempp);
301     } catch (MolSanitizeException &se) {
302       // We couldn't sanitize a molecule we got - write out an error message and
303       // move to
304       BOOST_LOG(rdErrorLog)
305           << "ERROR: Could not sanitize molecule ending on line " << d_line
306           << std::endl;
307       BOOST_LOG(rdErrorLog) << "ERROR: " << se.what() << "\n";
308       while (!dp_inStream->eof() && !dp_inStream->fail() &&
309              tempStr.find("|") != 0) {
310         d_line++;
311         std::getline(*dp_inStream, tempStr);
312       }
313     }
314   }
315   d_last++;
316   if (d_last >= static_cast<int>(d_molpos.size())) {
317     d_molpos.push_back(dp_inStream->tellg());
318   }
319   this->checkForEnd();
320   return res;
321 }
322 
getItemText(unsigned int idx)323 std::string TDTMolSupplier::getItemText(unsigned int idx) {
324   PRECONDITION(dp_inStream, "no stream");
325   unsigned int holder = d_last;
326   moveTo(idx);
327   std::streampos begP = d_molpos[idx];
328   bool endHolder = df_end;
329   std::streampos endP;
330   try {
331     moveTo(idx + 1);
332     endP = d_molpos[idx + 1];
333   } catch (FileParseException &) {
334     dp_inStream->clear();
335     dp_inStream->seekg(0, std::ios_base::end);
336     endP = dp_inStream->tellg();
337   }
338   d_last = holder;
339   df_end = endHolder;
340   auto *buff = new char[endP - begP];
341   dp_inStream->seekg(begP);
342   dp_inStream->read(buff, endP - begP);
343   std::string res(buff, endP - begP);
344   delete[] buff;
345   return res;
346 }
347 
moveTo(unsigned int idx)348 void TDTMolSupplier::moveTo(unsigned int idx) {
349   PRECONDITION(dp_inStream, "no stream");
350 
351   // dp_inStream->seekg() is called for all idx values
352   // and earlier calls to next() may have put the stream into a bad state
353   dp_inStream->clear();
354 
355   // move until we hit the desired idx
356   if (idx < d_molpos.size()) {
357     dp_inStream->seekg(d_molpos[idx]);
358     d_last = idx;
359   } else {
360     std::string tempStr;
361     d_last = d_molpos.size() - 1;
362     dp_inStream->seekg(d_molpos.back());
363     while (d_last < static_cast<int>(idx) && !dp_inStream->eof() &&
364            !dp_inStream->fail()) {
365       d_line++;
366       std::getline(*dp_inStream, tempStr);
367 
368       if (tempStr.find("|") == 0) {
369         d_molpos.push_back(dp_inStream->tellg());
370         d_last++;
371       }
372     }
373     // if we reached end of file without reaching "idx" we have an index error
374     if (dp_inStream->eof()) {
375       d_len = d_molpos.size();
376       std::ostringstream errout;
377       errout << "ERROR: Index error (idx = " << idx << ") : "
378              << " we do no have enough molecule blocks";
379       throw FileParseException(errout.str());
380     }
381   }
382 }
383 
operator [](unsigned int idx)384 ROMol *TDTMolSupplier::operator[](unsigned int idx) {
385   PRECONDITION(dp_inStream, "no stream");
386   // get the molecule with index idx
387   moveTo(idx);
388   return next();
389 }
390 
length()391 unsigned int TDTMolSupplier::length() {
392   PRECONDITION(dp_inStream, "no stream");
393   // return the number of mol blocks in the sdfile
394   if (d_len > 0) {
395     return d_len;
396   } else {
397     std::string tempStr;
398     d_len = d_molpos.size();
399     dp_inStream->seekg(d_molpos.back());
400     std::string inL;
401     std::getline(*dp_inStream, inL);
402     while (this->advanceToNextRecord()) {
403       d_molpos.push_back(dp_inStream->tellg());
404       d_len++;
405       std::getline(*dp_inStream, inL);
406     }
407     // now remember to set the stream to the last position we want to read
408     dp_inStream->clear();
409     dp_inStream->seekg(d_molpos[d_last]);
410     return d_len;
411   }
412 }
413 
atEnd()414 bool TDTMolSupplier::atEnd() {
415   PRECONDITION(dp_inStream, "no stream");
416   return df_end;
417 }
418 }  // namespace RDKit
419