1 //
2 // Copyright (C) 2005-2020 Greg Landrum and Rational Discovery LLC
3 //
4 // @@ All Rights Reserved @@
5 // This file is part of the RDKit.
6 // The contents are covered by the terms of the BSD license
7 // which is included in the file license.txt, found at the root
8 // of the RDKit source tree.
9 //
10 #include <RDGeneral/BoostStartInclude.h>
11 #include <boost/tokenizer.hpp>
12 #include <boost/algorithm/string.hpp>
13 #include <boost/lexical_cast.hpp>
14 #include <RDGeneral/BoostEndInclude.h>
15
16 #include <RDGeneral/BadFileException.h>
17 #include <RDGeneral/FileParseException.h>
18 #include <RDGeneral/RDLog.h>
19 #include "MolSupplier.h"
20 #include "FileParsers.h"
21 #include <GraphMol/SmilesParse/SmilesParse.h>
22 #include <RDGeneral/LocaleSwitcher.h>
23
24 #include <fstream>
25 #include <iostream>
26 #include <sstream>
27 #include <string>
28
29 namespace RDKit {
30 namespace TDTParseUtils {
31 typedef boost::tokenizer<boost::escaped_list_separator<char>> CommaTokenizer;
32
33 /*
34 * if inStream is valid, we'll allow the numbers to be broken across multiple
35 * lines.
36 *
37 * This will throw a boost::bad_lexical_cast exception if it hits a bogus number
38 *
39 */
40 template <typename T>
ParseNumberList(std::string inLine,std::vector<T> & res,std::istream * inStream=nullptr)41 void ParseNumberList(std::string inLine, std::vector<T> &res,
42 std::istream *inStream = nullptr) {
43 bool foundEnd = false;
44 while (!foundEnd) {
45 CommaTokenizer commaTok(inLine);
46 for (CommaTokenizer::const_iterator commaTokIt = commaTok.begin();
47 commaTokIt != commaTok.end(); commaTokIt++) {
48 std::string number = *commaTokIt;
49 bool atEnd = number.find(";>") != std::string::npos;
50 boost::trim_if(number, boost::is_any_of(" \r\n\t;>"));
51 if (number != "" && !atEnd) {
52 res.push_back(boost::lexical_cast<T>(number));
53 } else if (atEnd) {
54 // that's it, we're done:
55 foundEnd = true;
56 break;
57 }
58 }
59 if (foundEnd || !inStream || inStream->eof()) {
60 break;
61 } else {
62 std::getline(*inStream, inLine);
63 }
64 }
65 if (!foundEnd) {
66 throw FileParseException("no end tag found for numeric list");
67 }
68 }
69
70 } // end of namespace TDTParseUtils
71
TDTMolSupplier()72 TDTMolSupplier::TDTMolSupplier() { init(); }
73
TDTMolSupplier(const std::string & fileName,const std::string & nameRecord,int confId2D,int confId3D,bool sanitize)74 TDTMolSupplier::TDTMolSupplier(const std::string &fileName,
75 const std::string &nameRecord, int confId2D,
76 int confId3D, bool sanitize) {
77 init();
78 d_confId2D = confId2D;
79 d_confId3D = confId3D;
80 d_nameProp = nameRecord;
81 dp_inStream = openAndCheckStream(fileName);
82 df_owner = true;
83
84 this->advanceToNextRecord();
85 d_molpos.push_back(dp_inStream->tellg());
86 df_sanitize = sanitize;
87 this->checkForEnd();
88 }
89
TDTMolSupplier(std::istream * inStream,bool takeOwnership,const std::string & nameRecord,int confId2D,int confId3D,bool sanitize)90 TDTMolSupplier::TDTMolSupplier(std::istream *inStream, bool takeOwnership,
91 const std::string &nameRecord, int confId2D,
92 int confId3D, bool sanitize) {
93 CHECK_INVARIANT(inStream, "bad instream");
94 CHECK_INVARIANT(!(inStream->eof()), "early EOF");
95 init();
96 dp_inStream = inStream;
97 df_owner = takeOwnership;
98 d_confId2D = confId2D;
99 d_confId3D = confId3D;
100 d_nameProp = nameRecord;
101 this->advanceToNextRecord();
102 d_molpos.push_back(dp_inStream->tellg());
103 df_sanitize = sanitize;
104 this->checkForEnd();
105 }
106
init()107 void TDTMolSupplier::init() {
108 dp_inStream = nullptr;
109 df_owner = false;
110 df_end = false;
111 d_len = -1;
112 d_last = 0;
113 d_line = 0;
114 }
115
setData(const std::string & text,const std::string & nameRecord,int confId2D,int confId3D,bool sanitize)116 void TDTMolSupplier::setData(const std::string &text,
117 const std::string &nameRecord, int confId2D,
118 int confId3D, bool sanitize) {
119 if (dp_inStream && df_owner) {
120 delete dp_inStream;
121 }
122 init();
123 d_confId2D = confId2D;
124 d_confId3D = confId3D;
125 d_nameProp = nameRecord;
126 std::istream *tmpStream = nullptr;
127 tmpStream = static_cast<std::istream *>(
128 new std::istringstream(text, std::ios_base::binary));
129 dp_inStream = tmpStream;
130 df_owner = true;
131 this->advanceToNextRecord();
132 d_molpos.push_back(dp_inStream->tellg());
133 df_sanitize = sanitize;
134 this->checkForEnd();
135 POSTCONDITION(dp_inStream, "bad instream");
136 }
137
advanceToNextRecord()138 bool TDTMolSupplier::advanceToNextRecord() {
139 PRECONDITION(dp_inStream, "no stream");
140 std::streampos pos;
141 bool res = false;
142 while (1) {
143 if (dp_inStream->eof() || dp_inStream->bad()) {
144 return false;
145 }
146 pos = dp_inStream->tellg();
147 std::string inL;
148 std::getline(*dp_inStream, inL);
149 if (inL.find("$SMI<") == 0) {
150 res = true;
151 break;
152 }
153 }
154 dp_inStream->clear();
155 dp_inStream->seekg(pos);
156 return res;
157 }
158
checkForEnd()159 void TDTMolSupplier::checkForEnd() {
160 PRECONDITION(dp_inStream, "no stream");
161 if (dp_inStream->eof() || dp_inStream->bad()) {
162 df_end = true;
163 // the -1 here is because by the time we get here we've already pushed on
164 // the
165 // position of the next line:
166 d_len = d_molpos.size() - 1;
167 return;
168 }
169
170 // we are not at the end of file, but check for blank lines:
171 std::string tempStr;
172 std::getline(*dp_inStream, tempStr);
173
174 boost::trim_left_if(tempStr, boost::is_any_of(std::string(" \t\r\n")));
175
176 if (tempStr.length() == 0) {
177 df_end = true;
178 // the -1 here is because by the time we get here we've already pushed on
179 // the
180 // position of the next line:
181 d_len = d_molpos.size() - 1;
182 }
183 return;
184 }
185
reset()186 void TDTMolSupplier::reset() {
187 PRECONDITION(dp_inStream, "no stream");
188 dp_inStream->clear();
189
190 dp_inStream->seekg(0, std::ios::beg);
191 df_end = false;
192 d_last = 0;
193 d_line = 0;
194 }
195
parseMol(std::string inLine)196 ROMol *TDTMolSupplier::parseMol(std::string inLine) {
197 PRECONDITION(dp_inStream, "no stream");
198 Utils::LocaleSwitcher ls;
199 std::size_t startP = inLine.find("<");
200 std::size_t endP = inLine.find_last_of(">");
201 std::string smiles = inLine.substr(startP + 1, endP - startP - 1);
202 ROMol *res = SmilesToMol(smiles, 0, df_sanitize);
203
204 if (res && res->getNumAtoms() > 0) {
205 // -----------
206 // Process the properties:
207 d_line++;
208 std::getline(*dp_inStream, inLine);
209 while (!dp_inStream->eof() && !dp_inStream->fail() &&
210 inLine.find("|") != 0) {
211 endP = inLine.find("<");
212 std::string propName = inLine.substr(0, endP);
213 boost::trim_if(propName, boost::is_any_of(" \t"));
214 startP = endP + 1;
215
216 if (propName == common_properties::TWOD && d_confId2D >= 0) {
217 std::string rest = inLine.substr(startP, inLine.size() - startP);
218 std::vector<double> coords;
219 TDTParseUtils::ParseNumberList(rest, coords, dp_inStream);
220 auto *conf = new Conformer(res->getNumAtoms());
221 conf->setId(d_confId2D);
222 conf->set3D(false);
223 for (unsigned int atIdx = 0; atIdx < res->getNumAtoms(); atIdx++) {
224 if (2 * atIdx + 1 < coords.size()) {
225 conf->setAtomPos(
226 atIdx,
227 RDGeom::Point3D(coords[2 * atIdx], coords[2 * atIdx + 1], 0.0));
228 } else {
229 // we're going to let this slide... but maybe we should do something
230 // else?
231 }
232 }
233 res->addConformer(conf, false);
234 } else if (propName == "3D" && d_confId3D >= 0) {
235 std::string rest = inLine.substr(startP, inLine.size() - startP);
236 std::vector<double> coords;
237 TDTParseUtils::ParseNumberList(rest, coords, dp_inStream);
238 auto *conf = new Conformer(res->getNumAtoms());
239 conf->setId(d_confId3D);
240 conf->set3D(true);
241 for (unsigned int atIdx = 0; atIdx < res->getNumAtoms(); atIdx++) {
242 if (3 * atIdx + 2 < coords.size()) {
243 conf->setAtomPos(
244 atIdx, RDGeom::Point3D(coords[3 * atIdx], coords[3 * atIdx + 1],
245 coords[3 * atIdx + 2]));
246 } else {
247 // we're going to let this slide... but maybe we should do something
248 // else?
249 }
250 }
251 res->addConformer(conf, false);
252 } else {
253 endP = inLine.find_last_of(">");
254 if (endP == std::string::npos) {
255 std::ostringstream errout;
256 errout << "no end tag found for property" << propName;
257 throw FileParseException(errout.str());
258 } else {
259 std::string propVal = inLine.substr(startP, endP - startP);
260 res->setProp(propName, propVal);
261 if (propName == d_nameProp) {
262 res->setProp(common_properties::_Name, propVal);
263 }
264 }
265 }
266 std::getline(*dp_inStream, inLine);
267 }
268 }
269
270 return res;
271 }
272
next()273 ROMol *TDTMolSupplier::next() {
274 PRECONDITION(dp_inStream, "no stream");
275 // set the stream to the appropriate position
276 dp_inStream->seekg(d_molpos[d_last]);
277
278 std::string tempStr;
279 ROMol *res = nullptr;
280 // finally if we reached the end of the file set end to be true
281 if (dp_inStream->eof()) {
282 // FIX: we should probably be throwing an exception here
283 df_end = true;
284 d_len = d_molpos.size();
285 return res;
286 }
287
288 // start by finding the $SMI element (we're assuming that this starts the
289 // block)
290 std::string tempp;
291 d_line++;
292 std::getline(*dp_inStream, tempp);
293 while (tempp.find("$SMI<") != 0 && !dp_inStream->eof() &&
294 !dp_inStream->fail()) {
295 d_line++;
296 std::getline(*dp_inStream, tempp);
297 }
298 if (tempp.find("$SMI<") == 0) {
299 try {
300 res = parseMol(tempp);
301 } catch (MolSanitizeException &se) {
302 // We couldn't sanitize a molecule we got - write out an error message and
303 // move to
304 BOOST_LOG(rdErrorLog)
305 << "ERROR: Could not sanitize molecule ending on line " << d_line
306 << std::endl;
307 BOOST_LOG(rdErrorLog) << "ERROR: " << se.what() << "\n";
308 while (!dp_inStream->eof() && !dp_inStream->fail() &&
309 tempStr.find("|") != 0) {
310 d_line++;
311 std::getline(*dp_inStream, tempStr);
312 }
313 }
314 }
315 d_last++;
316 if (d_last >= static_cast<int>(d_molpos.size())) {
317 d_molpos.push_back(dp_inStream->tellg());
318 }
319 this->checkForEnd();
320 return res;
321 }
322
getItemText(unsigned int idx)323 std::string TDTMolSupplier::getItemText(unsigned int idx) {
324 PRECONDITION(dp_inStream, "no stream");
325 unsigned int holder = d_last;
326 moveTo(idx);
327 std::streampos begP = d_molpos[idx];
328 bool endHolder = df_end;
329 std::streampos endP;
330 try {
331 moveTo(idx + 1);
332 endP = d_molpos[idx + 1];
333 } catch (FileParseException &) {
334 dp_inStream->clear();
335 dp_inStream->seekg(0, std::ios_base::end);
336 endP = dp_inStream->tellg();
337 }
338 d_last = holder;
339 df_end = endHolder;
340 auto *buff = new char[endP - begP];
341 dp_inStream->seekg(begP);
342 dp_inStream->read(buff, endP - begP);
343 std::string res(buff, endP - begP);
344 delete[] buff;
345 return res;
346 }
347
moveTo(unsigned int idx)348 void TDTMolSupplier::moveTo(unsigned int idx) {
349 PRECONDITION(dp_inStream, "no stream");
350
351 // dp_inStream->seekg() is called for all idx values
352 // and earlier calls to next() may have put the stream into a bad state
353 dp_inStream->clear();
354
355 // move until we hit the desired idx
356 if (idx < d_molpos.size()) {
357 dp_inStream->seekg(d_molpos[idx]);
358 d_last = idx;
359 } else {
360 std::string tempStr;
361 d_last = d_molpos.size() - 1;
362 dp_inStream->seekg(d_molpos.back());
363 while (d_last < static_cast<int>(idx) && !dp_inStream->eof() &&
364 !dp_inStream->fail()) {
365 d_line++;
366 std::getline(*dp_inStream, tempStr);
367
368 if (tempStr.find("|") == 0) {
369 d_molpos.push_back(dp_inStream->tellg());
370 d_last++;
371 }
372 }
373 // if we reached end of file without reaching "idx" we have an index error
374 if (dp_inStream->eof()) {
375 d_len = d_molpos.size();
376 std::ostringstream errout;
377 errout << "ERROR: Index error (idx = " << idx << ") : "
378 << " we do no have enough molecule blocks";
379 throw FileParseException(errout.str());
380 }
381 }
382 }
383
operator [](unsigned int idx)384 ROMol *TDTMolSupplier::operator[](unsigned int idx) {
385 PRECONDITION(dp_inStream, "no stream");
386 // get the molecule with index idx
387 moveTo(idx);
388 return next();
389 }
390
length()391 unsigned int TDTMolSupplier::length() {
392 PRECONDITION(dp_inStream, "no stream");
393 // return the number of mol blocks in the sdfile
394 if (d_len > 0) {
395 return d_len;
396 } else {
397 std::string tempStr;
398 d_len = d_molpos.size();
399 dp_inStream->seekg(d_molpos.back());
400 std::string inL;
401 std::getline(*dp_inStream, inL);
402 while (this->advanceToNextRecord()) {
403 d_molpos.push_back(dp_inStream->tellg());
404 d_len++;
405 std::getline(*dp_inStream, inL);
406 }
407 // now remember to set the stream to the last position we want to read
408 dp_inStream->clear();
409 dp_inStream->seekg(d_molpos[d_last]);
410 return d_len;
411 }
412 }
413
atEnd()414 bool TDTMolSupplier::atEnd() {
415 PRECONDITION(dp_inStream, "no stream");
416 return df_end;
417 }
418 } // namespace RDKit
419