1 //
2 // Copyright (C) 2018 Pat Lorton
3 //
4 // @@ All Rights Reserved @@
5 // This file is part of the RDKit.
6 // The contents are covered by the terms of the BSD license
7 // which is included in the file license.txt, found at the root
8 // of the RDKit source tree.
9 //
10 #include <iostream>
11 #include <fstream>
12 #include <map>
13
14 #include <RDGeneral/BadFileException.h>
15 #include <RDGeneral/FileParseException.h>
16 #include <GraphMol/MolInterchange/details.h>
17 #include <GraphMol/MolOps.h>
18 #include <GraphMol/MonomerInfo.h>
19 #include <GraphMol/RWMol.h>
20 #include <GraphMol/FileParsers/MolSupplier.h>
21 #include <GraphMol/FileParsers/FileParserUtils.h>
22
23 #include <boost/tokenizer.hpp>
24
25 #include <maeparser/MaeConstants.hpp>
26 #include <maeparser/Reader.hpp>
27
28 using namespace schrodinger;
29 using RDKit::MolInterchange::bolookup;
30
31 namespace RDKit {
32
33 namespace {
34
35 const std::string PDB_ATOM_NAME = "s_m_pdb_atom_name";
36 const std::string PDB_RESIDUE_NAME = "s_m_pdb_residue_name";
37 const std::string PDB_CHAIN_NAME = "s_m_chain_name";
38 const std::string PDB_INSERTION_CODE = "s_m_insertion_code";
39 const std::string PDB_RESIDUE_NUMBER = "i_m_residue_number";
40 const std::string PDB_OCCUPANCY = "r_m_pdb_occupancy";
41 const std::string PDB_TFACTOR = "r_m_pdb_tfactor";
42
43 class PDBInfo {
44 public:
PDBInfo(const mae::IndexedBlock & atom_block)45 PDBInfo(const mae::IndexedBlock &atom_block) {
46 try {
47 m_atom_name = atom_block.getStringProperty(PDB_ATOM_NAME);
48 } catch (std::out_of_range &) {
49 }
50
51 try {
52 m_residue_name = atom_block.getStringProperty(PDB_RESIDUE_NAME);
53 } catch (std::out_of_range &) {
54 }
55
56 try {
57 m_chain_id = atom_block.getStringProperty(PDB_CHAIN_NAME);
58 } catch (std::out_of_range &) {
59 }
60
61 try {
62 m_insertion_code = atom_block.getStringProperty(PDB_INSERTION_CODE);
63 } catch (std::out_of_range &) {
64 }
65
66 try {
67 m_resnum = atom_block.getIntProperty(PDB_RESIDUE_NUMBER);
68 } catch (std::out_of_range &) {
69 }
70
71 try {
72 m_occupancy = atom_block.getRealProperty(PDB_OCCUPANCY);
73 } catch (std::out_of_range &) {
74 }
75
76 try {
77 m_tempfac = atom_block.getRealProperty(PDB_TFACTOR);
78 } catch (std::out_of_range &) {
79 }
80 }
81
addPDBData(Atom * atom,size_t atom_num)82 void addPDBData(Atom *atom, size_t atom_num) {
83 if (!m_atom_name || !m_atom_name->isDefined(atom_num)) {
84 return; // Need a PDB atom name to populate info
85 }
86 AtomPDBResidueInfo *rd_info =
87 new AtomPDBResidueInfo(m_atom_name->at(atom_num));
88
89 atom->setMonomerInfo(rd_info);
90
91 if (m_residue_name && m_residue_name->isDefined(atom_num)) {
92 rd_info->setResidueName(m_residue_name->at(atom_num));
93 }
94
95 if (m_chain_id && m_chain_id->isDefined(atom_num)) {
96 rd_info->setChainId(m_chain_id->at(atom_num));
97 }
98
99 if (m_insertion_code && m_insertion_code->isDefined(atom_num)) {
100 rd_info->setInsertionCode(m_insertion_code->at(atom_num));
101 }
102
103 if (m_resnum && m_resnum->isDefined(atom_num)) {
104 rd_info->setResidueNumber(m_resnum->at(atom_num));
105 }
106
107 if (m_occupancy && m_occupancy->isDefined(atom_num)) {
108 rd_info->setOccupancy(m_occupancy->at(atom_num));
109 }
110
111 if (m_tempfac && m_tempfac->isDefined(atom_num)) {
112 rd_info->setTempFactor(m_tempfac->at(atom_num));
113 }
114 }
115
116 private:
117 std::shared_ptr<mae::IndexedStringProperty> m_atom_name;
118 std::shared_ptr<mae::IndexedStringProperty> m_residue_name;
119 std::shared_ptr<mae::IndexedStringProperty> m_chain_id;
120 std::shared_ptr<mae::IndexedStringProperty> m_insertion_code;
121
122 std::shared_ptr<mae::IndexedIntProperty> m_resnum;
123
124 std::shared_ptr<mae::IndexedRealProperty> m_occupancy;
125 std::shared_ptr<mae::IndexedRealProperty> m_tempfac;
126 };
127
streamIsGoodOrExhausted(std::istream * stream)128 bool streamIsGoodOrExhausted(std::istream *stream) {
129 PRECONDITION(stream, "bad stream");
130 return stream->good() || (stream->eof() && stream->fail() && !stream->bad());
131 }
132
parseChiralityLabel(RWMol & mol,const std::string & stereo_prop)133 void parseChiralityLabel(RWMol &mol, const std::string &stereo_prop) {
134 boost::char_separator<char> sep{"_"};
135 boost::tokenizer<boost::char_separator<char>> tokenizer{stereo_prop, sep};
136
137 auto tItr = tokenizer.begin();
138
139 const int chiral_idx = FileParserUtils::toInt(*tItr) - 1;
140 Atom *chiral_atom = mol.getAtomWithIdx(chiral_idx);
141 CHECK_INVARIANT(chiral_atom != nullptr, "bad prop value");
142
143 unsigned nSwaps = 2;
144 const char rotation_direction = (++tItr)->back();
145 switch (rotation_direction) {
146 case 'R': // R, ANR
147 nSwaps = 0;
148 break;
149 case 'S': // S, ANS
150 nSwaps = 1;
151 break;
152 case '?': // Undefined
153 return;
154 default:
155 break;
156 }
157 CHECK_INVARIANT(nSwaps < 2, "bad prop value");
158
159 INT_LIST bond_indexes;
160 for (++tItr; tItr != tokenizer.end(); ++tItr) {
161 const int nbr_idx = FileParserUtils::toInt(*tItr) - 1;
162 const Bond *bnd = mol.getBondBetweenAtoms(chiral_idx, nbr_idx);
163 CHECK_INVARIANT(bnd, "bad chiral bond");
164 bond_indexes.push_back(bnd->getIdx());
165 }
166 CHECK_INVARIANT(bond_indexes.size() == chiral_atom->getDegree(),
167 "bad prop value");
168
169 nSwaps += chiral_atom->getPerturbationOrder(bond_indexes);
170 switch (nSwaps % 2) {
171 case 0:
172 chiral_atom->setChiralTag(Atom::CHI_TETRAHEDRAL_CW);
173 break;
174 case 1:
175 chiral_atom->setChiralTag(Atom::CHI_TETRAHEDRAL_CCW);
176 break;
177 }
178 }
179
parseStereoBondLabel(RWMol & mol,const std::string & stereo_prop)180 void parseStereoBondLabel(RWMol &mol, const std::string &stereo_prop) {
181 boost::char_separator<char> sep{"_"};
182 boost::tokenizer<boost::char_separator<char>> tokenizer{stereo_prop, sep};
183
184 Bond::BondStereo type = Bond::STEREONONE;
185 std::vector<int> atom_indexes;
186 for (const auto &t : tokenizer) {
187 if (t == "E") {
188 type = Bond::STEREOTRANS;
189 } else if (t == "Z") {
190 type = Bond::STEREOCIS;
191 } else {
192 // Atom indexes are 0-based in RDKit, and 1-based in Mae.
193 atom_indexes.push_back(FileParserUtils::toInt(t) - 1);
194 }
195 }
196 CHECK_INVARIANT(type != Bond::STEREONONE, "bad prop value");
197
198 // We currently don't support allenes or allene-likes
199 if (atom_indexes.size() != 4) {
200 return;
201 }
202
203 auto *bond = mol.getBondBetweenAtoms(atom_indexes[1], atom_indexes[2]);
204 CHECK_INVARIANT(bond, "bad stereo bond");
205 CHECK_INVARIANT(bond->getBondType() == Bond::DOUBLE, "bad stereo bond");
206
207 bond->setStereoAtoms(atom_indexes[0], atom_indexes[3]);
208 bond->setStereo(type);
209 }
210
211 //! Copy over the structure properties, including stereochemistry.
set_mol_properties(RWMol & mol,const mae::Block & ct_block)212 void set_mol_properties(RWMol &mol, const mae::Block &ct_block) {
213 for (const auto &prop : ct_block.getProperties<std::string>()) {
214 if (prop.first == mae::CT_TITLE) {
215 mol.setProp(common_properties::_Name, prop.second);
216 } else if (prop.first.find(mae::CT_CHIRALITY_PROP_PREFIX) == 0 ||
217 prop.first.find(mae::CT_PSEUDOCHIRALITY_PROP_PREFIX) == 0) {
218 parseChiralityLabel(mol, prop.second);
219 } else if (prop.first.find(mae::CT_EZ_PROP_PREFIX) == 0) {
220 parseStereoBondLabel(mol, prop.second);
221 } else {
222 mol.setProp(prop.first, prop.second);
223 }
224 }
225 for (const auto &prop : ct_block.getProperties<double>()) {
226 mol.setProp(prop.first, prop.second);
227 }
228 for (const auto &prop : ct_block.getProperties<int>()) {
229 mol.setProp(prop.first, prop.second);
230 }
231 for (const auto &prop : ct_block.getProperties<mae::BoolProperty>()) {
232 mol.setProp(prop.first, static_cast<bool>(prop.second));
233 }
234 }
235
236 //! Set atom properties. Some of these have already been parsed to construct the
237 //! Atom object, and should be skipped. Also, atom properties may be undefined
238 //! for an atom, and should also be skipped for that atom.
set_atom_properties(Atom & atom,const mae::IndexedBlock & atom_block,size_t i)239 void set_atom_properties(Atom &atom, const mae::IndexedBlock &atom_block,
240 size_t i) {
241 for (const auto &prop : atom_block.getProperties<std::string>()) {
242 if (prop.first == PDB_ATOM_NAME || prop.first == PDB_RESIDUE_NAME ||
243 prop.first == PDB_CHAIN_NAME || prop.first == PDB_INSERTION_CODE) {
244 // PDB information is parsed separately.
245 continue;
246 } else if (!prop.second->isDefined(i)) {
247 continue;
248 }
249
250 atom.setProp(prop.first, prop.second->at(i));
251 }
252
253 for (const auto &prop : atom_block.getProperties<double>()) {
254 if (prop.first == mae::ATOM_X_COORD || prop.first == mae::ATOM_Y_COORD ||
255 prop.first == mae::ATOM_Z_COORD) {
256 // Coordinates are used in defining a conformation, and should not be
257 // set on the atom.
258 continue;
259 } else if (prop.first == PDB_OCCUPANCY || prop.first == PDB_TFACTOR) {
260 // PDB information is parsed separately.
261 continue;
262 } else if (!prop.second->isDefined(i)) {
263 continue;
264 }
265
266 atom.setProp(prop.first, prop.second->at(i));
267 }
268 for (const auto &prop : atom_block.getProperties<int>()) {
269 if (prop.first == mae::ATOM_ATOMIC_NUM) {
270 // Atomic number was already used in the creation of the atom
271 continue;
272 } else if (prop.first == PDB_RESIDUE_NUMBER) {
273 // PDB information is parsed separately.
274 continue;
275 } else if (!prop.second->isDefined(i)) {
276 continue;
277 }
278
279 if (prop.first == mae::ATOM_FORMAL_CHARGE) {
280 // Formal charge has a specific setter
281 atom.setFormalCharge(prop.second->at(i));
282 } else {
283 atom.setProp(prop.first, prop.second->at(i));
284 }
285 }
286 for (const auto &prop : atom_block.getProperties<mae::BoolProperty>()) {
287 if (!prop.second->isDefined(i)) {
288 continue;
289 }
290
291 atom.setProp(prop.first, static_cast<bool>(prop.second->at(i)));
292 }
293 }
294
addAtoms(const mae::IndexedBlock & atom_block,RWMol & mol)295 void addAtoms(const mae::IndexedBlock &atom_block, RWMol &mol) {
296 // All atoms are guaranteed to have these three field names:
297 const auto atomic_numbers = atom_block.getIntProperty(mae::ATOM_ATOMIC_NUM);
298 const auto xs = atom_block.getRealProperty(mae::ATOM_X_COORD);
299 const auto ys = atom_block.getRealProperty(mae::ATOM_Y_COORD);
300 const auto zs = atom_block.getRealProperty(mae::ATOM_Z_COORD);
301
302 // atomic numbers, and x, y, and z coordinates
303 const auto size = atomic_numbers->size();
304 auto conf = new RDKit::Conformer(size);
305 conf->set3D(true);
306 conf->setId(0);
307
308 PDBInfo pdb_info(atom_block);
309
310 for (size_t i = 0; i < size; ++i) {
311 Atom *atom = new Atom(atomic_numbers->at(i));
312 mol.addAtom(atom, true, true);
313
314 pdb_info.addPDBData(atom, i);
315 set_atom_properties(*atom, atom_block, i);
316
317 RDGeom::Point3D pos;
318 pos.x = xs->at(i);
319 pos.y = ys->at(i);
320 pos.z = zs->at(i);
321 conf->setAtomPos(i, pos);
322 }
323 mol.addConformer(conf, false);
324 }
325
addBonds(const mae::IndexedBlock & bond_block,RWMol & mol)326 void addBonds(const mae::IndexedBlock &bond_block, RWMol &mol) {
327 // All bonds are guaranteed to have these three field names:
328 const auto from_atoms = bond_block.getIntProperty(mae::BOND_ATOM_1);
329 const auto to_atoms = bond_block.getIntProperty(mae::BOND_ATOM_2);
330 const auto orders = bond_block.getIntProperty(mae::BOND_ORDER);
331 const auto size = from_atoms->size();
332
333 for (size_t i = 0; i < size; ++i) {
334 // Maestro atoms are 1 indexed!
335 const auto from_atom = from_atoms->at(i) - 1;
336 const auto to_atom = to_atoms->at(i) - 1;
337 const auto order = bolookup.find(orders->at(i))->second;
338 if (from_atom > to_atom) {
339 continue; // Maestro files may double-list some bonds
340 }
341
342 auto bond = new Bond(order);
343 bond->setOwningMol(mol);
344 bond->setBeginAtomIdx(from_atom);
345 bond->setEndAtomIdx(to_atom);
346 mol.addBond(bond, true);
347 }
348 }
349
build_mol(RWMol & mol,mae::Block & structure_block,bool sanitize,bool removeHs)350 void build_mol(RWMol &mol, mae::Block &structure_block, bool sanitize,
351 bool removeHs) {
352 const auto &atom_block = structure_block.getIndexedBlock(mae::ATOM_BLOCK);
353 addAtoms(*atom_block, mol);
354
355 const auto &bond_block = structure_block.getIndexedBlock(mae::BOND_BLOCK);
356 addBonds(*bond_block, mol);
357
358 // These properties need to be set last, as stereochemistry is defined here,
359 // and it requires atoms and bonds to be available.
360 set_mol_properties(mol, structure_block);
361
362 if (sanitize) {
363 if (removeHs) {
364 MolOps::removeHs(mol, false, false);
365 } else {
366 MolOps::sanitizeMol(mol);
367 }
368 } else {
369 // we need some properties for the chiral setup
370 mol.updatePropertyCache(false);
371 }
372
373 // If there are 3D coordinates, try to read more chiralities from them, but do
374 // not override the ones that were read from properties
375 bool replaceExistingTags = false;
376 if (mol.getNumConformers() && mol.getConformer().is3D()) {
377 MolOps::assignChiralTypesFrom3D(mol, -1, replaceExistingTags);
378 }
379
380 // Find more stereo bonds, assign labels, but don't replace the existing ones
381 MolOps::detectBondStereochemistry(mol, replaceExistingTags);
382 MolOps::assignStereochemistry(mol, replaceExistingTags);
383 }
384
385 } // namespace
386
MaeMolSupplier(std::shared_ptr<std::istream> inStream,bool sanitize,bool removeHs)387 MaeMolSupplier::MaeMolSupplier(std::shared_ptr<std::istream> inStream,
388 bool sanitize, bool removeHs) {
389 PRECONDITION(inStream, "bad stream");
390 dp_sInStream = inStream;
391 dp_inStream = inStream.get();
392 df_owner = true;
393 df_sanitize = sanitize;
394 df_removeHs = removeHs;
395
396 d_reader.reset(new mae::Reader(dp_sInStream));
397 CHECK_INVARIANT(streamIsGoodOrExhausted(dp_inStream), "bad instream");
398
399 try {
400 d_next_struct = d_reader->next(mae::CT_BLOCK);
401 } catch (const mae::read_exception &e) {
402 throw FileParseException(e.what());
403 }
404 }
405
MaeMolSupplier(std::istream * inStream,bool takeOwnership,bool sanitize,bool removeHs)406 MaeMolSupplier::MaeMolSupplier(std::istream *inStream, bool takeOwnership,
407 bool sanitize, bool removeHs) {
408 PRECONDITION(inStream, "bad stream");
409 PRECONDITION(takeOwnership, "takeOwnership is required for MaeMolSupplier");
410 dp_inStream = inStream;
411 dp_sInStream.reset(dp_inStream);
412 df_owner = takeOwnership; // always true
413 df_sanitize = sanitize;
414 df_removeHs = removeHs;
415
416 d_reader.reset(new mae::Reader(dp_sInStream));
417 CHECK_INVARIANT(streamIsGoodOrExhausted(dp_inStream), "bad instream");
418
419 try {
420 d_next_struct = d_reader->next(mae::CT_BLOCK);
421 } catch (const mae::read_exception &e) {
422 throw FileParseException(e.what());
423 }
424 }
425
MaeMolSupplier(const std::string & fileName,bool sanitize,bool removeHs)426 MaeMolSupplier::MaeMolSupplier(const std::string &fileName, bool sanitize,
427 bool removeHs) {
428 df_owner = true;
429 dp_inStream = openAndCheckStream(fileName);
430 dp_sInStream.reset(dp_inStream);
431 df_sanitize = sanitize;
432 df_removeHs = removeHs;
433
434 d_reader.reset(new mae::Reader(dp_sInStream));
435 CHECK_INVARIANT(streamIsGoodOrExhausted(dp_inStream), "bad instream");
436
437 try {
438 d_next_struct = d_reader->next(mae::CT_BLOCK);
439 } catch (const mae::read_exception &e) {
440 throw FileParseException(e.what());
441 }
442 }
443
init()444 void MaeMolSupplier::init() {}
reset()445 void MaeMolSupplier::reset() {}
446
next()447 ROMol *MaeMolSupplier::next() {
448 PRECONDITION(dp_sInStream != nullptr, "no stream");
449 if (!d_stored_exc.empty()) {
450 throw FileParseException(d_stored_exc);
451 } else if (atEnd()) {
452 throw FileParseException("All structures read from Maestro file");
453 }
454
455 auto mol = new RWMol;
456
457 try {
458 build_mol(*mol, *d_next_struct, df_sanitize, df_removeHs);
459 } catch (...) {
460 delete mol;
461 moveToNextBlock();
462 throw;
463 }
464
465 moveToNextBlock();
466
467 return static_cast<ROMol *>(mol);
468 }
469
moveToNextBlock()470 void MaeMolSupplier::moveToNextBlock() {
471 try {
472 d_next_struct = d_reader->next(mae::CT_BLOCK);
473 } catch (const mae::read_exception &e) {
474 d_stored_exc = e.what();
475 }
476 }
477
atEnd()478 bool MaeMolSupplier::atEnd() { return d_next_struct == nullptr; }
479 } // namespace RDKit