1 //
2 //  Copyright (C) 2018 Pat Lorton
3 //
4 //   @@ All Rights Reserved @@
5 //  This file is part of the RDKit.
6 //  The contents are covered by the terms of the BSD license
7 //  which is included in the file license.txt, found at the root
8 //  of the RDKit source tree.
9 //
10 #include <iostream>
11 #include <fstream>
12 #include <map>
13 
14 #include <RDGeneral/BadFileException.h>
15 #include <RDGeneral/FileParseException.h>
16 #include <GraphMol/MolInterchange/details.h>
17 #include <GraphMol/MolOps.h>
18 #include <GraphMol/MonomerInfo.h>
19 #include <GraphMol/RWMol.h>
20 #include <GraphMol/FileParsers/MolSupplier.h>
21 #include <GraphMol/FileParsers/FileParserUtils.h>
22 
23 #include <boost/tokenizer.hpp>
24 
25 #include <maeparser/MaeConstants.hpp>
26 #include <maeparser/Reader.hpp>
27 
28 using namespace schrodinger;
29 using RDKit::MolInterchange::bolookup;
30 
31 namespace RDKit {
32 
33 namespace {
34 
35 const std::string PDB_ATOM_NAME = "s_m_pdb_atom_name";
36 const std::string PDB_RESIDUE_NAME = "s_m_pdb_residue_name";
37 const std::string PDB_CHAIN_NAME = "s_m_chain_name";
38 const std::string PDB_INSERTION_CODE = "s_m_insertion_code";
39 const std::string PDB_RESIDUE_NUMBER = "i_m_residue_number";
40 const std::string PDB_OCCUPANCY = "r_m_pdb_occupancy";
41 const std::string PDB_TFACTOR = "r_m_pdb_tfactor";
42 
43 class PDBInfo {
44  public:
PDBInfo(const mae::IndexedBlock & atom_block)45   PDBInfo(const mae::IndexedBlock &atom_block) {
46     try {
47       m_atom_name = atom_block.getStringProperty(PDB_ATOM_NAME);
48     } catch (std::out_of_range &) {
49     }
50 
51     try {
52       m_residue_name = atom_block.getStringProperty(PDB_RESIDUE_NAME);
53     } catch (std::out_of_range &) {
54     }
55 
56     try {
57       m_chain_id = atom_block.getStringProperty(PDB_CHAIN_NAME);
58     } catch (std::out_of_range &) {
59     }
60 
61     try {
62       m_insertion_code = atom_block.getStringProperty(PDB_INSERTION_CODE);
63     } catch (std::out_of_range &) {
64     }
65 
66     try {
67       m_resnum = atom_block.getIntProperty(PDB_RESIDUE_NUMBER);
68     } catch (std::out_of_range &) {
69     }
70 
71     try {
72       m_occupancy = atom_block.getRealProperty(PDB_OCCUPANCY);
73     } catch (std::out_of_range &) {
74     }
75 
76     try {
77       m_tempfac = atom_block.getRealProperty(PDB_TFACTOR);
78     } catch (std::out_of_range &) {
79     }
80   }
81 
addPDBData(Atom * atom,size_t atom_num)82   void addPDBData(Atom *atom, size_t atom_num) {
83     if (!m_atom_name || !m_atom_name->isDefined(atom_num)) {
84       return;  // Need a PDB atom name to populate info
85     }
86     AtomPDBResidueInfo *rd_info =
87         new AtomPDBResidueInfo(m_atom_name->at(atom_num));
88 
89     atom->setMonomerInfo(rd_info);
90 
91     if (m_residue_name && m_residue_name->isDefined(atom_num)) {
92       rd_info->setResidueName(m_residue_name->at(atom_num));
93     }
94 
95     if (m_chain_id && m_chain_id->isDefined(atom_num)) {
96       rd_info->setChainId(m_chain_id->at(atom_num));
97     }
98 
99     if (m_insertion_code && m_insertion_code->isDefined(atom_num)) {
100       rd_info->setInsertionCode(m_insertion_code->at(atom_num));
101     }
102 
103     if (m_resnum && m_resnum->isDefined(atom_num)) {
104       rd_info->setResidueNumber(m_resnum->at(atom_num));
105     }
106 
107     if (m_occupancy && m_occupancy->isDefined(atom_num)) {
108       rd_info->setOccupancy(m_occupancy->at(atom_num));
109     }
110 
111     if (m_tempfac && m_tempfac->isDefined(atom_num)) {
112       rd_info->setTempFactor(m_tempfac->at(atom_num));
113     }
114   }
115 
116  private:
117   std::shared_ptr<mae::IndexedStringProperty> m_atom_name;
118   std::shared_ptr<mae::IndexedStringProperty> m_residue_name;
119   std::shared_ptr<mae::IndexedStringProperty> m_chain_id;
120   std::shared_ptr<mae::IndexedStringProperty> m_insertion_code;
121 
122   std::shared_ptr<mae::IndexedIntProperty> m_resnum;
123 
124   std::shared_ptr<mae::IndexedRealProperty> m_occupancy;
125   std::shared_ptr<mae::IndexedRealProperty> m_tempfac;
126 };
127 
streamIsGoodOrExhausted(std::istream * stream)128 bool streamIsGoodOrExhausted(std::istream *stream) {
129   PRECONDITION(stream, "bad stream");
130   return stream->good() || (stream->eof() && stream->fail() && !stream->bad());
131 }
132 
parseChiralityLabel(RWMol & mol,const std::string & stereo_prop)133 void parseChiralityLabel(RWMol &mol, const std::string &stereo_prop) {
134   boost::char_separator<char> sep{"_"};
135   boost::tokenizer<boost::char_separator<char>> tokenizer{stereo_prop, sep};
136 
137   auto tItr = tokenizer.begin();
138 
139   const int chiral_idx = FileParserUtils::toInt(*tItr) - 1;
140   Atom *chiral_atom = mol.getAtomWithIdx(chiral_idx);
141   CHECK_INVARIANT(chiral_atom != nullptr, "bad prop value");
142 
143   unsigned nSwaps = 2;
144   const char rotation_direction = (++tItr)->back();
145   switch (rotation_direction) {
146     case 'R':  // R, ANR
147       nSwaps = 0;
148       break;
149     case 'S':  // S, ANS
150       nSwaps = 1;
151       break;
152     case '?':  // Undefined
153       return;
154     default:
155       break;
156   }
157   CHECK_INVARIANT(nSwaps < 2, "bad prop value");
158 
159   INT_LIST bond_indexes;
160   for (++tItr; tItr != tokenizer.end(); ++tItr) {
161     const int nbr_idx = FileParserUtils::toInt(*tItr) - 1;
162     const Bond *bnd = mol.getBondBetweenAtoms(chiral_idx, nbr_idx);
163     CHECK_INVARIANT(bnd, "bad chiral bond");
164     bond_indexes.push_back(bnd->getIdx());
165   }
166   CHECK_INVARIANT(bond_indexes.size() == chiral_atom->getDegree(),
167                   "bad prop value");
168 
169   nSwaps += chiral_atom->getPerturbationOrder(bond_indexes);
170   switch (nSwaps % 2) {
171     case 0:
172       chiral_atom->setChiralTag(Atom::CHI_TETRAHEDRAL_CW);
173       break;
174     case 1:
175       chiral_atom->setChiralTag(Atom::CHI_TETRAHEDRAL_CCW);
176       break;
177   }
178 }
179 
parseStereoBondLabel(RWMol & mol,const std::string & stereo_prop)180 void parseStereoBondLabel(RWMol &mol, const std::string &stereo_prop) {
181   boost::char_separator<char> sep{"_"};
182   boost::tokenizer<boost::char_separator<char>> tokenizer{stereo_prop, sep};
183 
184   Bond::BondStereo type = Bond::STEREONONE;
185   std::vector<int> atom_indexes;
186   for (const auto &t : tokenizer) {
187     if (t == "E") {
188       type = Bond::STEREOTRANS;
189     } else if (t == "Z") {
190       type = Bond::STEREOCIS;
191     } else {
192       // Atom indexes are 0-based in RDKit, and 1-based in Mae.
193       atom_indexes.push_back(FileParserUtils::toInt(t) - 1);
194     }
195   }
196   CHECK_INVARIANT(type != Bond::STEREONONE, "bad prop value");
197 
198   // We currently don't support allenes or allene-likes
199   if (atom_indexes.size() != 4) {
200     return;
201   }
202 
203   auto *bond = mol.getBondBetweenAtoms(atom_indexes[1], atom_indexes[2]);
204   CHECK_INVARIANT(bond, "bad stereo bond");
205   CHECK_INVARIANT(bond->getBondType() == Bond::DOUBLE, "bad stereo bond");
206 
207   bond->setStereoAtoms(atom_indexes[0], atom_indexes[3]);
208   bond->setStereo(type);
209 }
210 
211 //! Copy over the structure properties, including stereochemistry.
set_mol_properties(RWMol & mol,const mae::Block & ct_block)212 void set_mol_properties(RWMol &mol, const mae::Block &ct_block) {
213   for (const auto &prop : ct_block.getProperties<std::string>()) {
214     if (prop.first == mae::CT_TITLE) {
215       mol.setProp(common_properties::_Name, prop.second);
216     } else if (prop.first.find(mae::CT_CHIRALITY_PROP_PREFIX) == 0 ||
217                prop.first.find(mae::CT_PSEUDOCHIRALITY_PROP_PREFIX) == 0) {
218       parseChiralityLabel(mol, prop.second);
219     } else if (prop.first.find(mae::CT_EZ_PROP_PREFIX) == 0) {
220       parseStereoBondLabel(mol, prop.second);
221     } else {
222       mol.setProp(prop.first, prop.second);
223     }
224   }
225   for (const auto &prop : ct_block.getProperties<double>()) {
226     mol.setProp(prop.first, prop.second);
227   }
228   for (const auto &prop : ct_block.getProperties<int>()) {
229     mol.setProp(prop.first, prop.second);
230   }
231   for (const auto &prop : ct_block.getProperties<mae::BoolProperty>()) {
232     mol.setProp(prop.first, static_cast<bool>(prop.second));
233   }
234 }
235 
236 //! Set atom properties. Some of these have already been parsed to construct the
237 //! Atom object, and should be skipped. Also, atom properties may be undefined
238 //! for an atom, and should also be skipped for that atom.
set_atom_properties(Atom & atom,const mae::IndexedBlock & atom_block,size_t i)239 void set_atom_properties(Atom &atom, const mae::IndexedBlock &atom_block,
240                          size_t i) {
241   for (const auto &prop : atom_block.getProperties<std::string>()) {
242     if (prop.first == PDB_ATOM_NAME || prop.first == PDB_RESIDUE_NAME ||
243         prop.first == PDB_CHAIN_NAME || prop.first == PDB_INSERTION_CODE) {
244       // PDB information is parsed separately.
245       continue;
246     } else if (!prop.second->isDefined(i)) {
247       continue;
248     }
249 
250     atom.setProp(prop.first, prop.second->at(i));
251   }
252 
253   for (const auto &prop : atom_block.getProperties<double>()) {
254     if (prop.first == mae::ATOM_X_COORD || prop.first == mae::ATOM_Y_COORD ||
255         prop.first == mae::ATOM_Z_COORD) {
256       // Coordinates are used in defining a conformation, and should not be
257       // set on the atom.
258       continue;
259     } else if (prop.first == PDB_OCCUPANCY || prop.first == PDB_TFACTOR) {
260       // PDB information is parsed separately.
261       continue;
262     } else if (!prop.second->isDefined(i)) {
263       continue;
264     }
265 
266     atom.setProp(prop.first, prop.second->at(i));
267   }
268   for (const auto &prop : atom_block.getProperties<int>()) {
269     if (prop.first == mae::ATOM_ATOMIC_NUM) {
270       // Atomic number was already used in the creation of the atom
271       continue;
272     } else if (prop.first == PDB_RESIDUE_NUMBER) {
273       // PDB information is parsed separately.
274       continue;
275     } else if (!prop.second->isDefined(i)) {
276       continue;
277     }
278 
279     if (prop.first == mae::ATOM_FORMAL_CHARGE) {
280       // Formal charge has a specific setter
281       atom.setFormalCharge(prop.second->at(i));
282     } else {
283       atom.setProp(prop.first, prop.second->at(i));
284     }
285   }
286   for (const auto &prop : atom_block.getProperties<mae::BoolProperty>()) {
287     if (!prop.second->isDefined(i)) {
288       continue;
289     }
290 
291     atom.setProp(prop.first, static_cast<bool>(prop.second->at(i)));
292   }
293 }
294 
addAtoms(const mae::IndexedBlock & atom_block,RWMol & mol)295 void addAtoms(const mae::IndexedBlock &atom_block, RWMol &mol) {
296   // All atoms are guaranteed to have these three field names:
297   const auto atomic_numbers = atom_block.getIntProperty(mae::ATOM_ATOMIC_NUM);
298   const auto xs = atom_block.getRealProperty(mae::ATOM_X_COORD);
299   const auto ys = atom_block.getRealProperty(mae::ATOM_Y_COORD);
300   const auto zs = atom_block.getRealProperty(mae::ATOM_Z_COORD);
301 
302   // atomic numbers, and x, y, and z coordinates
303   const auto size = atomic_numbers->size();
304   auto conf = new RDKit::Conformer(size);
305   conf->set3D(true);
306   conf->setId(0);
307 
308   PDBInfo pdb_info(atom_block);
309 
310   for (size_t i = 0; i < size; ++i) {
311     Atom *atom = new Atom(atomic_numbers->at(i));
312     mol.addAtom(atom, true, true);
313 
314     pdb_info.addPDBData(atom, i);
315     set_atom_properties(*atom, atom_block, i);
316 
317     RDGeom::Point3D pos;
318     pos.x = xs->at(i);
319     pos.y = ys->at(i);
320     pos.z = zs->at(i);
321     conf->setAtomPos(i, pos);
322   }
323   mol.addConformer(conf, false);
324 }
325 
addBonds(const mae::IndexedBlock & bond_block,RWMol & mol)326 void addBonds(const mae::IndexedBlock &bond_block, RWMol &mol) {
327   // All bonds are guaranteed to have these three field names:
328   const auto from_atoms = bond_block.getIntProperty(mae::BOND_ATOM_1);
329   const auto to_atoms = bond_block.getIntProperty(mae::BOND_ATOM_2);
330   const auto orders = bond_block.getIntProperty(mae::BOND_ORDER);
331   const auto size = from_atoms->size();
332 
333   for (size_t i = 0; i < size; ++i) {
334     // Maestro atoms are 1 indexed!
335     const auto from_atom = from_atoms->at(i) - 1;
336     const auto to_atom = to_atoms->at(i) - 1;
337     const auto order = bolookup.find(orders->at(i))->second;
338     if (from_atom > to_atom) {
339       continue;  // Maestro files may double-list some bonds
340     }
341 
342     auto bond = new Bond(order);
343     bond->setOwningMol(mol);
344     bond->setBeginAtomIdx(from_atom);
345     bond->setEndAtomIdx(to_atom);
346     mol.addBond(bond, true);
347   }
348 }
349 
build_mol(RWMol & mol,mae::Block & structure_block,bool sanitize,bool removeHs)350 void build_mol(RWMol &mol, mae::Block &structure_block, bool sanitize,
351                bool removeHs) {
352   const auto &atom_block = structure_block.getIndexedBlock(mae::ATOM_BLOCK);
353   addAtoms(*atom_block, mol);
354 
355   const auto &bond_block = structure_block.getIndexedBlock(mae::BOND_BLOCK);
356   addBonds(*bond_block, mol);
357 
358   // These properties need to be set last, as stereochemistry is defined here,
359   // and it requires atoms and bonds to be available.
360   set_mol_properties(mol, structure_block);
361 
362   if (sanitize) {
363     if (removeHs) {
364       MolOps::removeHs(mol, false, false);
365     } else {
366       MolOps::sanitizeMol(mol);
367     }
368   } else {
369     // we need some properties for the chiral setup
370     mol.updatePropertyCache(false);
371   }
372 
373   // If there are 3D coordinates, try to read more chiralities from them, but do
374   // not override the ones that were read from properties
375   bool replaceExistingTags = false;
376   if (mol.getNumConformers() && mol.getConformer().is3D()) {
377     MolOps::assignChiralTypesFrom3D(mol, -1, replaceExistingTags);
378   }
379 
380   // Find more stereo bonds, assign labels, but don't replace the existing ones
381   MolOps::detectBondStereochemistry(mol, replaceExistingTags);
382   MolOps::assignStereochemistry(mol, replaceExistingTags);
383 }
384 
385 }  // namespace
386 
MaeMolSupplier(std::shared_ptr<std::istream> inStream,bool sanitize,bool removeHs)387 MaeMolSupplier::MaeMolSupplier(std::shared_ptr<std::istream> inStream,
388                                bool sanitize, bool removeHs) {
389   PRECONDITION(inStream, "bad stream");
390   dp_sInStream = inStream;
391   dp_inStream = inStream.get();
392   df_owner = true;
393   df_sanitize = sanitize;
394   df_removeHs = removeHs;
395 
396   d_reader.reset(new mae::Reader(dp_sInStream));
397   CHECK_INVARIANT(streamIsGoodOrExhausted(dp_inStream), "bad instream");
398 
399   try {
400     d_next_struct = d_reader->next(mae::CT_BLOCK);
401   } catch (const mae::read_exception &e) {
402     throw FileParseException(e.what());
403   }
404 }
405 
MaeMolSupplier(std::istream * inStream,bool takeOwnership,bool sanitize,bool removeHs)406 MaeMolSupplier::MaeMolSupplier(std::istream *inStream, bool takeOwnership,
407                                bool sanitize, bool removeHs) {
408   PRECONDITION(inStream, "bad stream");
409   PRECONDITION(takeOwnership, "takeOwnership is required for MaeMolSupplier");
410   dp_inStream = inStream;
411   dp_sInStream.reset(dp_inStream);
412   df_owner = takeOwnership;  // always true
413   df_sanitize = sanitize;
414   df_removeHs = removeHs;
415 
416   d_reader.reset(new mae::Reader(dp_sInStream));
417   CHECK_INVARIANT(streamIsGoodOrExhausted(dp_inStream), "bad instream");
418 
419   try {
420     d_next_struct = d_reader->next(mae::CT_BLOCK);
421   } catch (const mae::read_exception &e) {
422     throw FileParseException(e.what());
423   }
424 }
425 
MaeMolSupplier(const std::string & fileName,bool sanitize,bool removeHs)426 MaeMolSupplier::MaeMolSupplier(const std::string &fileName, bool sanitize,
427                                bool removeHs) {
428   df_owner = true;
429   dp_inStream = openAndCheckStream(fileName);
430   dp_sInStream.reset(dp_inStream);
431   df_sanitize = sanitize;
432   df_removeHs = removeHs;
433 
434   d_reader.reset(new mae::Reader(dp_sInStream));
435   CHECK_INVARIANT(streamIsGoodOrExhausted(dp_inStream), "bad instream");
436 
437   try {
438     d_next_struct = d_reader->next(mae::CT_BLOCK);
439   } catch (const mae::read_exception &e) {
440     throw FileParseException(e.what());
441   }
442 }
443 
init()444 void MaeMolSupplier::init() {}
reset()445 void MaeMolSupplier::reset() {}
446 
next()447 ROMol *MaeMolSupplier::next() {
448   PRECONDITION(dp_sInStream != nullptr, "no stream");
449   if (!d_stored_exc.empty()) {
450     throw FileParseException(d_stored_exc);
451   } else if (atEnd()) {
452     throw FileParseException("All structures read from Maestro file");
453   }
454 
455   auto mol = new RWMol;
456 
457   try {
458     build_mol(*mol, *d_next_struct, df_sanitize, df_removeHs);
459   } catch (...) {
460     delete mol;
461     moveToNextBlock();
462     throw;
463   }
464 
465   moveToNextBlock();
466 
467   return static_cast<ROMol *>(mol);
468 }
469 
moveToNextBlock()470 void MaeMolSupplier::moveToNextBlock() {
471   try {
472     d_next_struct = d_reader->next(mae::CT_BLOCK);
473   } catch (const mae::read_exception &e) {
474     d_stored_exc = e.what();
475   }
476 }
477 
atEnd()478 bool MaeMolSupplier::atEnd() { return d_next_struct == nullptr; }
479 }  // namespace RDKit