1 //
2 //
3 //  Copyright (C) 2018 Greg Landrum and T5 Informatics GmbH
4 //
5 //   @@ All Rights Reserved @@
6 //  This file is part of the RDKit.
7 //  The contents are covered by the terms of the BSD license
8 //  which is included in the file license.txt, found at the root
9 //  of the RDKit source tree.
10 //
11 
12 #include "catch.hpp"
13 
14 #include <GraphMol/RDKitBase.h>
15 #include <GraphMol/SubstanceGroup.h>
16 #include <GraphMol/Chirality.h>
17 #include <GraphMol/SmilesParse/SmilesParse.h>
18 #include "GraphMol/FileParsers/FileParsers.h"
19 
20 using namespace RDKit;
21 
22 /* Auxiliary functions */
testIdxVector(const std::vector<unsigned int> & groupVector,const std::vector<unsigned int> & reference)23 void testIdxVector(const std::vector<unsigned int> &groupVector,
24                    const std::vector<unsigned int> &reference) {
25   size_t vecSize = reference.size();
26   CHECK(groupVector.size() == vecSize);
27 
28   auto sgItr = groupVector.begin();
29   for (auto refItr = reference.begin(); refItr != reference.end();
30        ++sgItr, ++refItr) {
31     CHECK(1 + *sgItr == *refItr);
32   }
33 }
34 
testBrackets(const std::vector<SubstanceGroup::Bracket> & brackets,const std::vector<std::array<std::array<double,3>,3>> & reference)35 void testBrackets(
36     const std::vector<SubstanceGroup::Bracket> &brackets,
37     const std::vector<std::array<std::array<double, 3>, 3>> &reference) {
38   CHECK(brackets.size() == 2);
39   for (int i = 0; i < 2; ++i) {
40     for (int j = 0; j < 3; ++j) {
41       for (int k = 0; k < 3; ++k) {
42         CHECK(std::abs(brackets[i][j][k] - reference[i][j][k]) < 1.e-6);
43       }
44     }
45   }
46 }
47 
buildSampleMolecule()48 RWMol buildSampleMolecule() {
49   // This builds a RDKit::RWMol with all implemented SubstanceGroup features in
50   // order to test them. SubstanceGroups and features probably do not make any
51   // sense.
52 
53   //// Initialize Molecule ////
54   RWMol mol;
55 
56   // Add some atoms and bonds
57   for (unsigned i = 0; i < 6; ++i) {
58     mol.addAtom(new Atom(6), false, true);
59 
60     if (i > 0) {
61       mol.addBond(i - 1, i, Bond::SINGLE);
62     }
63   }
64 
65   //// First SubstanceGroup ////
66   {
67     SubstanceGroup sg(&mol, "MUL");
68 
69     sg.setProp("SUBTYPE", "BLO");
70     sg.setProp("MULT", "n");
71     sg.setProp("CONNECT", "HH");
72 
73     // Add some atoms and bonds
74     for (unsigned i = 0; i < 3; ++i) {
75       sg.addAtomWithIdx(i);
76       sg.addParentAtomWithIdx(i);
77       sg.addBondWithIdx(i);  // add 2 CBONDs + 1 XBOND
78     }
79 
80     sg.setProp("COMPNO", 7u);
81     sg.setProp("ESTATE", "E");
82 
83     SubstanceGroup::Bracket bracket1 = {{RDGeom::Point3D(1., 3., 0.),
84                                          RDGeom::Point3D(5., 7., 0.),
85                                          RDGeom::Point3D(0., 0., 0.)}};
86     sg.addBracket(bracket1);
87 
88     SubstanceGroup::Bracket bracket2 = {{RDGeom::Point3D(2., 4., 0.),
89                                          RDGeom::Point3D(6., 8., 0.),
90                                          RDGeom::Point3D(0., 0., 0.)}};
91     sg.addBracket(bracket2);
92 
93     // Vector should not be parsed (not a SUP group)
94     sg.addCState(2, RDGeom::Point3D());
95 
96     sg.setProp("CLASS", "TEST CLASS");
97 
98     sg.addAttachPoint(0, 0, "XX");
99 
100     sg.setProp("BRKTYP", "PAREN");
101 
102     addSubstanceGroup(mol, sg);
103   }
104   //// Second SubstanceGroup ////
105   {
106     SubstanceGroup sg(&mol, "SUP");
107 
108     // Add some atoms and bonds
109     for (unsigned i = 3; i < 6; ++i) {
110       sg.addAtomWithIdx(i);
111       sg.addParentAtomWithIdx(i);
112       sg.addBondWithIdx(i - 1);  // add 1 XBOND + 2 CBONDs
113     }
114 
115     sg.setProp("LABEL", "TEST LABEL");
116 
117     // V2000 has only x and y coords; z value restricted to 0.
118     RDGeom::Point3D vector(3., 4., 0.);
119     sg.addCState(2, vector);  // Vector should be parsed now!
120 
121     sg.addAttachPoint(3, -1, "YY");
122 
123     addSubstanceGroup(mol, sg);
124   }
125   //// Third SubstanceGroup ////
126   {
127     SubstanceGroup sg(&mol, "DAT");
128 
129     sg.setProp("FIELDNAME", "SAMPLE FIELD NAME");  // 30 char max
130     // Field Type is ignored in V3000
131     sg.setProp("FIELDINFO", "SAMPLE FIELD INFO");  // 20 char max
132     sg.setProp("QUERYTYPE", "PQ");                 // 2 char max
133     sg.setProp("QUERYOP", "SAMPLE QUERY OP");      // 15 char max (rest of line)
134 
135     // This should be properly formatted, but format is not checked
136     sg.setProp("FIELDDISP", "SAMPLE FIELD DISP");
137 
138     STR_VECT dataFields = {"SAMPLE DATA FIELD 1", "SAMPLE DATA FIELD 2",
139                            "SAMPLE DATA FIELD 3"};
140     sg.setProp("DATAFIELDS", dataFields);
141 
142     addSubstanceGroup(mol, sg);
143   }
144 
145   // Set a parent with higher index
146   const auto &sgroups = getSubstanceGroups(mol);
147   sgroups.at(0).setProp<unsigned int>("PARENT", 3);
148 
149   return mol;
150 }
151 
152 /* End Auxiliary functions */
153 
154 TEST_CASE("Basic Sgroup creation", "[Sgroups]") {
155   // Create two SubstanceGroups and add them to a molecule
156   RWMol mol;
157 
158   {
159     SubstanceGroup sg0(&mol, "DAT");
160     SubstanceGroup sg1(&mol, "SUP");
161     addSubstanceGroup(mol, sg0);
162     addSubstanceGroup(mol, sg1);
163   }
164 
165   const auto &sgroups = getSubstanceGroups(mol);
166   REQUIRE(sgroups.size() == 2);
167   CHECK(sgroups.at(0).getProp<std::string>("TYPE") == "DAT");
168   CHECK(sgroups.at(1).getProp<std::string>("TYPE") == "SUP");
169 }
170 
171 TEST_CASE("Build and test sample molecule", "[Sgroups]") {
172   auto mol = buildSampleMolecule();
173 
174   const auto &sgroups = getSubstanceGroups(mol);
175   CHECK(sgroups.size() == 3);
176 
177   SECTION("first sgroup") {
178     const auto &sg = sgroups.at(0);
179     CHECK(sg.getProp<std::string>("TYPE") == "MUL");
180 
181     CHECK(sg.getProp<std::string>("SUBTYPE") == "BLO");
182     CHECK(sg.getProp<std::string>("MULT") == "n");
183     CHECK(sg.getProp<std::string>("CONNECT") == "HH");
184 
185     std::vector<unsigned int> atoms_reference = {1, 2, 3};
186     auto atoms = sg.getAtoms();
187     testIdxVector(atoms, atoms_reference);
188 
189     std::vector<unsigned int> patoms_reference = {1, 2, 3};
190     testIdxVector(sg.getParentAtoms(), patoms_reference);
191 
192     std::vector<unsigned int> bonds_reference = {1, 2, 3};
193     auto bonds = sg.getBonds();
194 
195     // bonds are not sorted in V3000; sort them here
196     std::sort(bonds.begin(), bonds.end());
197 
198     testIdxVector(bonds, bonds_reference);
199 
200     CHECK(sg.getBondType(bonds[0]) == SubstanceGroup::BondType::CBOND);
201     CHECK(sg.getBondType(bonds[1]) == SubstanceGroup::BondType::CBOND);
202     CHECK(sg.getBondType(bonds[2]) == SubstanceGroup::BondType::XBOND);
203 
204     CHECK(sg.getProp<unsigned int>("COMPNO") == 7u);
205     CHECK(sg.getProp<std::string>("ESTATE") == "E");
206 
207     std::vector<std::array<std::array<double, 3>, 3>> brackets_reference = {
208         {{{{1., 3., 0.}}, {{5., 7., 0.}}, {{0., 0., 0.}}}},
209         {{{{2., 4., 0.}}, {{6., 8., 0.}}, {{0., 0., 0.}}}},
210     };
211     testBrackets(sg.getBrackets(), brackets_reference);
212 
213     auto cstates = sg.getCStates();
214     CHECK(cstates.size() == 1);
215     CHECK(cstates[0].bondIdx == bonds[2]);
216     CHECK(cstates[0].vector.x == 0.);
217     CHECK(cstates[0].vector.y == 0.);
218     CHECK(cstates[0].vector.z == 0.);
219 
220     CHECK(sg.getProp<std::string>("CLASS") == "TEST CLASS");
221 
222     auto ap = sg.getAttachPoints();
223     CHECK(ap.size() == 1);
224     CHECK(ap[0].aIdx == atoms[0]);
225     CHECK(ap[0].lvIdx == static_cast<int>(atoms[0]));
226     CHECK(ap[0].id == "XX");
227 
228     CHECK(sg.getProp<std::string>("BRKTYP") == "PAREN");
229 
230     CHECK(sg.getProp<unsigned int>("PARENT") == 3u);
231   }
232 
233   SECTION("second sgroup") {
234     const auto &sg = sgroups.at(1);
235     CHECK(sg.getProp<std::string>("TYPE") == "SUP");
236 
237     std::vector<unsigned int> atoms_reference = {4, 5, 6};
238     auto atoms = sg.getAtoms();
239     testIdxVector(atoms, atoms_reference);
240 
241     std::vector<unsigned int> patoms_reference = {4, 5, 6};
242     testIdxVector(sg.getParentAtoms(), patoms_reference);
243 
244     std::vector<unsigned int> bonds_reference = {3, 4, 5};
245     auto bonds = sg.getBonds();
246 
247     // bonds are not sorted in V3000; sort them here
248     std::sort(bonds.begin(), bonds.end());
249 
250     testIdxVector(bonds, bonds_reference);
251     CHECK(sg.getBondType(bonds[0]) == SubstanceGroup::BondType::XBOND);
252     CHECK(sg.getBondType(bonds[1]) == SubstanceGroup::BondType::CBOND);
253     CHECK(sg.getBondType(bonds[2]) == SubstanceGroup::BondType::CBOND);
254 
255     CHECK(sg.getProp<std::string>("LABEL") == "TEST LABEL");
256 
257     auto cstates = sg.getCStates();
258     CHECK(cstates.size() == 1);
259     CHECK(cstates[0].bondIdx == bonds[0]);
260     CHECK(cstates[0].vector.x == 3.);
261     CHECK(cstates[0].vector.y == 4.);
262     CHECK(cstates[0].vector.z == 0.);
263 
264     auto ap = sg.getAttachPoints();
265     CHECK(ap.size() == 1);
266     CHECK(ap[0].aIdx == atoms[0]);
267     CHECK(ap[0].lvIdx == -1);
268     CHECK(ap[0].id == "YY");
269   }
270 
271   SECTION("third sgroup") {
272     const auto &sg = sgroups.at(2);
273     CHECK(sg.getProp<std::string>("TYPE") == "DAT");
274 
275     CHECK(sg.getProp<std::string>("FIELDNAME") == "SAMPLE FIELD NAME");
276     CHECK(sg.getProp<std::string>("FIELDINFO") == "SAMPLE FIELD INFO");
277     CHECK(sg.getProp<std::string>("QUERYTYPE") == "PQ");
278     CHECK(sg.getProp<std::string>("QUERYOP") == "SAMPLE QUERY OP");
279 
280     CHECK(sg.getProp<std::string>("FIELDDISP") == "SAMPLE FIELD DISP");
281 
282     auto dataFields = sg.getProp<STR_VECT>("DATAFIELDS");
283     CHECK(dataFields.size() == 3);
284     CHECK(dataFields[0] == "SAMPLE DATA FIELD 1");
285     CHECK(dataFields[1] == "SAMPLE DATA FIELD 2");
286     CHECK(dataFields[2] == "SAMPLE DATA FIELD 3");
287   }
288 }
289 
290 TEST_CASE("Removing sgroups", "[Sgroups]") {
291   SECTION("basics") {
292     auto m1 = R"CTAB(
293   Mrv2014 07312005252D
294 
295   0  0  0     0  0            999 V3000
296 M  V30 BEGIN CTAB
297 M  V30 COUNTS 7 6 3 0 0
298 M  V30 BEGIN ATOM
299 M  V30 1 * -12.75 11.5 0 0
300 M  V30 2 O -11.4163 12.27 0 0
301 M  V30 3 C -10.0826 11.5 0 0
302 M  V30 4 C -8.749 12.27 0 0
303 M  V30 5 O -10.0826 9.96 0 0
304 M  V30 6 N -7.4153 11.5 0 0
305 M  V30 7 C -6.0816 12.27 0 0
306 M  V30 END ATOM
307 M  V30 BEGIN BOND
308 M  V30 1 1 1 2
309 M  V30 2 1 2 3
310 M  V30 3 1 3 4
311 M  V30 4 2 3 5
312 M  V30 5 1 4 6
313 M  V30 6 1 6 7
314 M  V30 END BOND
315 M  V30 BEGIN SGROUP
316 M  V30 1 SRU 0 ATOMS=(3 2 3 5) XBONDS=(2 1 3) BRKXYZ=(9 -9.9955 12.6173 0 -
317 M  V30 -9.0715 11.0169 0 0 0 0) BRKXYZ=(9 -11.5035 11.1527 0 -12.4275 12.7531 -
318 M  V30 0 0 0 0) CONNECT=HT LABEL=n
319 M  V30 2 DAT 0 ATOMS=(1 6) FIELDNAME=foo_data -
320 M  V30 FIELDDISP="   -7.4153   11.5000    DAU   ALL  0       0" -
321 M  V30 MRV_FIELDDISP=0 FIELDDATA=bar
322 M  V30 3 DAT 0 ATOMS=(1 7) FIELDNAME=bar_data -
323 M  V30 FIELDDISP="   -6.0816   12.2700    DAU   ALL  0       0" -
324 M  V30 MRV_FIELDDISP=0 FIELDDATA=baz
325 M  V30 END SGROUP
326 M  V30 END CTAB
327 M  END
328 )CTAB"_ctab;
329     REQUIRE(m1);
330     auto &sgs = getSubstanceGroups(*m1);
331     CHECK(sgs.size() == 3);
332     sgs.erase(++sgs.begin());
333     CHECK(sgs.size() == 2);
334     CHECK(getSubstanceGroups(*m1).size() == 2);
335     auto molb = MolToV3KMolBlock(*m1);
336     CHECK(molb.find("foo_data") == std::string::npos);
337     CHECK(molb.find("M  V30 2 DAT 0 ATOMS=(1 7) FIELDNAME=bar_data") !=
338           std::string::npos);
339   }
340 }
341 
342 TEST_CASE(
343     "Github #3315: SubstanceGroups should not be written with quotes around "
344     "missing fields",
345     "[Sgroups][bug]") {
346   SECTION("basics") {
347     auto m1 = R"CTAB(
348   Mrv2014 07312012022D
349 
350   2  1  0  0  0  0            999 V2000
351     1.4295    0.1449    0.0000 C   0  0  0  0  0  0  0  0  0  0  0  0
352     1.4266   -0.6801    0.0000 C   0  0  0  0  0  0  0  0  0  0  0  0
353   2  1  1  0  0  0  0
354 M  STY  1   1 DAT
355 M  SAL   1  1   1
356 M  SDT   1 test sgroup
357 M  SDD   1     0.5348   -0.3403    DRU   ALL  0       0
358 M  SED   1 sgroupval
359 M  END
360 )CTAB"_ctab;
361     REQUIRE(m1);
362     auto molb = MolToV3KMolBlock(*m1);
363     CHECK(molb.find("FIELDINFO") == std::string::npos);
364     CHECK(molb.find("QUERYTYPE") == std::string::npos);
365     CHECK(molb.find("QUERYOP") == std::string::npos);
366     CHECK(molb.find("FIELDNAME") != std::string::npos);
367   }
368 }
369 
370 TEST_CASE("Allow brackets, cstates, and attachment points to be removed",
371           "[Sgroups]") {
372   auto m1 = R"CTAB(example
373  -ISIS-  10171405052D
374 
375   0  0  0     0  0            999 V3000
376 M  V30 BEGIN CTAB
377 M  V30 COUNTS 14 15 1 0 0
378 M  V30 BEGIN ATOM
379 M  V30 1 C 6.4292 -1.1916 0 0 CFG=3
380 M  V30 2 C 7.0125 -0.6042 0 0
381 M  V30 3 N 6.4292 -0.0250999 0 0
382 M  V30 4 C 5.8416 -0.6042 0 0
383 M  V30 5 C 5.8416 -1.7708 0 0
384 M  V30 6 N 6.4292 -2.3584 0 0 CFG=3
385 M  V30 7 C 7.0125 -1.7708 0 0
386 M  V30 8 O 5.7166 -3.5875 0 0
387 M  V30 9 C 5.7166 -4.4125 0 0 CFG=3
388 M  V30 10 C 4.8875 -4.4125 0 0
389 M  V30 11 C 6.5376 -4.4166 0 0
390 M  V30 12 C 5.7166 -5.2376 0 0
391 M  V30 13 C 6.4292 -3.175 0 0
392 M  V30 14 O 7.1375 -3.5875 0 0
393 M  V30 END ATOM
394 M  V30 BEGIN BOND
395 M  V30 1 1 1 2
396 M  V30 2 1 2 3
397 M  V30 3 1 3 4
398 M  V30 4 1 4 1
399 M  V30 5 1 1 5
400 M  V30 6 1 5 6
401 M  V30 7 1 6 7
402 M  V30 8 1 7 1
403 M  V30 9 1 6 13
404 M  V30 10 1 8 9
405 M  V30 11 1 9 10
406 M  V30 12 1 9 11
407 M  V30 13 1 9 12
408 M  V30 14 2 13 14
409 M  V30 15 1 8 13
410 M  V30 END BOND
411 M  V30 BEGIN SGROUP
412 M  V30 1 SUP 0 ATOMS=(7 8 9 10 11 12 13 14) XBONDS=(1 9) BRKXYZ=(9 6.24 -2.9 0 -
413 M  V30 6.24 -2.9 0 0 0 0) CSTATE=(4 9 0 0.82 0) LABEL=Boc SAP=(3 13 6 1)
414 M  V30 END SGROUP
415 M  V30 END CTAB
416 M  END)CTAB"_ctab;
417 
418   REQUIRE(m1);
419 
420   SECTION("brackets") {
421     REQUIRE(m1);
422     auto &sgs = getSubstanceGroups(*m1);
423     REQUIRE(sgs.size() == 1);
424     CHECK(sgs[0].getBrackets().size() == 1);
425     sgs[0].clearBrackets();
426     CHECK(sgs[0].getBrackets().size() == 0);
427   }
428   SECTION("cstates") {
429     auto &sgs = getSubstanceGroups(*m1);
430     REQUIRE(sgs.size() == 1);
431     CHECK(sgs[0].getCStates().size() == 1);
432     sgs[0].clearCStates();
433     CHECK(sgs[0].getCStates().size() == 0);
434   }
435   SECTION("attachpts") {
436     auto &sgs = getSubstanceGroups(*m1);
437     REQUIRE(sgs.size() == 1);
438     CHECK(sgs[0].getAttachPoints().size() == 1);
439     sgs[0].clearAttachPoints();
440     CHECK(sgs[0].getAttachPoints().size() == 0);
441   }
442 }
443 
444 TEST_CASE(
445     "GitHub Issue #4434: v2000 SGroups do not generate an \"index\" property",
446     "[Sgroups][bug]") {
447   SECTION("basics") {
448     auto v2000_mol = R"CTAB(
449   MJ211300
450 
451   2  1  0  0  0  0  0  0  0  0999 V2000
452     1.4295    0.1449    0.0000 C   0  0  0  0  0  0  0  0  0  0  0  0
453     1.4266   -0.6801    0.0000 C   0  0  0  0  0  0  0  0  0  0  0  0
454   2  1  1  0  0  0  0
455 M  STY  2   1 DAT   2 DAT
456 M  SAL   1  1   1
457 M  SDT   1 test sgroup 1
458 M  SDD   1     0.0000    0.0000    DRU   ALL  0       0
459 M  SED   1 sgroupval1
460 M  SAL   2  1   2
461 M  SDT   2 test sgroup 2
462 M  SDD   2     0.0000    0.0000    DRU   ALL  0       0
463 M  SED   2 sgroupval2
464 M  END
465 )CTAB"_ctab;
466     auto v3000_mol = R"CTAB(
467   Mrv2113 08202115042D
468 
469   0  0  0     0  0            999 V3000
470 M  V30 BEGIN CTAB
471 M  V30 COUNTS 2 1 2 0 0
472 M  V30 BEGIN ATOM
473 M  V30 1 C 2.6684 0.2705 0 0
474 M  V30 2 C 2.663 -1.2695 0 0
475 M  V30 END ATOM
476 M  V30 BEGIN BOND
477 M  V30 1 1 2 1
478 M  V30 END BOND
479 M  V30 BEGIN SGROUP
480 M  V30 1 DAT 0 ATOMS=(1 1) FIELDNAME="test sgroup 1" -
481 M  V30 FIELDDISP="    0.0000    0.0000    DRU   ALL  0       0" -
482 M  V30 MRV_FIELDDISP=0 FIELDDATA=sgroupval1
483 M  V30 2 DAT 0 ATOMS=(1 2) FIELDNAME="test sgroup 2" -
484 M  V30 FIELDDISP="    0.0000    0.0000    DRU   ALL  0       0" -
485 M  V30 MRV_FIELDDISP=0 FIELDDATA=sgroupval2
486 M  V30 END SGROUP
487 M  V30 END CTAB
488 M  END
489 )CTAB"_ctab;
490 
491     REQUIRE(v2000_mol);
492     REQUIRE(v3000_mol);
493 
494     unsigned int index;
495     unsigned int count = 0;
496 
497     for (const auto &sg : getSubstanceGroups(*v2000_mol)) {
498       ++count;
499       TEST_ASSERT(sg.getPropIfPresent("index", index));
500       TEST_ASSERT(index == count);
501     }
502 
503     count = 0;
504     for (const auto &sg : getSubstanceGroups(*v3000_mol)) {
505       ++count;
506       TEST_ASSERT(sg.getPropIfPresent("index", index));
507       TEST_ASSERT(index == count);
508     }
509   }
510 }