1 package uk.ac.cam.ch.wwmm.opsin; 2 3 import static uk.ac.cam.ch.wwmm.opsin.XmlDeclarations.*; 4 5 import java.util.ArrayDeque; 6 import java.util.Deque; 7 import java.util.HashMap; 8 import java.util.HashSet; 9 import java.util.List; 10 import java.util.Map; 11 import java.util.Set; 12 13 import uk.ac.cam.ch.wwmm.opsin.Bond.SMILES_BOND_DIRECTION; 14 import uk.ac.cam.ch.wwmm.opsin.BondStereo.BondStereoValue; 15 16 /** A builder for fragments specified as SMILES. A slightly custom SMILES dialect is used. 17 * It includes all common features of SMILES and a few useful extensions: 18 * | is used within a square bracketed element to directly set valency e.g. [P|5]. This is the same as using the lambda convention 19 * sb/te are allowed (aromatic antimony/tellurium): 20 * H? e.g. [SeH?] is used to indicate that the atom should use the default valency. It is equivalent to not using square brackets for organic atoms 21 * 22 * Allowed: 23 * Organic elements B,C,N,O,P,S,F,Cl,Br,I (square brackets not required) 24 * Aromatic elements c,n,o,p,s (square brackets not required) si,as,se,sb,te (square brackets required) Note that the inclusion of si/sb/te are an unofficial extension 25 * =, # for bond orders 26 * . for disconnection 27 * (, ) for branching 28 * [, ] for placing inorganic elements within and specifying charge. Allowed: [Al3+] or [Al+++] 29 * 012345679 - ring closures 30 * %10 %99 - more ring closures (%100 is ring closure %10 and 0 as in normal SMILES) 31 * / and \ to set double bond stereochemistry to cis/trans 32 * @ and @@ to set tetrahedral stereochemistry as in SMILES. 33 * Hx where x is a digit is used to sort of set the hydrogen. In actuality the valency of the atom is derived and a valency hint added to the atom 34 * This valency hint is the minimum valency that atom may be in. H? as an extension gives you the lowest acceptable valency. 35 * |3 |5 etc. can be used to set the valency of an atom e.g. [Se|2] 36 * 37 * Also, an = or # at the start of the string indicates that the group attaches to its parent group via a double or triple bond. 38 * 39 * A -,=,# on the end indicates that in the absence of locants, other groups attach to 40 * *it* via the atom at the end of the string, not at the start of the string with -,=,# meaning single,double or triple bond 41 * This behaviour is overridden for certain suffixes to give different meanings to the atom the -,=,# is referring to 42 * 43 * @author ptc24 44 * @author dl387 45 * 46 */ 47 class SMILESFragmentBuilder { 48 49 /**A "struct" to hold information on the parsing stack 50 * 51 * @author ptc24 52 * 53 */ 54 private static class StackFrame { 55 /**The Atom currently under consideration.*/ 56 Atom atom; 57 58 /**The order of the bond about to be formed.*/ 59 int bondOrder; 60 61 /**Whether the bond is a \ or / bond for use in determining cis/trans.*/ 62 SMILES_BOND_DIRECTION slash = null; 63 64 /**The index of a dummy atom in the atom's stereochemistry atomrefs4*/ 65 Integer indexOfDummyAtom = null; 66 67 /**Creates a stack frame with given parameters. 68 * 69 * @param a An atom or null 70 * @param bondOrderVal The value for bondOrder. 71 */ StackFrame(Atom a, int bondOrderVal)72 StackFrame(Atom a, int bondOrderVal) { 73 atom = a; 74 bondOrder = bondOrderVal; 75 } 76 77 /**Creates a copy of an existing StackFrame. 78 * 79 * @param sf The stackframe to copy. 80 */ StackFrame(StackFrame sf)81 StackFrame(StackFrame sf) { 82 atom = sf.atom; 83 bondOrder = sf.bondOrder; 84 } 85 } 86 87 /**Ring opening dummy atom, used as a placeholder in stereochemistry atomrefs4*/ 88 private static final Atom ringOpeningDummyAtom = new Atom(ChemEl.R); 89 90 /**Organic Atoms.*/ 91 private static final Set<String> organicAtoms = new HashSet<String>(); 92 /**Aromatic Atoms.*/ 93 private static final Set<String> aromaticAtoms = new HashSet<String>(); 94 95 static { 96 organicAtoms.add("B"); 97 organicAtoms.add("C"); 98 organicAtoms.add("N"); 99 organicAtoms.add("O"); 100 organicAtoms.add("P"); 101 organicAtoms.add("S"); 102 organicAtoms.add("F"); 103 organicAtoms.add("Cl"); 104 organicAtoms.add("Br"); 105 organicAtoms.add("I"); 106 107 aromaticAtoms.add("c"); 108 aromaticAtoms.add("n"); 109 aromaticAtoms.add("o"); 110 aromaticAtoms.add("p"); 111 aromaticAtoms.add("s"); 112 aromaticAtoms.add("si"); 113 aromaticAtoms.add("as"); 114 aromaticAtoms.add("se"); 115 aromaticAtoms.add("sb"); 116 aromaticAtoms.add("te"); 117 } 118 119 private final IDManager idManager; 120 SMILESFragmentBuilder(IDManager idManager)121 SMILESFragmentBuilder(IDManager idManager) { 122 this.idManager = idManager; 123 } 124 125 private class ParserInstance { 126 private final Deque<StackFrame> stack = new ArrayDeque<StackFrame>(); 127 private final Map<String, StackFrame> ringClosures = new HashMap<String, StackFrame>(); 128 129 private final String smiles; 130 private final int endOfSmiles; 131 private final Fragment fragment; 132 133 private int i = 0; 134 ParserInstance(String smiles, Fragment fragment)135 public ParserInstance(String smiles, Fragment fragment) { 136 this.smiles = smiles; 137 this.endOfSmiles = smiles.length(); 138 this.fragment = fragment; 139 } 140 parseSmiles()141 void parseSmiles() throws StructureBuildingException { 142 stack.add(new StackFrame(null, 1)); 143 for (; i < endOfSmiles; i++) { 144 char ch = smiles.charAt(i); 145 switch (ch) { 146 case '(': 147 stack.add(new StackFrame(stack.getLast())); 148 break; 149 case ')': 150 stack.removeLast(); 151 break; 152 case '-': 153 stack.getLast().bondOrder = 1; 154 break; 155 case '=': 156 if (stack.getLast().bondOrder != 1){ 157 throw new StructureBuildingException("= in unexpected position: bond order already defined!"); 158 } 159 stack.getLast().bondOrder = 2; 160 break; 161 case '#': 162 if (stack.getLast().bondOrder != 1){ 163 throw new StructureBuildingException("# in unexpected position: bond order already defined!"); 164 } 165 stack.getLast().bondOrder = 3; 166 break; 167 case '/': 168 if (stack.getLast().slash != null){ 169 throw new StructureBuildingException("/ in unexpected position: bond configuration already defined!"); 170 } 171 stack.getLast().slash = SMILES_BOND_DIRECTION.RSLASH; 172 break; 173 case '\\': 174 if (stack.getLast().slash != null){ 175 throw new StructureBuildingException("\\ in unexpected position: bond configuration already defined!"); 176 } 177 stack.getLast().slash = SMILES_BOND_DIRECTION.LSLASH; 178 break; 179 case '.': 180 stack.getLast().atom = null; 181 break; 182 case 'a': 183 case 'b': 184 case 'c': 185 case 'd': 186 case 'e': 187 case 'f': 188 case 'g': 189 case 'h': 190 case 'i': 191 case 'j': 192 case 'k': 193 case 'l': 194 case 'm': 195 case 'n': 196 case 'o': 197 case 'p': 198 case 'q': 199 case 'r': 200 case 's': 201 case 't': 202 case 'u': 203 case 'v': 204 case 'w': 205 case 'x': 206 case 'y': 207 case 'z': 208 case 'A': 209 case 'B': 210 case 'C': 211 case 'D': 212 case 'E': 213 case 'F': 214 case 'G': 215 case 'H': 216 case 'I': 217 case 'J': 218 case 'K': 219 case 'L': 220 case 'M': 221 case 'N': 222 case 'O': 223 case 'P': 224 case 'Q': 225 case 'R': 226 case 'S': 227 case 'T': 228 case 'U': 229 case 'V': 230 case 'W': 231 case 'X': 232 case 'Y': 233 case 'Z': 234 case '*': 235 processOrganicAtom(ch); 236 break; 237 case '[': 238 processBracketedAtom(); 239 break; 240 case '0': 241 case '1': 242 case '2': 243 case '3': 244 case '4': 245 case '5': 246 case '6': 247 case '7': 248 case '8': 249 case '9': 250 case '%': 251 processRingOpeningOrClosure(ch); 252 break; 253 default: 254 throw new StructureBuildingException(ch + " is in an unexpected position. Check this is not a mistake and that this feature of SMILES is supported by OPSIN's SMILES parser"); 255 } 256 } 257 if (!ringClosures.isEmpty()){ 258 throw new StructureBuildingException("Unmatched ring opening"); 259 } 260 } 261 262 /** 263 * An organic atom e.g. 'C', 'Cl', 'c' etc. 264 * @param ch 265 * @throws StructureBuildingException 266 */ processOrganicAtom(char ch)267 private void processOrganicAtom(char ch) throws StructureBuildingException { 268 String elementType = String.valueOf(ch); 269 boolean spareValency = false; 270 if(is_A_to_Z(ch)) {//normal atoms 271 if(i + 1 < endOfSmiles && is_a_to_z(smiles.charAt(i + 1)) && organicAtoms.contains(smiles.substring(i, i + 2))) { 272 elementType = smiles.substring(i, i + 2); 273 i++; 274 } 275 else if (!organicAtoms.contains(elementType)){ 276 throw new StructureBuildingException(elementType + " is not an organic Element. If it is actually an element it should be in square brackets"); 277 } 278 } 279 else if(is_a_to_z(ch)) {//aromatic atoms 280 if (!aromaticAtoms.contains(elementType)){ 281 throw new StructureBuildingException(elementType + " is not an aromatic Element. If it is actually an element it should not be in lower case"); 282 } 283 elementType = String.valueOf((char)(ch - 32)); 284 spareValency = true; 285 } 286 else if (ch == '*') { 287 elementType = "R"; 288 } 289 Atom atom = createAtom(elementType, fragment); 290 atom.setSpareValency(spareValency); 291 fragment.addAtom(atom); 292 293 StackFrame currentFrame = stack.getLast(); 294 if(currentFrame.atom != null) { 295 Bond b = createBond(currentFrame.atom, atom, currentFrame.bondOrder); 296 if (currentFrame.slash != null){ 297 b.setSmilesStereochemistry(currentFrame.slash); 298 currentFrame.slash = null; 299 } 300 if (currentFrame.atom.getAtomParity() != null){ 301 addAtomToAtomParity(currentFrame.atom.getAtomParity(), atom); 302 } 303 } 304 currentFrame.atom = atom; 305 currentFrame.bondOrder = 1; 306 } 307 308 /** 309 * square brackets- contain non-organic atoms or where required to set properties such as charge/chirality etc. 310 * e.g. [Na+] 311 * @throws StructureBuildingException 312 */ processBracketedAtom()313 private void processBracketedAtom() throws StructureBuildingException { 314 i++; 315 int indexOfRightSquareBracket = smiles.indexOf(']', i); 316 if (indexOfRightSquareBracket == -1) { 317 throw new StructureBuildingException("[ without matching \"]\""); 318 } 319 // isotope 320 String isotope = ""; 321 while(is_0_to_9(smiles.charAt(i))) { 322 isotope += smiles.charAt(i); 323 i++; 324 } 325 326 char ch; 327 if (i < indexOfRightSquareBracket){ 328 ch = smiles.charAt(i); 329 i++; 330 } 331 else{ 332 throw new StructureBuildingException("No element found in square brackets"); 333 } 334 // elementType 335 String elementType = String.valueOf(ch); 336 boolean spareValency = false; 337 if(is_A_to_Z(ch)) {//normal atoms 338 if(is_a_to_z(smiles.charAt(i))) { 339 elementType += smiles.charAt(i); 340 i++; 341 } 342 } 343 else if(is_a_to_z(ch)) {//aromatic atoms 344 if(is_a_to_z(smiles.charAt(i))) { 345 if (aromaticAtoms.contains(elementType + smiles.charAt(i))){ 346 elementType = String.valueOf((char)(ch - 32)) + smiles.charAt(i); 347 i++; 348 } 349 else{ 350 throw new StructureBuildingException(elementType + smiles.charAt(i) + " is not an aromatic Element. If it is actually an element it should not be in lower case"); 351 } 352 } 353 else{ 354 if (!aromaticAtoms.contains(elementType)){ 355 throw new StructureBuildingException(elementType + " is not an aromatic Element."); 356 } 357 elementType = String.valueOf((char)(ch - 32)); 358 } 359 spareValency = true; 360 } 361 else if (elementType.equals("*")){ 362 elementType = "R"; 363 } 364 else{ 365 throw new StructureBuildingException(elementType + " is not a valid element type!"); 366 } 367 Atom atom = createAtom(elementType, fragment); 368 atom.setSpareValency(spareValency); 369 if (isotope.length() > 0){ 370 atom.setIsotope(Integer.parseInt(isotope)); 371 } 372 fragment.addAtom(atom); 373 StackFrame currentFrame = stack.getLast(); 374 if(currentFrame.atom != null) { 375 Bond b = createBond(currentFrame.atom, atom, currentFrame.bondOrder); 376 if (currentFrame.slash != null){ 377 b.setSmilesStereochemistry(currentFrame.slash); 378 currentFrame.slash = null; 379 } 380 if (currentFrame.atom.getAtomParity() != null){ 381 addAtomToAtomParity(currentFrame.atom.getAtomParity(), atom); 382 } 383 } 384 Atom previousAtom = currentFrame.atom;//needed for setting atomParity elements up 385 currentFrame.atom = atom; 386 currentFrame.bondOrder = 1; 387 388 Integer hydrogenCount = 0; 389 int charge = 0; 390 Boolean chiralitySet = false; 391 for (; i < indexOfRightSquareBracket; i++) { 392 ch = smiles.charAt(i); 393 if(ch == '@') {// chirality-sets atom parity 394 if (chiralitySet){ 395 throw new StructureBuildingException("Atom parity appeared to be specified twice for an atom in a square bracket!"); 396 } 397 processTetrahedralStereochemistry(atom, previousAtom); 398 chiralitySet = true; 399 } 400 else if (ch == 'H'){// hydrogenCount 401 if (hydrogenCount == null || hydrogenCount != 0){ 402 throw new StructureBuildingException("Hydrogen count appeared to be specified twice for an atom in a square bracket!"); 403 } 404 if (smiles.charAt(i + 1) == '?'){ 405 //extension to allow standard valency (as determined by the group in the periodic table) to dictate hydrogens 406 i++; 407 hydrogenCount = null; 408 } 409 else{ 410 String hydrogenCountString =""; 411 while(is_0_to_9(smiles.charAt(i + 1))) { 412 hydrogenCountString += smiles.charAt(i + 1); 413 i++; 414 } 415 if (hydrogenCountString.length() == 0){ 416 hydrogenCount = 1; 417 } 418 else{ 419 hydrogenCount = Integer.parseInt(hydrogenCountString); 420 } 421 if (atom.hasSpareValency()) { 422 if ((!elementType.equals("C") && !elementType.equals("Si")) || hydrogenCount >=2){ 423 fragment.addIndicatedHydrogen(atom); 424 } 425 } 426 } 427 } 428 else if(ch == '+' || ch == '-') {// formalCharge 429 if (charge != 0){ 430 throw new StructureBuildingException("Charge appeared to be specified twice for an atom in a square bracket!"); 431 } 432 charge = (ch == '+') ? 1 : -1; 433 String changeChargeStr = ""; 434 int changeCharge = 1; 435 while(is_0_to_9(smiles.charAt(i + 1))) {//e.g. [C+2] 436 changeChargeStr += smiles.charAt(i + 1); 437 i++; 438 } 439 if (changeChargeStr.length() == 0){ 440 while(i + 1 < indexOfRightSquareBracket){//e.g. [C++] 441 ch = smiles.charAt(i + 1); 442 if (ch == '+'){ 443 if (charge != 1){ 444 throw new StructureBuildingException("Atom has both positive and negative charges specified!");//e.g. [C+-] 445 } 446 } 447 else if (ch == '-'){ 448 if (charge != -1){ 449 throw new StructureBuildingException("Atom has both negative and positive charges specified!"); 450 } 451 } 452 else{ 453 break; 454 } 455 changeCharge++; 456 i++; 457 } 458 } 459 changeCharge = changeChargeStr.length() == 0 ? changeCharge : Integer.parseInt(changeChargeStr); 460 atom.setCharge(charge * changeCharge); 461 } 462 else if(ch == '|') { 463 StringBuilder lambda = new StringBuilder(); 464 while(i < endOfSmiles && is_0_to_9(smiles.charAt(i + 1))) { 465 lambda.append(smiles.charAt(i + 1)); 466 i++; 467 } 468 atom.setLambdaConventionValency(Integer.parseInt(lambda.toString())); 469 } 470 else{ 471 throw new StructureBuildingException("Unexpected character found in square bracket"); 472 } 473 } 474 atom.setProperty(Atom.SMILES_HYDROGEN_COUNT, hydrogenCount); 475 } 476 477 /** 478 * Adds an atomParity element to the given atom using the information at the current index 479 * @param atom 480 * @param previousAtom 481 */ processTetrahedralStereochemistry(Atom atom, Atom previousAtom)482 private void processTetrahedralStereochemistry(Atom atom, Atom previousAtom){ 483 Boolean chiralityClockwise = false; 484 if (smiles.charAt(i + 1) == '@'){ 485 chiralityClockwise = true; 486 i++; 487 } 488 AtomParity atomParity; 489 if (chiralityClockwise){ 490 atomParity = new AtomParity(new Atom[4], 1); 491 } 492 else{ 493 atomParity = new AtomParity(new Atom[4], -1); 494 } 495 Atom[] atomRefs4 = atomParity.getAtomRefs4(); 496 int index =0; 497 if (previousAtom != null){ 498 atomRefs4[index] = previousAtom; 499 index++; 500 } 501 if (smiles.charAt(i + 1) == 'H'){ 502 atomRefs4[index] = AtomParity.hydrogen; 503 //this character will also be checked by the hydrogen count check, hence don't increment i 504 } 505 atom.setAtomParity(atomParity); 506 } 507 508 /** 509 * Process ring openings and closings e.g. the two 1s in c1ccccc1 510 * @param ch 511 * @throws StructureBuildingException 512 */ processRingOpeningOrClosure(char ch)513 private void processRingOpeningOrClosure(char ch) throws StructureBuildingException { 514 String closure = String.valueOf(ch); 515 if(ch == '%') { 516 if (i + 2 < endOfSmiles && is_0_to_9(smiles.charAt(i + 1)) && is_0_to_9(smiles.charAt(i + 2))) { 517 closure = smiles.substring(i + 1, i + 3); 518 i +=2; 519 } 520 else{ 521 throw new StructureBuildingException("A ring opening indice after a % must be two digits long"); 522 } 523 } 524 if(ringClosures.containsKey(closure)) { 525 processRingClosure(closure); 526 } else { 527 if (getInscopeAtom() == null){ 528 throw new StructureBuildingException("A ring opening has appeared before any atom!"); 529 } 530 processRingOpening(closure); 531 } 532 } 533 processRingOpening(String closure)534 private void processRingOpening(String closure) throws StructureBuildingException { 535 StackFrame currentFrame = stack.getLast(); 536 StackFrame sf = new StackFrame(currentFrame); 537 if (currentFrame.slash != null){ 538 sf.slash = currentFrame.slash; 539 currentFrame.slash = null; 540 } 541 AtomParity atomParity = sf.atom.getAtomParity(); 542 if (atomParity != null){//replace ringclosureX with actual reference to id when it is known 543 sf.indexOfDummyAtom = addAtomToAtomParity(atomParity, ringOpeningDummyAtom); 544 } 545 ringClosures.put(closure, sf); 546 currentFrame.bondOrder = 1; 547 } 548 processRingClosure(String closure)549 private void processRingClosure(String closure) throws StructureBuildingException { 550 StackFrame sf = ringClosures.remove(closure); 551 StackFrame currentFrame = stack.getLast(); 552 int bondOrder = 1; 553 if(sf.bondOrder > 1) { 554 if(currentFrame.bondOrder > 1 && sf.bondOrder != currentFrame.bondOrder){ 555 throw new StructureBuildingException("ring closure has two different bond orders specified!"); 556 } 557 bondOrder = sf.bondOrder; 558 } else if(currentFrame.bondOrder > 1) { 559 bondOrder = currentFrame.bondOrder; 560 } 561 Bond b; 562 if (currentFrame.slash != null) { 563 //stereochemistry specified on ring closure 564 //special case e.g. CC1=C/F.O\1 Bond is done from the O to the the C due to the presence of the \ 565 b = createBond(currentFrame.atom, sf.atom, bondOrder); 566 b.setSmilesStereochemistry(currentFrame.slash); 567 if(sf.slash != null && sf.slash.equals(currentFrame.slash)) {//specified twice check for contradiction 568 throw new StructureBuildingException("Contradictory double bond stereoconfiguration"); 569 } 570 currentFrame.slash = null; 571 } 572 else { 573 b = createBond(sf.atom, currentFrame.atom, bondOrder); 574 if (sf.slash != null) { 575 //stereochemistry specified on ring opening 576 b.setSmilesStereochemistry(sf.slash); 577 } 578 } 579 580 AtomParity currentAtomParity = currentFrame.atom.getAtomParity(); 581 if (currentAtomParity != null) { 582 addAtomToAtomParity(currentAtomParity, sf.atom); 583 } 584 585 AtomParity closureAtomParity = sf.atom.getAtomParity(); 586 if (closureAtomParity != null) {//replace dummy atom with actual atom e.g. N[C@@H]1C.F1 where the 1 initially holds a dummy atom before being replaced with the F atom 587 Atom[] atomRefs4 = closureAtomParity.getAtomRefs4(); 588 if (sf.indexOfDummyAtom == null) { 589 throw new RuntimeException("OPSIN Bug: Index of dummy atom representing ring closure atom not set"); 590 } 591 atomRefs4[sf.indexOfDummyAtom] = currentFrame.atom; 592 } 593 currentFrame.bondOrder = 1; 594 } 595 596 /** 597 * Adds an atom at the first non-null position in the atomParity's atomRefs4 598 * @param atomParity 599 * @param atom 600 * @return Returns the index of the atom in the atomParity's atomRefs4 601 * @throws StructureBuildingException 602 */ addAtomToAtomParity(AtomParity atomParity, Atom atom)603 private int addAtomToAtomParity(AtomParity atomParity, Atom atom) throws StructureBuildingException { 604 Atom[] atomRefs4 = atomParity.getAtomRefs4(); 605 boolean setAtom = false; 606 int i = 0; 607 for (; i < atomRefs4.length; i++) { 608 if (atomRefs4[i] == null){ 609 atomRefs4[i] = atom; 610 setAtom = true; 611 break; 612 } 613 } 614 if (!setAtom){ 615 throw new StructureBuildingException("Tetrahedral stereocentre specified in SMILES appears to involve more than 4 atoms"); 616 } 617 return i; 618 } 619 620 /** 621 * For non-empty SMILES will return the atom at the top of the stack i.e. the one that will be bonded to next if the SMILES continued 622 * (only valid during execution of and after {@link ParserInstance#parseSmiles()} has been called) 623 * @return 624 */ getInscopeAtom()625 Atom getInscopeAtom(){ 626 return stack.getLast().atom; 627 } 628 } 629 630 /** 631 * Build a Fragment based on a SMILES string. 632 * The type/subType of the Fragment are the empty String 633 * The fragment has no locants 634 * 635 * @param smiles The SMILES string to build from. 636 * @return The built fragment. 637 * @throws StructureBuildingException 638 */ build(String smiles)639 Fragment build(String smiles) throws StructureBuildingException { 640 return build(smiles, "", NONE_LABELS_VAL); 641 } 642 643 /** 644 * Build a Fragment based on a SMILES string. 645 * @param smiles The SMILES string to build from. 646 * @param type The type of the fragment retrieved when calling {@link Fragment#getType()} 647 * @param labelMapping A string indicating which locants to assign to each atom. Can be a slash delimited list, "numeric", "fusedRing" or "none"/"" 648 * @return 649 * @throws StructureBuildingException 650 */ build(String smiles, String type, String labelMapping)651 Fragment build(String smiles, String type, String labelMapping) throws StructureBuildingException { 652 return build(smiles, new Fragment(type), labelMapping); 653 } 654 655 /** 656 * Build a Fragment based on a SMILES string. 657 * @param smiles The SMILES string to build from. 658 * @param tokenEl The corresponding tokenEl 659 * @param labelMapping A string indicating which locants to assign to each atom. Can be a slash delimited list, "numeric", "fusedRing" or "none"/"" 660 * @return Fragment The built fragment. 661 * @throws StructureBuildingException 662 */ build(String smiles, Element tokenEl, String labelMapping)663 Fragment build(String smiles, Element tokenEl, String labelMapping) throws StructureBuildingException { 664 if (tokenEl == null){ 665 throw new IllegalArgumentException("tokenEl is null. FragmentManager's DUMMY_TOKEN should be used instead"); 666 } 667 return build(smiles, new Fragment(tokenEl), labelMapping); 668 } 669 build(String smiles, Fragment fragment, String labelMapping)670 private Fragment build(String smiles, Fragment fragment, String labelMapping) throws StructureBuildingException { 671 if (smiles == null){ 672 throw new IllegalArgumentException("SMILES specified is null"); 673 } 674 if (labelMapping == null){ 675 throw new IllegalArgumentException("labelMapping is null use \"none\" if you do not want any numbering or \"numeric\" if you would like default numbering"); 676 } 677 if (smiles.length() == 0){ 678 return fragment; 679 } 680 int firstIndex = 0; 681 int lastIndex = smiles.length(); 682 char firstCharacter =smiles.charAt(0); 683 if(firstCharacter == '-' || firstCharacter == '=' || firstCharacter == '#') {//used by OPSIN to specify the valency with which this fragment connects 684 firstIndex++; 685 } 686 char lastCharacter =smiles.charAt(lastIndex - 1); 687 if(lastCharacter == '-' || lastCharacter == '=' || lastCharacter == '#') {//used by OPSIN to specify the valency with which this fragment connects and to indicate it connects via the last atom in the SMILES 688 lastIndex--; 689 } 690 ParserInstance instance = new ParserInstance(smiles.substring(firstIndex, lastIndex), fragment); 691 instance.parseSmiles(); 692 693 List<Atom> atomList = fragment.getAtomList(); 694 processLabelling(labelMapping, atomList); 695 696 verifyAndTakeIntoAccountLonePairsInAtomParities(atomList); 697 addBondStereoElements(fragment); 698 699 if(firstCharacter == '-'){ 700 fragment.addOutAtom(fragment.getFirstAtom(), 1, true); 701 } 702 else if(firstCharacter == '='){ 703 fragment.addOutAtom(fragment.getFirstAtom(), 2, true); 704 } 705 else if (firstCharacter == '#'){ 706 fragment.addOutAtom(fragment.getFirstAtom(), 3, true); 707 } 708 709 if(lastCharacter == '-' || lastCharacter == '=' || lastCharacter == '#') { 710 Atom lastAtom = instance.getInscopeAtom();//note that in something like C(=O)- this would be the carbon not the oxygen 711 if (lastCharacter == '#'){ 712 fragment.addOutAtom(lastAtom, 3, true); 713 } 714 else if (lastCharacter == '='){ 715 fragment.addOutAtom(lastAtom, 2, true); 716 } 717 else{ 718 fragment.addOutAtom(lastAtom, 1, true); 719 } 720 } 721 722 for (Atom atom : atomList) { 723 if (atom.getProperty(Atom.SMILES_HYDROGEN_COUNT) != null && atom.getLambdaConventionValency() == null){ 724 setupAtomValency(atom); 725 } 726 } 727 CycleDetector.assignWhetherAtomsAreInCycles(fragment); 728 return fragment; 729 } 730 processLabelling(String labelMapping, List<Atom> atomList)731 private void processLabelling(String labelMapping, List<Atom> atomList) throws StructureBuildingException { 732 if (labelMapping.equals(NONE_LABELS_VAL) || labelMapping.length() == 0) { 733 return; 734 } 735 if (labelMapping.equals(NUMERIC_LABELS_VAL)) { 736 int atomNumber = 1; 737 for (Atom atom : atomList) { 738 atom.addLocant(Integer.toString(atomNumber++)); 739 } 740 } 741 else if(labelMapping.equals(FUSEDRING_LABELS_VAL)) {//fragment is a fusedring with atoms in the correct order for fused ring numbering 742 //this will do stuff like changing labels from 1,2,3,4,5,6,7,8,9,10->1,2,3,4,4a,5,6,7,8,8a 743 FragmentTools.relabelLocantsAsFusedRingSystem(atomList); 744 } 745 else{ 746 String[] labelMap = labelMapping.split("/", -1);//place slash delimited labels into an array 747 int numOfAtoms = atomList.size(); 748 if (labelMap.length != numOfAtoms){ 749 throw new StructureBuildingException("Group numbering has been invalidly defined in resource file: labels: " +labelMap.length + ", atoms: " + numOfAtoms ); 750 } 751 for (int i = 0; i < numOfAtoms; i++) { 752 String labels[] = labelMap[i].split(","); 753 for (String label : labels) { 754 if (label.length() > 0) { 755 atomList.get(i).addLocant(label); 756 } 757 } 758 } 759 } 760 } 761 verifyAndTakeIntoAccountLonePairsInAtomParities(List<Atom> atomList)762 private void verifyAndTakeIntoAccountLonePairsInAtomParities(List<Atom> atomList) throws StructureBuildingException { 763 for (Atom atom : atomList) { 764 AtomParity atomParity = atom.getAtomParity(); 765 if (atomParity != null){ 766 Atom[] atomRefs4 = atomParity.getAtomRefs4(); 767 int nullAtoms = 0; 768 int hydrogen = 0; 769 for (Atom atomRefs4Atom : atomRefs4) { 770 if (atomRefs4Atom == null){ 771 nullAtoms++; 772 } 773 else if (atomRefs4Atom.equals(AtomParity.hydrogen)){ 774 hydrogen++; 775 } 776 } 777 if (nullAtoms != 0){ 778 if (nullAtoms ==1 && hydrogen==0 && 779 (atom.getElement() == ChemEl.N || atom.getElement() == ChemEl.S || atom.getElement() == ChemEl.Se)){//special case where lone pair is part of the tetrahedron 780 if (atomList.indexOf(atomRefs4[0]) < atomList.indexOf(atom)){//is there an atom in the SMILES in front of the stereocentre? 781 atomRefs4[3] = atomRefs4[2]; 782 atomRefs4[2] = atomRefs4[1]; 783 atomRefs4[1] = atom; 784 } 785 else{ 786 atomRefs4[3] = atomRefs4[2]; 787 atomRefs4[2] = atomRefs4[1]; 788 atomRefs4[1] = atomRefs4[0]; 789 atomRefs4[0] = atom; 790 } 791 } 792 else{ 793 throw new StructureBuildingException("SMILES is malformed. Tetrahedral stereochemistry defined on a non tetrahedral centre"); 794 } 795 } 796 } 797 } 798 } 799 addBondStereoElements(Fragment currentFrag)800 private void addBondStereoElements(Fragment currentFrag) throws StructureBuildingException { 801 Set<Bond> bonds = currentFrag.getBondSet(); 802 for (Bond centralBond : bonds) {//identify cases of E/Z stereochemistry and add appropriate bondstereo tags 803 if (centralBond.getOrder() == 2) { 804 List<Bond> fromAtomBonds = centralBond.getFromAtom().getBonds(); 805 for (Bond preceedingBond : fromAtomBonds) { 806 if (preceedingBond.getSmilesStereochemistry() != null) { 807 List<Bond> toAtomBonds = centralBond.getToAtom().getBonds(); 808 for (Bond followingBond : toAtomBonds) { 809 if (followingBond.getSmilesStereochemistry() != null) {//now found a double bond surrounded by two bonds with slashs 810 boolean upFirst; 811 boolean upSecond; 812 Atom atom2 = centralBond.getFromAtom(); 813 Atom atom3 = centralBond.getToAtom(); 814 Atom atom1 = preceedingBond.getOtherAtom(atom2); 815 Atom atom4 = followingBond.getOtherAtom(atom3); 816 if (preceedingBond.getSmilesStereochemistry() == SMILES_BOND_DIRECTION.LSLASH) { 817 upFirst = preceedingBond.getToAtom() == atom2;//in normally constructed SMILES this will be the case but you could write C(/F)=C/F instead of F\C=C/F 818 } 819 else if (preceedingBond.getSmilesStereochemistry() == SMILES_BOND_DIRECTION.RSLASH) { 820 upFirst = preceedingBond.getToAtom() != atom2; 821 } 822 else{ 823 throw new StructureBuildingException(preceedingBond.getSmilesStereochemistry() + " is not a slash!"); 824 } 825 826 if (followingBond.getSmilesStereochemistry() == SMILES_BOND_DIRECTION.LSLASH) { 827 upSecond = followingBond.getFromAtom() != atom3; 828 } 829 else if (followingBond.getSmilesStereochemistry() == SMILES_BOND_DIRECTION.RSLASH) { 830 upSecond = followingBond.getFromAtom() == atom3; 831 } 832 else{ 833 throw new StructureBuildingException(followingBond.getSmilesStereochemistry() + " is not a slash!"); 834 } 835 BondStereoValue cisTrans = upFirst == upSecond ? BondStereoValue.CIS : BondStereoValue.TRANS; 836 if (centralBond.getBondStereo() != null) { 837 //double bond has redundant specification e.g. C/C=C\\1/NC1 hence need to check it is consistent 838 Atom[] atomRefs4 = centralBond.getBondStereo().getAtomRefs4(); 839 if (atomRefs4[0].equals(atom1) || atomRefs4[3].equals(atom4)) { 840 if (centralBond.getBondStereo().getBondStereoValue().equals(cisTrans)){ 841 throw new StructureBuildingException("Contradictory double bond stereoconfiguration"); 842 } 843 } 844 else{ 845 if (!centralBond.getBondStereo().getBondStereoValue().equals(cisTrans)){ 846 throw new StructureBuildingException("Contradictory double bond stereoconfiguration"); 847 } 848 } 849 } 850 else{ 851 Atom[] atomRefs4= new Atom[4]; 852 atomRefs4[0] = atom1; 853 atomRefs4[1] = atom2; 854 atomRefs4[2] = atom3; 855 atomRefs4[3] = atom4; 856 centralBond.setBondStereoElement(atomRefs4, cisTrans); 857 } 858 } 859 } 860 } 861 } 862 } 863 } 864 for (Bond bond : bonds) { 865 bond.setSmilesStereochemistry(null); 866 } 867 } 868 869 /** 870 * Utilises the atom's hydrogen count as set by the SMILES as well as incoming valency to determine the atom's valency 871 * If the atom is charged whether protons have been added or removed will also need to be determined 872 * @param atom 873 * @throws StructureBuildingException 874 */ setupAtomValency(Atom atom)875 private void setupAtomValency(Atom atom) throws StructureBuildingException { 876 int hydrogenCount = atom.getProperty(Atom.SMILES_HYDROGEN_COUNT); 877 int incomingValency = atom.getIncomingValency() + hydrogenCount +atom.getOutValency(); 878 int charge = atom.getCharge(); 879 int absoluteCharge =Math.abs(charge); 880 ChemEl chemEl = atom.getElement(); 881 if (atom.hasSpareValency()) { 882 Integer hwValency = ValencyChecker.getHWValency(chemEl); 883 if (hwValency == null || absoluteCharge > 1) { 884 throw new StructureBuildingException(chemEl +" is not expected to be aromatic!"); 885 } 886 if (absoluteCharge != 0) { 887 Integer[] possibleVal = ValencyChecker.getPossibleValencies(chemEl, charge); 888 if (possibleVal != null && possibleVal.length > 0) { 889 hwValency = possibleVal[0]; 890 } 891 else { 892 throw new StructureBuildingException(chemEl +" with charge " + charge + " is not expected to be aromatic!"); 893 } 894 } 895 if (incomingValency < hwValency){ 896 incomingValency++; 897 } 898 } 899 Integer defaultVal = ValencyChecker.getDefaultValency(chemEl); 900 if (defaultVal !=null){//s or p block element 901 if (defaultVal != incomingValency || charge !=0) { 902 if (Math.abs(incomingValency - defaultVal) == absoluteCharge) { 903 atom.setProtonsExplicitlyAddedOrRemoved(incomingValency - defaultVal); 904 } 905 else{ 906 Integer[] unchargedStableValencies = ValencyChecker.getPossibleValencies(chemEl, 0); 907 boolean hasPlausibleValency =false; 908 for (Integer unchargedStableValency : unchargedStableValencies) { 909 if (Math.abs(incomingValency - unchargedStableValency)==Math.abs(charge)){ 910 atom.setProtonsExplicitlyAddedOrRemoved(incomingValency - unchargedStableValency); 911 //we strictly set the valency if a charge is specified but are more loose about things if uncharged e.g. allow penta substituted phosphine 912 if (charge != 0) { 913 atom.setLambdaConventionValency(unchargedStableValency); 914 } 915 else{ 916 atom.setMinimumValency(incomingValency); 917 } 918 hasPlausibleValency=true; 919 break; 920 } 921 } 922 if (!hasPlausibleValency){//could be something like [Sn] which would be expected to be attached to later 923 atom.setMinimumValency(incomingValency); 924 } 925 } 926 } 927 } 928 else{ 929 if (hydrogenCount > 0){//make hydrogen explicit 930 Fragment frag =atom.getFrag(); 931 for (int i = 0; i < hydrogenCount; i++) { 932 Atom hydrogen = createAtom(ChemEl.H, frag); 933 createBond(atom, hydrogen, 1); 934 } 935 } 936 } 937 } 938 939 940 /** 941 * Create a new Atom of the given element belonging to the given fragment 942 * @param elementSymbol 943 * @param frag 944 * @return Atom 945 */ createAtom(String elementSymbol, Fragment frag)946 private Atom createAtom(String elementSymbol, Fragment frag) { 947 return createAtom(ChemEl.valueOf(elementSymbol), frag); 948 } 949 950 /** 951 * Create a new Atom of the given element belonging to the given fragment 952 * @param chemEl 953 * @param frag 954 * @return Atom 955 */ createAtom(ChemEl chemEl, Fragment frag)956 private Atom createAtom(ChemEl chemEl, Fragment frag) { 957 Atom a = new Atom(idManager.getNextID(), chemEl, frag); 958 frag.addAtom(a); 959 return a; 960 } 961 962 /** 963 * Create a new bond between two atoms. 964 * The bond is associated with these atoms. 965 * @param fromAtom 966 * @param toAtom 967 * @param bondOrder 968 * @return Bond 969 */ createBond(Atom fromAtom, Atom toAtom, int bondOrder)970 private Bond createBond(Atom fromAtom, Atom toAtom, int bondOrder) { 971 Bond b = new Bond(fromAtom, toAtom, bondOrder); 972 fromAtom.addBond(b); 973 toAtom.addBond(b); 974 fromAtom.getFrag().addBond(b); 975 return b; 976 } 977 is_A_to_Z(char ch)978 private boolean is_A_to_Z(char ch) { 979 return ch >= 'A' && ch <= 'Z'; 980 } 981 is_a_to_z(char ch)982 private boolean is_a_to_z(char ch) { 983 return ch >= 'a' && ch <= 'z'; 984 } 985 is_0_to_9(char ch)986 private boolean is_0_to_9(char ch){ 987 return ch >= '0' && ch <= '9'; 988 } 989 990 } 991