1 package uk.ac.cam.ch.wwmm.opsin;
2 
3 import static uk.ac.cam.ch.wwmm.opsin.XmlDeclarations.*;
4 
5 import java.util.ArrayDeque;
6 import java.util.Deque;
7 import java.util.HashMap;
8 import java.util.HashSet;
9 import java.util.List;
10 import java.util.Map;
11 import java.util.Set;
12 
13 import uk.ac.cam.ch.wwmm.opsin.Bond.SMILES_BOND_DIRECTION;
14 import uk.ac.cam.ch.wwmm.opsin.BondStereo.BondStereoValue;
15 
16 /** A builder for fragments specified as SMILES. A slightly custom SMILES dialect is used.
17  * It includes all common features of SMILES and a few useful extensions:
18  * | is used within a square bracketed element to directly set valency e.g. [P|5]. This is the same as using the lambda convention
19  * sb/te are allowed (aromatic antimony/tellurium):
20  * H? e.g. [SeH?] is used to indicate that the atom should use the default valency. It is equivalent to not using square brackets for organic atoms
21  *
22  * Allowed:
23  * Organic elements B,C,N,O,P,S,F,Cl,Br,I (square brackets not required)
24  * Aromatic elements c,n,o,p,s (square brackets not required) si,as,se,sb,te (square brackets required) Note that the inclusion of si/sb/te are an unofficial extension
25  * =, # for bond orders
26  * . for disconnection
27  * (, ) for branching
28  * [, ] for placing inorganic elements within and specifying charge. Allowed: [Al3+] or [Al+++]
29  * 012345679 - ring closures
30  * %10 %99 - more ring closures (%100 is ring closure %10 and 0 as in normal SMILES)
31  * / and \ to set double bond stereochemistry to cis/trans
32  * @ and @@ to set tetrahedral stereochemistry as in SMILES.
33  * Hx where x is a digit is used to sort of set the hydrogen. In actuality the valency of the atom is derived and a valency hint added to the atom
34  * This valency hint is the minimum valency that atom may be in. H? as an extension gives you the lowest acceptable valency.
35  * |3 |5 etc. can be used to set the valency of an atom e.g.  [Se|2]
36  *
37  * Also, an = or # at the start of the string indicates that the group attaches to its parent group via a double or triple bond.
38  *
39  * A -,=,# on the end indicates that in the absence of locants, other groups attach to
40  * *it* via the atom at the end of the string, not at the start of the string with -,=,# meaning single,double or triple bond
41  * This behaviour is overridden for certain suffixes to give different meanings to the atom the -,=,# is referring to
42  *
43  * @author ptc24
44  * @author dl387
45  *
46  */
47 class SMILESFragmentBuilder {
48 
49 	/**A "struct" to hold information on the parsing stack
50 	 *
51 	 * @author ptc24
52 	 *
53 	 */
54 	private static class StackFrame {
55 		/**The Atom currently under consideration.*/
56 		Atom atom;
57 
58 		/**The order of the bond about to be formed.*/
59 		int bondOrder;
60 
61 		/**Whether the bond is a \ or / bond for use in determining cis/trans.*/
62 		SMILES_BOND_DIRECTION slash = null;
63 
64 		/**The index of a dummy atom in the atom's stereochemistry atomrefs4*/
65 		Integer indexOfDummyAtom = null;
66 
67 		/**Creates a stack frame with given parameters.
68 		 *
69 		 * @param a An atom or null
70 		 * @param bondOrderVal The value for bondOrder.
71 		 */
StackFrame(Atom a, int bondOrderVal)72 		StackFrame(Atom a, int bondOrderVal) {
73 			atom = a;
74 			bondOrder = bondOrderVal;
75 		}
76 
77 		/**Creates a copy of an existing StackFrame.
78 		 *
79 		 * @param sf The stackframe to copy.
80 		 */
StackFrame(StackFrame sf)81 		StackFrame(StackFrame sf) {
82 			atom = sf.atom;
83 			bondOrder = sf.bondOrder;
84 		}
85 	}
86 
87 	/**Ring opening dummy atom, used as a placeholder in stereochemistry atomrefs4*/
88 	private static final Atom ringOpeningDummyAtom = new Atom(ChemEl.R);
89 
90 	/**Organic Atoms.*/
91 	private static final Set<String> organicAtoms = new HashSet<String>();
92 	/**Aromatic Atoms.*/
93 	private static final Set<String> aromaticAtoms = new HashSet<String>();
94 
95 	static {
96 		organicAtoms.add("B");
97 		organicAtoms.add("C");
98 		organicAtoms.add("N");
99 		organicAtoms.add("O");
100 		organicAtoms.add("P");
101 		organicAtoms.add("S");
102 		organicAtoms.add("F");
103 		organicAtoms.add("Cl");
104 		organicAtoms.add("Br");
105 		organicAtoms.add("I");
106 
107 		aromaticAtoms.add("c");
108 		aromaticAtoms.add("n");
109 		aromaticAtoms.add("o");
110 		aromaticAtoms.add("p");
111 		aromaticAtoms.add("s");
112 		aromaticAtoms.add("si");
113 		aromaticAtoms.add("as");
114 		aromaticAtoms.add("se");
115 		aromaticAtoms.add("sb");
116 		aromaticAtoms.add("te");
117 	}
118 
119 	private final IDManager idManager;
120 
SMILESFragmentBuilder(IDManager idManager)121 	SMILESFragmentBuilder(IDManager idManager) {
122 		this.idManager = idManager;
123 	}
124 
125 	private class ParserInstance {
126 		private final Deque<StackFrame> stack = new ArrayDeque<StackFrame>();
127 		private final Map<String, StackFrame> ringClosures = new HashMap<String, StackFrame>();
128 
129 		private final String smiles;
130 		private final int endOfSmiles;
131 		private final Fragment fragment;
132 
133 		private int i = 0;
134 
ParserInstance(String smiles, Fragment fragment)135 		public ParserInstance(String smiles, Fragment fragment) {
136 			this.smiles = smiles;
137 			this.endOfSmiles = smiles.length();
138 			this.fragment = fragment;
139 		}
140 
parseSmiles()141 		void parseSmiles() throws StructureBuildingException {
142 			stack.add(new StackFrame(null, 1));
143 			for (; i < endOfSmiles; i++) {
144 				char ch = smiles.charAt(i);
145 				switch (ch) {
146 				case '(':
147 					stack.add(new StackFrame(stack.getLast()));
148 					break;
149 				case ')':
150 					stack.removeLast();
151 					break;
152 				case '-':
153 					stack.getLast().bondOrder = 1;
154 					break;
155 				case '=':
156 					if (stack.getLast().bondOrder != 1){
157 						throw new StructureBuildingException("= in unexpected position: bond order already defined!");
158 					}
159 					stack.getLast().bondOrder = 2;
160 					break;
161 				case '#':
162 					if (stack.getLast().bondOrder != 1){
163 						throw new StructureBuildingException("# in unexpected position: bond order already defined!");
164 					}
165 					stack.getLast().bondOrder = 3;
166 					break;
167 				case '/':
168 					if (stack.getLast().slash != null){
169 						throw new StructureBuildingException("/ in unexpected position: bond configuration already defined!");
170 					}
171 					stack.getLast().slash = SMILES_BOND_DIRECTION.RSLASH;
172 					break;
173 				case '\\':
174 					if (stack.getLast().slash != null){
175 						throw new StructureBuildingException("\\ in unexpected position: bond configuration already defined!");
176 					}
177 					stack.getLast().slash = SMILES_BOND_DIRECTION.LSLASH;
178 					break;
179 				case '.':
180 					stack.getLast().atom = null;
181 					break;
182 				case 'a':
183 				case 'b':
184 				case 'c':
185 				case 'd':
186 				case 'e':
187 				case 'f':
188 				case 'g':
189 				case 'h':
190 				case 'i':
191 				case 'j':
192 				case 'k':
193 				case 'l':
194 				case 'm':
195 				case 'n':
196 				case 'o':
197 				case 'p':
198 				case 'q':
199 				case 'r':
200 				case 's':
201 				case 't':
202 				case 'u':
203 				case 'v':
204 				case 'w':
205 				case 'x':
206 				case 'y':
207 				case 'z':
208 				case 'A':
209 				case 'B':
210 				case 'C':
211 				case 'D':
212 				case 'E':
213 				case 'F':
214 				case 'G':
215 				case 'H':
216 				case 'I':
217 				case 'J':
218 				case 'K':
219 				case 'L':
220 				case 'M':
221 				case 'N':
222 				case 'O':
223 				case 'P':
224 				case 'Q':
225 				case 'R':
226 				case 'S':
227 				case 'T':
228 				case 'U':
229 				case 'V':
230 				case 'W':
231 				case 'X':
232 				case 'Y':
233 				case 'Z':
234 				case '*':
235 					processOrganicAtom(ch);
236 					break;
237 				case '[':
238 					processBracketedAtom();
239 					break;
240 				case '0':
241 				case '1':
242 				case '2':
243 				case '3':
244 				case '4':
245 				case '5':
246 				case '6':
247 				case '7':
248 				case '8':
249 				case '9':
250 				case '%':
251 					processRingOpeningOrClosure(ch);
252 					break;
253 				default:
254 					throw new StructureBuildingException(ch + " is in an unexpected position. Check this is not a mistake and that this feature of SMILES is supported by OPSIN's SMILES parser");
255 				}
256 			}
257 			if (!ringClosures.isEmpty()){
258 				throw new StructureBuildingException("Unmatched ring opening");
259 			}
260 		}
261 
262 		/**
263 		 * An organic atom e.g. 'C', 'Cl', 'c' etc.
264 		 * @param ch
265 		 * @throws StructureBuildingException
266 		 */
processOrganicAtom(char ch)267 		private void processOrganicAtom(char ch) throws StructureBuildingException {
268 			String elementType = String.valueOf(ch);
269 			boolean spareValency = false;
270 			if(is_A_to_Z(ch)) {//normal atoms
271 				if(i + 1 < endOfSmiles && is_a_to_z(smiles.charAt(i + 1)) && organicAtoms.contains(smiles.substring(i, i + 2))) {
272 					elementType = smiles.substring(i, i + 2);
273 					i++;
274 				}
275 				else if (!organicAtoms.contains(elementType)){
276 					throw new StructureBuildingException(elementType + " is not an organic Element. If it is actually an element it should be in square brackets");
277 				}
278 			}
279 			else if(is_a_to_z(ch)) {//aromatic atoms
280 				if (!aromaticAtoms.contains(elementType)){
281 					throw new StructureBuildingException(elementType + " is not an aromatic Element. If it is actually an element it should not be in lower case");
282 				}
283 				elementType = String.valueOf((char)(ch - 32));
284 				spareValency = true;
285 			}
286 			else if (ch == '*') {
287 				elementType = "R";
288 			}
289 			Atom atom = createAtom(elementType, fragment);
290 			atom.setSpareValency(spareValency);
291 			fragment.addAtom(atom);
292 
293 			StackFrame currentFrame = stack.getLast();
294 			if(currentFrame.atom != null) {
295 				Bond b = createBond(currentFrame.atom, atom, currentFrame.bondOrder);
296 				if (currentFrame.slash != null){
297 					b.setSmilesStereochemistry(currentFrame.slash);
298 					currentFrame.slash = null;
299 				}
300 				if (currentFrame.atom.getAtomParity() != null){
301 					addAtomToAtomParity(currentFrame.atom.getAtomParity(), atom);
302 				}
303 			}
304 			currentFrame.atom = atom;
305 			currentFrame.bondOrder = 1;
306 		}
307 
308 		/**
309 		 * square brackets- contain non-organic atoms or where required to set properties such as charge/chirality etc.
310 		 * e.g. [Na+]
311 		 * @throws StructureBuildingException
312 		 */
processBracketedAtom()313 		private void processBracketedAtom() throws StructureBuildingException {
314 			i++;
315 			int indexOfRightSquareBracket = smiles.indexOf(']', i);
316 			if (indexOfRightSquareBracket == -1) {
317 				throw new StructureBuildingException("[ without matching \"]\"");
318 			}
319 			// isotope
320 			String isotope = "";
321 			while(is_0_to_9(smiles.charAt(i))) {
322 				isotope += smiles.charAt(i);
323 				i++;
324 			}
325 
326 			char ch;
327 			if (i < indexOfRightSquareBracket){
328 				ch = smiles.charAt(i);
329 				i++;
330 			}
331 			else{
332 				throw new StructureBuildingException("No element found in square brackets");
333 			}
334 			// elementType
335 			String elementType = String.valueOf(ch);
336 			boolean spareValency = false;
337 			if(is_A_to_Z(ch)) {//normal atoms
338 				if(is_a_to_z(smiles.charAt(i))) {
339 					elementType += smiles.charAt(i);
340 					i++;
341 				}
342 			}
343 			else if(is_a_to_z(ch)) {//aromatic atoms
344 				if(is_a_to_z(smiles.charAt(i))) {
345 					if (aromaticAtoms.contains(elementType + smiles.charAt(i))){
346 						elementType = String.valueOf((char)(ch - 32)) + smiles.charAt(i);
347 						i++;
348 					}
349 					else{
350 						throw new StructureBuildingException(elementType + smiles.charAt(i) + " is not an aromatic Element. If it is actually an element it should not be in lower case");
351 					}
352 				}
353 				else{
354 					if (!aromaticAtoms.contains(elementType)){
355 						throw new StructureBuildingException(elementType + " is not an aromatic Element.");
356 					}
357 					elementType = String.valueOf((char)(ch - 32));
358 				}
359 				spareValency = true;
360 			}
361 			else if (elementType.equals("*")){
362 				elementType = "R";
363 			}
364 			else{
365 				throw new StructureBuildingException(elementType + " is not a valid element type!");
366 			}
367 			Atom atom = createAtom(elementType, fragment);
368 			atom.setSpareValency(spareValency);
369 			if (isotope.length() > 0){
370 				atom.setIsotope(Integer.parseInt(isotope));
371 			}
372 			fragment.addAtom(atom);
373 			StackFrame currentFrame = stack.getLast();
374 			if(currentFrame.atom != null) {
375 				Bond b = createBond(currentFrame.atom, atom, currentFrame.bondOrder);
376 				if (currentFrame.slash != null){
377 					b.setSmilesStereochemistry(currentFrame.slash);
378 					currentFrame.slash = null;
379 				}
380 				if (currentFrame.atom.getAtomParity() != null){
381 					addAtomToAtomParity(currentFrame.atom.getAtomParity(), atom);
382 				}
383 			}
384 			Atom previousAtom = currentFrame.atom;//needed for setting atomParity elements up
385 			currentFrame.atom = atom;
386 			currentFrame.bondOrder = 1;
387 
388 			Integer hydrogenCount = 0;
389 			int charge = 0;
390 			Boolean chiralitySet = false;
391 			for (; i < indexOfRightSquareBracket; i++) {
392 				ch = smiles.charAt(i);
393 				if(ch == '@') {// chirality-sets atom parity
394 					if (chiralitySet){
395 						throw new StructureBuildingException("Atom parity appeared to be specified twice for an atom in a square bracket!");
396 					}
397 					processTetrahedralStereochemistry(atom, previousAtom);
398 					chiralitySet = true;
399 				}
400 				else if (ch == 'H'){// hydrogenCount
401 					if (hydrogenCount == null || hydrogenCount != 0){
402 						throw new StructureBuildingException("Hydrogen count appeared to be specified twice for an atom in a square bracket!");
403 					}
404 					if (smiles.charAt(i + 1) == '?'){
405 						//extension to allow standard valency (as determined by the group in the periodic table) to dictate hydrogens
406 						i++;
407 						hydrogenCount = null;
408 					}
409 					else{
410 						String hydrogenCountString ="";
411 						while(is_0_to_9(smiles.charAt(i + 1))) {
412 							hydrogenCountString += smiles.charAt(i + 1);
413 							i++;
414 						}
415 						if (hydrogenCountString.length() == 0){
416 							hydrogenCount = 1;
417 						}
418 						else{
419 							hydrogenCount = Integer.parseInt(hydrogenCountString);
420 						}
421 						if (atom.hasSpareValency()) {
422 							if ((!elementType.equals("C") && !elementType.equals("Si")) || hydrogenCount >=2){
423 								fragment.addIndicatedHydrogen(atom);
424 							}
425 						}
426 					}
427 				}
428 				else if(ch == '+' || ch == '-') {// formalCharge
429 					if (charge != 0){
430 						throw new StructureBuildingException("Charge appeared to be specified twice for an atom in a square bracket!");
431 					}
432 					charge = (ch == '+') ? 1 : -1;
433 					String changeChargeStr = "";
434 					int changeCharge = 1;
435 					while(is_0_to_9(smiles.charAt(i + 1))) {//e.g. [C+2]
436 						changeChargeStr += smiles.charAt(i + 1);
437 						i++;
438 					}
439 					if (changeChargeStr.length() == 0){
440 						while(i + 1 < indexOfRightSquareBracket){//e.g. [C++]
441 							ch = smiles.charAt(i + 1);
442 							if (ch == '+'){
443 								if (charge != 1){
444 									throw new StructureBuildingException("Atom has both positive and negative charges specified!");//e.g. [C+-]
445 								}
446 							}
447 							else if (ch == '-'){
448 								if (charge != -1){
449 									throw new StructureBuildingException("Atom has both negative and positive charges specified!");
450 								}
451 							}
452 							else{
453 								break;
454 							}
455 							changeCharge++;
456 							i++;
457 						}
458 					}
459 					changeCharge = changeChargeStr.length() == 0 ? changeCharge : Integer.parseInt(changeChargeStr);
460 					atom.setCharge(charge * changeCharge);
461 				}
462 				else if(ch == '|') {
463 					StringBuilder lambda = new StringBuilder();
464 					while(i < endOfSmiles && is_0_to_9(smiles.charAt(i + 1))) {
465 						lambda.append(smiles.charAt(i + 1));
466 						i++;
467 					}
468 					atom.setLambdaConventionValency(Integer.parseInt(lambda.toString()));
469 				}
470 				else{
471 					throw new StructureBuildingException("Unexpected character found in square bracket");
472 				}
473 			}
474 			atom.setProperty(Atom.SMILES_HYDROGEN_COUNT, hydrogenCount);
475 		}
476 
477 		/**
478 		 * Adds an atomParity element to the given atom using the information at the current index
479 		 * @param atom
480 		 * @param previousAtom
481 		 */
processTetrahedralStereochemistry(Atom atom, Atom previousAtom)482 		private void processTetrahedralStereochemistry(Atom atom, Atom previousAtom){
483 			Boolean chiralityClockwise = false;
484 			if (smiles.charAt(i + 1) == '@'){
485 				chiralityClockwise = true;
486 				i++;
487 			}
488 			AtomParity atomParity;
489 			if (chiralityClockwise){
490 				atomParity = new AtomParity(new Atom[4], 1);
491 			}
492 			else{
493 				atomParity = new AtomParity(new Atom[4], -1);
494 			}
495 			Atom[] atomRefs4 = atomParity.getAtomRefs4();
496 			int index =0;
497 			if (previousAtom != null){
498 				atomRefs4[index] = previousAtom;
499 				index++;
500 			}
501 			if (smiles.charAt(i + 1) == 'H'){
502 				atomRefs4[index] = AtomParity.hydrogen;
503 				//this character will also be checked by the hydrogen count check, hence don't increment i
504 			}
505 			atom.setAtomParity(atomParity);
506 		}
507 
508 		/**
509 		 * Process ring openings and closings e.g. the two 1s in c1ccccc1
510 		 * @param ch
511 		 * @throws StructureBuildingException
512 		 */
processRingOpeningOrClosure(char ch)513 		private void processRingOpeningOrClosure(char ch) throws StructureBuildingException {
514 			String closure = String.valueOf(ch);
515 			if(ch == '%') {
516 				if (i + 2 < endOfSmiles && is_0_to_9(smiles.charAt(i + 1)) && is_0_to_9(smiles.charAt(i + 2))) {
517 					closure = smiles.substring(i + 1, i + 3);
518 					i +=2;
519 				}
520 				else{
521 					throw new StructureBuildingException("A ring opening indice after a % must be two digits long");
522 				}
523 			}
524 			if(ringClosures.containsKey(closure)) {
525 				processRingClosure(closure);
526 			} else {
527 				if (getInscopeAtom() == null){
528 					throw new StructureBuildingException("A ring opening has appeared before any atom!");
529 				}
530 				processRingOpening(closure);
531 			}
532 		}
533 
processRingOpening(String closure)534 		private void processRingOpening(String closure) throws StructureBuildingException {
535 			StackFrame currentFrame = stack.getLast();
536 			StackFrame sf = new StackFrame(currentFrame);
537 			if (currentFrame.slash != null){
538 				sf.slash = currentFrame.slash;
539 				currentFrame.slash = null;
540 			}
541 			AtomParity atomParity = sf.atom.getAtomParity();
542 			if (atomParity != null){//replace ringclosureX with actual reference to id when it is known
543 				sf.indexOfDummyAtom = addAtomToAtomParity(atomParity, ringOpeningDummyAtom);
544 			}
545 			ringClosures.put(closure, sf);
546 			currentFrame.bondOrder = 1;
547 		}
548 
processRingClosure(String closure)549 		private void processRingClosure(String closure) throws StructureBuildingException {
550 			StackFrame sf = ringClosures.remove(closure);
551 			StackFrame currentFrame = stack.getLast();
552 			int bondOrder = 1;
553 			if(sf.bondOrder > 1) {
554 				if(currentFrame.bondOrder > 1 && sf.bondOrder != currentFrame.bondOrder){
555 					throw new StructureBuildingException("ring closure has two different bond orders specified!");
556 				}
557 				bondOrder = sf.bondOrder;
558 			} else if(currentFrame.bondOrder > 1) {
559 				bondOrder = currentFrame.bondOrder;
560 			}
561 			Bond b;
562 			if (currentFrame.slash != null) {
563 				//stereochemistry specified on ring closure
564 				//special case e.g. CC1=C/F.O\1  Bond is done from the O to the the C due to the presence of the \
565 				b = createBond(currentFrame.atom, sf.atom, bondOrder);
566 				b.setSmilesStereochemistry(currentFrame.slash);
567 				if(sf.slash != null && sf.slash.equals(currentFrame.slash)) {//specified twice check for contradiction
568 					throw new StructureBuildingException("Contradictory double bond stereoconfiguration");
569 				}
570 				currentFrame.slash = null;
571 			}
572 			else {
573 				b = createBond(sf.atom, currentFrame.atom, bondOrder);
574 				if (sf.slash != null) {
575 					//stereochemistry specified on ring opening
576 					b.setSmilesStereochemistry(sf.slash);
577 				}
578 			}
579 
580 			AtomParity currentAtomParity = currentFrame.atom.getAtomParity();
581 			if (currentAtomParity != null) {
582 				addAtomToAtomParity(currentAtomParity, sf.atom);
583 			}
584 
585 			AtomParity closureAtomParity = sf.atom.getAtomParity();
586 			if (closureAtomParity != null) {//replace dummy atom with actual atom e.g. N[C@@H]1C.F1 where the 1 initially holds a dummy atom before being replaced with the F atom
587 				Atom[] atomRefs4 = closureAtomParity.getAtomRefs4();
588 				if (sf.indexOfDummyAtom == null) {
589 					throw new RuntimeException("OPSIN Bug: Index of dummy atom representing ring closure atom not set");
590 				}
591 				atomRefs4[sf.indexOfDummyAtom] = currentFrame.atom;
592 			}
593 			currentFrame.bondOrder = 1;
594 		}
595 
596 		/**
597 		 * Adds an atom at the first non-null position in the atomParity's atomRefs4
598 		 * @param atomParity
599 		 * @param atom
600 		 * @return Returns the index of the atom in the atomParity's atomRefs4
601 		 * @throws StructureBuildingException
602 		 */
addAtomToAtomParity(AtomParity atomParity, Atom atom)603 		private int addAtomToAtomParity(AtomParity atomParity, Atom atom) throws StructureBuildingException {
604 			Atom[] atomRefs4 = atomParity.getAtomRefs4();
605 			boolean setAtom = false;
606 			int i = 0;
607 			for (; i < atomRefs4.length; i++) {
608 				if (atomRefs4[i] == null){
609 					atomRefs4[i] = atom;
610 					setAtom = true;
611 					break;
612 				}
613 			}
614 			if (!setAtom){
615 				throw new StructureBuildingException("Tetrahedral stereocentre specified in SMILES appears to involve more than 4 atoms");
616 			}
617 			return i;
618 		}
619 
620 		/**
621 		 * For non-empty SMILES will return the atom at the top of the stack i.e. the one that will be bonded to next if the SMILES continued
622 		 * (only valid during execution of and after {@link ParserInstance#parseSmiles()} has been called)
623 		 * @return
624 		 */
getInscopeAtom()625 		Atom getInscopeAtom(){
626 			return stack.getLast().atom;
627 		}
628 	}
629 
630 	/**
631 	 * Build a Fragment based on a SMILES string.
632 	 * The type/subType of the Fragment are the empty String
633 	 * The fragment has no locants
634 	 *
635 	 * @param smiles The SMILES string to build from.
636 	 * @return The built fragment.
637 	 * @throws StructureBuildingException
638 	 */
build(String smiles)639 	Fragment build(String smiles) throws StructureBuildingException {
640 		return build(smiles, "", NONE_LABELS_VAL);
641 	}
642 
643 	/**
644 	 * Build a Fragment based on a SMILES string.
645 	 * @param smiles The SMILES string to build from.
646 	 * @param type The type of the fragment retrieved when calling {@link Fragment#getType()}
647 	 * @param labelMapping A string indicating which locants to assign to each atom. Can be a slash delimited list, "numeric", "fusedRing" or "none"/""
648 	 * @return
649 	 * @throws StructureBuildingException
650 	 */
build(String smiles, String type, String labelMapping)651 	Fragment build(String smiles, String type, String labelMapping) throws StructureBuildingException {
652 		return build(smiles, new Fragment(type), labelMapping);
653 	}
654 
655 	/**
656 	 * Build a Fragment based on a SMILES string.
657 	 * @param smiles The SMILES string to build from.
658 	 * @param tokenEl The corresponding tokenEl
659 	 * @param labelMapping A string indicating which locants to assign to each atom. Can be a slash delimited list, "numeric", "fusedRing" or "none"/""
660 	 * @return Fragment The built fragment.
661 	 * @throws StructureBuildingException
662 	 */
build(String smiles, Element tokenEl, String labelMapping)663 	Fragment build(String smiles, Element tokenEl, String labelMapping) throws StructureBuildingException {
664 		if (tokenEl == null){
665 			throw new IllegalArgumentException("tokenEl is null. FragmentManager's DUMMY_TOKEN should be used instead");
666 		}
667 		return build(smiles, new Fragment(tokenEl), labelMapping);
668 	}
669 
build(String smiles, Fragment fragment, String labelMapping)670 	private Fragment build(String smiles, Fragment fragment, String labelMapping) throws StructureBuildingException {
671 		if (smiles == null){
672 			throw new IllegalArgumentException("SMILES specified is null");
673 		}
674 		if (labelMapping == null){
675 			throw new IllegalArgumentException("labelMapping is null use \"none\" if you do not want any numbering or \"numeric\" if you would like default numbering");
676 		}
677 		if (smiles.length() == 0){
678 			return fragment;
679 		}
680 		int firstIndex = 0;
681 		int lastIndex = smiles.length();
682 		char firstCharacter =smiles.charAt(0);
683 		if(firstCharacter == '-' || firstCharacter == '=' || firstCharacter == '#') {//used by OPSIN to specify the valency with which this fragment connects
684 			firstIndex++;
685 		}
686 		char lastCharacter =smiles.charAt(lastIndex - 1);
687 		if(lastCharacter == '-' || lastCharacter == '=' || lastCharacter == '#') {//used by OPSIN to specify the valency with which this fragment connects and to indicate it connects via the last atom in the SMILES
688 			lastIndex--;
689 		}
690 		ParserInstance instance = new ParserInstance(smiles.substring(firstIndex, lastIndex), fragment);
691 		instance.parseSmiles();
692 
693 		List<Atom> atomList = fragment.getAtomList();
694 		processLabelling(labelMapping, atomList);
695 
696 		verifyAndTakeIntoAccountLonePairsInAtomParities(atomList);
697 		addBondStereoElements(fragment);
698 
699 		if(firstCharacter == '-'){
700 			fragment.addOutAtom(fragment.getFirstAtom(), 1, true);
701 		}
702 		else if(firstCharacter == '='){
703 			fragment.addOutAtom(fragment.getFirstAtom(), 2, true);
704 		}
705 		else if (firstCharacter == '#'){
706 			fragment.addOutAtom(fragment.getFirstAtom(), 3, true);
707 		}
708 
709 		if(lastCharacter == '-' || lastCharacter == '=' || lastCharacter == '#') {
710 			Atom lastAtom = instance.getInscopeAtom();//note that in something like C(=O)- this would be the carbon not the oxygen
711 			if (lastCharacter == '#'){
712 				fragment.addOutAtom(lastAtom, 3, true);
713 			}
714 			else if (lastCharacter == '='){
715 				fragment.addOutAtom(lastAtom, 2, true);
716 			}
717 			else{
718 				fragment.addOutAtom(lastAtom, 1, true);
719 			}
720 		}
721 
722 		for (Atom atom : atomList) {
723 			if (atom.getProperty(Atom.SMILES_HYDROGEN_COUNT) != null && atom.getLambdaConventionValency() == null){
724 				setupAtomValency(atom);
725 			}
726 		}
727 		CycleDetector.assignWhetherAtomsAreInCycles(fragment);
728 		return fragment;
729 	}
730 
processLabelling(String labelMapping, List<Atom> atomList)731 	private void processLabelling(String labelMapping, List<Atom> atomList) throws StructureBuildingException {
732 		if (labelMapping.equals(NONE_LABELS_VAL) || labelMapping.length() == 0) {
733 			return;
734 		}
735 		if (labelMapping.equals(NUMERIC_LABELS_VAL)) {
736 			int atomNumber = 1;
737 			for (Atom atom : atomList) {
738 				atom.addLocant(Integer.toString(atomNumber++));
739 			}
740 		}
741 		else if(labelMapping.equals(FUSEDRING_LABELS_VAL)) {//fragment is a fusedring with atoms in the correct order for fused ring numbering
742 			//this will do stuff like changing labels from 1,2,3,4,5,6,7,8,9,10->1,2,3,4,4a,5,6,7,8,8a
743 			FragmentTools.relabelLocantsAsFusedRingSystem(atomList);
744 		}
745 		else{
746 			String[] labelMap = labelMapping.split("/", -1);//place slash delimited labels into an array
747 			int numOfAtoms = atomList.size();
748 			if (labelMap.length != numOfAtoms){
749 				throw new StructureBuildingException("Group numbering has been invalidly defined in resource file: labels: " +labelMap.length + ", atoms: " + numOfAtoms );
750 			}
751 			for (int i = 0; i < numOfAtoms; i++) {
752 				String labels[] = labelMap[i].split(",");
753 				for (String label : labels) {
754 					if (label.length() > 0) {
755 						atomList.get(i).addLocant(label);
756 					}
757 				}
758 			}
759 		}
760 	}
761 
verifyAndTakeIntoAccountLonePairsInAtomParities(List<Atom> atomList)762 	private void verifyAndTakeIntoAccountLonePairsInAtomParities(List<Atom> atomList) throws StructureBuildingException {
763 		for (Atom atom : atomList) {
764 			AtomParity atomParity = atom.getAtomParity();
765 			if (atomParity != null){
766 				Atom[] atomRefs4 = atomParity.getAtomRefs4();
767 				int nullAtoms = 0;
768 				int hydrogen = 0;
769 				for (Atom atomRefs4Atom : atomRefs4) {
770 					if (atomRefs4Atom == null){
771 						nullAtoms++;
772 					}
773 					else if (atomRefs4Atom.equals(AtomParity.hydrogen)){
774 						hydrogen++;
775 					}
776 				}
777 				if (nullAtoms != 0){
778 					if (nullAtoms ==1 && hydrogen==0 &&
779 							(atom.getElement() == ChemEl.N || atom.getElement() == ChemEl.S || atom.getElement() == ChemEl.Se)){//special case where lone pair is part of the tetrahedron
780 						if (atomList.indexOf(atomRefs4[0]) < atomList.indexOf(atom)){//is there an atom in the SMILES in front of the stereocentre?
781 							atomRefs4[3] = atomRefs4[2];
782 							atomRefs4[2] = atomRefs4[1];
783 							atomRefs4[1] = atom;
784 						}
785 						else{
786 							atomRefs4[3] = atomRefs4[2];
787 							atomRefs4[2] = atomRefs4[1];
788 							atomRefs4[1] = atomRefs4[0];
789 							atomRefs4[0] = atom;
790 						}
791 					}
792 					else{
793 						throw new StructureBuildingException("SMILES is malformed. Tetrahedral stereochemistry defined on a non tetrahedral centre");
794 					}
795 				}
796 			}
797 		}
798 	}
799 
addBondStereoElements(Fragment currentFrag)800 	private void addBondStereoElements(Fragment currentFrag) throws StructureBuildingException {
801 		Set<Bond> bonds = currentFrag.getBondSet();
802 		for (Bond centralBond : bonds) {//identify cases of E/Z stereochemistry and add appropriate bondstereo tags
803 			if (centralBond.getOrder() == 2) {
804 				List<Bond> fromAtomBonds = centralBond.getFromAtom().getBonds();
805 				for (Bond preceedingBond : fromAtomBonds) {
806 					if (preceedingBond.getSmilesStereochemistry() != null) {
807 						List<Bond> toAtomBonds = centralBond.getToAtom().getBonds();
808 						for (Bond followingBond : toAtomBonds) {
809 							if (followingBond.getSmilesStereochemistry() != null) {//now found a double bond surrounded by two bonds with slashs
810 								boolean upFirst;
811 								boolean upSecond;
812 								Atom atom2 = centralBond.getFromAtom();
813 								Atom atom3 = centralBond.getToAtom();
814 								Atom atom1 = preceedingBond.getOtherAtom(atom2);
815 								Atom atom4 = followingBond.getOtherAtom(atom3);
816 								if (preceedingBond.getSmilesStereochemistry() == SMILES_BOND_DIRECTION.LSLASH) {
817 									upFirst = preceedingBond.getToAtom() == atom2;//in normally constructed SMILES this will be the case but you could write C(/F)=C/F instead of F\C=C/F
818 								}
819 								else if (preceedingBond.getSmilesStereochemistry() == SMILES_BOND_DIRECTION.RSLASH) {
820 									upFirst = preceedingBond.getToAtom() != atom2;
821 								}
822 								else{
823 									throw new StructureBuildingException(preceedingBond.getSmilesStereochemistry() + " is not a slash!");
824 								}
825 
826 								if (followingBond.getSmilesStereochemistry() == SMILES_BOND_DIRECTION.LSLASH) {
827 									upSecond = followingBond.getFromAtom() != atom3;
828 								}
829 								else if (followingBond.getSmilesStereochemistry() == SMILES_BOND_DIRECTION.RSLASH) {
830 									upSecond = followingBond.getFromAtom() == atom3;
831 								}
832 								else{
833 									throw new StructureBuildingException(followingBond.getSmilesStereochemistry() + " is not a slash!");
834 								}
835 								BondStereoValue cisTrans = upFirst == upSecond ? BondStereoValue.CIS : BondStereoValue.TRANS;
836 								if (centralBond.getBondStereo() != null) {
837 									//double bond has redundant specification e.g. C/C=C\\1/NC1 hence need to check it is consistent
838 									Atom[] atomRefs4 = centralBond.getBondStereo().getAtomRefs4();
839 									if (atomRefs4[0].equals(atom1) || atomRefs4[3].equals(atom4)) {
840 										if (centralBond.getBondStereo().getBondStereoValue().equals(cisTrans)){
841 											throw new StructureBuildingException("Contradictory double bond stereoconfiguration");
842 										}
843 									}
844 									else{
845 										if (!centralBond.getBondStereo().getBondStereoValue().equals(cisTrans)){
846 											throw new StructureBuildingException("Contradictory double bond stereoconfiguration");
847 										}
848 									}
849 								}
850 								else{
851 									Atom[] atomRefs4= new Atom[4];
852 									atomRefs4[0] = atom1;
853 									atomRefs4[1] = atom2;
854 									atomRefs4[2] = atom3;
855 									atomRefs4[3] = atom4;
856 									centralBond.setBondStereoElement(atomRefs4, cisTrans);
857 								}
858 							}
859 						}
860 					}
861 				}
862 			}
863 		}
864 		for (Bond bond : bonds) {
865 			bond.setSmilesStereochemistry(null);
866 		}
867 	}
868 
869 	/**
870 	 * Utilises the atom's hydrogen count as set by the SMILES as well as incoming valency to determine the atom's valency
871 	 * If the atom is charged whether protons have been added or removed will also need to be determined
872 	 * @param atom
873 	 * @throws StructureBuildingException
874 	 */
setupAtomValency(Atom atom)875 	private void setupAtomValency(Atom atom) throws StructureBuildingException {
876 		int hydrogenCount = atom.getProperty(Atom.SMILES_HYDROGEN_COUNT);
877 		int incomingValency = atom.getIncomingValency() + hydrogenCount +atom.getOutValency();
878 		int charge = atom.getCharge();
879 		int absoluteCharge =Math.abs(charge);
880 		ChemEl chemEl = atom.getElement();
881 		if (atom.hasSpareValency()) {
882 			Integer hwValency = ValencyChecker.getHWValency(chemEl);
883 			if (hwValency == null || absoluteCharge > 1) {
884 				throw new StructureBuildingException(chemEl +" is not expected to be aromatic!");
885 			}
886 			if (absoluteCharge != 0) {
887 				Integer[] possibleVal = ValencyChecker.getPossibleValencies(chemEl, charge);
888 				if (possibleVal != null && possibleVal.length > 0) {
889 					hwValency = possibleVal[0];
890 				}
891 				else {
892 					throw new StructureBuildingException(chemEl +" with charge " + charge + " is not expected to be aromatic!");
893 				}
894 			}
895 			if (incomingValency < hwValency){
896 				incomingValency++;
897 			}
898 		}
899 		Integer defaultVal = ValencyChecker.getDefaultValency(chemEl);
900 		if (defaultVal !=null){//s or p block element
901 			if (defaultVal != incomingValency || charge !=0) {
902 				if (Math.abs(incomingValency - defaultVal) == absoluteCharge) {
903 					atom.setProtonsExplicitlyAddedOrRemoved(incomingValency - defaultVal);
904 				}
905 				else{
906 					Integer[] unchargedStableValencies = ValencyChecker.getPossibleValencies(chemEl, 0);
907 					boolean hasPlausibleValency =false;
908 					for (Integer unchargedStableValency : unchargedStableValencies) {
909 						if (Math.abs(incomingValency - unchargedStableValency)==Math.abs(charge)){
910 							atom.setProtonsExplicitlyAddedOrRemoved(incomingValency - unchargedStableValency);
911 							//we strictly set the valency if a charge is specified but are more loose about things if uncharged e.g. allow penta substituted phosphine
912 							if (charge != 0) {
913 								atom.setLambdaConventionValency(unchargedStableValency);
914 							}
915 							else{
916 								atom.setMinimumValency(incomingValency);
917 							}
918 							hasPlausibleValency=true;
919 							break;
920 						}
921 					}
922 					if (!hasPlausibleValency){//could be something like [Sn] which would be expected to be attached to later
923 						atom.setMinimumValency(incomingValency);
924 					}
925 				}
926 			}
927 		}
928 		else{
929 			if (hydrogenCount > 0){//make hydrogen explicit
930 				Fragment frag =atom.getFrag();
931 				for (int i = 0; i < hydrogenCount; i++) {
932 					Atom hydrogen = createAtom(ChemEl.H, frag);
933 					createBond(atom, hydrogen, 1);
934 				}
935 			}
936 		}
937 	}
938 
939 
940 	/**
941 	 * Create a new Atom of the given element belonging to the given fragment
942 	 * @param elementSymbol
943 	 * @param frag
944 	 * @return Atom
945 	 */
createAtom(String elementSymbol, Fragment frag)946 	private Atom createAtom(String elementSymbol, Fragment frag) {
947 		return createAtom(ChemEl.valueOf(elementSymbol), frag);
948 	}
949 
950 	/**
951 	 * Create a new Atom of the given element belonging to the given fragment
952 	 * @param chemEl
953 	 * @param frag
954 	 * @return Atom
955 	 */
createAtom(ChemEl chemEl, Fragment frag)956 	private Atom createAtom(ChemEl chemEl, Fragment frag) {
957 		Atom a = new Atom(idManager.getNextID(), chemEl, frag);
958 		frag.addAtom(a);
959 		return a;
960 	}
961 
962 	/**
963 	 * Create a new bond between two atoms.
964 	 * The bond is associated with these atoms.
965 	 * @param fromAtom
966 	 * @param toAtom
967 	 * @param bondOrder
968 	 * @return Bond
969 	 */
createBond(Atom fromAtom, Atom toAtom, int bondOrder)970 	private Bond createBond(Atom fromAtom, Atom toAtom, int bondOrder) {
971 		Bond b = new Bond(fromAtom, toAtom, bondOrder);
972 		fromAtom.addBond(b);
973 		toAtom.addBond(b);
974 		fromAtom.getFrag().addBond(b);
975 		return b;
976 	}
977 
is_A_to_Z(char ch)978 	private boolean is_A_to_Z(char ch) {
979 		return ch >= 'A' && ch <= 'Z';
980 	}
981 
is_a_to_z(char ch)982 	private boolean is_a_to_z(char ch) {
983 		return ch >= 'a' && ch <= 'z';
984 	}
985 
is_0_to_9(char ch)986 	private boolean is_0_to_9(char ch){
987 		return ch >= '0' && ch <= '9';
988 	}
989 
990 }
991