1 package uk.ac.cam.ch.wwmm.opsin;
2 
3 import java.util.ArrayList;
4 import java.util.List;
5 import static uk.ac.cam.ch.wwmm.opsin.OpsinTools.*;
6 
7 /**
8  * Tools for dealing uniformly with unusually-formed words.
9  */
10 class WordTools {
11 	/**
12 	 * Splits cases where the parseTokensList describes a functionalTerm in addition to another mainGroup/substituent into two parseWords
13 	 * This occurs if the name is formally missing a space e.g. ethylthiocyanate.
14 	 * If multiple parses are present then it may be possible to disambiguate between them:
15 	 * 	parses with omitted spaces are discarded if a parse without omitted space is found
16 	 * 	parses with shorter functional terms are discarded e.g. ethylthiocyanate is [ethyl] [thiocyanate] not [ethylthio] [cyanate]
17 	 * @param parseTokensList
18 	 * @param chemicalName
19 	 * @return
20 	 */
splitIntoParseWords(List<ParseTokens> parseTokensList, String chemicalName)21 	static List<ParseWord> splitIntoParseWords(List<ParseTokens> parseTokensList, String chemicalName) {
22 		List<ParseTokens> wellFormedParseTokens = new ArrayList<ParseTokens>();//these are all in the same word as would be expected
23 		List<List<ParseTokens>> splitParseTokensForEachParseTokens = new ArrayList<List<ParseTokens>>();
24 		/*
25 		 * Each ParseTokens is split into the number of words it describes
26 		 * e.g. ethylchloride has one interpretation so splitParseTokensList will have one entry
27 		 * This entry will be formed of TWO parseTokens, one for the ethyl and one for the chloride
28 		 */
29 		int leastWordsInOmmittedSpaceParse = Integer.MAX_VALUE;//we want the least number of words i.e. less omitted spaces
30 		int longestFunctionalTermEncountered = 0;//we want the longest functional term
31 		for (ParseTokens parseTokens : parseTokensList) {
32 			List<Character> annotations = parseTokens.getAnnotations();
33 			List<List<Character>> chunkedAnnotations = chunkAnnotations(annotations);//chunked into mainGroup/substituent/functionalTerm
34 			if (containsOmittedSpace(chunkedAnnotations)){
35 				List<ParseTokens> omittedWordParseTokens = new ArrayList<ParseTokens>();
36 				List<String> tokens = parseTokens.getTokens();
37 				List<Character> newAnnotations = new ArrayList<Character>();
38 				List<String> newTokens = new ArrayList<String>();
39 				int currentFunctionalTermLength = 0;
40 				int annotPos = 0;
41 				for (List<Character> annotationList : chunkedAnnotations) {
42 					Character finalAnnotationInList = annotationList.get(annotationList.size() - 1);
43 					if (finalAnnotationInList.equals(END_OF_FUNCTIONALTERM) && newAnnotations.size() > 0) {
44 						//create a new parseTokens for the substituent/maingroup preceding the functional term
45 						//not necessary if the functional term is the first thing to be read e.g. in the case of poly
46 						omittedWordParseTokens.add(new ParseTokens(newTokens, newAnnotations));
47 						newAnnotations = new ArrayList<Character>();
48 						newTokens = new ArrayList<String>();
49 					}
50 					for (Character annotation : annotationList) {
51 						newAnnotations.add(annotation);
52 						newTokens.add(tokens.get(annotPos++));
53 					}
54 					if (finalAnnotationInList.equals(END_OF_FUNCTIONALTERM) || finalAnnotationInList.equals(END_OF_MAINGROUP) || annotPos == tokens.size()) {
55 						omittedWordParseTokens.add(new ParseTokens(newTokens, newAnnotations));
56 						if (finalAnnotationInList.equals(END_OF_FUNCTIONALTERM)){
57 							currentFunctionalTermLength = StringTools.stringListToString(newTokens, "").length();
58 						}
59 						newAnnotations = new ArrayList<Character>();
60 						newTokens = new ArrayList<String>();
61 					}
62 				}
63 				if (omittedWordParseTokens.size() <= leastWordsInOmmittedSpaceParse){
64 					if (omittedWordParseTokens.size() < leastWordsInOmmittedSpaceParse){
65 						splitParseTokensForEachParseTokens.clear();
66 						leastWordsInOmmittedSpaceParse = omittedWordParseTokens.size();
67 						longestFunctionalTermEncountered = 0;
68 					}
69 					if (currentFunctionalTermLength >=longestFunctionalTermEncountered){
70 						if (currentFunctionalTermLength > longestFunctionalTermEncountered){
71 							splitParseTokensForEachParseTokens.clear();
72 							longestFunctionalTermEncountered =currentFunctionalTermLength;
73 						}
74 						splitParseTokensForEachParseTokens.add(omittedWordParseTokens);
75 					}
76 				}
77 			} else {
78 				wellFormedParseTokens.add(parseTokens);
79 			}
80 		}
81 		List<ParseWord> parseWords = new ArrayList<ParseWord>();
82 		if (!wellFormedParseTokens.isEmpty()) {
83 			parseWords.add(new ParseWord(chemicalName, wellFormedParseTokens));
84 		} else {
85 			for (int i = 0; i < leastWordsInOmmittedSpaceParse; i++) {
86 				List<ParseTokens> parseTokensForWord = new ArrayList<ParseTokens>();
87 				for (List<ParseTokens> parseTokens : splitParseTokensForEachParseTokens) {
88 					if (!parseTokensForWord.contains(parseTokens.get(i))){//if only one word is ambiguous there is no need for the unambiguous word to have multiple identical interpretation
89 						parseTokensForWord.add(parseTokens.get(i));
90 					}
91 				}
92 				parseWords.add(new ParseWord(StringTools.stringListToString(parseTokensForWord.get(0).getTokens(), ""), parseTokensForWord));
93 			}
94 		}
95 		return parseWords;
96 	}
97 
containsOmittedSpace(List<List<Character>> chunkedAnnotations)98 	private static boolean containsOmittedSpace(List<List<Character>> chunkedAnnotations) {
99 		if (chunkedAnnotations.size() > 1){//there are multiple subsitutents/maingroup/functionalterms
100 			for (List<Character> annotationList : chunkedAnnotations) {
101 				if (annotationList.contains(END_OF_FUNCTIONALTERM)){
102 					return true;
103 				}
104 			}
105 		}
106 		return false;
107 	}
108 
109 	/**Groups the token annotations for a given word into substituent/s and/or a maingroup and/or functionalTerm by
110 	 * looking for the endOfSubstituent/endOfMainGroup/endOfFunctionalTerm annotations
111 	 *
112 	 * @param annots The annotations for a word.
113 	 * @return A List of lists of annotations, each list corresponds to a substituent/maingroup/functionalTerm
114 	 */
chunkAnnotations(List<Character> annots)115 	static List<List<Character>> chunkAnnotations(List<Character> annots) {
116 		List<List<Character>> chunkList = new ArrayList<List<Character>>();
117 		List<Character> currentTerm = new ArrayList<Character>();
118 		for (Character annot : annots) {
119 			currentTerm.add(annot);
120 			char ch = annot;
121 			if (ch == END_OF_SUBSTITUENT || ch == END_OF_MAINGROUP || ch == END_OF_FUNCTIONALTERM) {
122 				chunkList.add(currentTerm);
123 				currentTerm = new ArrayList<Character>();
124 			}
125 		}
126 		return chunkList;
127 	}
128 
129 	/**
130 	 * Works left to right removing spaces if there are too many opening brackets
131 	 * @param name
132 	 * @return
133 	 * @throws ParsingException If brackets are unbalanced and cannot be balanced by removing whitespace
134 	 */
removeWhiteSpaceIfBracketsAreUnbalanced(String name)135 	static String removeWhiteSpaceIfBracketsAreUnbalanced(String name) throws ParsingException {
136 		int bracketLevel = 0;
137 		int stringLength = name.length();
138 		for (int i = 0; i < stringLength; i++) {
139 			char c = name.charAt(i);
140 			if (c == '(' || c == '[' || c == '{') {
141 				bracketLevel++;
142 			} else if (c == ')' || c == ']' || c == '}') {
143 				bracketLevel--;
144 			} else if (c == ' ' && bracketLevel > 0) {//brackets unbalanced and a space has been encountered!
145 				name = name.substring(0, i) + name.substring(i + 1);
146 				stringLength = name.length();
147 				i--;
148 			}
149 		}
150 		if (bracketLevel > 0) {
151 			throw new ParsingException("Unmatched opening bracket found in :" + name);
152 		} else if (bracketLevel < 0) {
153 			throw new ParsingException("Unmatched closing bracket found in :" + name);
154 		}
155 		return name;
156 	}
157 }
158