1 package uk.ac.cam.ch.wwmm.opsin; 2 3 import java.util.ArrayList; 4 import java.util.List; 5 import static uk.ac.cam.ch.wwmm.opsin.OpsinTools.*; 6 7 /** 8 * Tools for dealing uniformly with unusually-formed words. 9 */ 10 class WordTools { 11 /** 12 * Splits cases where the parseTokensList describes a functionalTerm in addition to another mainGroup/substituent into two parseWords 13 * This occurs if the name is formally missing a space e.g. ethylthiocyanate. 14 * If multiple parses are present then it may be possible to disambiguate between them: 15 * parses with omitted spaces are discarded if a parse without omitted space is found 16 * parses with shorter functional terms are discarded e.g. ethylthiocyanate is [ethyl] [thiocyanate] not [ethylthio] [cyanate] 17 * @param parseTokensList 18 * @param chemicalName 19 * @return 20 */ splitIntoParseWords(List<ParseTokens> parseTokensList, String chemicalName)21 static List<ParseWord> splitIntoParseWords(List<ParseTokens> parseTokensList, String chemicalName) { 22 List<ParseTokens> wellFormedParseTokens = new ArrayList<ParseTokens>();//these are all in the same word as would be expected 23 List<List<ParseTokens>> splitParseTokensForEachParseTokens = new ArrayList<List<ParseTokens>>(); 24 /* 25 * Each ParseTokens is split into the number of words it describes 26 * e.g. ethylchloride has one interpretation so splitParseTokensList will have one entry 27 * This entry will be formed of TWO parseTokens, one for the ethyl and one for the chloride 28 */ 29 int leastWordsInOmmittedSpaceParse = Integer.MAX_VALUE;//we want the least number of words i.e. less omitted spaces 30 int longestFunctionalTermEncountered = 0;//we want the longest functional term 31 for (ParseTokens parseTokens : parseTokensList) { 32 List<Character> annotations = parseTokens.getAnnotations(); 33 List<List<Character>> chunkedAnnotations = chunkAnnotations(annotations);//chunked into mainGroup/substituent/functionalTerm 34 if (containsOmittedSpace(chunkedAnnotations)){ 35 List<ParseTokens> omittedWordParseTokens = new ArrayList<ParseTokens>(); 36 List<String> tokens = parseTokens.getTokens(); 37 List<Character> newAnnotations = new ArrayList<Character>(); 38 List<String> newTokens = new ArrayList<String>(); 39 int currentFunctionalTermLength = 0; 40 int annotPos = 0; 41 for (List<Character> annotationList : chunkedAnnotations) { 42 Character finalAnnotationInList = annotationList.get(annotationList.size() - 1); 43 if (finalAnnotationInList.equals(END_OF_FUNCTIONALTERM) && newAnnotations.size() > 0) { 44 //create a new parseTokens for the substituent/maingroup preceding the functional term 45 //not necessary if the functional term is the first thing to be read e.g. in the case of poly 46 omittedWordParseTokens.add(new ParseTokens(newTokens, newAnnotations)); 47 newAnnotations = new ArrayList<Character>(); 48 newTokens = new ArrayList<String>(); 49 } 50 for (Character annotation : annotationList) { 51 newAnnotations.add(annotation); 52 newTokens.add(tokens.get(annotPos++)); 53 } 54 if (finalAnnotationInList.equals(END_OF_FUNCTIONALTERM) || finalAnnotationInList.equals(END_OF_MAINGROUP) || annotPos == tokens.size()) { 55 omittedWordParseTokens.add(new ParseTokens(newTokens, newAnnotations)); 56 if (finalAnnotationInList.equals(END_OF_FUNCTIONALTERM)){ 57 currentFunctionalTermLength = StringTools.stringListToString(newTokens, "").length(); 58 } 59 newAnnotations = new ArrayList<Character>(); 60 newTokens = new ArrayList<String>(); 61 } 62 } 63 if (omittedWordParseTokens.size() <= leastWordsInOmmittedSpaceParse){ 64 if (omittedWordParseTokens.size() < leastWordsInOmmittedSpaceParse){ 65 splitParseTokensForEachParseTokens.clear(); 66 leastWordsInOmmittedSpaceParse = omittedWordParseTokens.size(); 67 longestFunctionalTermEncountered = 0; 68 } 69 if (currentFunctionalTermLength >=longestFunctionalTermEncountered){ 70 if (currentFunctionalTermLength > longestFunctionalTermEncountered){ 71 splitParseTokensForEachParseTokens.clear(); 72 longestFunctionalTermEncountered =currentFunctionalTermLength; 73 } 74 splitParseTokensForEachParseTokens.add(omittedWordParseTokens); 75 } 76 } 77 } else { 78 wellFormedParseTokens.add(parseTokens); 79 } 80 } 81 List<ParseWord> parseWords = new ArrayList<ParseWord>(); 82 if (!wellFormedParseTokens.isEmpty()) { 83 parseWords.add(new ParseWord(chemicalName, wellFormedParseTokens)); 84 } else { 85 for (int i = 0; i < leastWordsInOmmittedSpaceParse; i++) { 86 List<ParseTokens> parseTokensForWord = new ArrayList<ParseTokens>(); 87 for (List<ParseTokens> parseTokens : splitParseTokensForEachParseTokens) { 88 if (!parseTokensForWord.contains(parseTokens.get(i))){//if only one word is ambiguous there is no need for the unambiguous word to have multiple identical interpretation 89 parseTokensForWord.add(parseTokens.get(i)); 90 } 91 } 92 parseWords.add(new ParseWord(StringTools.stringListToString(parseTokensForWord.get(0).getTokens(), ""), parseTokensForWord)); 93 } 94 } 95 return parseWords; 96 } 97 containsOmittedSpace(List<List<Character>> chunkedAnnotations)98 private static boolean containsOmittedSpace(List<List<Character>> chunkedAnnotations) { 99 if (chunkedAnnotations.size() > 1){//there are multiple subsitutents/maingroup/functionalterms 100 for (List<Character> annotationList : chunkedAnnotations) { 101 if (annotationList.contains(END_OF_FUNCTIONALTERM)){ 102 return true; 103 } 104 } 105 } 106 return false; 107 } 108 109 /**Groups the token annotations for a given word into substituent/s and/or a maingroup and/or functionalTerm by 110 * looking for the endOfSubstituent/endOfMainGroup/endOfFunctionalTerm annotations 111 * 112 * @param annots The annotations for a word. 113 * @return A List of lists of annotations, each list corresponds to a substituent/maingroup/functionalTerm 114 */ chunkAnnotations(List<Character> annots)115 static List<List<Character>> chunkAnnotations(List<Character> annots) { 116 List<List<Character>> chunkList = new ArrayList<List<Character>>(); 117 List<Character> currentTerm = new ArrayList<Character>(); 118 for (Character annot : annots) { 119 currentTerm.add(annot); 120 char ch = annot; 121 if (ch == END_OF_SUBSTITUENT || ch == END_OF_MAINGROUP || ch == END_OF_FUNCTIONALTERM) { 122 chunkList.add(currentTerm); 123 currentTerm = new ArrayList<Character>(); 124 } 125 } 126 return chunkList; 127 } 128 129 /** 130 * Works left to right removing spaces if there are too many opening brackets 131 * @param name 132 * @return 133 * @throws ParsingException If brackets are unbalanced and cannot be balanced by removing whitespace 134 */ removeWhiteSpaceIfBracketsAreUnbalanced(String name)135 static String removeWhiteSpaceIfBracketsAreUnbalanced(String name) throws ParsingException { 136 int bracketLevel = 0; 137 int stringLength = name.length(); 138 for (int i = 0; i < stringLength; i++) { 139 char c = name.charAt(i); 140 if (c == '(' || c == '[' || c == '{') { 141 bracketLevel++; 142 } else if (c == ')' || c == ']' || c == '}') { 143 bracketLevel--; 144 } else if (c == ' ' && bracketLevel > 0) {//brackets unbalanced and a space has been encountered! 145 name = name.substring(0, i) + name.substring(i + 1); 146 stringLength = name.length(); 147 i--; 148 } 149 } 150 if (bracketLevel > 0) { 151 throw new ParsingException("Unmatched opening bracket found in :" + name); 152 } else if (bracketLevel < 0) { 153 throw new ParsingException("Unmatched closing bracket found in :" + name); 154 } 155 return name; 156 } 157 } 158