1 package uk.ac.cam.ch.wwmm.opsin; 2 3 import java.util.ArrayList; 4 import java.util.Arrays; 5 import java.util.List; 6 7 /**Static routines for string manipulation. 8 * 9 * @author ptc24 10 * @author dl387 11 * 12 */ 13 class StringTools { 14 15 /** 16 * Converts a list of strings into a single string delimited by the given separator 17 * 18 * @param list A list of strings. 19 * @param separator 20 * @return The corresponding string. 21 */ stringListToString(List<String> list, String separator)22 static String stringListToString(List<String> list, String separator) { 23 StringBuilder sb = new StringBuilder(); 24 int lastIndexOfList = list.size() - 1; 25 for (int i = 0; i < lastIndexOfList; i++) { 26 sb.append(list.get(i)); 27 sb.append(separator); 28 } 29 if (lastIndexOfList >= 0){ 30 sb.append(list.get(lastIndexOfList)); 31 } 32 return sb.toString(); 33 } 34 35 /**Produce repetitions of a string. Eg. HelloWorld * 2 = HelloWorldHelloWorld. 36 * 37 * @param s The string to multiply. 38 * @param n The number of times to multiply it. 39 * @return The multiplied string. 40 */ multiplyString(String s, int n)41 static String multiplyString(String s, int n) { 42 StringBuilder sb = new StringBuilder(); 43 for (int i = 0; i < n; i++) { 44 sb.append(s); 45 } 46 return sb.toString(); 47 } 48 49 /**Joins an array of strings into a single string. 50 * 51 * @param stringArray The strings to join together. 52 * @param separator The separator to use. 53 * @return The resulting string. 54 */ arrayToString(String[] stringArray, String separator)55 static String arrayToString(String[] stringArray, String separator) { 56 StringBuilder sb = new StringBuilder(); 57 int lastIndexOfArray = stringArray.length - 1; 58 for(int i = 0; i < lastIndexOfArray; i++) { 59 sb.append(stringArray[i]); 60 sb.append(separator); 61 } 62 if (lastIndexOfArray >= 0){ 63 sb.append(stringArray[lastIndexOfArray]); 64 } 65 return sb.toString(); 66 } 67 68 /**Converts a unicode string into ASCII 69 * e.g. converting Greek letters to their names (e.g. alpha) 70 * Unrecognised non-ASCII characters trigger an exception 71 * 72 * @param s The string to convert 73 * @return The converted string 74 * @throws PreProcessingException 75 */ convertNonAsciiAndNormaliseRepresentation(String s)76 static String convertNonAsciiAndNormaliseRepresentation(String s) throws PreProcessingException { 77 StringBuilder sb = new StringBuilder(s.length()); 78 for (int i = 0, l = s.length(); i < l; i++) { 79 char c = s.charAt(i); 80 switch (c) { 81 case '\t': 82 case '\n': 83 case '\u000B'://vertical tab 84 case '\f': 85 case '\r': 86 //normalise white space 87 sb.append(" "); 88 break; 89 case '`': 90 sb.append("'");//replace back ticks with apostrophe 91 break; 92 case '"': 93 sb.append("''");//replace quotation mark with two primes 94 break; 95 default: 96 if(c >= 128) { 97 sb.append(getReplacementForNonASCIIChar(c));//replace non ascii characters with hard coded ascii strings 98 } 99 else if (c > 31){//ignore control characters 100 sb.append(c); 101 } 102 } 103 } 104 return sb.toString(); 105 } 106 getReplacementForNonASCIIChar(char c)107 private static String getReplacementForNonASCIIChar(char c) throws PreProcessingException { 108 switch (c) { 109 case '\u03b1': return "alpha";//greeks 110 case '\u03b2': return "beta"; 111 case '\u03b3': return "gamma"; 112 case '\u03b4': return "delta"; 113 case '\u03b5': return "epsilon"; 114 case '\u03b6': return "zeta"; 115 case '\u03b7': return "eta"; 116 case '\u03b8': return "theta"; 117 case '\u03b9': return "iota"; 118 case '\u03ba': return "kappa"; 119 case '\u03bb': return "lambda"; 120 case '\u03bc': return "mu"; 121 case '\u03bd': return "nu"; 122 case '\u03be': return "xi"; 123 case '\u03bf': return "omicron"; 124 case '\u03c0': return "pi"; 125 case '\u03c1': return "rho"; 126 case '\u03c2': return "stigma"; 127 case '\u03c3': return "sigma"; 128 case '\u03c4': return "tau"; 129 case '\u03c5': return "upsilon"; 130 case '\u03c6': return "phi"; 131 case '\u03c7': return "chi"; 132 case '\u03c8': return "psi"; 133 case '\u03c9': return "omega"; 134 135 case '\u1D05': return "D";//small capitals 136 case '\u029F': return "L"; 137 138 case '\u00B1': return "+-";//plus minus symbol 139 case '\u2213': return "-+"; 140 141 case '\u2192'://right arrows 142 case '\u2794': 143 case '\u2799': 144 case '\u279C': return "->"; 145 146 case '\u00C6': return "AE";//common ligatures 147 case '\u00E6': return "ae"; 148 case '\u0152': return "OE"; 149 case '\u0153': return "oe"; 150 case '\u0132': return "IJ"; 151 case '\u0133': return "ij"; 152 case '\u1D6B': return "ue"; 153 case '\uFB00': return "ff"; 154 case '\uFB01': return "fi"; 155 case '\uFB02': return "fl"; 156 case '\uFB03': return "ffi"; 157 case '\uFB04': return "ffl"; 158 case '\uFB06': return "st"; 159 160 case '\u00E0': return "a";//diacritics 161 case '\u00C0': return "A"; 162 case '\u00E1': return "a"; 163 case '\u00C1': return "A"; 164 case '\u00E2': return "a"; 165 case '\u00C2': return "A"; 166 case '\u00E3': return "a"; 167 case '\u00C3': return "A"; 168 case '\u00E4': return "a"; 169 case '\u00C4': return "A"; 170 case '\u00E5': return "a"; 171 case '\u00C5': return "A"; 172 case '\u00E7': return "c"; 173 case '\u00C7': return "C"; 174 case '\u00E8': return "e"; 175 case '\u00C8': return "E"; 176 case '\u00E9': return "e"; 177 case '\u00C9': return "E"; 178 case '\u00EA': return "e"; 179 case '\u00CA': return "E"; 180 case '\u00EB': return "e"; 181 case '\u00CB': return "E"; 182 case '\u00EC': return "i"; 183 case '\u00CC': return "I"; 184 case '\u00ED': return "i"; 185 case '\u00CD': return "I"; 186 case '\u00EE': return "i"; 187 case '\u00CE': return "I"; 188 case '\u00EF': return "i"; 189 case '\u00CF': return "I"; 190 case '\u00F2': return "o"; 191 case '\u00D2': return "O"; 192 case '\u00F3': return "o"; 193 case '\u00D3': return "O"; 194 case '\u00F4': return "o"; 195 case '\u00D4': return "O"; 196 case '\u00F5': return "o"; 197 case '\u00D5': return "O"; 198 case '\u00F6': return "o"; 199 case '\u00D6': return "O"; 200 case '\u00F9': return "u"; 201 case '\u00D9': return "U"; 202 case '\u00FA': return "u"; 203 case '\u00DA': return "U"; 204 case '\u00FB': return "u"; 205 case '\u00DB': return "U"; 206 case '\u00FC': return "u"; 207 case '\u00DC': return "U"; 208 case '\u00FD': return "y"; 209 case '\u00DD': return "Y"; 210 211 case '\u0115': return "e"; 212 case '\u0114': return "E"; 213 case '\u0117': return "e"; 214 case '\u0116': return "E"; 215 216 case '\u2070': return "0";//superscripts 217 case '\u00B9': return "1"; 218 case '\u00B2': return "2"; 219 case '\u00B3': return "3"; 220 case '\u2074': return "4"; 221 case '\u2075': return "5"; 222 case '\u2076': return "6"; 223 case '\u2077': return "7"; 224 case '\u2078': return "8"; 225 case '\u2079': return "9"; 226 227 case '\u2080': return "0";//subscripts 228 case '\u2081': return "1"; 229 case '\u2082': return "2"; 230 case '\u2083': return "3"; 231 case '\u2084': return "4"; 232 case '\u2085': return "5"; 233 case '\u2086': return "6"; 234 case '\u2087': return "7"; 235 case '\u2088': return "8"; 236 case '\u2089': return "9"; 237 238 case '\u2018': return "'";//quotation marks and primes (map to apostrophe/s) 239 case '\u2019': return "'"; 240 case '\u201B': return "'"; 241 case '\u02BC': return "'"; 242 case '\u201C': return "''"; 243 case '\u201D': return "''"; 244 case '\u2032': return "'";//primes 245 case '\u2033': return "''"; 246 case '\u2034': return "'''"; 247 case '\u2057': return "''''"; 248 case '\u02B9': return "'";//modifier primes 249 case '\u02BA': return "''"; 250 case '\u2035': return "'";//back primes 251 case '\u2036': return "''"; 252 case '\u2037': return "'''"; 253 case '\u00B4': return "'";//accents 254 case '\u02CA': return "'"; 255 case '\u0301': return "'"; 256 case '\u02DD': return "''"; 257 case '\u030B': return "''"; 258 259 case '\u2010'://dashes, hyphens and the minus sign 260 case '\u2011': 261 case '\u2012': 262 case '\u2013': 263 case '\u2014': 264 case '\u2015': 265 case '\u2212': return "-"; 266 267 case '\u02DC'://small tilde 268 case '\u223C'://tilde operator 269 case '\u301C': return "~";//wave dash 270 271 case '\uff0c': return ",";//full width punctuation 272 case '\uFF1A': return ":"; 273 case '\uFF1B': return ";"; 274 case '\uFF08': return "("; 275 case '\uFF09': return ")"; 276 case '\uFF3B': return "["; 277 case '\uFF3D': return "]"; 278 case '\u3010': return "["; 279 case '\u3011': return "]"; 280 case '\uFF5B': return "{"; 281 case '\uFF5D': return "}"; 282 283 case '\u00DF': return "beta";//similar glyph 284 285 case '\u2000'://different sized spaces 286 case '\u2001': 287 case '\u2002': 288 case '\u2003': 289 case '\u2004': 290 case '\u2005': 291 case '\u2006': 292 case '\u2008': 293 case '\u2009': 294 case '\u200A': 295 case '\u205F': 296 case '\u00A0'://Non-breaking spaces 297 case '\u2007': 298 case '\u202F': 299 case '\u3000': return " ";//ideographic space 300 301 case '\u00AD'://soft hyphen 302 case '\u200b'://zero width space 303 case '\u200d'://zero width joiner 304 case '\uFEFF': return "";//BOM-found at the start of some UTF files 305 306 default: throw new PreProcessingException("Unrecognised unicode character: " + c); 307 } 308 } 309 310 /**Converts a string array to an ArrayList. 311 * 312 * @param array The array. 313 * @return The ArrayList. 314 */ arrayToList(String [] array)315 static List<String> arrayToList(String [] array) { 316 List<String> list = new ArrayList<String>(); 317 list.addAll(Arrays.asList(array)); 318 return list; 319 } 320 321 /** 322 * If a dash is the last character it is removed 323 * @param locantText 324 * @return 325 */ removeDashIfPresent(String locantText)326 static String removeDashIfPresent(String locantText){ 327 if(locantText.endsWith("-")) { 328 locantText = locantText.substring(0, locantText.length() - 1); 329 } 330 return locantText; 331 } 332 333 /** 334 * Counts the number of primes at the end of a locant 335 * @param locantText 336 * @return 337 */ countTerminalPrimes(String locantText)338 static int countTerminalPrimes(String locantText){ 339 int numberOfPrimes = 0; 340 for(int i = locantText.length() -1; i > 0; i--){ 341 if (locantText.charAt(i) == '\''){ 342 numberOfPrimes++; 343 } 344 else{ 345 break; 346 } 347 } 348 return numberOfPrimes; 349 } 350 351 /** 352 * Tests if this string start with the specified prefix ignoring case. 353 * @param str 354 * @param prefix 355 * @return 356 */ startsWithCaseInsensitive(String str, String prefix)357 static boolean startsWithCaseInsensitive(String str, String prefix) { 358 return str.regionMatches(true, 0, prefix, 0, prefix.length()); 359 } 360 361 /** 362 * Tests if this string ends with the specified suffix ignoring case. 363 * @param str 364 * @param suffix 365 * @return 366 */ endsWithCaseInsensitive(String str, String suffix)367 static boolean endsWithCaseInsensitive(String str, String suffix) { 368 if (suffix.length() > str.length()) { 369 return false; 370 } 371 int strOffset = str.length() - suffix.length(); 372 return str.regionMatches(true, strOffset, suffix, 0, suffix.length()); 373 } 374 375 /** 376 * Lower cases a string (only converts A-Z to a-z) 377 * @param str 378 */ lowerCaseAsciiString(String str)379 static String lowerCaseAsciiString(String str) { 380 StringBuilder sb = new StringBuilder(str.length()); 381 for (int i = 0, l = str.length(); i < l; i++) { 382 char c = str.charAt(i); 383 if (c >= 'A' && c <= 'Z') { 384 c = (char) (c + 32); 385 } 386 sb.append(c); 387 } 388 return sb.toString(); 389 } 390 } 391