1 package uk.ac.cam.ch.wwmm.opsin;
2 
3 import java.util.ArrayList;
4 import java.util.Arrays;
5 import java.util.List;
6 
7 /**Static routines for string manipulation.
8  *
9  * @author ptc24
10  * @author dl387
11  *
12  */
13 class StringTools {
14 
15 	/**
16 	 * Converts a list of strings into a single string delimited by the given separator
17 	 *
18 	 * @param list A list of strings.
19 	 * @param separator
20 	 * @return The corresponding string.
21 	 */
stringListToString(List<String> list, String separator)22 	static String stringListToString(List<String> list, String separator) {
23 		StringBuilder sb = new StringBuilder();
24 		int lastIndexOfList = list.size() - 1;
25 		for (int i = 0; i < lastIndexOfList; i++) {
26 			sb.append(list.get(i));
27 			sb.append(separator);
28 		}
29 		if (lastIndexOfList >= 0){
30 			sb.append(list.get(lastIndexOfList));
31 		}
32 		return sb.toString();
33 	}
34 
35 	/**Produce repetitions of a string. Eg. HelloWorld * 2 = HelloWorldHelloWorld.
36 	 *
37 	 * @param s The string to multiply.
38 	 * @param n The number of times to multiply it.
39 	 * @return The multiplied string.
40 	 */
multiplyString(String s, int n)41 	static String multiplyString(String s, int n) {
42 		StringBuilder sb = new StringBuilder();
43 		for (int i = 0; i < n; i++) {
44 			sb.append(s);
45 		}
46 		return sb.toString();
47 	}
48 
49 	/**Joins an array of strings into a single string.
50 	 *
51 	 * @param stringArray The strings to join together.
52 	 * @param separator The separator to use.
53 	 * @return The resulting string.
54 	 */
arrayToString(String[] stringArray, String separator)55 	static String arrayToString(String[] stringArray, String separator) {
56 		StringBuilder sb = new StringBuilder();
57 		int lastIndexOfArray = stringArray.length - 1;
58 		for(int i = 0; i < lastIndexOfArray; i++) {
59 			sb.append(stringArray[i]);
60 			sb.append(separator);
61 		}
62 		if (lastIndexOfArray >= 0){
63 			sb.append(stringArray[lastIndexOfArray]);
64 		}
65 		return sb.toString();
66 	}
67 
68 	/**Converts a unicode string into ASCII
69 	 * e.g. converting Greek letters to their names (e.g. alpha)
70 	 * Unrecognised non-ASCII characters trigger an exception
71 	 *
72 	 * @param s The string to convert
73 	 * @return The converted string
74 	 * @throws PreProcessingException
75 	 */
convertNonAsciiAndNormaliseRepresentation(String s)76 	static String convertNonAsciiAndNormaliseRepresentation(String s) throws PreProcessingException {
77 		StringBuilder sb = new StringBuilder(s.length());
78 		for (int i = 0, l = s.length(); i < l; i++) {
79 			char c = s.charAt(i);
80 			switch (c) {
81 			case '\t':
82 			case '\n':
83 			case '\u000B'://vertical tab
84 			case '\f':
85 			case '\r':
86 				//normalise white space
87 				sb.append(" ");
88 				break;
89 			case '`':
90 				sb.append("'");//replace back ticks with apostrophe
91 				break;
92 			case '"':
93 				sb.append("''");//replace quotation mark with two primes
94 				break;
95 			default:
96 				if(c >= 128) {
97 					sb.append(getReplacementForNonASCIIChar(c));//replace non ascii characters with hard coded ascii strings
98 				}
99 				else if (c > 31){//ignore control characters
100 					sb.append(c);
101 				}
102 			}
103 		}
104 		return sb.toString();
105 	}
106 
getReplacementForNonASCIIChar(char c)107     private static String getReplacementForNonASCIIChar(char c) throws PreProcessingException {
108         switch (c) {
109             case '\u03b1': return "alpha";//greeks
110             case '\u03b2': return "beta";
111             case '\u03b3': return "gamma";
112             case '\u03b4': return "delta";
113             case '\u03b5': return "epsilon";
114             case '\u03b6': return "zeta";
115             case '\u03b7': return "eta";
116             case '\u03b8': return "theta";
117             case '\u03b9': return "iota";
118             case '\u03ba': return "kappa";
119             case '\u03bb': return "lambda";
120             case '\u03bc': return "mu";
121             case '\u03bd': return "nu";
122             case '\u03be': return "xi";
123             case '\u03bf': return "omicron";
124             case '\u03c0': return "pi";
125             case '\u03c1': return "rho";
126             case '\u03c2': return "stigma";
127             case '\u03c3': return "sigma";
128             case '\u03c4': return "tau";
129             case '\u03c5': return "upsilon";
130             case '\u03c6': return "phi";
131             case '\u03c7': return "chi";
132             case '\u03c8': return "psi";
133             case '\u03c9': return "omega";
134 
135             case '\u1D05': return "D";//small capitals
136             case '\u029F': return "L";
137 
138             case '\u00B1': return "+-";//plus minus symbol
139             case '\u2213': return "-+";
140 
141             case '\u2192'://right arrows
142             case '\u2794':
143             case '\u2799':
144             case '\u279C': return "->";
145 
146             case '\u00C6': return "AE";//common ligatures
147             case '\u00E6': return "ae";
148             case '\u0152': return "OE";
149             case '\u0153': return "oe";
150             case '\u0132': return "IJ";
151             case '\u0133': return "ij";
152             case '\u1D6B': return "ue";
153             case '\uFB00': return "ff";
154             case '\uFB01': return "fi";
155             case '\uFB02': return "fl";
156             case '\uFB03': return "ffi";
157             case '\uFB04': return "ffl";
158             case '\uFB06': return "st";
159 
160             case '\u00E0': return "a";//diacritics
161             case '\u00C0': return "A";
162             case '\u00E1': return "a";
163             case '\u00C1': return "A";
164             case '\u00E2': return "a";
165             case '\u00C2': return "A";
166             case '\u00E3': return "a";
167             case '\u00C3': return "A";
168             case '\u00E4': return "a";
169             case '\u00C4': return "A";
170             case '\u00E5': return "a";
171             case '\u00C5': return "A";
172             case '\u00E7': return "c";
173             case '\u00C7': return "C";
174             case '\u00E8': return "e";
175             case '\u00C8': return "E";
176             case '\u00E9': return "e";
177             case '\u00C9': return "E";
178             case '\u00EA': return "e";
179             case '\u00CA': return "E";
180             case '\u00EB': return "e";
181             case '\u00CB': return "E";
182             case '\u00EC': return "i";
183             case '\u00CC': return "I";
184             case '\u00ED': return "i";
185             case '\u00CD': return "I";
186             case '\u00EE': return "i";
187             case '\u00CE': return "I";
188             case '\u00EF': return "i";
189             case '\u00CF': return "I";
190             case '\u00F2': return "o";
191             case '\u00D2': return "O";
192             case '\u00F3': return "o";
193             case '\u00D3': return "O";
194             case '\u00F4': return "o";
195             case '\u00D4': return "O";
196             case '\u00F5': return "o";
197             case '\u00D5': return "O";
198             case '\u00F6': return "o";
199             case '\u00D6': return "O";
200             case '\u00F9': return "u";
201             case '\u00D9': return "U";
202             case '\u00FA': return "u";
203             case '\u00DA': return "U";
204             case '\u00FB': return "u";
205             case '\u00DB': return "U";
206             case '\u00FC': return "u";
207             case '\u00DC': return "U";
208             case '\u00FD': return "y";
209             case '\u00DD': return "Y";
210 
211             case '\u0115': return "e";
212             case '\u0114': return "E";
213             case '\u0117': return "e";
214             case '\u0116': return "E";
215 
216             case '\u2070': return "0";//superscripts
217             case '\u00B9': return "1";
218             case '\u00B2': return "2";
219             case '\u00B3': return "3";
220             case '\u2074': return "4";
221             case '\u2075': return "5";
222             case '\u2076': return "6";
223             case '\u2077': return "7";
224             case '\u2078': return "8";
225             case '\u2079': return "9";
226 
227             case '\u2080': return "0";//subscripts
228             case '\u2081': return "1";
229             case '\u2082': return "2";
230             case '\u2083': return "3";
231             case '\u2084': return "4";
232             case '\u2085': return "5";
233             case '\u2086': return "6";
234             case '\u2087': return "7";
235             case '\u2088': return "8";
236             case '\u2089': return "9";
237 
238             case '\u2018': return "'";//quotation marks and primes (map to apostrophe/s)
239             case '\u2019': return "'";
240             case '\u201B': return "'";
241             case '\u02BC': return "'";
242             case '\u201C': return "''";
243             case '\u201D': return "''";
244             case '\u2032': return "'";//primes
245             case '\u2033': return "''";
246             case '\u2034': return "'''";
247             case '\u2057': return "''''";
248             case '\u02B9': return "'";//modifier primes
249             case '\u02BA': return "''";
250             case '\u2035': return "'";//back primes
251             case '\u2036': return "''";
252             case '\u2037': return "'''";
253             case '\u00B4': return "'";//accents
254             case '\u02CA': return "'";
255             case '\u0301': return "'";
256             case '\u02DD': return "''";
257             case '\u030B': return "''";
258 
259             case '\u2010'://dashes, hyphens and the minus sign
260             case '\u2011':
261             case '\u2012':
262             case '\u2013':
263             case '\u2014':
264             case '\u2015':
265             case '\u2212': return "-";
266 
267             case '\u02DC'://small tilde
268             case '\u223C'://tilde operator
269             case '\u301C': return "~";//wave dash
270 
271             case '\uff0c': return ",";//full width punctuation
272             case '\uFF1A': return ":";
273             case '\uFF1B': return ";";
274             case '\uFF08': return "(";
275             case '\uFF09': return ")";
276             case '\uFF3B': return "[";
277             case '\uFF3D': return "]";
278             case '\u3010': return "[";
279             case '\u3011': return "]";
280             case '\uFF5B': return "{";
281             case '\uFF5D': return "}";
282 
283             case '\u00DF': return "beta";//similar glyph
284 
285             case '\u2000'://different sized spaces
286             case '\u2001':
287             case '\u2002':
288             case '\u2003':
289             case '\u2004':
290             case '\u2005':
291             case '\u2006':
292             case '\u2008':
293             case '\u2009':
294             case '\u200A':
295             case '\u205F':
296             case '\u00A0'://Non-breaking spaces
297             case '\u2007':
298             case '\u202F':
299             case '\u3000': return " ";//ideographic space
300 
301             case '\u00AD'://soft hyphen
302             case '\u200b'://zero width space
303             case '\u200d'://zero width joiner
304             case '\uFEFF': return "";//BOM-found at the start of some UTF files
305 
306             default: throw new PreProcessingException("Unrecognised unicode character: " + c);
307         }
308     }
309 
310 	/**Converts a string array to an ArrayList.
311 	 *
312 	 * @param array The array.
313 	 * @return The ArrayList.
314 	 */
arrayToList(String [] array)315 	static List<String> arrayToList(String [] array) {
316 		List<String> list = new ArrayList<String>();
317         list.addAll(Arrays.asList(array));
318 		return list;
319 	}
320 
321 	/**
322 	 * If a dash is the last character it is removed
323 	 * @param locantText
324 	 * @return
325 	 */
removeDashIfPresent(String locantText)326 	static String removeDashIfPresent(String locantText){
327 		if(locantText.endsWith("-")) {
328 			locantText = locantText.substring(0, locantText.length() - 1);
329 		}
330 		return locantText;
331 	}
332 
333 	/**
334 	 * Counts the number of primes at the end of a locant
335 	 * @param locantText
336 	 * @return
337 	 */
countTerminalPrimes(String locantText)338 	static int countTerminalPrimes(String locantText){
339 		int numberOfPrimes = 0;
340 		for(int i = locantText.length() -1; i > 0; i--){
341 			if (locantText.charAt(i) == '\''){
342 				numberOfPrimes++;
343 			}
344 			else{
345 				break;
346 			}
347 		}
348 		return numberOfPrimes;
349 	}
350 
351 	/**
352 	 * Tests if this string start with the specified prefix ignoring case.
353 	 * @param str
354 	 * @param prefix
355 	 * @return
356 	 */
startsWithCaseInsensitive(String str, String prefix)357 	static boolean startsWithCaseInsensitive(String str, String prefix) {
358 		return str.regionMatches(true, 0, prefix, 0, prefix.length());
359 	}
360 
361 	/**
362 	 * Tests if this string ends with the specified suffix ignoring case.
363 	 * @param str
364 	 * @param suffix
365 	 * @return
366 	 */
endsWithCaseInsensitive(String str, String suffix)367 	static boolean endsWithCaseInsensitive(String str, String suffix) {
368 		if (suffix.length() > str.length()) {
369 			return false;
370 		}
371 		int strOffset = str.length() - suffix.length();
372 		return str.regionMatches(true, strOffset, suffix, 0, suffix.length());
373 	}
374 
375 	/**
376 	* Lower cases a string (only converts A-Z to a-z)
377 	* @param str
378 	*/
lowerCaseAsciiString(String str)379 	static String lowerCaseAsciiString(String str) {
380 		StringBuilder sb = new StringBuilder(str.length());
381 		for (int i = 0, l = str.length(); i < l; i++) {
382 			char c = str.charAt(i);
383 			if (c >= 'A' && c <= 'Z') {
384 				c = (char) (c + 32);
385 			}
386 			sb.append(c);
387 		}
388 		return sb.toString();
389 	}
390 }
391