1 // Copyright (c) 2001 Per M.A. Bothner and Brainfood Inc. 2 // This is free software; for terms and warranty disclaimer see ./COPYING. 3 4 package gnu.lists; 5 6 import gnu.text.Char; 7 import java.io.IOException; 8 9 /** Various static utility methods for general strings (CharSeqs). */ 10 11 public class Strings 12 { 13 /** Get character (code point) at a offset. 14 * @param index offset measured in 16-bit code units 15 */ characterAt(CharSequence cseq, int index)16 public static int characterAt(CharSequence cseq, int index) { 17 return characterAt(cseq, 0, cseq.length(), index); 18 } 19 /** Get character (code point) at a offset. 20 * @param index offset measured in 16-bit code units, 21 * from begining of cseq, not frm start 22 */ characterAt(CharSequence cseq, int start, int end, int index)23 public static int characterAt(CharSequence cseq, int start, int end, 24 int index) { 25 if (index < start || index >= end) 26 throw new IndexOutOfBoundsException(); 27 char ch1 = cseq.charAt(index); 28 if (ch1 >= 0xD800 && ch1 <= 0xDBFF) { 29 if (index + 1 < end) { 30 char ch2 = cseq.charAt(index+1); 31 if (ch2 >= 0xDC00 && ch2 <= 0xDFFF) 32 return ((ch1 - 0xD800) << 10) + (ch2 - 0xDC00) + 0x10000; 33 } 34 } else if (ch1 >= 0xDC00 && ch1 <= 0xDFFF) { 35 if (index > start) { 36 char ch0 = cseq.charAt(index-1); 37 if (ch0 >= 0xD800 && ch0 <= 0xDBFF) 38 return Char.IGNORABLE_CHAR; 39 } 40 } 41 return ch1; 42 } 43 /** Get index'th character (code point). 44 * @param index offset by code points 45 */ indexByCodePoints(CharSequence str, int index)46 public static int indexByCodePoints(CharSequence str, int index) { 47 if (str instanceof IString) 48 return ((IString) str).indexByCodePoints(index); 49 index = Character.offsetByCodePoints(str, 0, index); 50 return Character.codePointAt(str, index); 51 } 52 53 /** Like offsetByCodePoints, but optimize if an IString. 54 * @param offset number of code points beyond start index. 55 * @param cuStart start index in code units (Java chars) 56 * @param cpStart start index in Unicode code points 57 */ offsetByCodePoints(CharSequence str, int offset, int cuStart, int cpStart)58 public static int offsetByCodePoints(CharSequence str, int offset, 59 int cuStart, int cpStart) { 60 if (str instanceof IString) { 61 IString istr = (IString) str; 62 offset += cpStart; 63 if (offset < 0 || offset > istr.size()) 64 throw new IndexOutOfBoundsException(); 65 return istr.offsetByCodePoints(offset); 66 } 67 return Character.offsetByCodePoints(str, cuStart, offset); 68 } 69 sizeInCodePoints(CharSequence str)70 public static int sizeInCodePoints(CharSequence str) { 71 if (str instanceof IString) 72 return ((IString) str).lengthByCodePoints(); 73 int len = str.length(); 74 int nsurr = 0; 75 for (int i = 0; i < len; ) { 76 char ch = str.charAt(i++); 77 if (ch >= 0xD800 && ch <= 0xDBFF && i < len) { 78 int next = str.charAt(i); 79 if (next >= 0xDC00 && next <= 0xDFFF) { 80 i++; 81 nsurr++; 82 } 83 } 84 } 85 return len-nsurr; 86 } 87 88 /** Change every character to be uppercase. */ makeUpperCase(CharSeq str)89 public static void makeUpperCase(CharSeq str) 90 { 91 for (int i = str.length(); --i >= 0; ) 92 str.setCharAt(i, Character.toUpperCase(str.charAt(i))); 93 } 94 95 /** Change every character to be lowercase. */ makeLowerCase(CharSeq str)96 public static void makeLowerCase(CharSeq str) 97 { 98 for (int i = str.length(); --i >= 0; ) 99 str.setCharAt(i, Character.toLowerCase(str.charAt(i))); 100 } 101 102 /** Capitalize this string. 103 * Change first character of each word to titlecase, 104 * and change the other characters to lowercase. */ makeCapitalize(CharSeq str)105 public static void makeCapitalize(CharSeq str) 106 { 107 char prev = ' '; 108 int len = str.length(); 109 for (int i = 0; i < len; i++) 110 { 111 char ch = str.charAt(i); 112 if (! Character.isLetterOrDigit(prev)) 113 ch = Character.toTitleCase(ch); 114 else 115 ch = Character.toLowerCase(ch); 116 str.setCharAt(i, ch); 117 prev = ch; 118 } 119 } 120 toJson(CharSequence str)121 public static String toJson(CharSequence str) { 122 StringBuilder sbuf = new StringBuilder(); 123 printQuoted(str, sbuf, 3); 124 return sbuf.toString(); 125 } 126 printJson(CharSequence str, Appendable ps)127 public static void printJson(CharSequence str, Appendable ps) { 128 printQuoted(str, ps, 3); 129 } 130 131 /** Print a string with quotes and escapes. 132 * @param escapes The value 0 means only escape '"' and '\\'; 133 * the value 1 means escape standard escape characters like '\\b'; 134 * the value 2 means escape all non-ascii or control characters; 135 * the value 3 means follow the JSON standard. 136 */ printQuoted(CharSequence str, Appendable ps, int escapes)137 public static void printQuoted(CharSequence str, 138 Appendable ps, int escapes) { 139 int len = str.length(); 140 try { 141 ps.append('\"'); 142 for (int i = 0; i < len; i++) { 143 char ch = str.charAt(i); 144 if ((ch == '\\' || ch == '\"')) 145 ps.append('\\'); 146 else if (escapes > 0) { 147 // These escapes are R6RS: 148 if (ch == '\n') 149 { ps.append("\\n"); continue; } 150 else if (ch == '\r') 151 { ps.append("\\r"); continue; } 152 else if (ch == '\t') 153 { ps.append("\\t"); continue; } 154 else if (ch == '\007' && escapes < 3) 155 { ps.append("\\a"); continue; } 156 else if (ch == '\b') 157 { ps.append("\\b"); continue; } 158 else if (ch == '\013' && escapes < 3) 159 { ps.append("\\v"); continue; } 160 else if (ch == '\f') 161 { ps.append("\\f"); continue; } 162 else if (escapes >= 3 && (ch < ' ' || ch >= 127)) 163 { 164 ps.append("\\u"); 165 int d = ch; 166 for (int k = 12; k >= 0; k -= 4) { 167 ps.append(Character.forDigit((d >> k) & 15, 16)); 168 } 169 continue; 170 } 171 else if (ch < ' ' || (escapes > 1 && ch >= 127)) 172 { 173 ps.append("\\x"); 174 ps.append(Integer.toHexString(ch)); 175 ps.append(';'); 176 continue; 177 } 178 } 179 ps.append(ch); 180 } 181 ps.append('\"'); 182 } catch (IOException ex) { 183 throw new RuntimeException(ex); 184 } 185 } 186 copyInto(CharSequence src, int start, int end, CharSeq dst, int at)187 public static void copyInto(CharSequence src, int start, int end, 188 CharSeq dst, int at) { 189 int dstLen = dst.length(); 190 int srcLen = src.length(); 191 if (at < 0 || at > dstLen || start < 0 || end > srcLen || end < start 192 || dstLen - at < end - start) 193 throw new StringIndexOutOfBoundsException(); 194 if (at < start) { 195 int i = at; 196 int j = start; 197 for (; j < end; i++, j++) { 198 dst.setCharAt(i, src.charAt(j)); 199 } 200 } 201 else { 202 int i = at + end - start; 203 int j = end; 204 while (--j >= start) { 205 dst.setCharAt(--i, src.charAt(j)); 206 } 207 } 208 } 209 210 /** Make a read-only substring, generalized to arbitrary index sequences. 211 * The indexes are in terms of code points (character) offsets. 212 */ indirectIndexed(CharSequence base, IntSequence indexes)213 public static IString indirectIndexed(CharSequence base, 214 IntSequence indexes) { 215 if (indexes instanceof Range.IntRange) { 216 Range.IntRange range = (Range.IntRange) indexes; 217 if (range.getStepInt() == 1) { 218 int start = range.getStartInt(); 219 int end = base.length(); 220 if (start < 0 || start > end) 221 throw new IndexOutOfBoundsException(); 222 int size; 223 if (! range.isUnbounded()) { 224 size = range.size(); 225 if (start+size < 0 || start+size > end) 226 throw new IndexOutOfBoundsException(); 227 } else 228 size = end - start; 229 return IString.valueOf(base, start, size); 230 } 231 } 232 int len = indexes.size(); 233 StringBuilder sbuf = new StringBuilder(len); 234 for (int i = 0; i < len; i++) { 235 int ch = Strings.indexByCodePoints(base, indexes.getInt(i)); 236 if (ch >= 0x10000) { 237 sbuf.append((char) (((ch - 0x10000) >> 10) + 0xD800)); 238 ch = (ch & 0x3FF) + 0xDC00; 239 } 240 sbuf.append((char) ch); 241 } 242 return new IString(sbuf.toString()); 243 } 244 245 /** Make a read-only substring. 246 * The start and end are in terms of code unit (16-bit char). 247 */ substring(CharSequence base, int start, int end)248 public static CharSequence substring(CharSequence base, 249 int start, int end) { 250 if (base instanceof FString) { 251 FString fstr = (FString) base; 252 if (fstr.isVerySimple() || fstr.isSubRange()) 253 return (CharSequence) Sequences.copySimple(fstr, start, end, false); 254 } 255 if (base instanceof String) { 256 return ((String) base).substring(start, end); 257 } else { 258 int len = end - start; 259 StringBuilder sbuf = new StringBuilder(len); 260 if (base instanceof CharSeq) { 261 try { 262 ((CharSeq) base).writeTo(start, len, sbuf); 263 } catch (Throwable ex) { 264 throw new RuntimeException(ex); 265 } 266 } else { 267 for (int i = start; i < end; i++) 268 sbuf.append(base.charAt(i)); 269 } 270 return sbuf.toString(); 271 } 272 } 273 fromUtf8(byte[] bytes, int start, int length)274 public static String fromUtf8(byte[] bytes, int start, int length) { 275 /* #ifdef JAVA7 */ 276 return new String(bytes, start, length, java.nio.charset.StandardCharsets.UTF_8); 277 /* #else */ 278 // try { 279 // return new String(bytes, start, length, "UTF-8"); 280 // } catch (java.io.UnsupportedEncodingException ex) { 281 // throw new RuntimeException(ex); 282 // } 283 /* #endif */ 284 } 285 toUtf16(CharSequence str, int start, int end, boolean bigEndian, boolean writeBOM)286 public static byte[] toUtf16(CharSequence str, int start, int end, 287 boolean bigEndian, boolean writeBOM) { 288 int blen = 2*(end-start)+(writeBOM?2:0); 289 byte[] buf = new byte[blen]; 290 int hi = bigEndian ? 0 : 1; 291 int lo = bigEndian ? 1 : 0; 292 int i = start; 293 int j = 0; 294 while (j < blen) { 295 char ch; 296 if (writeBOM) { 297 ch = '\uFEFF'; 298 writeBOM = false; 299 } 300 else 301 ch = str.charAt(i++); 302 buf[j + lo] = (byte) ch; 303 buf[j + hi] = (byte) (ch >> 8); 304 j += 2; 305 } 306 return buf; 307 } 308 compareTo(CharSequence str1, CharSequence str2)309 public static int compareTo(CharSequence str1, CharSequence str2) { 310 int n1 = str1.length(); 311 int n2 = str2.length(); 312 int n = n1 > n2 ? n2 : n1; 313 for (int i = 0; i < n; i++) { 314 char c1 = str1.charAt(i); 315 char c2 = str2.charAt(i); 316 int d = c1 - c2; 317 if (d != 0) 318 return d; 319 } 320 return n1 - n2; 321 } 322 replicate(int from, int to, boolean suppliedTo, CharSequence string, int start, int end, boolean suppliedEnd)323 public static String replicate(int from, int to, boolean suppliedTo, 324 CharSequence string, 325 int start, int end, boolean suppliedEnd) { 326 int sstart = Strings.offsetByCodePoints(string, start, 0, 0); 327 if (end <= start || (suppliedTo && to < from)) { 328 if (end >= start && from == to) 329 return ""; 330 throw new StringIndexOutOfBoundsException(); 331 } 332 int slen = end - start; 333 // startOffset = modulo(from, slen) 334 int startOffset = from % slen; 335 if (startOffset < 0) startOffset += slen; 336 int ptr = Strings.offsetByCodePoints(string, startOffset, 337 sstart, start); 338 int send = ! suppliedEnd ? string.length() 339 : Strings.offsetByCodePoints(string, end-startOffset, ptr, startOffset); 340 StringBuilder buf = new StringBuilder(); 341 for (int i = from; 342 suppliedTo ? i < to : ptr < send; 343 i++) { 344 if (ptr == send) 345 ptr = sstart; 346 char ch = string.charAt(ptr); 347 ptr++; 348 buf.append(ch); 349 if (ch >= 0xD800 && ch <= 0xDBFF && ptr < send) { 350 char next = string.charAt(ptr); 351 if (next >= 0xDC00 && next <= 0xDFFF) { 352 ptr++; 353 buf.append(next); 354 } 355 } 356 } 357 return buf.toString(); 358 } 359 } 360