1 // Copyright (c) 2001  Per M.A. Bothner and Brainfood Inc.
2 // This is free software;  for terms and warranty disclaimer see ./COPYING.
3 
4 package gnu.lists;
5 
6 import gnu.text.Char;
7 import java.io.IOException;
8 
9 /** Various static utility methods for general strings (CharSeqs). */
10 
11 public class Strings
12 {
13     /** Get character (code point) at a offset.
14      * @param index offset measured in 16-bit code units
15      */
characterAt(CharSequence cseq, int index)16     public static int characterAt(CharSequence cseq, int index) {
17         return characterAt(cseq, 0, cseq.length(), index);
18     }
19     /** Get character (code point) at a offset.
20      * @param index offset measured in 16-bit code units,
21      * from begining of cseq, not frm start
22      */
characterAt(CharSequence cseq, int start, int end, int index)23     public static int characterAt(CharSequence cseq, int start, int end,
24                                   int index) {
25         if (index < start || index >= end)
26             throw new IndexOutOfBoundsException();
27         char ch1 = cseq.charAt(index);
28         if (ch1 >= 0xD800 && ch1 <= 0xDBFF) {
29             if (index + 1 < end) {
30                 char ch2 = cseq.charAt(index+1);
31                 if (ch2 >= 0xDC00 && ch2 <= 0xDFFF)
32                     return ((ch1 - 0xD800) << 10) + (ch2 - 0xDC00) + 0x10000;
33             }
34         } else if (ch1 >= 0xDC00 && ch1 <= 0xDFFF) {
35             if (index > start) {
36                 char ch0 = cseq.charAt(index-1);
37                 if (ch0 >= 0xD800 && ch0 <= 0xDBFF)
38                     return Char.IGNORABLE_CHAR;
39             }
40         }
41         return ch1;
42     }
43     /** Get index'th character (code point).
44      * @param index offset by code points
45      */
indexByCodePoints(CharSequence str, int index)46     public static int indexByCodePoints(CharSequence str, int index) {
47         if (str instanceof IString)
48             return ((IString) str).indexByCodePoints(index);
49         index = Character.offsetByCodePoints(str, 0, index);
50         return Character.codePointAt(str, index);
51     }
52 
53     /** Like offsetByCodePoints, but optimize if an IString.
54      * @param offset number of code points beyond start index.
55      * @param cuStart start index in code units (Java chars)
56      * @param cpStart start index in Unicode code points
57      */
offsetByCodePoints(CharSequence str, int offset, int cuStart, int cpStart)58     public static int offsetByCodePoints(CharSequence str, int offset,
59                                          int cuStart, int cpStart) {
60         if (str instanceof IString) {
61             IString istr = (IString) str;
62             offset += cpStart;
63             if (offset < 0 || offset > istr.size())
64                 throw new IndexOutOfBoundsException();
65             return istr.offsetByCodePoints(offset);
66         }
67         return Character.offsetByCodePoints(str, cuStart, offset);
68     }
69 
sizeInCodePoints(CharSequence str)70     public static int sizeInCodePoints(CharSequence str) {
71         if (str instanceof IString)
72             return ((IString) str).lengthByCodePoints();
73         int len = str.length();
74         int nsurr = 0;
75         for (int i = 0; i < len;  ) {
76             char ch = str.charAt(i++);
77             if (ch >= 0xD800 && ch <= 0xDBFF && i < len) {
78                 int next = str.charAt(i);
79                 if (next >= 0xDC00 && next <= 0xDFFF) {
80                     i++;
81                     nsurr++;
82                 }
83             }
84         }
85         return len-nsurr;
86     }
87 
88   /** Change every character to be uppercase. */
makeUpperCase(CharSeq str)89   public static void makeUpperCase(CharSeq str)
90   {
91     for (int i = str.length();  --i >= 0; )
92       str.setCharAt(i, Character.toUpperCase(str.charAt(i)));
93   }
94 
95   /** Change every character to be lowercase. */
makeLowerCase(CharSeq str)96   public static void makeLowerCase(CharSeq str)
97   {
98     for (int i = str.length();  --i >= 0; )
99       str.setCharAt(i, Character.toLowerCase(str.charAt(i)));
100   }
101 
102   /** Capitalize this string.
103    * Change first character of each word to titlecase,
104    * and change the other characters to lowercase. */
makeCapitalize(CharSeq str)105   public static void makeCapitalize(CharSeq str)
106   {
107     char prev = ' ';
108     int len = str.length();
109     for (int i = 0;  i < len;  i++)
110       {
111 	char ch = str.charAt(i);
112 	if (! Character.isLetterOrDigit(prev))
113 	  ch = Character.toTitleCase(ch);
114         else
115           ch = Character.toLowerCase(ch);
116 	str.setCharAt(i, ch);
117 	prev = ch;
118       }
119   }
120 
toJson(CharSequence str)121     public static String toJson(CharSequence str) {
122         StringBuilder sbuf = new StringBuilder();
123         printQuoted(str, sbuf, 3);
124         return sbuf.toString();
125     }
126 
printJson(CharSequence str, Appendable ps)127     public static void printJson(CharSequence str, Appendable ps) {
128         printQuoted(str, ps, 3);
129     }
130 
131     /** Print a string with quotes and escapes.
132      * @param escapes The value 0 means only escape '"' and '\\';
133      *   the value 1 means escape standard escape characters like '\\b';
134      *   the value 2 means escape all non-ascii or control characters;
135      *   the value 3 means follow the JSON standard.
136      */
printQuoted(CharSequence str, Appendable ps, int escapes)137     public static void printQuoted(CharSequence str,
138                                    Appendable ps, int escapes) {
139         int len = str.length();
140         try {
141             ps.append('\"');
142             for (int i = 0;  i < len; i++) {
143                 char ch = str.charAt(i);
144                 if ((ch == '\\' || ch == '\"'))
145                     ps.append('\\');
146                 else if (escapes > 0) {
147                     // These escapes are R6RS:
148                     if (ch == '\n')
149                     { ps.append("\\n"); continue; }
150                     else if (ch == '\r')
151                     { ps.append("\\r"); continue; }
152                     else if (ch == '\t')
153                     { ps.append("\\t"); continue; }
154                     else if (ch == '\007' && escapes < 3)
155                     { ps.append("\\a"); continue; }
156                     else if (ch == '\b')
157                     { ps.append("\\b"); continue; }
158                     else if (ch == '\013' && escapes < 3)
159                     { ps.append("\\v"); continue; }
160                     else if (ch == '\f')
161                     { ps.append("\\f"); continue; }
162                     else if (escapes >= 3 && (ch < ' ' || ch >= 127))
163                     {
164                         ps.append("\\u");
165                         int d = ch;
166                         for (int k = 12; k >= 0; k -= 4) {
167                             ps.append(Character.forDigit((d >> k) & 15, 16));
168                         }
169                         continue;
170                     }
171                     else if (ch < ' ' || (escapes > 1 && ch >= 127))
172                     {
173                         ps.append("\\x");
174                         ps.append(Integer.toHexString(ch));
175                         ps.append(';');
176                         continue;
177                     }
178                 }
179                 ps.append(ch);
180             }
181             ps.append('\"');
182         } catch (IOException ex) {
183             throw new RuntimeException(ex);
184         }
185     }
186 
copyInto(CharSequence src, int start, int end, CharSeq dst, int at)187     public static void copyInto(CharSequence src, int start, int end,
188                                 CharSeq dst, int at) {
189         int dstLen = dst.length();
190         int srcLen = src.length();
191         if (at < 0 || at > dstLen || start < 0 || end > srcLen || end < start
192             || dstLen - at < end - start)
193             throw new StringIndexOutOfBoundsException();
194         if (at < start) {
195             int i = at;
196             int j = start;
197             for (; j < end; i++, j++) {
198                 dst.setCharAt(i, src.charAt(j));
199             }
200         }
201         else {
202             int i = at + end - start;
203             int j = end;
204             while (--j >= start) {
205                 dst.setCharAt(--i, src.charAt(j));
206             }
207         }
208     }
209 
210     /** Make a read-only substring, generalized to arbitrary index sequences.
211      * The indexes are in terms of code points (character) offsets.
212      */
indirectIndexed(CharSequence base, IntSequence indexes)213     public static IString indirectIndexed(CharSequence base,
214                                                IntSequence indexes) {
215         if (indexes instanceof Range.IntRange) {
216             Range.IntRange range = (Range.IntRange) indexes;
217             if (range.getStepInt() == 1) {
218                 int start = range.getStartInt();
219                 int end = base.length();
220                 if (start < 0 || start > end)
221                     throw new IndexOutOfBoundsException();
222                 int size;
223                 if (! range.isUnbounded()) {
224                     size = range.size();
225                     if (start+size < 0 || start+size > end)
226                         throw new IndexOutOfBoundsException();
227                 } else
228                     size = end - start;
229                 return IString.valueOf(base, start, size);
230             }
231         }
232         int len = indexes.size();
233         StringBuilder sbuf = new StringBuilder(len);
234         for (int i = 0; i < len; i++) {
235             int ch = Strings.indexByCodePoints(base, indexes.getInt(i));
236             if (ch >= 0x10000) {
237                 sbuf.append((char) (((ch - 0x10000) >> 10) + 0xD800));
238                 ch = (ch & 0x3FF) + 0xDC00;
239             }
240             sbuf.append((char) ch);
241         }
242         return new IString(sbuf.toString());
243     }
244 
245     /** Make a read-only substring.
246      * The start and end are in terms of code unit (16-bit char).
247      */
substring(CharSequence base, int start, int end)248     public static CharSequence substring(CharSequence base,
249                                          int start, int end) {
250         if (base instanceof FString) {
251             FString fstr = (FString) base;
252             if (fstr.isVerySimple() || fstr.isSubRange())
253                 return (CharSequence) Sequences.copySimple(fstr, start, end, false);
254         }
255         if (base instanceof String) {
256             return ((String) base).substring(start, end);
257         } else {
258             int len = end - start;
259             StringBuilder sbuf = new StringBuilder(len);
260             if (base instanceof CharSeq) {
261                 try {
262                     ((CharSeq) base).writeTo(start, len, sbuf);
263                 } catch (Throwable ex) {
264                     throw new RuntimeException(ex);
265                 }
266             } else {
267                 for (int i = start; i < end; i++)
268                     sbuf.append(base.charAt(i));
269             }
270             return sbuf.toString();
271         }
272     }
273 
fromUtf8(byte[] bytes, int start, int length)274     public static String fromUtf8(byte[] bytes, int start, int length) {
275         /* #ifdef JAVA7 */
276         return new String(bytes, start, length, java.nio.charset.StandardCharsets.UTF_8);
277         /* #else */
278         // try {
279         //   return new String(bytes, start, length, "UTF-8");
280         // } catch (java.io.UnsupportedEncodingException ex) {
281         //     throw new RuntimeException(ex);
282         // }
283         /* #endif */
284     }
285 
toUtf16(CharSequence str, int start, int end, boolean bigEndian, boolean writeBOM)286     public static byte[] toUtf16(CharSequence str, int start, int end,
287                                  boolean bigEndian, boolean writeBOM) {
288         int blen = 2*(end-start)+(writeBOM?2:0);
289         byte[] buf = new byte[blen];
290         int hi = bigEndian ? 0 : 1;
291         int lo = bigEndian ? 1 : 0;
292         int i = start;
293         int j = 0;
294         while (j < blen) {
295             char ch;
296             if (writeBOM) {
297                 ch = '\uFEFF';
298                 writeBOM = false;
299             }
300             else
301                 ch = str.charAt(i++);
302             buf[j + lo] = (byte) ch;
303             buf[j + hi] = (byte) (ch >> 8);
304             j += 2;
305         }
306         return buf;
307     }
308 
compareTo(CharSequence str1, CharSequence str2)309     public static int compareTo(CharSequence str1, CharSequence str2) {
310         int n1 = str1.length();
311         int n2 = str2.length();
312         int n = n1 > n2 ? n2 : n1;
313         for (int i = 0; i < n; i++) {
314             char c1 = str1.charAt(i);
315             char c2 = str2.charAt(i);
316             int d = c1 - c2;
317             if (d != 0)
318                 return d;
319         }
320         return n1 - n2;
321     }
322 
replicate(int from, int to, boolean suppliedTo, CharSequence string, int start, int end, boolean suppliedEnd)323     public static String replicate(int from, int to, boolean suppliedTo,
324                                     CharSequence string,
325                                     int start, int end, boolean suppliedEnd) {
326         int sstart = Strings.offsetByCodePoints(string, start, 0, 0);
327         if (end <= start || (suppliedTo && to < from)) {
328             if (end >= start && from == to)
329                 return "";
330             throw new StringIndexOutOfBoundsException();
331         }
332         int slen = end - start;
333         // startOffset = modulo(from, slen)
334         int startOffset = from % slen;
335         if (startOffset < 0) startOffset += slen;
336         int ptr = Strings.offsetByCodePoints(string, startOffset,
337                                              sstart, start);
338         int send = ! suppliedEnd ? string.length()
339             : Strings.offsetByCodePoints(string, end-startOffset, ptr, startOffset);
340         StringBuilder buf = new StringBuilder();
341         for (int i = from;
342              suppliedTo ? i < to : ptr < send;
343              i++) {
344             if (ptr == send)
345                 ptr = sstart;
346             char ch = string.charAt(ptr);
347             ptr++;
348             buf.append(ch);
349             if (ch >= 0xD800 && ch <= 0xDBFF && ptr < send) {
350                 char next = string.charAt(ptr);
351                 if (next >= 0xDC00 && next <= 0xDFFF) {
352                     ptr++;
353                     buf.append(next);
354                 }
355             }
356         }
357         return buf.toString();
358     }
359 }
360