1 /*
2  * Copyright (c) 2019, Oracle and/or its affiliates. All rights reserved.
3  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
4  *
5  * This code is free software; you can redistribute it and/or modify it
6  * under the terms of the GNU General Public License version 2 only, as
7  * published by the Free Software Foundation.  Oracle designates this
8  * particular file as subject to the "Classpath" exception as provided
9  * by Oracle in the LICENSE file that accompanied this code.
10  *
11  * This code is distributed in the hope that it will be useful, but WITHOUT
12  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
13  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
14  * version 2 for more details (a copy is included in the LICENSE file that
15  * accompanied this code).
16  *
17  * You should have received a copy of the GNU General Public License version
18  * 2 along with this work; if not, write to the Free Software Foundation,
19  * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
20  *
21  * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
22  * or visit www.oracle.com if you need additional information or have any
23  * questions.
24  */
25 // (c) 2018 and later: Unicode, Inc. and others.
26 // License & terms of use: http://www.unicode.org/copyright.html#License
27 
28 // created: 2018may10 Markus W. Scherer
29 
30 package sun.text.normalizer;
31 
32 import java.util.Iterator;
33 import java.util.NoSuchElementException;
34 
35 /**
36  * Abstract map from Unicode code points (U+0000..U+10FFFF) to integer values.
37  * This does not implement java.util.Map.
38  *
39  * @draft ICU 63
40  * @provisional This API might change or be removed in a future release.
41  */
42 public abstract class CodePointMap implements Iterable<CodePointMap.Range> {
43     /**
44      * Selectors for how getRange() should report value ranges overlapping with surrogates.
45      * Most users should use NORMAL.
46      *
47      * @see #getRange
48      * @draft ICU 63
49      * @provisional This API might change or be removed in a future release.
50      */
51     public enum RangeOption {
52         /**
53          * getRange() enumerates all same-value ranges as stored in the map.
54          * Most users should use this option.
55          *
56          * @draft ICU 63
57          * @provisional This API might change or be removed in a future release.
58          */
59         NORMAL,
60         /**
61          * getRange() enumerates all same-value ranges as stored in the map,
62          * except that lead surrogates (U+D800..U+DBFF) are treated as having the
63          * surrogateValue, which is passed to getRange() as a separate parameter.
64          * The surrogateValue is not transformed via filter().
65          * See {@link Character#isHighSurrogate}.
66          *
67          * <p>Most users should use NORMAL instead.
68          *
69          * <p>This option is useful for maps that map surrogate code *units* to
70          * special values optimized for UTF-16 string processing
71          * or for special error behavior for unpaired surrogates,
72          * but those values are not to be associated with the lead surrogate code *points*.
73          *
74          * @draft ICU 63
75          * @provisional This API might change or be removed in a future release.
76          */
77         FIXED_LEAD_SURROGATES,
78         /**
79          * getRange() enumerates all same-value ranges as stored in the map,
80          * except that all surrogates (U+D800..U+DFFF) are treated as having the
81          * surrogateValue, which is passed to getRange() as a separate parameter.
82          * The surrogateValue is not transformed via filter().
83          * See {@link Character#isSurrogate}.
84          *
85          * <p>Most users should use NORMAL instead.
86          *
87          * <p>This option is useful for maps that map surrogate code *units* to
88          * special values optimized for UTF-16 string processing
89          * or for special error behavior for unpaired surrogates,
90          * but those values are not to be associated with the lead surrogate code *points*.
91          *
92          * @draft ICU 63
93          * @provisional This API might change or be removed in a future release.
94          */
95         FIXED_ALL_SURROGATES
96     }
97 
98     /**
99      * Callback function interface: Modifies a map value.
100      * Optionally called by getRange().
101      * The modified value will be returned by the getRange() function.
102      *
103      * <p>Can be used to ignore some of the value bits,
104      * make a filter for one of several values,
105      * return a value index computed from the map value, etc.
106      *
107      * @see #getRange
108      * @see #iterator
109      * @draft ICU 63
110      * @provisional This API might change or be removed in a future release.
111      */
112     public interface ValueFilter {
113         /**
114          * Modifies the map value.
115          *
116          * @param value map value
117          * @return modified value
118          * @draft ICU 63
119          * @provisional This API might change or be removed in a future release.
120          */
apply(int value)121         public int apply(int value);
122     }
123 
124     /**
125      * Range iteration result data.
126      * Code points from start to end map to the same value.
127      * The value may have been modified by {@link ValueFilter#apply(int)},
128      * or it may be the surrogateValue if a RangeOption other than "normal" was used.
129      *
130      * @see #getRange
131      * @see #iterator
132      * @draft ICU 63
133      * @provisional This API might change or be removed in a future release.
134      */
135     public static final class Range {
136         private int start;
137         private int end;
138         private int value;
139 
140         /**
141          * Constructor. Sets start and end to -1 and value to 0.
142          *
143          * @draft ICU 63
144          * @provisional This API might change or be removed in a future release.
145          */
Range()146         public Range() {
147             start = end = -1;
148             value = 0;
149         }
150 
151         /**
152          * @return the start code point
153          * @draft ICU 63
154          * @provisional This API might change or be removed in a future release.
155          */
getStart()156         public int getStart() { return start; }
157         /**
158          * @return the (inclusive) end code point
159          * @draft ICU 63
160          * @provisional This API might change or be removed in a future release.
161          */
getEnd()162         public int getEnd() { return end; }
163         /**
164          * @return the range value
165          * @draft ICU 63
166          * @provisional This API might change or be removed in a future release.
167          */
getValue()168         public int getValue() { return value; }
169         /**
170          * Sets the range. When using {@link #iterator()},
171          * iteration will resume after the newly set end.
172          *
173          * @param start new start code point
174          * @param end new end code point
175          * @param value new value
176          * @draft ICU 63
177          * @provisional This API might change or be removed in a future release.
178          */
set(int start, int end, int value)179         public void set(int start, int end, int value) {
180             this.start = start;
181             this.end = end;
182             this.value = value;
183         }
184     }
185 
186     private final class RangeIterator implements Iterator<Range> {
187         private Range range = new Range();
188 
189         @Override
hasNext()190         public boolean hasNext() {
191             return -1 <= range.end && range.end < 0x10ffff;
192         }
193 
194         @Override
next()195         public Range next() {
196             if (getRange(range.end + 1, null, range)) {
197                 return range;
198             } else {
199                 throw new NoSuchElementException();
200             }
201         }
202 
203         @Override
remove()204         public final void remove() {
205             throw new UnsupportedOperationException();
206         }
207     }
208 
209     /**
210      * Iterates over code points of a string and fetches map values.
211      * This does not implement java.util.Iterator.
212      *
213      * <pre>
214      * void onString(CodePointMap map, CharSequence s, int start) {
215      *     CodePointMap.StringIterator iter = map.stringIterator(s, start);
216      *     while (iter.next()) {
217      *         int end = iter.getIndex();  // code point from between start and end
218      *         useValue(s, start, end, iter.getCodePoint(), iter.getValue());
219      *         start = end;
220      *     }
221      * }
222      * </pre>
223      *
224      * <p>This class is not intended for public subclassing.
225      *
226      * @draft ICU 63
227      * @provisional This API might change or be removed in a future release.
228      */
229     public class StringIterator {
230         /**
231          * @internal
232          * @deprecated This API is ICU internal only.
233          */
234         @Deprecated
235         protected CharSequence s;
236         /**
237          * @internal
238          * @deprecated This API is ICU internal only.
239          */
240         @Deprecated
241         protected int sIndex;
242         /**
243          * @internal
244          * @deprecated This API is ICU internal only.
245          */
246         @Deprecated
247         protected int c;
248         /**
249          * @internal
250          * @deprecated This API is ICU internal only.
251          */
252         @Deprecated
253         protected int value;
254 
255         /**
256          * @internal
257          * @deprecated This API is ICU internal only.
258          */
259         @Deprecated
StringIterator(CharSequence s, int sIndex)260         protected StringIterator(CharSequence s, int sIndex) {
261             this.s = s;
262             this.sIndex = sIndex;
263             c = -1;
264             value = 0;
265         }
266 
267         /**
268          * Resets the iterator to a new string and/or a new string index.
269          *
270          * @param s string to iterate over
271          * @param sIndex string index where the iteration will start
272          * @draft ICU 63
273          * @provisional This API might change or be removed in a future release.
274          */
reset(CharSequence s, int sIndex)275         public void reset(CharSequence s, int sIndex) {
276             this.s = s;
277             this.sIndex = sIndex;
278             c = -1;
279             value = 0;
280         }
281 
282         /**
283          * Reads the next code point, post-increments the string index,
284          * and gets a value from the map.
285          * Sets an implementation-defined error value if the code point is an unpaired surrogate.
286          *
287          * @return true if the string index was not yet at the end of the string;
288          *         otherwise the iterator did not advance
289          * @draft ICU 63
290          * @provisional This API might change or be removed in a future release.
291          */
next()292         public boolean next() {
293             if (sIndex >= s.length()) {
294                 return false;
295             }
296             c = Character.codePointAt(s, sIndex);
297             sIndex += Character.charCount(c);
298             value = get(c);
299             return true;
300         }
301 
302         /**
303          * Reads the previous code point, pre-decrements the string index,
304          * and gets a value from the map.
305          * Sets an implementation-defined error value if the code point is an unpaired surrogate.
306          *
307          * @return true if the string index was not yet at the start of the string;
308          *         otherwise the iterator did not advance
309          * @draft ICU 63
310          * @provisional This API might change or be removed in a future release.
311          */
previous()312         public boolean previous() {
313             if (sIndex <= 0) {
314                 return false;
315             }
316             c = Character.codePointBefore(s, sIndex);
317             sIndex -= Character.charCount(c);
318             value = get(c);
319             return true;
320         }
321         /**
322          * @return the string index
323          * @draft ICU 63
324          * @provisional This API might change or be removed in a future release.
325          */
getIndex()326         public final int getIndex() { return sIndex; }
327         /**
328          * @return the code point
329          * @draft ICU 63
330          * @provisional This API might change or be removed in a future release.
331          */
getCodePoint()332         public final int getCodePoint() { return c; }
333         /**
334          * @return the map value,
335          *         or an implementation-defined error value if
336          *         the code point is an unpaired surrogate
337          * @draft ICU 63
338          * @provisional This API might change or be removed in a future release.
339          */
getValue()340         public final int getValue() { return value; }
341     }
342 
343     /**
344      * Protected no-args constructor.
345      *
346      * @draft ICU 63
347      * @provisional This API might change or be removed in a future release.
348      */
CodePointMap()349     protected CodePointMap() {
350     }
351 
352     /**
353      * Returns the value for a code point as stored in the map, with range checking.
354      * Returns an implementation-defined error value if c is not in the range 0..U+10FFFF.
355      *
356      * @param c the code point
357      * @return the map value,
358      *         or an implementation-defined error value if
359      *         the code point is not in the range 0..U+10FFFF
360      * @draft ICU 63
361      * @provisional This API might change or be removed in a future release.
362      */
get(int c)363     public abstract int get(int c);
364 
365     /**
366      * Sets the range object to a range of code points beginning with the start parameter.
367      * The range start is the same as the start input parameter
368      * (even if there are preceding code points that have the same value).
369      * The range end is the last code point such that
370      * all those from start to there have the same value.
371      * Returns false if start is not 0..U+10FFFF.
372      * Can be used to efficiently iterate over all same-value ranges in a map.
373      * (This is normally faster than iterating over code points and get()ting each value,
374      * but may be much slower than a data structure that stores ranges directly.)
375      *
376      * <p>If the {@link ValueFilter} parameter is not null, then
377      * the value to be delivered is passed through that filter, and the return value is the end
378      * of the range where all values are modified to the same actual value.
379      * The value is unchanged if that parameter is null.
380      *
381      * <p>Example:
382      * <pre>
383      * int start = 0;
384      * CodePointMap.Range range = new CodePointMap.Range();
385      * while (map.getRange(start, null, range)) {
386      *     int end = range.getEnd();
387      *     int value = range.getValue();
388      *     // Work with the range start..end and its value.
389      *     start = end + 1;
390      * }
391      * </pre>
392      *
393      * @param start range start
394      * @param filter an object that may modify the map data value,
395      *     or null if the values from the map are to be used unmodified
396      * @param range the range object that will be set to the code point range and value
397      * @return true if start is 0..U+10FFFF; otherwise no new range is fetched
398      * @draft ICU 63
399      * @provisional This API might change or be removed in a future release.
400      */
getRange(int start, ValueFilter filter, Range range)401     public abstract boolean getRange(int start, ValueFilter filter, Range range);
402 
403     /**
404      * Sets the range object to a range of code points beginning with the start parameter.
405      * The range start is the same as the start input parameter
406      * (even if there are preceding code points that have the same value).
407      * The range end is the last code point such that
408      * all those from start to there have the same value.
409      * Returns false if start is not 0..U+10FFFF.
410      *
411      * <p>Same as the simpler {@link #getRange(int, ValueFilter, Range)} but optionally
412      * modifies the range if it overlaps with surrogate code points.
413      *
414      * @param start range start
415      * @param option defines whether surrogates are treated normally,
416      *               or as having the surrogateValue; usually {@link RangeOption#NORMAL}
417      * @param surrogateValue value for surrogates; ignored if option=={@link RangeOption#NORMAL}
418      * @param filter an object that may modify the map data value,
419      *     or null if the values from the map are to be used unmodified
420      * @param range the range object that will be set to the code point range and value
421      * @return true if start is 0..U+10FFFF; otherwise no new range is fetched
422      * @draft ICU 63
423      * @provisional This API might change or be removed in a future release.
424      */
getRange(int start, RangeOption option, int surrogateValue, ValueFilter filter, Range range)425     public boolean getRange(int start, RangeOption option, int surrogateValue,
426             ValueFilter filter, Range range) {
427         assert option != null;
428         if (!getRange(start, filter, range)) {
429             return false;
430         }
431         if (option == RangeOption.NORMAL) {
432             return true;
433         }
434         int surrEnd = option == RangeOption.FIXED_ALL_SURROGATES ? 0xdfff : 0xdbff;
435         int end = range.end;
436         if (end < 0xd7ff || start > surrEnd) {
437             return true;
438         }
439         // The range overlaps with surrogates, or ends just before the first one.
440         if (range.value == surrogateValue) {
441             if (end >= surrEnd) {
442                 // Surrogates followed by a non-surrValue range,
443                 // or surrogates are part of a larger surrValue range.
444                 return true;
445             }
446         } else {
447             if (start <= 0xd7ff) {
448                 range.end = 0xd7ff;  // Non-surrValue range ends before surrValue surrogates.
449                 return true;
450             }
451             // Start is a surrogate with a non-surrValue code *unit* value.
452             // Return a surrValue code *point* range.
453             range.value = surrogateValue;
454             if (end > surrEnd) {
455                 range.end = surrEnd;  // Surrogate range ends before non-surrValue rest of range.
456                 return true;
457             }
458         }
459         // See if the surrValue surrogate range can be merged with
460         // an immediately following range.
461         if (getRange(surrEnd + 1, filter, range) && range.value == surrogateValue) {
462             range.start = start;
463             return true;
464         }
465         range.start = start;
466         range.end = surrEnd;
467         range.value = surrogateValue;
468         return true;
469     }
470 
471     /**
472      * Convenience iterator over same-map-value code point ranges.
473      * Same as looping over all ranges with {@link #getRange(int, ValueFilter, Range)}
474      * without filtering.
475      * Adjacent ranges have different map values.
476      *
477      * <p>The iterator always returns the same Range object.
478      *
479      * @return a Range iterator
480      * @draft ICU 63
481      * @provisional This API might change or be removed in a future release.
482      */
483     @Override
iterator()484     public Iterator<Range> iterator() {
485         return new RangeIterator();
486     }
487 
488     /**
489      * Returns an iterator (not a java.util.Iterator) over code points of a string
490      * for fetching map values.
491      *
492      * @param s string to iterate over
493      * @param sIndex string index where the iteration will start
494      * @return the iterator
495      * @draft ICU 63
496      * @provisional This API might change or be removed in a future release.
497      */
stringIterator(CharSequence s, int sIndex)498     public StringIterator stringIterator(CharSequence s, int sIndex) {
499         return new StringIterator(s, sIndex);
500     }
501 }
502