1 /*
2  * Copyright (c) 2015, Oracle and/or its affiliates. All rights reserved.
3  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
4  *
5  * This code is free software; you can redistribute it and/or modify it
6  * under the terms of the GNU General Public License version 2 only, as
7  * published by the Free Software Foundation.  Oracle designates this
8  * particular file as subject to the "Classpath" exception as provided
9  * by Oracle in the LICENSE file that accompanied this code.
10  *
11  * This code is distributed in the hope that it will be useful, but WITHOUT
12  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
13  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
14  * version 2 for more details (a copy is included in the LICENSE file that
15  * accompanied this code).
16  *
17  * You should have received a copy of the GNU General Public License version
18  * 2 along with this work; if not, write to the Free Software Foundation,
19  * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
20  *
21  * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
22  * or visit www.oracle.com if you need additional information or have any
23  * questions.
24  */
25 
26 /*
27  ******************************************************************************
28  *
29  *   Copyright (C) 2009-2014, International Business Machines
30  *   Corporation and others.  All Rights Reserved.
31  *
32  ******************************************************************************
33  */
34 
35 package sun.text.normalizer;
36 
37 import java.util.ArrayList;
38 
39 import sun.text.normalizer.UnicodeSet.SpanCondition;
40 
41 /*
42  * Implement span() etc. for a set with strings.
43  * Avoid recursion because of its exponential complexity.
44  * Instead, try multiple paths at once and track them with an IndexList.
45  */
46 class UnicodeSetStringSpan {
47 
48     /*
49      * Which span() variant will be used? The object is either built for one variant and used once,
50      * or built for all and may be used many times.
51      */
52     public static final int WITH_COUNT    = 0x40;  // spanAndCount() may be called
53     public static final int FWD           = 0x20;
54     public static final int BACK          = 0x10;
55     // public static final int UTF16      = 8;
56     public static final int CONTAINED     = 2;
57     public static final int NOT_CONTAINED = 1;
58 
59     public static final int ALL = 0x7f;
60 
61     public static final int FWD_UTF16_CONTAINED      = FWD  | /* UTF16 | */    CONTAINED;
62     public static final int FWD_UTF16_NOT_CONTAINED  = FWD  | /* UTF16 | */NOT_CONTAINED;
63     public static final int BACK_UTF16_CONTAINED     = BACK | /* UTF16 | */    CONTAINED;
64     public static final int BACK_UTF16_NOT_CONTAINED = BACK | /* UTF16 | */NOT_CONTAINED;
65 
66     /**
67      * Special spanLength short values. (since Java has not unsigned byte type)
68      * All code points in the string are contained in the parent set.
69      */
70     static final short ALL_CP_CONTAINED = 0xff;
71 
72     /** The spanLength is >=0xfe. */
73     static final short LONG_SPAN = ALL_CP_CONTAINED - 1;
74 
75     /** Set for span(). Same as parent but without strings. */
76     private UnicodeSet spanSet;
77 
78     /**
79      * Set for span(not contained).
80      * Same as spanSet, plus characters that start or end strings.
81      */
82     private UnicodeSet spanNotSet;
83 
84     /** The strings of the parent set. */
85     private ArrayList<String> strings;
86 
87     /** The lengths of span(), spanBack() etc. for each string. */
88     private short[] spanLengths;
89 
90     /** Maximum lengths of relevant strings. */
91     private int maxLength16;
92 
93     /** Are there strings that are not fully contained in the code point set? */
94     private boolean someRelevant;
95 
96     /** Set up for all variants of span()? */
97     private boolean all;
98 
99     /** Span helper */
100     private OffsetList offsets;
101 
102     /**
103      * Constructs for all variants of span(), or only for any one variant.
104      * Initializes as little as possible, for single use.
105      */
UnicodeSetStringSpan(final UnicodeSet set, final ArrayList<String> setStrings, int which)106     public UnicodeSetStringSpan(final UnicodeSet set, final ArrayList<String> setStrings, int which) {
107         spanSet = new UnicodeSet(0, 0x10ffff);
108         // TODO: With Java 6, just take the parent set's strings as is,
109         // as a NavigableSet<String>, rather than as an ArrayList copy of the set of strings.
110         // Then iterate via the first() and higher() methods.
111         // (We do not want to create multiple Iterator objects in each span().)
112         // See ICU ticket #7454.
113         strings = setStrings;
114         all = (which == ALL);
115         spanSet.retainAll(set);
116         if (0 != (which & NOT_CONTAINED)) {
117             // Default to the same sets.
118             // addToSpanNotSet() will create a separate set if necessary.
119             spanNotSet = spanSet;
120         }
121         offsets = new OffsetList();
122 
123         // Determine if the strings even need to be taken into account at all for span() etc.
124         // If any string is relevant, then all strings need to be used for
125         // span(longest match) but only the relevant ones for span(while contained).
126         // TODO: Possible optimization: Distinguish CONTAINED vs. LONGEST_MATCH
127         // and do not store UTF-8 strings if !thisRelevant and CONTAINED.
128         // (Only store irrelevant UTF-8 strings for LONGEST_MATCH where they are relevant after all.)
129         // Also count the lengths of the UTF-8 versions of the strings for memory allocation.
130         int stringsLength = strings.size();
131 
132         int i, spanLength;
133         someRelevant = false;
134         for (i = 0; i < stringsLength; ++i) {
135             String string = strings.get(i);
136             int length16 = string.length();
137             spanLength = spanSet.span(string, SpanCondition.CONTAINED);
138             if (spanLength < length16) { // Relevant string.
139                 someRelevant = true;
140             }
141             if (/* (0 != (which & UTF16)) && */ length16 > maxLength16) {
142                 maxLength16 = length16;
143             }
144         }
145         if (!someRelevant && (which & WITH_COUNT) == 0) {
146             return;
147         }
148 
149         // Freeze after checking for the need to use strings at all because freezing
150         // a set takes some time and memory which are wasted if there are no relevant strings.
151         if (all) {
152             spanSet.freeze();
153         }
154 
155         int spanBackLengthsOffset;
156 
157         // Allocate a block of meta data.
158         int allocSize;
159         if (all) {
160             // 2 sets of span lengths
161             allocSize = stringsLength * (2);
162         } else {
163             allocSize = stringsLength; // One set of span lengths.
164         }
165         spanLengths = new short[allocSize];
166 
167         if (all) {
168             // Store span lengths for all span() variants.
169             spanBackLengthsOffset = stringsLength;
170         } else {
171             // Store span lengths for only one span() variant.
172             spanBackLengthsOffset = 0;
173         }
174 
175         // Set the meta data and spanNotSet and write the UTF-8 strings.
176 
177         for (i = 0; i < stringsLength; ++i) {
178             String string = strings.get(i);
179             int length16 = string.length();
180             spanLength = spanSet.span(string, SpanCondition.CONTAINED);
181             if (spanLength < length16) { // Relevant string.
182                 if (true /* 0 != (which & UTF16) */) {
183                     if (0 != (which & CONTAINED)) {
184                         if (0 != (which & FWD)) {
185                             spanLengths[i] = makeSpanLengthByte(spanLength);
186                         }
187                         if (0 != (which & BACK)) {
188                             spanLength = length16
189                                     - spanSet.spanBack(string, length16, SpanCondition.CONTAINED);
190                             spanLengths[spanBackLengthsOffset + i] = makeSpanLengthByte(spanLength);
191                         }
192                     } else /* not CONTAINED, not all, but NOT_CONTAINED */{
193                         spanLengths[i] = spanLengths[spanBackLengthsOffset + i] = 0; // Only store a relevant/irrelevant
194                                                                                      // flag.
195                     }
196                 }
197                 if (0 != (which & NOT_CONTAINED)) {
198                     // Add string start and end code points to the spanNotSet so that
199                     // a span(while not contained) stops before any string.
200                     int c;
201                     if (0 != (which & FWD)) {
202                         c = string.codePointAt(0);
203                         addToSpanNotSet(c);
204                     }
205                     if (0 != (which & BACK)) {
206                         c = string.codePointBefore(length16);
207                         addToSpanNotSet(c);
208                     }
209                 }
210             } else { // Irrelevant string.
211                 if (all) {
212                     spanLengths[i] = spanLengths[spanBackLengthsOffset + i] = ALL_CP_CONTAINED;
213                 } else {
214                     // All spanXYZLengths pointers contain the same address.
215                     spanLengths[i] = ALL_CP_CONTAINED;
216                 }
217             }
218         }
219 
220         // Finish.
221         if (all) {
222             spanNotSet.freeze();
223         }
224     }
225 
226     /**
227      * Do the strings need to be checked in span() etc.?
228      *
229      * @return true if strings need to be checked (call span() here),
230      *         false if not (use a BMPSet for best performance).
231      */
needsStringSpanUTF16()232     public boolean needsStringSpanUTF16() {
233         return someRelevant;
234     }
235 
236     /** For fast UnicodeSet::contains(c). */
contains(int c)237     public boolean contains(int c) {
238         return spanSet.contains(c);
239     }
240 
241     /**
242      * Adds a starting or ending string character to the spanNotSet
243      * so that a character span ends before any string.
244      */
addToSpanNotSet(int c)245     private void addToSpanNotSet(int c) {
246         if (spanNotSet == null || spanNotSet == spanSet) {
247             if (spanSet.contains(c)) {
248                 return; // Nothing to do.
249             }
250             spanNotSet = spanSet.cloneAsThawed();
251         }
252         spanNotSet.add(c);
253     }
254 
255     /*
256      * Note: In span() when spanLength==0
257      * (after a string match, or at the beginning after an empty code point span)
258      * and in spanNot() and spanNotUTF8(),
259      * string matching could use a binary search because all string matches are done
260      * from the same start index.
261      *
262      * For UTF-8, this would require a comparison function that returns UTF-16 order.
263      *
264      * This optimization should not be necessary for normal UnicodeSets because most sets have no strings, and most sets
265      * with strings have very few very short strings. For cases with many strings, it might be better to use a different
266      * API and implementation with a DFA (state machine).
267      */
268 
269     /*
270      * Algorithm for span(SpanCondition.CONTAINED)
271      *
272      * Theoretical algorithm:
273      * - Iterate through the string, and at each code point boundary:
274      *   + If the code point there is in the set, then remember to continue after it.
275      *   + If a set string matches at the current position, then remember to continue after it.
276      *   + Either recursively span for each code point or string match, or recursively span
277      *     for all but the shortest one and iteratively continue the span with the shortest local match.
278      *   + Remember the longest recursive span (the farthest end point).
279      *   + If there is no match at the current position,
280      *     neither for the code point there nor for any set string,
281      *     then stop and return the longest recursive span length.
282      *
283      * Optimized implementation:
284      *
285      * (We assume that most sets will have very few very short strings.
286      * A span using a string-less set is extremely fast.)
287      *
288      * Create and cache a spanSet which contains all of the single code points of the original set
289      * but none of its strings.
290      *
291      * - Start with spanLength=spanSet.span(SpanCondition.CONTAINED).
292      * - Loop:
293      *   + Try to match each set string at the end of the spanLength.
294      *     ~ Set strings that start with set-contained code points
295      *       must be matched with a partial overlap
296      *       because the recursive algorithm would have tried to match them at every position.
297      *     ~ Set strings that entirely consist of set-contained code points
298      *       are irrelevant for span(SpanCondition.CONTAINED)
299      *       because the recursive algorithm would continue after them anyway and
300      *       find the longest recursive match from their end.
301      *     ~ Rather than recursing, note each end point of a set string match.
302      *   + If no set string matched after spanSet.span(),
303      *     then return with where the spanSet.span() ended.
304      *   + If at least one set string matched after spanSet.span(),
305      *     then pop the shortest string match end point and continue the loop,
306      *     trying to match all set strings from there.
307      *   + If at least one more set string matched after a previous string match, then test if the
308      *     code point after the previous string match is also contained in the set.
309      *     Continue the loop with the shortest end point of
310      *     either this code point or a matching set string.
311      *   + If no more set string matched after a previous string match,
312      *     then try another spanLength=spanSet.span(SpanCondition.CONTAINED).
313      *     Stop if spanLength==0, otherwise continue the loop.
314      *
315      * By noting each end point of a set string match, the function visits each string position at most once and
316      * finishes in linear time.
317      *
318      * The recursive algorithm may visit the same string position many times
319      * if multiple paths lead to it and finishes in exponential time.
320      */
321 
322     /*
323      * Algorithm for span(SIMPLE)
324      *
325      * Theoretical algorithm:
326      * - Iterate through the string, and at each code point boundary:
327      *   + If the code point there is in the set, then remember to continue after it.
328      *   + If a set string matches at the current position, then remember to continue after it.
329      *   + Continue from the farthest match position and ignore all others.
330      *   + If there is no match at the current position, then stop and return the current position.
331      *
332      * Optimized implementation:
333      *
334      * (Same assumption and spanSet as above.)
335      *
336      * - Start with spanLength=spanSet.span(SpanCondition.CONTAINED).
337      * - Loop:
338      *   + Try to match each set string at the end of the spanLength.
339      *     ~ Set strings that start with set-contained code points
340      *       must be matched with a partial overlap
341      *       because the standard algorithm would have tried to match them earlier.
342      *     ~ Set strings that entirely consist of set-contained code points
343      *       must be matched with a full overlap because the longest-match algorithm
344      *       would hide set string matches that end earlier.
345      *       Such set strings need not be matched earlier inside the code point span
346      *       because the standard algorithm would then have
347      *       continued after the set string match anyway.
348      *     ~ Remember the longest set string match (farthest end point)
349      *       from the earliest starting point.
350      *   + If no set string matched after spanSet.span(),
351      *     then return with where the spanSet.span() ended.
352      *   + If at least one set string matched,
353      *     then continue the loop after the longest match from the earliest position.
354      *   + If no more set string matched after a previous string match,
355      *     then try another spanLength=spanSet.span(SpanCondition.CONTAINED).
356      *     Stop if spanLength==0, otherwise continue the loop.
357      */
358     /**
359      * Spans a string.
360      *
361      * @param s The string to be spanned
362      * @param start The start index that the span begins
363      * @param spanCondition The span condition
364      * @return the limit (exclusive end) of the span
365      */
span(CharSequence s, int start, SpanCondition spanCondition)366     public int span(CharSequence s, int start, SpanCondition spanCondition) {
367         if (spanCondition == SpanCondition.NOT_CONTAINED) {
368             return spanNot(s, start, null);
369         }
370         int spanLimit = spanSet.span(s, start, SpanCondition.CONTAINED);
371         if (spanLimit == s.length()) {
372             return spanLimit;
373         }
374         return spanWithStrings(s, start, spanLimit, spanCondition);
375     }
376 
377     /**
378      * Synchronized method for complicated spans using the offsets.
379      * Avoids synchronization for simple cases.
380      *
381      * @param spanLimit = spanSet.span(s, start, CONTAINED)
382      */
spanWithStrings(CharSequence s, int start, int spanLimit, SpanCondition spanCondition)383     private synchronized int spanWithStrings(CharSequence s, int start, int spanLimit,
384             SpanCondition spanCondition) {
385         // Consider strings; they may overlap with the span.
386         int initSize = 0;
387         if (spanCondition == SpanCondition.CONTAINED) {
388             // Use offset list to try all possibilities.
389             initSize = maxLength16;
390         }
391         offsets.setMaxLength(initSize);
392         int length = s.length();
393         int pos = spanLimit, rest = length - spanLimit;
394         int spanLength = spanLimit - start;
395         int i, stringsLength = strings.size();
396         for (;;) {
397             if (spanCondition == SpanCondition.CONTAINED) {
398                 for (i = 0; i < stringsLength; ++i) {
399                     int overlap = spanLengths[i];
400                     if (overlap == ALL_CP_CONTAINED) {
401                         continue; // Irrelevant string.
402                     }
403                     String string = strings.get(i);
404 
405                     int length16 = string.length();
406 
407                     // Try to match this string at pos-overlap..pos.
408                     if (overlap >= LONG_SPAN) {
409                         overlap = length16;
410                         // While contained: No point matching fully inside the code point span.
411                         overlap = string.offsetByCodePoints(overlap, -1); // Length of the string minus the last code
412                                                                           // point.
413                     }
414                     if (overlap > spanLength) {
415                         overlap = spanLength;
416                     }
417                     int inc = length16 - overlap; // Keep overlap+inc==length16.
418                     for (;;) {
419                         if (inc > rest) {
420                             break;
421                         }
422                         // Try to match if the increment is not listed already.
423                         if (!offsets.containsOffset(inc) && matches16CPB(s, pos - overlap, length, string, length16)) {
424                             if (inc == rest) {
425                                 return length; // Reached the end of the string.
426                             }
427                             offsets.addOffset(inc);
428                         }
429                         if (overlap == 0) {
430                             break;
431                         }
432                         --overlap;
433                         ++inc;
434                     }
435                 }
436             } else /* SIMPLE */{
437                 int maxInc = 0, maxOverlap = 0;
438                 for (i = 0; i < stringsLength; ++i) {
439                     int overlap = spanLengths[i];
440                     // For longest match, we do need to try to match even an all-contained string
441                     // to find the match from the earliest start.
442 
443                     String string = strings.get(i);
444 
445                     int length16 = string.length();
446 
447                     // Try to match this string at pos-overlap..pos.
448                     if (overlap >= LONG_SPAN) {
449                         overlap = length16;
450                         // Longest match: Need to match fully inside the code point span
451                         // to find the match from the earliest start.
452                     }
453                     if (overlap > spanLength) {
454                         overlap = spanLength;
455                     }
456                     int inc = length16 - overlap; // Keep overlap+inc==length16.
457                     for (;;) {
458                         if (inc > rest || overlap < maxOverlap) {
459                             break;
460                         }
461                         // Try to match if the string is longer or starts earlier.
462                         if ((overlap > maxOverlap || /* redundant overlap==maxOverlap && */inc > maxInc)
463                                 && matches16CPB(s, pos - overlap, length, string, length16)) {
464                             maxInc = inc; // Longest match from earliest start.
465                             maxOverlap = overlap;
466                             break;
467                         }
468                         --overlap;
469                         ++inc;
470                     }
471                 }
472 
473                 if (maxInc != 0 || maxOverlap != 0) {
474                     // Longest-match algorithm, and there was a string match.
475                     // Simply continue after it.
476                     pos += maxInc;
477                     rest -= maxInc;
478                     if (rest == 0) {
479                         return length; // Reached the end of the string.
480                     }
481                     spanLength = 0; // Match strings from after a string match.
482                     continue;
483                 }
484             }
485             // Finished trying to match all strings at pos.
486 
487             if (spanLength != 0 || pos == 0) {
488                 // The position is after an unlimited code point span (spanLength!=0),
489                 // not after a string match.
490                 // The only position where spanLength==0 after a span is pos==0.
491                 // Otherwise, an unlimited code point span is only tried again when no
492                 // strings match, and if such a non-initial span fails we stop.
493                 if (offsets.isEmpty()) {
494                     return pos; // No strings matched after a span.
495                 }
496                 // Match strings from after the next string match.
497             } else {
498                 // The position is after a string match (or a single code point).
499                 if (offsets.isEmpty()) {
500                     // No more strings matched after a previous string match.
501                     // Try another code point span from after the last string match.
502                     spanLimit = spanSet.span(s, pos, SpanCondition.CONTAINED);
503                     spanLength = spanLimit - pos;
504                     if (spanLength == rest || // Reached the end of the string, or
505                             spanLength == 0 // neither strings nor span progressed.
506                     ) {
507                         return spanLimit;
508                     }
509                     pos += spanLength;
510                     rest -= spanLength;
511                     continue; // spanLength>0: Match strings from after a span.
512                 } else {
513                     // Try to match only one code point from after a string match if some
514                     // string matched beyond it, so that we try all possible positions
515                     // and don't overshoot.
516                     spanLength = spanOne(spanSet, s, pos, rest);
517                     if (spanLength > 0) {
518                         if (spanLength == rest) {
519                             return length; // Reached the end of the string.
520                         }
521                         // Match strings after this code point.
522                         // There cannot be any increments below it because UnicodeSet strings
523                         // contain multiple code points.
524                         pos += spanLength;
525                         rest -= spanLength;
526                         offsets.shift(spanLength);
527                         spanLength = 0;
528                         continue; // Match strings from after a single code point.
529                     }
530                     // Match strings from after the next string match.
531                 }
532             }
533             int minOffset = offsets.popMinimum(null);
534             pos += minOffset;
535             rest -= minOffset;
536             spanLength = 0; // Match strings from after a string match.
537         }
538     }
539 
540     /**
541      * Spans a string and counts the smallest number of set elements on any path across the span.
542      *
543      * <p>For proper counting, we cannot ignore strings that are fully contained in code point spans.
544      *
545      * <p>If the set does not have any fully-contained strings, then we could optimize this
546      * like span(), but such sets are likely rare, and this is at least still linear.
547      *
548      * @param s The string to be spanned
549      * @param start The start index that the span begins
550      * @param spanCondition The span condition
551      * @param outCount The count
552      * @return the limit (exclusive end) of the span
553      */
spanAndCount(CharSequence s, int start, SpanCondition spanCondition, OutputInt outCount)554     public int spanAndCount(CharSequence s, int start, SpanCondition spanCondition,
555             OutputInt outCount) {
556         if (spanCondition == SpanCondition.NOT_CONTAINED) {
557             return spanNot(s, start, outCount);
558         }
559         // Consider strings; they may overlap with the span,
560         // and they may result in a smaller count that with just code points.
561         if (spanCondition == SpanCondition.CONTAINED) {
562             return spanContainedAndCount(s, start, outCount);
563         }
564         // SIMPLE (not synchronized, does not use offsets)
565         int stringsLength = strings.size();
566         int length = s.length();
567         int pos = start;
568         int rest = length - start;
569         int count = 0;
570         while (rest != 0) {
571             // Try to match the next code point.
572             int cpLength = spanOne(spanSet, s, pos, rest);
573             int maxInc = (cpLength > 0) ? cpLength : 0;
574             // Try to match all of the strings.
575             for (int i = 0; i < stringsLength; ++i) {
576                 String string = strings.get(i);
577                 int length16 = string.length();
578                 if (maxInc < length16 && length16 <= rest &&
579                         matches16CPB(s, pos, length, string, length16)) {
580                     maxInc = length16;
581                 }
582             }
583             // We are done if there is no match beyond pos.
584             if (maxInc == 0) {
585                 outCount.value = count;
586                 return pos;
587             }
588             // Continue from the longest match.
589             ++count;
590             pos += maxInc;
591             rest -= maxInc;
592         }
593         outCount.value = count;
594         return pos;
595     }
596 
spanContainedAndCount(CharSequence s, int start, OutputInt outCount)597     private synchronized int spanContainedAndCount(CharSequence s, int start, OutputInt outCount) {
598         // Use offset list to try all possibilities.
599         offsets.setMaxLength(maxLength16);
600         int stringsLength = strings.size();
601         int length = s.length();
602         int pos = start;
603         int rest = length - start;
604         int count = 0;
605         while (rest != 0) {
606             // Try to match the next code point.
607             int cpLength = spanOne(spanSet, s, pos, rest);
608             if (cpLength > 0) {
609                 offsets.addOffsetAndCount(cpLength, count + 1);
610             }
611             // Try to match all of the strings.
612             for (int i = 0; i < stringsLength; ++i) {
613                 String string = strings.get(i);
614                 int length16 = string.length();
615                 // Note: If the strings were sorted by length, then we could also
616                 // avoid trying to match if there is already a match of the same length.
617                 if (length16 <= rest && !offsets.hasCountAtOffset(length16, count + 1) &&
618                         matches16CPB(s, pos, length, string, length16)) {
619                     offsets.addOffsetAndCount(length16, count + 1);
620                 }
621             }
622             // We are done if there is no match beyond pos.
623             if (offsets.isEmpty()) {
624                 outCount.value = count;
625                 return pos;
626             }
627             // Continue from the nearest match.
628             int minOffset = offsets.popMinimum(outCount);
629             count = outCount.value;
630             pos += minOffset;
631             rest -= minOffset;
632         }
633         outCount.value = count;
634         return pos;
635     }
636 
637     /**
638      * Span a string backwards.
639      *
640      * @param s The string to be spanned
641      * @param spanCondition The span condition
642      * @return The string index which starts the span (i.e. inclusive).
643      */
spanBack(CharSequence s, int length, SpanCondition spanCondition)644     public synchronized int spanBack(CharSequence s, int length, SpanCondition spanCondition) {
645         if (spanCondition == SpanCondition.NOT_CONTAINED) {
646             return spanNotBack(s, length);
647         }
648         int pos = spanSet.spanBack(s, length, SpanCondition.CONTAINED);
649         if (pos == 0) {
650             return 0;
651         }
652         int spanLength = length - pos;
653 
654         // Consider strings; they may overlap with the span.
655         int initSize = 0;
656         if (spanCondition == SpanCondition.CONTAINED) {
657             // Use offset list to try all possibilities.
658             initSize = maxLength16;
659         }
660         offsets.setMaxLength(initSize);
661         int i, stringsLength = strings.size();
662         int spanBackLengthsOffset = 0;
663         if (all) {
664             spanBackLengthsOffset = stringsLength;
665         }
666         for (;;) {
667             if (spanCondition == SpanCondition.CONTAINED) {
668                 for (i = 0; i < stringsLength; ++i) {
669                     int overlap = spanLengths[spanBackLengthsOffset + i];
670                     if (overlap == ALL_CP_CONTAINED) {
671                         continue; // Irrelevant string.
672                     }
673                     String string = strings.get(i);
674 
675                     int length16 = string.length();
676 
677                     // Try to match this string at pos-(length16-overlap)..pos-length16.
678                     if (overlap >= LONG_SPAN) {
679                         overlap = length16;
680                         // While contained: No point matching fully inside the code point span.
681                         int len1 = 0;
682                         len1 = string.offsetByCodePoints(0, 1);
683                         overlap -= len1; // Length of the string minus the first code point.
684                     }
685                     if (overlap > spanLength) {
686                         overlap = spanLength;
687                     }
688                     int dec = length16 - overlap; // Keep dec+overlap==length16.
689                     for (;;) {
690                         if (dec > pos) {
691                             break;
692                         }
693                         // Try to match if the decrement is not listed already.
694                         if (!offsets.containsOffset(dec) && matches16CPB(s, pos - dec, length, string, length16)) {
695                             if (dec == pos) {
696                                 return 0; // Reached the start of the string.
697                             }
698                             offsets.addOffset(dec);
699                         }
700                         if (overlap == 0) {
701                             break;
702                         }
703                         --overlap;
704                         ++dec;
705                     }
706                 }
707             } else /* SIMPLE */{
708                 int maxDec = 0, maxOverlap = 0;
709                 for (i = 0; i < stringsLength; ++i) {
710                     int overlap = spanLengths[spanBackLengthsOffset + i];
711                     // For longest match, we do need to try to match even an all-contained string
712                     // to find the match from the latest end.
713 
714                     String string = strings.get(i);
715 
716                     int length16 = string.length();
717 
718                     // Try to match this string at pos-(length16-overlap)..pos-length16.
719                     if (overlap >= LONG_SPAN) {
720                         overlap = length16;
721                         // Longest match: Need to match fully inside the code point span
722                         // to find the match from the latest end.
723                     }
724                     if (overlap > spanLength) {
725                       overlap = spanLength;
726                     }
727                     int dec = length16 - overlap; // Keep dec+overlap==length16.
728                     for (;;) {
729                         if (dec > pos || overlap < maxOverlap) {
730                             break;
731                         }
732                         // Try to match if the string is longer or ends later.
733                         if ((overlap > maxOverlap || /* redundant overlap==maxOverlap && */dec > maxDec)
734                                 && matches16CPB(s, pos - dec, length, string, length16)) {
735                             maxDec = dec; // Longest match from latest end.
736                             maxOverlap = overlap;
737                             break;
738                         }
739                         --overlap;
740                         ++dec;
741                     }
742                 }
743 
744                 if (maxDec != 0 || maxOverlap != 0) {
745                     // Longest-match algorithm, and there was a string match.
746                     // Simply continue before it.
747                     pos -= maxDec;
748                     if (pos == 0) {
749                         return 0; // Reached the start of the string.
750                     }
751                     spanLength = 0; // Match strings from before a string match.
752                     continue;
753                 }
754             }
755             // Finished trying to match all strings at pos.
756 
757             if (spanLength != 0 || pos == length) {
758                 // The position is before an unlimited code point span (spanLength!=0),
759                 // not before a string match.
760                 // The only position where spanLength==0 before a span is pos==length.
761                 // Otherwise, an unlimited code point span is only tried again when no
762                 // strings match, and if such a non-initial span fails we stop.
763                 if (offsets.isEmpty()) {
764                     return pos; // No strings matched before a span.
765                 }
766                 // Match strings from before the next string match.
767             } else {
768                 // The position is before a string match (or a single code point).
769                 if (offsets.isEmpty()) {
770                     // No more strings matched before a previous string match.
771                     // Try another code point span from before the last string match.
772                     int oldPos = pos;
773                     pos = spanSet.spanBack(s, oldPos, SpanCondition.CONTAINED);
774                     spanLength = oldPos - pos;
775                     if (pos == 0 || // Reached the start of the string, or
776                             spanLength == 0 // neither strings nor span progressed.
777                     ) {
778                         return pos;
779                     }
780                     continue; // spanLength>0: Match strings from before a span.
781                 } else {
782                     // Try to match only one code point from before a string match if some
783                     // string matched beyond it, so that we try all possible positions
784                     // and don't overshoot.
785                     spanLength = spanOneBack(spanSet, s, pos);
786                     if (spanLength > 0) {
787                         if (spanLength == pos) {
788                             return 0; // Reached the start of the string.
789                         }
790                         // Match strings before this code point.
791                         // There cannot be any decrements below it because UnicodeSet strings
792                         // contain multiple code points.
793                         pos -= spanLength;
794                         offsets.shift(spanLength);
795                         spanLength = 0;
796                         continue; // Match strings from before a single code point.
797                     }
798                     // Match strings from before the next string match.
799                 }
800             }
801             pos -= offsets.popMinimum(null);
802             spanLength = 0; // Match strings from before a string match.
803         }
804     }
805 
806     /**
807      * Algorithm for spanNot()==span(SpanCondition.NOT_CONTAINED)
808      *
809      * Theoretical algorithm:
810      * - Iterate through the string, and at each code point boundary:
811      *   + If the code point there is in the set, then return with the current position.
812      *   + If a set string matches at the current position, then return with the current position.
813      *
814      * Optimized implementation:
815      *
816      * (Same assumption as for span() above.)
817      *
818      * Create and cache a spanNotSet which contains
819      * all of the single code points of the original set but none of its strings.
820      * For each set string add its initial code point to the spanNotSet.
821      * (Also add its final code point for spanNotBack().)
822      *
823      * - Loop:
824      *   + Do spanLength=spanNotSet.span(SpanCondition.NOT_CONTAINED).
825      *   + If the current code point is in the original set, then return the current position.
826      *   + If any set string matches at the current position, then return the current position.
827      *   + If there is no match at the current position, neither for the code point
828      *     there nor for any set string, then skip this code point and continue the loop.
829      *     This happens for set-string-initial code points that were added to spanNotSet
830      *     when there is not actually a match for such a set string.
831      *
832      * @param s The string to be spanned
833      * @param start The start index that the span begins
834      * @param outCount If not null: Receives the number of code points across the span.
835      * @return the limit (exclusive end) of the span
836      */
spanNot(CharSequence s, int start, OutputInt outCount)837     private int spanNot(CharSequence s, int start, OutputInt outCount) {
838         int length = s.length();
839         int pos = start, rest = length - start;
840         int stringsLength = strings.size();
841         int count = 0;
842         do {
843             // Span until we find a code point from the set,
844             // or a code point that starts or ends some string.
845             int spanLimit;
846             if (outCount == null) {
847                 spanLimit = spanNotSet.span(s, pos, SpanCondition.NOT_CONTAINED);
848             } else {
849                 spanLimit = spanNotSet.spanAndCount(s, pos, SpanCondition.NOT_CONTAINED, outCount);
850                 outCount.value = count = count + outCount.value;
851             }
852             if (spanLimit == length) {
853                 return length; // Reached the end of the string.
854             }
855             pos = spanLimit;
856             rest = length - spanLimit;
857 
858             // Check whether the current code point is in the original set,
859             // without the string starts and ends.
860             int cpLength = spanOne(spanSet, s, pos, rest);
861             if (cpLength > 0) {
862                 return pos; // There is a set element at pos.
863             }
864 
865             // Try to match the strings at pos.
866             for (int i = 0; i < stringsLength; ++i) {
867                 if (spanLengths[i] == ALL_CP_CONTAINED) {
868                     continue; // Irrelevant string.
869                 }
870                 String string = strings.get(i);
871 
872                 int length16 = string.length();
873                 if (length16 <= rest && matches16CPB(s, pos, length, string, length16)) {
874                     return pos; // There is a set element at pos.
875                 }
876             }
877 
878             // The span(while not contained) ended on a string start/end which is
879             // not in the original set. Skip this code point and continue.
880             // cpLength<0
881             pos -= cpLength;
882             rest += cpLength;
883             ++count;
884         } while (rest != 0);
885         if (outCount != null) {
886             outCount.value = count;
887         }
888         return length; // Reached the end of the string.
889     }
890 
spanNotBack(CharSequence s, int length)891     private int spanNotBack(CharSequence s, int length) {
892         int pos = length;
893         int i, stringsLength = strings.size();
894         do {
895             // Span until we find a code point from the set,
896             // or a code point that starts or ends some string.
897             pos = spanNotSet.spanBack(s, pos, SpanCondition.NOT_CONTAINED);
898             if (pos == 0) {
899                 return 0; // Reached the start of the string.
900             }
901 
902             // Check whether the current code point is in the original set,
903             // without the string starts and ends.
904             int cpLength = spanOneBack(spanSet, s, pos);
905             if (cpLength > 0) {
906                 return pos; // There is a set element at pos.
907             }
908 
909             // Try to match the strings at pos.
910             for (i = 0; i < stringsLength; ++i) {
911                 // Use spanLengths rather than a spanLengths pointer because
912                 // it is easier and we only need to know whether the string is irrelevant
913                 // which is the same in either array.
914                 if (spanLengths[i] == ALL_CP_CONTAINED) {
915                     continue; // Irrelevant string.
916                 }
917                 String string = strings.get(i);
918 
919                 int length16 = string.length();
920                 if (length16 <= pos && matches16CPB(s, pos - length16, length, string, length16)) {
921                     return pos; // There is a set element at pos.
922                 }
923             }
924 
925             // The span(while not contained) ended on a string start/end which is
926             // not in the original set. Skip this code point and continue.
927             // cpLength<0
928             pos += cpLength;
929         } while (pos != 0);
930         return 0; // Reached the start of the string.
931     }
932 
makeSpanLengthByte(int spanLength)933     static short makeSpanLengthByte(int spanLength) {
934         // 0xfe==UnicodeSetStringSpan::LONG_SPAN
935         return spanLength < LONG_SPAN ? (short) spanLength : LONG_SPAN;
936     }
937 
938     // Compare strings without any argument checks. Requires length>0.
matches16(CharSequence s, int start, final String t, int length)939     private static boolean matches16(CharSequence s, int start, final String t, int length) {
940         int end = start + length;
941         while (length-- > 0) {
942             if (s.charAt(--end) != t.charAt(length)) {
943                 return false;
944             }
945         }
946         return true;
947     }
948 
949     /**
950      * Compare 16-bit Unicode strings (which may be malformed UTF-16)
951      * at code point boundaries.
952      * That is, each edge of a match must not be in the middle of a surrogate pair.
953      * @param s       The string to match in.
954      * @param start   The start index of s.
955      * @param limit   The limit of the subsequence of s being spanned.
956      * @param t       The substring to be matched in s.
957      * @param tlength The length of t.
958      */
matches16CPB(CharSequence s, int start, int limit, final String t, int tlength)959     static boolean matches16CPB(CharSequence s, int start, int limit, final String t, int tlength) {
960         return matches16(s, start, t, tlength)
961                 && !(0 < start && Character.isHighSurrogate(s.charAt(start - 1)) &&
962                         Character.isLowSurrogate(s.charAt(start)))
963                 && !((start + tlength) < limit && Character.isHighSurrogate(s.charAt(start + tlength - 1)) &&
964                         Character.isLowSurrogate(s.charAt(start + tlength)));
965     }
966 
967     /**
968      * Does the set contain the next code point?
969      * If so, return its length; otherwise return its negative length.
970      */
spanOne(final UnicodeSet set, CharSequence s, int start, int length)971     static int spanOne(final UnicodeSet set, CharSequence s, int start, int length) {
972         char c = s.charAt(start);
973         if (c >= 0xd800 && c <= 0xdbff && length >= 2) {
974             char c2 = s.charAt(start + 1);
975             if (UTF16.isTrailSurrogate(c2)) {
976                 int supplementary = UCharacterProperty.getRawSupplementary(c, c2);
977                 return set.contains(supplementary) ? 2 : -2;
978             }
979         }
980         return set.contains(c) ? 1 : -1;
981     }
982 
spanOneBack(final UnicodeSet set, CharSequence s, int length)983     static int spanOneBack(final UnicodeSet set, CharSequence s, int length) {
984         char c = s.charAt(length - 1);
985         if (c >= 0xdc00 && c <= 0xdfff && length >= 2) {
986             char c2 = s.charAt(length - 2);
987             if (UTF16.isLeadSurrogate(c2)) {
988                 int supplementary = UCharacterProperty.getRawSupplementary(c2, c);
989                 return set.contains(supplementary) ? 2 : -2;
990             }
991         }
992         return set.contains(c) ? 1 : -1;
993     }
994 
995     /**
996      * Helper class for UnicodeSetStringSpan.
997      *
998      * <p>List of offsets from the current position from where to try matching
999      * a code point or a string.
1000      * Stores offsets rather than indexes to simplify the code and use the same list
1001      * for both increments (in span()) and decrements (in spanBack()).
1002      *
1003      * <p>Assumption: The maximum offset is limited, and the offsets that are stored at any one time
1004      * are relatively dense, that is,
1005      * there are normally no gaps of hundreds or thousands of offset values.
1006      *
1007      * <p>This class optionally also tracks the minimum non-negative count for each position,
1008      * intended to count the smallest number of elements of any path leading to that position.
1009      *
1010      * <p>The implementation uses a circular buffer of count integers,
1011      * each indicating whether the corresponding offset is in the list,
1012      * and its path element count.
1013      * This avoids inserting into a sorted list of offsets (or absolute indexes)
1014      * and physically moving part of the list.
1015      *
1016      * <p>Note: In principle, the caller should setMaxLength() to
1017      * the maximum of the max string length and U16_LENGTH/U8_LENGTH
1018      * to account for "long" single code points.
1019      *
1020      * <p>Note: An earlier version did not track counts and stored only byte flags.
1021      * With boolean flags, if maxLength were guaranteed to be no more than 32 or 64,
1022      * the list could be stored as bit flags in a single integer.
1023      * Rather than handling a circular buffer with a start list index,
1024      * the integer would simply be shifted when lower offsets are removed.
1025      * UnicodeSet does not have a limit on the lengths of strings.
1026      */
1027     private static final class OffsetList {
1028         private int[] list;
1029         private int length;
1030         private int start;
1031 
OffsetList()1032         public OffsetList() {
1033             list = new int[16];  // default size
1034         }
1035 
setMaxLength(int maxLength)1036         public void setMaxLength(int maxLength) {
1037             if (maxLength > list.length) {
1038                 list = new int[maxLength];
1039             }
1040             clear();
1041         }
1042 
clear()1043         public void clear() {
1044             for (int i = list.length; i-- > 0;) {
1045                 list[i] = 0;
1046             }
1047             start = length = 0;
1048         }
1049 
isEmpty()1050         public boolean isEmpty() {
1051             return (length == 0);
1052         }
1053 
1054         /**
1055          * Reduces all stored offsets by delta, used when the current position moves by delta.
1056          * There must not be any offsets lower than delta.
1057          * If there is an offset equal to delta, it is removed.
1058          *
1059          * @param delta [1..maxLength]
1060          */
shift(int delta)1061         public void shift(int delta) {
1062             int i = start + delta;
1063             if (i >= list.length) {
1064                 i -= list.length;
1065             }
1066             if (list[i] != 0) {
1067                 list[i] = 0;
1068                 --length;
1069             }
1070             start = i;
1071         }
1072 
1073         /**
1074          * Adds an offset. The list must not contain it yet.
1075          * @param offset [1..maxLength]
1076          */
addOffset(int offset)1077         public void addOffset(int offset) {
1078             int i = start + offset;
1079             if (i >= list.length) {
1080                 i -= list.length;
1081             }
1082             assert list[i] == 0;
1083             list[i] = 1;
1084             ++length;
1085         }
1086 
1087         /**
1088          * Adds an offset and updates its count.
1089          * The list may already contain the offset.
1090          * @param offset [1..maxLength]
1091          */
addOffsetAndCount(int offset, int count)1092         public void addOffsetAndCount(int offset, int count) {
1093             assert count > 0;
1094             int i = start + offset;
1095             if (i >= list.length) {
1096                 i -= list.length;
1097             }
1098             if (list[i] == 0) {
1099                 list[i] = count;
1100                 ++length;
1101             } else if (count < list[i]) {
1102                 list[i] = count;
1103             }
1104         }
1105 
1106         /**
1107          * @param offset [1..maxLength]
1108          */
containsOffset(int offset)1109         public boolean containsOffset(int offset) {
1110             int i = start + offset;
1111             if (i >= list.length) {
1112                 i -= list.length;
1113             }
1114             return list[i] != 0;
1115         }
1116 
1117         /**
1118          * @param offset [1..maxLength]
1119          */
hasCountAtOffset(int offset, int count)1120         public boolean hasCountAtOffset(int offset, int count) {
1121             int i = start + offset;
1122             if (i >= list.length) {
1123                 i -= list.length;
1124             }
1125             int oldCount = list[i];
1126             return oldCount != 0 && oldCount <= count;
1127         }
1128 
1129         /**
1130          * Finds the lowest stored offset from a non-empty list, removes it,
1131          * and reduces all other offsets by this minimum.
1132          * @return min=[1..maxLength]
1133          */
popMinimum(OutputInt outCount)1134         public int popMinimum(OutputInt outCount) {
1135             // Look for the next offset in list[start+1..list.length-1].
1136             int i = start, result;
1137             while (++i < list.length) {
1138                 int count = list[i];
1139                 if (count != 0) {
1140                     list[i] = 0;
1141                     --length;
1142                     result = i - start;
1143                     start = i;
1144                     if (outCount != null) { outCount.value = count; }
1145                     return result;
1146                 }
1147             }
1148             // i==list.length
1149 
1150             // Wrap around and look for the next offset in list[0..start].
1151             // Since the list is not empty, there will be one.
1152             result = list.length - start;
1153             i = 0;
1154             int count;
1155             while ((count = list[i]) == 0) {
1156                 ++i;
1157             }
1158             list[i] = 0;
1159             --length;
1160             start = i;
1161             if (outCount != null) { outCount.value = count; }
1162             return result + i;
1163         }
1164     }
1165 }
1166