1 /*
2  * Copyright (c) 1996, 2013, Oracle and/or its affiliates. All rights reserved.
3  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
4  *
5  * This code is free software; you can redistribute it and/or modify it
6  * under the terms of the GNU General Public License version 2 only, as
7  * published by the Free Software Foundation.  Oracle designates this
8  * particular file as subject to the "Classpath" exception as provided
9  * by Oracle in the LICENSE file that accompanied this code.
10  *
11  * This code is distributed in the hope that it will be useful, but WITHOUT
12  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
13  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
14  * version 2 for more details (a copy is included in the LICENSE file that
15  * accompanied this code).
16  *
17  * You should have received a copy of the GNU General Public License version
18  * 2 along with this work; if not, write to the Free Software Foundation,
19  * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
20  *
21  * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
22  * or visit www.oracle.com if you need additional information or have any
23  * questions.
24  */
25 
26 /*
27  * (C) Copyright Taligent, Inc. 1996, 1997 - All Rights Reserved
28  * (C) Copyright IBM Corp. 1996 - 1998 - All Rights Reserved
29  *
30  * The original version of this source code and documentation
31  * is copyrighted and owned by Taligent, Inc., a wholly-owned
32  * subsidiary of IBM. These materials are provided under terms
33  * of a License Agreement between Taligent and Sun. This technology
34  * is protected by multiple US and International patents.
35  *
36  * This notice and attribution to Taligent may not be removed.
37  * Taligent is a registered trademark of Taligent, Inc.
38  *
39  */
40 
41 package java.text;
42 
43 import java.lang.ref.SoftReference;
44 import java.text.spi.BreakIteratorProvider;
45 import java.util.Locale;
46 import sun.util.locale.provider.LocaleProviderAdapter;
47 import sun.util.locale.provider.LocaleServiceProviderPool;
48 
49 
50 /**
51  * The <code>BreakIterator</code> class implements methods for finding
52  * the location of boundaries in text. Instances of <code>BreakIterator</code>
53  * maintain a current position and scan over text
54  * returning the index of characters where boundaries occur.
55  * Internally, <code>BreakIterator</code> scans text using a
56  * <code>CharacterIterator</code>, and is thus able to scan text held
57  * by any object implementing that protocol. A <code>StringCharacterIterator</code>
58  * is used to scan <code>String</code> objects passed to <code>setText</code>.
59  *
60  * <p>
61  * You use the factory methods provided by this class to create
62  * instances of various types of break iterators. In particular,
63  * use <code>getWordInstance</code>, <code>getLineInstance</code>,
64  * <code>getSentenceInstance</code>, and <code>getCharacterInstance</code>
65  * to create <code>BreakIterator</code>s that perform
66  * word, line, sentence, and character boundary analysis respectively.
67  * A single <code>BreakIterator</code> can work only on one unit
68  * (word, line, sentence, and so on). You must use a different iterator
69  * for each unit boundary analysis you wish to perform.
70  *
71  * <p><a id="line"></a>
72  * Line boundary analysis determines where a text string can be
73  * broken when line-wrapping. The mechanism correctly handles
74  * punctuation and hyphenated words. Actual line breaking needs
75  * to also consider the available line width and is handled by
76  * higher-level software.
77  *
78  * <p><a id="sentence"></a>
79  * Sentence boundary analysis allows selection with correct interpretation
80  * of periods within numbers and abbreviations, and trailing punctuation
81  * marks such as quotation marks and parentheses.
82  *
83  * <p><a id="word"></a>
84  * Word boundary analysis is used by search and replace functions, as
85  * well as within text editing applications that allow the user to
86  * select words with a double click. Word selection provides correct
87  * interpretation of punctuation marks within and following
88  * words. Characters that are not part of a word, such as symbols
89  * or punctuation marks, have word-breaks on both sides.
90  *
91  * <p><a id="character"></a>
92  * Character boundary analysis allows users to interact with characters
93  * as they expect to, for example, when moving the cursor through a text
94  * string. Character boundary analysis provides correct navigation
95  * through character strings, regardless of how the character is stored.
96  * The boundaries returned may be those of supplementary characters,
97  * combining character sequences, or ligature clusters.
98  * For example, an accented character might be stored as a base character
99  * and a diacritical mark. What users consider to be a character can
100  * differ between languages.
101  *
102  * <p>
103  * The <code>BreakIterator</code> instances returned by the factory methods
104  * of this class are intended for use with natural languages only, not for
105  * programming language text. It is however possible to define subclasses
106  * that tokenize a programming language.
107  *
108  * <P>
109  * <strong>Examples</strong>:<P>
110  * Creating and using text boundaries:
111  * <blockquote>
112  * <pre>
113  * public static void main(String args[]) {
114  *      if (args.length == 1) {
115  *          String stringToExamine = args[0];
116  *          //print each word in order
117  *          BreakIterator boundary = BreakIterator.getWordInstance();
118  *          boundary.setText(stringToExamine);
119  *          printEachForward(boundary, stringToExamine);
120  *          //print each sentence in reverse order
121  *          boundary = BreakIterator.getSentenceInstance(Locale.US);
122  *          boundary.setText(stringToExamine);
123  *          printEachBackward(boundary, stringToExamine);
124  *          printFirst(boundary, stringToExamine);
125  *          printLast(boundary, stringToExamine);
126  *      }
127  * }
128  * </pre>
129  * </blockquote>
130  *
131  * Print each element in order:
132  * <blockquote>
133  * <pre>
134  * public static void printEachForward(BreakIterator boundary, String source) {
135  *     int start = boundary.first();
136  *     for (int end = boundary.next();
137  *          end != BreakIterator.DONE;
138  *          start = end, end = boundary.next()) {
139  *          System.out.println(source.substring(start,end));
140  *     }
141  * }
142  * </pre>
143  * </blockquote>
144  *
145  * Print each element in reverse order:
146  * <blockquote>
147  * <pre>
148  * public static void printEachBackward(BreakIterator boundary, String source) {
149  *     int end = boundary.last();
150  *     for (int start = boundary.previous();
151  *          start != BreakIterator.DONE;
152  *          end = start, start = boundary.previous()) {
153  *         System.out.println(source.substring(start,end));
154  *     }
155  * }
156  * </pre>
157  * </blockquote>
158  *
159  * Print first element:
160  * <blockquote>
161  * <pre>
162  * public static void printFirst(BreakIterator boundary, String source) {
163  *     int start = boundary.first();
164  *     int end = boundary.next();
165  *     System.out.println(source.substring(start,end));
166  * }
167  * </pre>
168  * </blockquote>
169  *
170  * Print last element:
171  * <blockquote>
172  * <pre>
173  * public static void printLast(BreakIterator boundary, String source) {
174  *     int end = boundary.last();
175  *     int start = boundary.previous();
176  *     System.out.println(source.substring(start,end));
177  * }
178  * </pre>
179  * </blockquote>
180  *
181  * Print the element at a specified position:
182  * <blockquote>
183  * <pre>
184  * public static void printAt(BreakIterator boundary, int pos, String source) {
185  *     int end = boundary.following(pos);
186  *     int start = boundary.previous();
187  *     System.out.println(source.substring(start,end));
188  * }
189  * </pre>
190  * </blockquote>
191  *
192  * Find the next word:
193  * <blockquote>
194  * <pre>{@code
195  * public static int nextWordStartAfter(int pos, String text) {
196  *     BreakIterator wb = BreakIterator.getWordInstance();
197  *     wb.setText(text);
198  *     int last = wb.following(pos);
199  *     int current = wb.next();
200  *     while (current != BreakIterator.DONE) {
201  *         for (int p = last; p < current; p++) {
202  *             if (Character.isLetter(text.codePointAt(p)))
203  *                 return last;
204  *         }
205  *         last = current;
206  *         current = wb.next();
207  *     }
208  *     return BreakIterator.DONE;
209  * }
210  * }</pre>
211  * (The iterator returned by BreakIterator.getWordInstance() is unique in that
212  * the break positions it returns don't represent both the start and end of the
213  * thing being iterated over.  That is, a sentence-break iterator returns breaks
214  * that each represent the end of one sentence and the beginning of the next.
215  * With the word-break iterator, the characters between two boundaries might be a
216  * word, or they might be the punctuation or whitespace between two words.  The
217  * above code uses a simple heuristic to determine which boundary is the beginning
218  * of a word: If the characters between this boundary and the next boundary
219  * include at least one letter (this can be an alphabetical letter, a CJK ideograph,
220  * a Hangul syllable, a Kana character, etc.), then the text between this boundary
221  * and the next is a word; otherwise, it's the material between words.)
222  * </blockquote>
223  *
224  * @since 1.1
225  * @see CharacterIterator
226  *
227  */
228 
229 public abstract class BreakIterator implements Cloneable
230 {
231     /**
232      * Constructor. BreakIterator is stateless and has no default behavior.
233      */
BreakIterator()234     protected BreakIterator()
235     {
236     }
237 
238     /**
239      * Create a copy of this iterator
240      * @return A copy of this
241      */
242     @Override
clone()243     public Object clone()
244     {
245         try {
246             return super.clone();
247         }
248         catch (CloneNotSupportedException e) {
249             throw new InternalError(e);
250         }
251     }
252 
253     /**
254      * DONE is returned by previous(), next(), next(int), preceding(int)
255      * and following(int) when either the first or last text boundary has been
256      * reached.
257      */
258     public static final int DONE = -1;
259 
260     /**
261      * Returns the first boundary. The iterator's current position is set
262      * to the first text boundary.
263      * @return The character index of the first text boundary.
264      */
first()265     public abstract int first();
266 
267     /**
268      * Returns the last boundary. The iterator's current position is set
269      * to the last text boundary.
270      * @return The character index of the last text boundary.
271      */
last()272     public abstract int last();
273 
274     /**
275      * Returns the nth boundary from the current boundary. If either
276      * the first or last text boundary has been reached, it returns
277      * <code>BreakIterator.DONE</code> and the current position is set to either
278      * the first or last text boundary depending on which one is reached. Otherwise,
279      * the iterator's current position is set to the new boundary.
280      * For example, if the iterator's current position is the mth text boundary
281      * and three more boundaries exist from the current boundary to the last text
282      * boundary, the next(2) call will return m + 2. The new text position is set
283      * to the (m + 2)th text boundary. A next(4) call would return
284      * <code>BreakIterator.DONE</code> and the last text boundary would become the
285      * new text position.
286      * @param n which boundary to return.  A value of 0
287      * does nothing.  Negative values move to previous boundaries
288      * and positive values move to later boundaries.
289      * @return The character index of the nth boundary from the current position
290      * or <code>BreakIterator.DONE</code> if either first or last text boundary
291      * has been reached.
292      */
next(int n)293     public abstract int next(int n);
294 
295     /**
296      * Returns the boundary following the current boundary. If the current boundary
297      * is the last text boundary, it returns <code>BreakIterator.DONE</code> and
298      * the iterator's current position is unchanged. Otherwise, the iterator's
299      * current position is set to the boundary following the current boundary.
300      * @return The character index of the next text boundary or
301      * <code>BreakIterator.DONE</code> if the current boundary is the last text
302      * boundary.
303      * Equivalent to next(1).
304      * @see #next(int)
305      */
next()306     public abstract int next();
307 
308     /**
309      * Returns the boundary preceding the current boundary. If the current boundary
310      * is the first text boundary, it returns <code>BreakIterator.DONE</code> and
311      * the iterator's current position is unchanged. Otherwise, the iterator's
312      * current position is set to the boundary preceding the current boundary.
313      * @return The character index of the previous text boundary or
314      * <code>BreakIterator.DONE</code> if the current boundary is the first text
315      * boundary.
316      */
previous()317     public abstract int previous();
318 
319     /**
320      * Returns the first boundary following the specified character offset. If the
321      * specified offset equals to the last text boundary, it returns
322      * <code>BreakIterator.DONE</code> and the iterator's current position is unchanged.
323      * Otherwise, the iterator's current position is set to the returned boundary.
324      * The value returned is always greater than the offset or the value
325      * <code>BreakIterator.DONE</code>.
326      * @param offset the character offset to begin scanning.
327      * @return The first boundary after the specified offset or
328      * <code>BreakIterator.DONE</code> if the last text boundary is passed in
329      * as the offset.
330      * @exception  IllegalArgumentException if the specified offset is less than
331      * the first text boundary or greater than the last text boundary.
332      */
following(int offset)333     public abstract int following(int offset);
334 
335     /**
336      * Returns the last boundary preceding the specified character offset. If the
337      * specified offset equals to the first text boundary, it returns
338      * <code>BreakIterator.DONE</code> and the iterator's current position is unchanged.
339      * Otherwise, the iterator's current position is set to the returned boundary.
340      * The value returned is always less than the offset or the value
341      * <code>BreakIterator.DONE</code>.
342      * @param offset the character offset to begin scanning.
343      * @return The last boundary before the specified offset or
344      * <code>BreakIterator.DONE</code> if the first text boundary is passed in
345      * as the offset.
346      * @exception   IllegalArgumentException if the specified offset is less than
347      * the first text boundary or greater than the last text boundary.
348      * @since 1.2
349      */
preceding(int offset)350     public int preceding(int offset) {
351         // NOTE:  This implementation is here solely because we can't add new
352         // abstract methods to an existing class.  There is almost ALWAYS a
353         // better, faster way to do this.
354         int pos = following(offset);
355         while (pos >= offset && pos != DONE) {
356             pos = previous();
357         }
358         return pos;
359     }
360 
361     /**
362      * Returns true if the specified character offset is a text boundary.
363      * @param offset the character offset to check.
364      * @return <code>true</code> if "offset" is a boundary position,
365      * <code>false</code> otherwise.
366      * @exception   IllegalArgumentException if the specified offset is less than
367      * the first text boundary or greater than the last text boundary.
368      * @since 1.2
369      */
isBoundary(int offset)370     public boolean isBoundary(int offset) {
371         // NOTE: This implementation probably is wrong for most situations
372         // because it fails to take into account the possibility that a
373         // CharacterIterator passed to setText() may not have a begin offset
374         // of 0.  But since the abstract BreakIterator doesn't have that
375         // knowledge, it assumes the begin offset is 0.  If you subclass
376         // BreakIterator, copy the SimpleTextBoundary implementation of this
377         // function into your subclass.  [This should have been abstract at
378         // this level, but it's too late to fix that now.]
379         if (offset == 0) {
380             return true;
381         }
382         int boundary = following(offset - 1);
383         if (boundary == DONE) {
384             throw new IllegalArgumentException();
385         }
386         return boundary == offset;
387     }
388 
389     /**
390      * Returns character index of the text boundary that was most
391      * recently returned by next(), next(int), previous(), first(), last(),
392      * following(int) or preceding(int). If any of these methods returns
393      * <code>BreakIterator.DONE</code> because either first or last text boundary
394      * has been reached, it returns the first or last text boundary depending on
395      * which one is reached.
396      * @return The text boundary returned from the above methods, first or last
397      * text boundary.
398      * @see #next()
399      * @see #next(int)
400      * @see #previous()
401      * @see #first()
402      * @see #last()
403      * @see #following(int)
404      * @see #preceding(int)
405      */
current()406     public abstract int current();
407 
408     /**
409      * Get the text being scanned
410      * @return the text being scanned
411      */
getText()412     public abstract CharacterIterator getText();
413 
414     /**
415      * Set a new text string to be scanned.  The current scan
416      * position is reset to first().
417      * @param newText new text to scan.
418      */
setText(String newText)419     public void setText(String newText)
420     {
421         setText(new StringCharacterIterator(newText));
422     }
423 
424     /**
425      * Set a new text for scanning.  The current scan
426      * position is reset to first().
427      * @param newText new text to scan.
428      */
setText(CharacterIterator newText)429     public abstract void setText(CharacterIterator newText);
430 
431     private static final int CHARACTER_INDEX = 0;
432     private static final int WORD_INDEX = 1;
433     private static final int LINE_INDEX = 2;
434     private static final int SENTENCE_INDEX = 3;
435 
436     @SuppressWarnings("unchecked")
437     private static final SoftReference<BreakIteratorCache>[] iterCache = (SoftReference<BreakIteratorCache>[]) new SoftReference<?>[4];
438 
439     /**
440      * Returns a new <code>BreakIterator</code> instance
441      * for <a href="BreakIterator.html#word">word breaks</a>
442      * for the {@linkplain Locale#getDefault() default locale}.
443      * @return A break iterator for word breaks
444      */
getWordInstance()445     public static BreakIterator getWordInstance()
446     {
447         return getWordInstance(Locale.getDefault());
448     }
449 
450     /**
451      * Returns a new <code>BreakIterator</code> instance
452      * for <a href="BreakIterator.html#word">word breaks</a>
453      * for the given locale.
454      * @param locale the desired locale
455      * @return A break iterator for word breaks
456      * @exception NullPointerException if <code>locale</code> is null
457      */
getWordInstance(Locale locale)458     public static BreakIterator getWordInstance(Locale locale)
459     {
460         return getBreakInstance(locale, WORD_INDEX);
461     }
462 
463     /**
464      * Returns a new <code>BreakIterator</code> instance
465      * for <a href="BreakIterator.html#line">line breaks</a>
466      * for the {@linkplain Locale#getDefault() default locale}.
467      * @return A break iterator for line breaks
468      */
getLineInstance()469     public static BreakIterator getLineInstance()
470     {
471         return getLineInstance(Locale.getDefault());
472     }
473 
474     /**
475      * Returns a new <code>BreakIterator</code> instance
476      * for <a href="BreakIterator.html#line">line breaks</a>
477      * for the given locale.
478      * @param locale the desired locale
479      * @return A break iterator for line breaks
480      * @exception NullPointerException if <code>locale</code> is null
481      */
getLineInstance(Locale locale)482     public static BreakIterator getLineInstance(Locale locale)
483     {
484         return getBreakInstance(locale, LINE_INDEX);
485     }
486 
487     /**
488      * Returns a new <code>BreakIterator</code> instance
489      * for <a href="BreakIterator.html#character">character breaks</a>
490      * for the {@linkplain Locale#getDefault() default locale}.
491      * @return A break iterator for character breaks
492      */
getCharacterInstance()493     public static BreakIterator getCharacterInstance()
494     {
495         return getCharacterInstance(Locale.getDefault());
496     }
497 
498     /**
499      * Returns a new <code>BreakIterator</code> instance
500      * for <a href="BreakIterator.html#character">character breaks</a>
501      * for the given locale.
502      * @param locale the desired locale
503      * @return A break iterator for character breaks
504      * @exception NullPointerException if <code>locale</code> is null
505      */
getCharacterInstance(Locale locale)506     public static BreakIterator getCharacterInstance(Locale locale)
507     {
508         return getBreakInstance(locale, CHARACTER_INDEX);
509     }
510 
511     /**
512      * Returns a new <code>BreakIterator</code> instance
513      * for <a href="BreakIterator.html#sentence">sentence breaks</a>
514      * for the {@linkplain Locale#getDefault() default locale}.
515      * @return A break iterator for sentence breaks
516      */
getSentenceInstance()517     public static BreakIterator getSentenceInstance()
518     {
519         return getSentenceInstance(Locale.getDefault());
520     }
521 
522     /**
523      * Returns a new <code>BreakIterator</code> instance
524      * for <a href="BreakIterator.html#sentence">sentence breaks</a>
525      * for the given locale.
526      * @param locale the desired locale
527      * @return A break iterator for sentence breaks
528      * @exception NullPointerException if <code>locale</code> is null
529      */
getSentenceInstance(Locale locale)530     public static BreakIterator getSentenceInstance(Locale locale)
531     {
532         return getBreakInstance(locale, SENTENCE_INDEX);
533     }
534 
getBreakInstance(Locale locale, int type)535     private static BreakIterator getBreakInstance(Locale locale, int type) {
536         if (iterCache[type] != null) {
537             BreakIteratorCache cache = iterCache[type].get();
538             if (cache != null) {
539                 if (cache.getLocale().equals(locale)) {
540                     return cache.createBreakInstance();
541                 }
542             }
543         }
544 
545         BreakIterator result = createBreakInstance(locale, type);
546         BreakIteratorCache cache = new BreakIteratorCache(locale, result);
547         iterCache[type] = new SoftReference<>(cache);
548         return result;
549     }
550 
createBreakInstance(Locale locale, int type)551     private static BreakIterator createBreakInstance(Locale locale,
552                                                      int type) {
553         LocaleProviderAdapter adapter = LocaleProviderAdapter.getAdapter(BreakIteratorProvider.class, locale);
554         BreakIterator iterator = createBreakInstance(adapter, locale, type);
555         if (iterator == null) {
556             iterator = createBreakInstance(LocaleProviderAdapter.forJRE(), locale, type);
557         }
558         return iterator;
559     }
560 
createBreakInstance(LocaleProviderAdapter adapter, Locale locale, int type)561     private static BreakIterator createBreakInstance(LocaleProviderAdapter adapter, Locale locale, int type) {
562         BreakIteratorProvider breakIteratorProvider = adapter.getBreakIteratorProvider();
563         BreakIterator iterator = null;
564         switch (type) {
565         case CHARACTER_INDEX:
566             iterator = breakIteratorProvider.getCharacterInstance(locale);
567             break;
568         case WORD_INDEX:
569             iterator = breakIteratorProvider.getWordInstance(locale);
570             break;
571         case LINE_INDEX:
572             iterator = breakIteratorProvider.getLineInstance(locale);
573             break;
574         case SENTENCE_INDEX:
575             iterator = breakIteratorProvider.getSentenceInstance(locale);
576             break;
577         }
578         return iterator;
579     }
580 
581     /**
582      * Returns an array of all locales for which the
583      * <code>get*Instance</code> methods of this class can return
584      * localized instances.
585      * The returned array represents the union of locales supported by the Java
586      * runtime and by installed
587      * {@link java.text.spi.BreakIteratorProvider BreakIteratorProvider} implementations.
588      * It must contain at least a <code>Locale</code>
589      * instance equal to {@link java.util.Locale#US Locale.US}.
590      *
591      * @return An array of locales for which localized
592      *         <code>BreakIterator</code> instances are available.
593      */
getAvailableLocales()594     public static synchronized Locale[] getAvailableLocales()
595     {
596         LocaleServiceProviderPool pool =
597             LocaleServiceProviderPool.getPool(BreakIteratorProvider.class);
598         return pool.getAvailableLocales();
599     }
600 
601     private static final class BreakIteratorCache {
602 
603         private BreakIterator iter;
604         private Locale locale;
605 
BreakIteratorCache(Locale locale, BreakIterator iter)606         BreakIteratorCache(Locale locale, BreakIterator iter) {
607             this.locale = locale;
608             this.iter = (BreakIterator) iter.clone();
609         }
610 
getLocale()611         Locale getLocale() {
612             return locale;
613         }
614 
createBreakInstance()615         BreakIterator createBreakInstance() {
616             return (BreakIterator) iter.clone();
617         }
618     }
619 }
620