1 /*
2 * Copyright (C) 2006 Lars Knoll <lars@trolltech.com>
3 * Copyright (C) 2007 Apple Inc. All rights reserved.
4 *
5 * This library is free software; you can redistribute it and/or
6 * modify it under the terms of the GNU Library General Public
7 * License as published by the Free Software Foundation; either
8 * version 2 of the License, or (at your option) any later version.
9 *
10 * This library is distributed in the hope that it will be useful,
11 * but WITHOUT ANY WARRANTY; without even the implied warranty of
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 * Library General Public License for more details.
14 *
15 * You should have received a copy of the GNU Library General Public License
16 * along with this library; see the file COPYING.LIB. If not, write to
17 * the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor,
18 * Boston, MA 02110-1301, USA.
19 *
20 */
21
22 #include "config.h"
23 #include "TextBreakIterator.h"
24
25 #include "PlatformString.h"
26 #include "TextBreakIteratorInternalICU.h"
27 #include <unicode/ubrk.h>
28 #include <wtf/Assertions.h>
29
30 using namespace std;
31
32 namespace WebCore {
33
setUpIterator(bool & createdIterator,TextBreakIterator * & iterator,UBreakIteratorType type,const UChar * string,int length)34 static TextBreakIterator* setUpIterator(bool& createdIterator, TextBreakIterator*& iterator,
35 UBreakIteratorType type, const UChar* string, int length)
36 {
37 if (!string)
38 return 0;
39
40 if (!createdIterator) {
41 UErrorCode openStatus = U_ZERO_ERROR;
42 iterator = reinterpret_cast<TextBreakIterator*>(ubrk_open(type, currentTextBreakLocaleID(), 0, 0, &openStatus));
43 createdIterator = true;
44 ASSERT_WITH_MESSAGE(U_SUCCESS(openStatus), "ICU could not open a break iterator: %s (%d)", u_errorName(openStatus), openStatus);
45 }
46 if (!iterator)
47 return 0;
48
49 UErrorCode setTextStatus = U_ZERO_ERROR;
50 ubrk_setText(reinterpret_cast<UBreakIterator*>(iterator), string, length, &setTextStatus);
51 if (U_FAILURE(setTextStatus))
52 return 0;
53
54 return iterator;
55 }
56
characterBreakIterator(const UChar * string,int length)57 TextBreakIterator* characterBreakIterator(const UChar* string, int length)
58 {
59 static bool createdCharacterBreakIterator = false;
60 static TextBreakIterator* staticCharacterBreakIterator;
61 return setUpIterator(createdCharacterBreakIterator,
62 staticCharacterBreakIterator, UBRK_CHARACTER, string, length);
63 }
64
wordBreakIterator(const UChar * string,int length)65 TextBreakIterator* wordBreakIterator(const UChar* string, int length)
66 {
67 static bool createdWordBreakIterator = false;
68 static TextBreakIterator* staticWordBreakIterator;
69 return setUpIterator(createdWordBreakIterator,
70 staticWordBreakIterator, UBRK_WORD, string, length);
71 }
72
73 static bool createdLineBreakIterator = false;
74 static TextBreakIterator* staticLineBreakIterator;
75
acquireLineBreakIterator(const UChar * string,int length)76 TextBreakIterator* acquireLineBreakIterator(const UChar* string, int length)
77 {
78 TextBreakIterator* lineBreakIterator = 0;
79 if (!createdLineBreakIterator || staticLineBreakIterator) {
80 setUpIterator(createdLineBreakIterator, staticLineBreakIterator, UBRK_LINE, string, length);
81 swap(staticLineBreakIterator, lineBreakIterator);
82 }
83
84 if (!lineBreakIterator) {
85 bool createdNewLineBreakIterator = false;
86 setUpIterator(createdNewLineBreakIterator, lineBreakIterator, UBRK_LINE, string, length);
87 }
88
89 return lineBreakIterator;
90 }
91
releaseLineBreakIterator(TextBreakIterator * iterator)92 void releaseLineBreakIterator(TextBreakIterator* iterator)
93 {
94 ASSERT(createdLineBreakIterator);
95 ASSERT(iterator);
96
97 if (!staticLineBreakIterator)
98 staticLineBreakIterator = iterator;
99 else
100 ubrk_close(reinterpret_cast<UBreakIterator*>(iterator));
101 }
102
sentenceBreakIterator(const UChar * string,int length)103 TextBreakIterator* sentenceBreakIterator(const UChar* string, int length)
104 {
105 static bool createdSentenceBreakIterator = false;
106 static TextBreakIterator* staticSentenceBreakIterator;
107 return setUpIterator(createdSentenceBreakIterator,
108 staticSentenceBreakIterator, UBRK_SENTENCE, string, length);
109 }
110
textBreakFirst(TextBreakIterator * iterator)111 int textBreakFirst(TextBreakIterator* iterator)
112 {
113 return ubrk_first(reinterpret_cast<UBreakIterator*>(iterator));
114 }
115
textBreakLast(TextBreakIterator * iterator)116 int textBreakLast(TextBreakIterator* iterator)
117 {
118 return ubrk_last(reinterpret_cast<UBreakIterator*>(iterator));
119 }
120
textBreakNext(TextBreakIterator * iterator)121 int textBreakNext(TextBreakIterator* iterator)
122 {
123 return ubrk_next(reinterpret_cast<UBreakIterator*>(iterator));
124 }
125
textBreakPrevious(TextBreakIterator * iterator)126 int textBreakPrevious(TextBreakIterator* iterator)
127 {
128 return ubrk_previous(reinterpret_cast<UBreakIterator*>(iterator));
129 }
130
textBreakPreceding(TextBreakIterator * iterator,int pos)131 int textBreakPreceding(TextBreakIterator* iterator, int pos)
132 {
133 return ubrk_preceding(reinterpret_cast<UBreakIterator*>(iterator), pos);
134 }
135
textBreakFollowing(TextBreakIterator * iterator,int pos)136 int textBreakFollowing(TextBreakIterator* iterator, int pos)
137 {
138 return ubrk_following(reinterpret_cast<UBreakIterator*>(iterator), pos);
139 }
140
textBreakCurrent(TextBreakIterator * iterator)141 int textBreakCurrent(TextBreakIterator* iterator)
142 {
143 return ubrk_current(reinterpret_cast<UBreakIterator*>(iterator));
144 }
145
isTextBreak(TextBreakIterator * iterator,int position)146 bool isTextBreak(TextBreakIterator* iterator, int position)
147 {
148 return ubrk_isBoundary(reinterpret_cast<UBreakIterator*>(iterator), position);
149 }
150
setUpIteratorWithRules(bool & createdIterator,TextBreakIterator * & iterator,const char * breakRules,const UChar * string,int length)151 static TextBreakIterator* setUpIteratorWithRules(bool& createdIterator, TextBreakIterator*& iterator,
152 const char* breakRules, const UChar* string, int length)
153 {
154 if (!string)
155 return 0;
156
157 if (!createdIterator) {
158 UParseError parseStatus;
159 UErrorCode openStatus = U_ZERO_ERROR;
160 String rules(breakRules);
161 iterator = reinterpret_cast<TextBreakIterator*>(ubrk_openRules(rules.characters(), rules.length(), 0, 0, &parseStatus, &openStatus));
162 createdIterator = true;
163 ASSERT_WITH_MESSAGE(U_SUCCESS(openStatus), "ICU could not open a break iterator: %s (%d)", u_errorName(openStatus), openStatus);
164 }
165 if (!iterator)
166 return 0;
167
168 UErrorCode setTextStatus = U_ZERO_ERROR;
169 ubrk_setText(reinterpret_cast<UBreakIterator*>(iterator), string, length, &setTextStatus);
170 if (U_FAILURE(setTextStatus))
171 return 0;
172
173 return iterator;
174 }
175
cursorMovementIterator(const UChar * string,int length)176 TextBreakIterator* cursorMovementIterator(const UChar* string, int length)
177 {
178 // This rule set is based on character-break iterator rules of ICU 4.0
179 // <http://source.icu-project.org/repos/icu/icu/tags/release-4-0/source/data/brkitr/char.txt>.
180 // The major differences from the original ones are listed below:
181 // * Replaced '[\p{Grapheme_Cluster_Break = SpacingMark}]' with '[\p{General_Category = Spacing Mark} - $Extend]' for ICU 3.8 or earlier;
182 // * Removed rules that prevent a cursor from moving after prepend characters (Bug 24342);
183 // * Added rules that prevent a cursor from moving after virama signs of Indic languages except Tamil (Bug 15790), and;
184 // * Added rules that prevent a cursor from moving before Japanese half-width katakara voiced marks.
185 static const char* kRules =
186 "$CR = [\\p{Grapheme_Cluster_Break = CR}];"
187 "$LF = [\\p{Grapheme_Cluster_Break = LF}];"
188 "$Control = [\\p{Grapheme_Cluster_Break = Control}];"
189 "$VoiceMarks = [\\uFF9E\\uFF9F];" // Japanese half-width katakana voiced marks
190 "$Extend = [\\p{Grapheme_Cluster_Break = Extend} $VoiceMarks - [\\u0E30 \\u0E32 \\u0E45 \\u0EB0 \\u0EB2]];"
191 "$SpacingMark = [[\\p{General_Category = Spacing Mark}] - $Extend];"
192 "$L = [\\p{Grapheme_Cluster_Break = L}];"
193 "$V = [\\p{Grapheme_Cluster_Break = V}];"
194 "$T = [\\p{Grapheme_Cluster_Break = T}];"
195 "$LV = [\\p{Grapheme_Cluster_Break = LV}];"
196 "$LVT = [\\p{Grapheme_Cluster_Break = LVT}];"
197 "$Hin0 = [\\u0905-\\u0939];" // Devanagari Letter A,...,Ha
198 "$HinV = \\u094D;" // Devanagari Sign Virama
199 "$Hin1 = [\\u0915-\\u0939];" // Devanagari Letter Ka,...,Ha
200 "$Ben0 = [\\u0985-\\u09B9];" // Bengali Letter A,...,Ha
201 "$BenV = \\u09CD;" // Bengali Sign Virama
202 "$Ben1 = [\\u0995-\\u09B9];" // Bengali Letter Ka,...,Ha
203 "$Pan0 = [\\u0A05-\\u0A39];" // Gurmukhi Letter A,...,Ha
204 "$PanV = \\u0A4D;" // Gurmukhi Sign Virama
205 "$Pan1 = [\\u0A15-\\u0A39];" // Gurmukhi Letter Ka,...,Ha
206 "$Guj0 = [\\u0A85-\\u0AB9];" // Gujarati Letter A,...,Ha
207 "$GujV = \\u0ACD;" // Gujarati Sign Virama
208 "$Guj1 = [\\u0A95-\\u0AB9];" // Gujarati Letter Ka,...,Ha
209 "$Ori0 = [\\u0B05-\\u0B39];" // Oriya Letter A,...,Ha
210 "$OriV = \\u0B4D;" // Oriya Sign Virama
211 "$Ori1 = [\\u0B15-\\u0B39];" // Oriya Letter Ka,...,Ha
212 "$Tel0 = [\\u0C05-\\u0C39];" // Telugu Letter A,...,Ha
213 "$TelV = \\u0C4D;" // Telugu Sign Virama
214 "$Tel1 = [\\u0C14-\\u0C39];" // Telugu Letter Ka,...,Ha
215 "$Kan0 = [\\u0C85-\\u0CB9];" // Kannada Letter A,...,Ha
216 "$KanV = \\u0CCD;" // Kannada Sign Virama
217 "$Kan1 = [\\u0C95-\\u0CB9];" // Kannada Letter A,...,Ha
218 "$Mal0 = [\\u0D05-\\u0D39];" // Malayalam Letter A,...,Ha
219 "$MalV = \\u0D4D;" // Malayalam Sign Virama
220 "$Mal1 = [\\u0D15-\\u0D39];" // Malayalam Letter A,...,Ha
221 "!!chain;"
222 "!!forward;"
223 "$CR $LF;"
224 "$L ($L | $V | $LV | $LVT);"
225 "($LV | $V) ($V | $T);"
226 "($LVT | $T) $T;"
227 "[^$Control $CR $LF] $Extend;"
228 "[^$Control $CR $LF] $SpacingMark;"
229 "$Hin0 $HinV $Hin1;" // Devanagari Virama (forward)
230 "$Ben0 $BenV $Ben1;" // Bengali Virama (forward)
231 "$Pan0 $PanV $Pan1;" // Gurmukhi Virama (forward)
232 "$Guj0 $GujV $Guj1;" // Gujarati Virama (forward)
233 "$Ori0 $OriV $Ori1;" // Oriya Virama (forward)
234 "$Tel0 $TelV $Tel1;" // Telugu Virama (forward)
235 "$Kan0 $KanV $Kan1;" // Kannada Virama (forward)
236 "$Mal0 $MalV $Mal1;" // Malayalam Virama (forward)
237 "!!reverse;"
238 "$LF $CR;"
239 "($L | $V | $LV | $LVT) $L;"
240 "($V | $T) ($LV | $V);"
241 "$T ($LVT | $T);"
242 "$Extend [^$Control $CR $LF];"
243 "$SpacingMark [^$Control $CR $LF];"
244 "$Hin1 $HinV $Hin0;" // Devanagari Virama (backward)
245 "$Ben1 $BenV $Ben0;" // Bengali Virama (backward)
246 "$Pan1 $PanV $Pan0;" // Gurmukhi Virama (backward)
247 "$Guj1 $GujV $Guj0;" // Gujarati Virama (backward)
248 "$Ori1 $OriV $Ori0;" // Gujarati Virama (backward)
249 "$Tel1 $TelV $Tel0;" // Telugu Virama (backward)
250 "$Kan1 $KanV $Kan0;" // Kannada Virama (backward)
251 "$Mal1 $MalV $Mal0;" // Malayalam Virama (backward)
252 "!!safe_reverse;"
253 "!!safe_forward;";
254 static bool createdCursorMovementIterator = false;
255 static TextBreakIterator* staticCursorMovementIterator;
256 return setUpIteratorWithRules(createdCursorMovementIterator, staticCursorMovementIterator, kRules, string, length);
257 }
258
259 }
260