1 /* -*- Mode: C++; tab-width: 8; indent-tabs-mode: nil; c-basic-offset: 2 -*- */
2 /* vim: set ts=8 sts=2 et sw=2 tw=80: */
3 /* This Source Code Form is subject to the terms of the Mozilla Public
4 * License, v. 2.0. If a copy of the MPL was not distributed with this file,
5 * You can obtain one at http://mozilla.org/MPL/2.0/. */
6
7 /* Classes to iterate over grapheme, word, sentence, or line. */
8
9 #include "mozilla/intl/Segmenter.h"
10
11 #include "mozilla/intl/LineBreaker.h"
12 #include "mozilla/intl/WordBreaker.h"
13 #include "mozilla/intl/UnicodeProperties.h"
14 #include "nsUnicodeProperties.h"
15 #include "nsCharTraits.h"
16
17 using namespace mozilla::unicode;
18
19 namespace mozilla::intl {
20
SegmentIteratorUtf16(Span<const char16_t> aText)21 SegmentIteratorUtf16::SegmentIteratorUtf16(Span<const char16_t> aText)
22 : mText(aText) {}
23
Seek(uint32_t aPos)24 Maybe<uint32_t> SegmentIteratorUtf16::Seek(uint32_t aPos) {
25 if (mPos < aPos) {
26 mPos = aPos;
27 }
28 return Next();
29 }
30
LineBreakIteratorUtf16(Span<const char16_t> aText,const LineBreakOptions & aOptions)31 LineBreakIteratorUtf16::LineBreakIteratorUtf16(Span<const char16_t> aText,
32 const LineBreakOptions& aOptions)
33 : SegmentIteratorUtf16(aText), mOptions(aOptions) {}
34
Next()35 Maybe<uint32_t> LineBreakIteratorUtf16::Next() {
36 const int32_t nextPos =
37 LineBreaker::Next(mText.Elements(), mText.Length(), mPos);
38 if (nextPos == NS_LINEBREAKER_NEED_MORE_TEXT) {
39 return Nothing();
40 }
41 mPos = nextPos;
42 return Some(mPos);
43 }
44
WordBreakIteratorUtf16(Span<const char16_t> aText)45 WordBreakIteratorUtf16::WordBreakIteratorUtf16(Span<const char16_t> aText)
46 : SegmentIteratorUtf16(aText) {}
47
Next()48 Maybe<uint32_t> WordBreakIteratorUtf16::Next() {
49 const int32_t nextPos =
50 WordBreaker::Next(mText.Elements(), mText.Length(), mPos);
51 if (nextPos == NS_WORDBREAKER_NEED_MORE_TEXT) {
52 return Nothing();
53 }
54 mPos = nextPos;
55 return Some(mPos);
56 }
57
GraphemeClusterBreakIteratorUtf16(Span<const char16_t> aText)58 GraphemeClusterBreakIteratorUtf16::GraphemeClusterBreakIteratorUtf16(
59 Span<const char16_t> aText)
60 : SegmentIteratorUtf16(aText) {}
61
62 enum HSType {
63 HST_NONE = U_HST_NOT_APPLICABLE,
64 HST_L = U_HST_LEADING_JAMO,
65 HST_V = U_HST_VOWEL_JAMO,
66 HST_T = U_HST_TRAILING_JAMO,
67 HST_LV = U_HST_LV_SYLLABLE,
68 HST_LVT = U_HST_LVT_SYLLABLE
69 };
70
GetHangulSyllableType(uint32_t aCh)71 static HSType GetHangulSyllableType(uint32_t aCh) {
72 return HSType(UnicodeProperties::GetIntPropertyValue(
73 aCh, UnicodeProperties::IntProperty::HangulSyllableType));
74 }
75
Next()76 Maybe<uint32_t> GraphemeClusterBreakIteratorUtf16::Next() {
77 const auto len = mText.Length();
78 if (mPos >= len) {
79 // The iterator has already reached the end.
80 return Nothing();
81 }
82
83 uint32_t ch = mText[mPos++];
84
85 if (mPos < len && NS_IS_SURROGATE_PAIR(ch, mText[mPos])) {
86 ch = SURROGATE_TO_UCS4(ch, mText[mPos++]);
87 } else if ((ch & ~0xff) == 0x1100 || (ch >= 0xa960 && ch <= 0xa97f) ||
88 (ch >= 0xac00 && ch <= 0xd7ff)) {
89 // Handle conjoining Jamo that make Hangul syllables
90 HSType hangulState = GetHangulSyllableType(ch);
91 while (mPos < len) {
92 ch = mText[mPos];
93 HSType hangulType = GetHangulSyllableType(ch);
94 switch (hangulType) {
95 case HST_L:
96 case HST_LV:
97 case HST_LVT:
98 if (hangulState == HST_L) {
99 hangulState = hangulType;
100 mPos++;
101 continue;
102 }
103 break;
104 case HST_V:
105 if ((hangulState != HST_NONE) && (hangulState != HST_T) &&
106 (hangulState != HST_LVT)) {
107 hangulState = hangulType;
108 mPos++;
109 continue;
110 }
111 break;
112 case HST_T:
113 if (hangulState != HST_NONE && hangulState != HST_L) {
114 hangulState = hangulType;
115 mPos++;
116 continue;
117 }
118 break;
119 default:
120 break;
121 }
122 break;
123 }
124 }
125
126 const uint32_t kVS16 = 0xfe0f;
127 const uint32_t kZWJ = 0x200d;
128 // UTF-16 surrogate values for Fitzpatrick type modifiers
129 const uint32_t kFitzpatrickHigh = 0xD83C;
130 const uint32_t kFitzpatrickLowFirst = 0xDFFB;
131 const uint32_t kFitzpatrickLowLast = 0xDFFF;
132
133 bool baseIsEmoji = (GetEmojiPresentation(ch) == EmojiDefault) ||
134 (GetEmojiPresentation(ch) == TextDefault &&
135 ((mPos < len && mText[mPos] == kVS16) ||
136 (mPos + 1 < len && mText[mPos] == kFitzpatrickHigh &&
137 mText[mPos + 1] >= kFitzpatrickLowFirst &&
138 mText[mPos + 1] <= kFitzpatrickLowLast)));
139 bool prevWasZwj = false;
140
141 while (mPos < len) {
142 ch = mText[mPos];
143 size_t chLen = 1;
144
145 // Check for surrogate pairs; note that isolated surrogates will just
146 // be treated as generic (non-cluster-extending) characters here,
147 // which is fine for cluster-iterating purposes
148 if (mPos < len - 1 && NS_IS_SURROGATE_PAIR(ch, mText[mPos + 1])) {
149 ch = SURROGATE_TO_UCS4(ch, mText[mPos + 1]);
150 chLen = 2;
151 }
152
153 bool extendCluster =
154 IsClusterExtender(ch) ||
155 (baseIsEmoji && prevWasZwj &&
156 ((GetEmojiPresentation(ch) == EmojiDefault) ||
157 (GetEmojiPresentation(ch) == TextDefault && mPos + chLen < len &&
158 mText[mPos + chLen] == kVS16)));
159 if (!extendCluster) {
160 break;
161 }
162
163 prevWasZwj = (ch == kZWJ);
164 mPos += chLen;
165 }
166
167 MOZ_ASSERT(mPos <= len, "Next() has overshot the string!");
168 return Some(mPos);
169 }
170
171 GraphemeClusterBreakReverseIteratorUtf16::
GraphemeClusterBreakReverseIteratorUtf16(Span<const char16_t> aText)172 GraphemeClusterBreakReverseIteratorUtf16(Span<const char16_t> aText)
173 : SegmentIteratorUtf16(aText) {
174 mPos = mText.Length();
175 }
176
Next()177 Maybe<uint32_t> GraphemeClusterBreakReverseIteratorUtf16::Next() {
178 if (mPos == 0) {
179 return Nothing();
180 }
181
182 uint32_t ch;
183 do {
184 ch = mText[--mPos];
185
186 if (mPos > 0 && NS_IS_SURROGATE_PAIR(mText[mPos - 1], ch)) {
187 ch = SURROGATE_TO_UCS4(mText[--mPos], ch);
188 }
189
190 if (!IsClusterExtender(ch)) {
191 break;
192 }
193 } while (mPos > 0);
194
195 // XXX May need to handle conjoining Jamo
196
197 return Some(mPos);
198 }
199
Seek(uint32_t aPos)200 Maybe<uint32_t> GraphemeClusterBreakReverseIteratorUtf16::Seek(uint32_t aPos) {
201 if (mPos > aPos) {
202 mPos = aPos;
203 }
204 return Next();
205 }
206
TryCreate(Span<const char> aLocale,const SegmenterOptions & aOptions)207 Result<UniquePtr<Segmenter>, ICUError> Segmenter::TryCreate(
208 Span<const char> aLocale, const SegmenterOptions& aOptions) {
209 if (aOptions.mGranularity == SegmenterGranularity::Sentence) {
210 // Grapheme and Sentence iterator are not yet implemented.
211 return Err(ICUError::InternalError);
212 }
213 return MakeUnique<Segmenter>(aLocale, aOptions);
214 }
215
Segment(Span<const char16_t> aText) const216 UniquePtr<SegmentIteratorUtf16> Segmenter::Segment(
217 Span<const char16_t> aText) const {
218 switch (mOptions.mGranularity) {
219 case SegmenterGranularity::Grapheme:
220 return MakeUnique<GraphemeClusterBreakIteratorUtf16>(aText);
221 case SegmenterGranularity::Sentence:
222 MOZ_ASSERT_UNREACHABLE("Unimplemented yet!");
223 return nullptr;
224 case SegmenterGranularity::Word:
225 return MakeUnique<WordBreakIteratorUtf16>(aText);
226 case SegmenterGranularity::Line:
227 return MakeUnique<LineBreakIteratorUtf16>(aText);
228 }
229 MOZ_ASSERT_UNREACHABLE("All granularities must be handled!");
230 return nullptr;
231 }
232
233 } // namespace mozilla::intl
234