1 /* -*- Mode: C++; tab-width: 8; indent-tabs-mode: nil; c-basic-offset: 2 -*- */
2 /* vim: set ts=8 sts=2 et sw=2 tw=80: */
3 /* This Source Code Form is subject to the terms of the Mozilla Public
4  * License, v. 2.0. If a copy of the MPL was not distributed with this file,
5  * You can obtain one at http://mozilla.org/MPL/2.0/. */
6 
7 /* Classes to iterate over grapheme, word, sentence, or line. */
8 
9 #include "mozilla/intl/Segmenter.h"
10 
11 #include "mozilla/intl/LineBreaker.h"
12 #include "mozilla/intl/WordBreaker.h"
13 #include "mozilla/intl/UnicodeProperties.h"
14 #include "nsUnicodeProperties.h"
15 #include "nsCharTraits.h"
16 
17 using namespace mozilla::unicode;
18 
19 namespace mozilla::intl {
20 
SegmentIteratorUtf16(Span<const char16_t> aText)21 SegmentIteratorUtf16::SegmentIteratorUtf16(Span<const char16_t> aText)
22     : mText(aText) {}
23 
Seek(uint32_t aPos)24 Maybe<uint32_t> SegmentIteratorUtf16::Seek(uint32_t aPos) {
25   if (mPos < aPos) {
26     mPos = aPos;
27   }
28   return Next();
29 }
30 
LineBreakIteratorUtf16(Span<const char16_t> aText,const LineBreakOptions & aOptions)31 LineBreakIteratorUtf16::LineBreakIteratorUtf16(Span<const char16_t> aText,
32                                                const LineBreakOptions& aOptions)
33     : SegmentIteratorUtf16(aText), mOptions(aOptions) {}
34 
Next()35 Maybe<uint32_t> LineBreakIteratorUtf16::Next() {
36   const int32_t nextPos =
37       LineBreaker::Next(mText.Elements(), mText.Length(), mPos);
38   if (nextPos == NS_LINEBREAKER_NEED_MORE_TEXT) {
39     return Nothing();
40   }
41   mPos = nextPos;
42   return Some(mPos);
43 }
44 
WordBreakIteratorUtf16(Span<const char16_t> aText)45 WordBreakIteratorUtf16::WordBreakIteratorUtf16(Span<const char16_t> aText)
46     : SegmentIteratorUtf16(aText) {}
47 
Next()48 Maybe<uint32_t> WordBreakIteratorUtf16::Next() {
49   const int32_t nextPos =
50       WordBreaker::Next(mText.Elements(), mText.Length(), mPos);
51   if (nextPos == NS_WORDBREAKER_NEED_MORE_TEXT) {
52     return Nothing();
53   }
54   mPos = nextPos;
55   return Some(mPos);
56 }
57 
GraphemeClusterBreakIteratorUtf16(Span<const char16_t> aText)58 GraphemeClusterBreakIteratorUtf16::GraphemeClusterBreakIteratorUtf16(
59     Span<const char16_t> aText)
60     : SegmentIteratorUtf16(aText) {}
61 
62 enum HSType {
63   HST_NONE = U_HST_NOT_APPLICABLE,
64   HST_L = U_HST_LEADING_JAMO,
65   HST_V = U_HST_VOWEL_JAMO,
66   HST_T = U_HST_TRAILING_JAMO,
67   HST_LV = U_HST_LV_SYLLABLE,
68   HST_LVT = U_HST_LVT_SYLLABLE
69 };
70 
GetHangulSyllableType(uint32_t aCh)71 static HSType GetHangulSyllableType(uint32_t aCh) {
72   return HSType(UnicodeProperties::GetIntPropertyValue(
73       aCh, UnicodeProperties::IntProperty::HangulSyllableType));
74 }
75 
Next()76 Maybe<uint32_t> GraphemeClusterBreakIteratorUtf16::Next() {
77   const auto len = mText.Length();
78   if (mPos >= len) {
79     // The iterator has already reached the end.
80     return Nothing();
81   }
82 
83   uint32_t ch = mText[mPos++];
84 
85   if (mPos < len && NS_IS_SURROGATE_PAIR(ch, mText[mPos])) {
86     ch = SURROGATE_TO_UCS4(ch, mText[mPos++]);
87   } else if ((ch & ~0xff) == 0x1100 || (ch >= 0xa960 && ch <= 0xa97f) ||
88              (ch >= 0xac00 && ch <= 0xd7ff)) {
89     // Handle conjoining Jamo that make Hangul syllables
90     HSType hangulState = GetHangulSyllableType(ch);
91     while (mPos < len) {
92       ch = mText[mPos];
93       HSType hangulType = GetHangulSyllableType(ch);
94       switch (hangulType) {
95         case HST_L:
96         case HST_LV:
97         case HST_LVT:
98           if (hangulState == HST_L) {
99             hangulState = hangulType;
100             mPos++;
101             continue;
102           }
103           break;
104         case HST_V:
105           if ((hangulState != HST_NONE) && (hangulState != HST_T) &&
106               (hangulState != HST_LVT)) {
107             hangulState = hangulType;
108             mPos++;
109             continue;
110           }
111           break;
112         case HST_T:
113           if (hangulState != HST_NONE && hangulState != HST_L) {
114             hangulState = hangulType;
115             mPos++;
116             continue;
117           }
118           break;
119         default:
120           break;
121       }
122       break;
123     }
124   }
125 
126   const uint32_t kVS16 = 0xfe0f;
127   const uint32_t kZWJ = 0x200d;
128   // UTF-16 surrogate values for Fitzpatrick type modifiers
129   const uint32_t kFitzpatrickHigh = 0xD83C;
130   const uint32_t kFitzpatrickLowFirst = 0xDFFB;
131   const uint32_t kFitzpatrickLowLast = 0xDFFF;
132 
133   bool baseIsEmoji = (GetEmojiPresentation(ch) == EmojiDefault) ||
134                      (GetEmojiPresentation(ch) == TextDefault &&
135                       ((mPos < len && mText[mPos] == kVS16) ||
136                        (mPos + 1 < len && mText[mPos] == kFitzpatrickHigh &&
137                         mText[mPos + 1] >= kFitzpatrickLowFirst &&
138                         mText[mPos + 1] <= kFitzpatrickLowLast)));
139   bool prevWasZwj = false;
140 
141   while (mPos < len) {
142     ch = mText[mPos];
143     size_t chLen = 1;
144 
145     // Check for surrogate pairs; note that isolated surrogates will just
146     // be treated as generic (non-cluster-extending) characters here,
147     // which is fine for cluster-iterating purposes
148     if (mPos < len - 1 && NS_IS_SURROGATE_PAIR(ch, mText[mPos + 1])) {
149       ch = SURROGATE_TO_UCS4(ch, mText[mPos + 1]);
150       chLen = 2;
151     }
152 
153     bool extendCluster =
154         IsClusterExtender(ch) ||
155         (baseIsEmoji && prevWasZwj &&
156          ((GetEmojiPresentation(ch) == EmojiDefault) ||
157           (GetEmojiPresentation(ch) == TextDefault && mPos + chLen < len &&
158            mText[mPos + chLen] == kVS16)));
159     if (!extendCluster) {
160       break;
161     }
162 
163     prevWasZwj = (ch == kZWJ);
164     mPos += chLen;
165   }
166 
167   MOZ_ASSERT(mPos <= len, "Next() has overshot the string!");
168   return Some(mPos);
169 }
170 
171 GraphemeClusterBreakReverseIteratorUtf16::
GraphemeClusterBreakReverseIteratorUtf16(Span<const char16_t> aText)172     GraphemeClusterBreakReverseIteratorUtf16(Span<const char16_t> aText)
173     : SegmentIteratorUtf16(aText) {
174   mPos = mText.Length();
175 }
176 
Next()177 Maybe<uint32_t> GraphemeClusterBreakReverseIteratorUtf16::Next() {
178   if (mPos == 0) {
179     return Nothing();
180   }
181 
182   uint32_t ch;
183   do {
184     ch = mText[--mPos];
185 
186     if (mPos > 0 && NS_IS_SURROGATE_PAIR(mText[mPos - 1], ch)) {
187       ch = SURROGATE_TO_UCS4(mText[--mPos], ch);
188     }
189 
190     if (!IsClusterExtender(ch)) {
191       break;
192     }
193   } while (mPos > 0);
194 
195   // XXX May need to handle conjoining Jamo
196 
197   return Some(mPos);
198 }
199 
Seek(uint32_t aPos)200 Maybe<uint32_t> GraphemeClusterBreakReverseIteratorUtf16::Seek(uint32_t aPos) {
201   if (mPos > aPos) {
202     mPos = aPos;
203   }
204   return Next();
205 }
206 
TryCreate(Span<const char> aLocale,const SegmenterOptions & aOptions)207 Result<UniquePtr<Segmenter>, ICUError> Segmenter::TryCreate(
208     Span<const char> aLocale, const SegmenterOptions& aOptions) {
209   if (aOptions.mGranularity == SegmenterGranularity::Sentence) {
210     // Grapheme and Sentence iterator are not yet implemented.
211     return Err(ICUError::InternalError);
212   }
213   return MakeUnique<Segmenter>(aLocale, aOptions);
214 }
215 
Segment(Span<const char16_t> aText) const216 UniquePtr<SegmentIteratorUtf16> Segmenter::Segment(
217     Span<const char16_t> aText) const {
218   switch (mOptions.mGranularity) {
219     case SegmenterGranularity::Grapheme:
220       return MakeUnique<GraphemeClusterBreakIteratorUtf16>(aText);
221     case SegmenterGranularity::Sentence:
222       MOZ_ASSERT_UNREACHABLE("Unimplemented yet!");
223       return nullptr;
224     case SegmenterGranularity::Word:
225       return MakeUnique<WordBreakIteratorUtf16>(aText);
226     case SegmenterGranularity::Line:
227       return MakeUnique<LineBreakIteratorUtf16>(aText);
228   }
229   MOZ_ASSERT_UNREACHABLE("All granularities must be handled!");
230   return nullptr;
231 }
232 
233 }  // namespace mozilla::intl
234