1 /* -*- Mode: C++; tab-width: 2; indent-tabs-mode: nil; c-basic-offset: 2 -*- */
2 /* This Source Code Form is subject to the terms of the Mozilla Public
3  * License, v. 2.0. If a copy of the MPL was not distributed with this
4  * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
5 
6 #include "mozInlineSpellWordUtil.h"
7 
8 #include <algorithm>
9 #include <utility>
10 
11 #include "mozilla/BinarySearch.h"
12 #include "mozilla/EditorBase.h"
13 #include "mozilla/HTMLEditor.h"
14 #include "mozilla/Logging.h"
15 #include "mozilla/dom/Element.h"
16 
17 #include "nsDebug.h"
18 #include "nsAtom.h"
19 #include "nsComponentManagerUtils.h"
20 #include "nsUnicodeProperties.h"
21 #include "nsServiceManagerUtils.h"
22 #include "nsIContent.h"
23 #include "nsTextFragment.h"
24 #include "nsRange.h"
25 #include "nsContentUtils.h"
26 #include "nsIFrame.h"
27 
28 using namespace mozilla;
29 
30 static LazyLogModule sInlineSpellWordUtilLog{"InlineSpellWordUtil"};
31 
32 // IsIgnorableCharacter
33 //
34 //    These characters are ones that we should ignore in input.
35 
IsIgnorableCharacter(char ch)36 inline bool IsIgnorableCharacter(char ch) {
37   return (ch == static_cast<char>(0xAD));  // SOFT HYPHEN
38 }
39 
IsIgnorableCharacter(char16_t ch)40 inline bool IsIgnorableCharacter(char16_t ch) {
41   return (ch == 0xAD ||   // SOFT HYPHEN
42           ch == 0x1806);  // MONGOLIAN TODO SOFT HYPHEN
43 }
44 
45 // IsConditionalPunctuation
46 //
47 //    Some characters (like apostrophes) require characters on each side to be
48 //    part of a word, and are otherwise punctuation.
49 
IsConditionalPunctuation(char ch)50 inline bool IsConditionalPunctuation(char ch) {
51   return (ch == '\'' ||                    // RIGHT SINGLE QUOTATION MARK
52           ch == static_cast<char>(0xB7));  // MIDDLE DOT
53 }
54 
IsConditionalPunctuation(char16_t ch)55 inline bool IsConditionalPunctuation(char16_t ch) {
56   return (ch == '\'' || ch == 0x2019 ||  // RIGHT SINGLE QUOTATION MARK
57           ch == 0x00B7);                 // MIDDLE DOT
58 }
59 
IsAmbiguousDOMWordSeprator(char16_t ch)60 static bool IsAmbiguousDOMWordSeprator(char16_t ch) {
61   // This class may be CHAR_CLASS_SEPARATOR, but it depends on context.
62   return (ch == '@' || ch == ':' || ch == '.' || ch == '/' || ch == '-' ||
63           IsConditionalPunctuation(ch));
64 }
65 
IsAmbiguousDOMWordSeprator(char ch)66 static bool IsAmbiguousDOMWordSeprator(char ch) {
67   // This class may be CHAR_CLASS_SEPARATOR, but it depends on context.
68   return IsAmbiguousDOMWordSeprator(static_cast<char16_t>(ch));
69 }
70 
71 // IsDOMWordSeparator
72 //
73 //    Determines if the given character should be considered as a DOM Word
74 //    separator. Basically, this is whitespace, although it could also have
75 //    certain punctuation that we know ALWAYS breaks words. This is important.
76 //    For example, we can't have any punctuation that could appear in a URL
77 //    or email address in this, because those need to always fit into a single
78 //    DOM word.
79 
IsDOMWordSeparator(char ch)80 static bool IsDOMWordSeparator(char ch) {
81   // simple spaces or no-break space
82   return (ch == ' ' || ch == '\t' || ch == '\n' || ch == '\r' ||
83           ch == static_cast<char>(0xA0));
84 }
85 
IsDOMWordSeparator(char16_t ch)86 static bool IsDOMWordSeparator(char16_t ch) {
87   // simple spaces
88   if (ch == ' ' || ch == '\t' || ch == '\n' || ch == '\r') return true;
89 
90   // complex spaces - check only if char isn't ASCII (uncommon)
91   if (ch >= 0xA0 && (ch == 0x00A0 ||  // NO-BREAK SPACE
92                      ch == 0x2002 ||  // EN SPACE
93                      ch == 0x2003 ||  // EM SPACE
94                      ch == 0x2009 ||  // THIN SPACE
95                      ch == 0x3000))   // IDEOGRAPHIC SPACE
96     return true;
97 
98   // otherwise not a space
99   return false;
100 }
101 
operator ==(const mozilla::RangeBoundary & aRangeBoundary) const102 bool NodeOffset::operator==(
103     const mozilla::RangeBoundary& aRangeBoundary) const {
104   if (aRangeBoundary.Container() != mNode) {
105     return false;
106   }
107 
108   const Maybe<uint32_t> rangeBoundaryOffset =
109       aRangeBoundary.Offset(RangeBoundary::OffsetFilter::kValidOffsets);
110 
111   MOZ_ASSERT(mOffset >= 0);
112   return rangeBoundaryOffset &&
113          (*rangeBoundaryOffset == static_cast<uint32_t>(mOffset));
114 }
115 
operator ==(const nsRange & aRange) const116 bool NodeOffsetRange::operator==(const nsRange& aRange) const {
117   return mBegin == aRange.StartRef() && mEnd == aRange.EndRef();
118 }
119 
120 // static
Create(const EditorBase & aEditorBase)121 Maybe<mozInlineSpellWordUtil> mozInlineSpellWordUtil::Create(
122     const EditorBase& aEditorBase) {
123   dom::Document* document = aEditorBase.GetDocument();
124   if (NS_WARN_IF(!document)) {
125     return Nothing();
126   }
127 
128   const bool isContentEditableOrDesignMode = aEditorBase.IsHTMLEditor();
129 
130   // Find the root node for the editor. For contenteditable the mRootNode could
131   // change to shadow root if the begin and end are inside the shadowDOM.
132   nsINode* rootNode = aEditorBase.GetRoot();
133   if (NS_WARN_IF(!rootNode)) {
134     return Nothing();
135   }
136 
137   mozInlineSpellWordUtil util{*document, isContentEditableOrDesignMode,
138                               *rootNode};
139   return Some(std::move(util));
140 }
141 
IsSpellCheckingTextNode(nsINode * aNode)142 static inline bool IsSpellCheckingTextNode(nsINode* aNode) {
143   nsIContent* parent = aNode->GetParent();
144   if (parent &&
145       parent->IsAnyOfHTMLElements(nsGkAtoms::script, nsGkAtoms::style))
146     return false;
147   return aNode->IsText();
148 }
149 
150 typedef void (*OnLeaveNodeFunPtr)(nsINode* aNode, void* aClosure);
151 
152 // Find the next node in the DOM tree in preorder.
153 // Calls OnLeaveNodeFunPtr when the traversal leaves a node, which is
154 // why we can't just use GetNextNode here, sadly.
FindNextNode(nsINode * aNode,const nsINode * aRoot,OnLeaveNodeFunPtr aOnLeaveNode,void * aClosure)155 static nsINode* FindNextNode(nsINode* aNode, const nsINode* aRoot,
156                              OnLeaveNodeFunPtr aOnLeaveNode, void* aClosure) {
157   MOZ_ASSERT(aNode, "Null starting node?");
158 
159   nsINode* next = aNode->GetFirstChild();
160   if (next) return next;
161 
162   // Don't look at siblings or otherwise outside of aRoot
163   if (aNode == aRoot) return nullptr;
164 
165   next = aNode->GetNextSibling();
166   if (next) return next;
167 
168   // Go up
169   for (;;) {
170     if (aOnLeaveNode) {
171       aOnLeaveNode(aNode, aClosure);
172     }
173 
174     next = aNode->GetParent();
175     if (next == aRoot || !next) return nullptr;
176     aNode = next;
177 
178     next = aNode->GetNextSibling();
179     if (next) return next;
180   }
181 }
182 
183 // aNode is not a text node. Find the first text node starting at aNode/aOffset
184 // in a preorder DOM traversal.
FindNextTextNode(nsINode * aNode,int32_t aOffset,const nsINode * aRoot)185 static nsINode* FindNextTextNode(nsINode* aNode, int32_t aOffset,
186                                  const nsINode* aRoot) {
187   MOZ_ASSERT(aNode, "Null starting node?");
188   MOZ_ASSERT(!IsSpellCheckingTextNode(aNode),
189              "FindNextTextNode should start with a non-text node");
190 
191   nsINode* checkNode;
192   // Need to start at the aOffset'th child
193   nsIContent* child = aNode->GetChildAt_Deprecated(aOffset);
194 
195   if (child) {
196     checkNode = child;
197   } else {
198     // aOffset was beyond the end of the child list.
199     // goto next node after the last descendant of aNode in
200     // a preorder DOM traversal.
201     checkNode = aNode->GetNextNonChildNode(aRoot);
202   }
203 
204   while (checkNode && !IsSpellCheckingTextNode(checkNode)) {
205     checkNode = checkNode->GetNextNode(aRoot);
206   }
207   return checkNode;
208 }
209 
210 // mozInlineSpellWordUtil::SetPositionAndEnd
211 //
212 //    We have two ranges "hard" and "soft". The hard boundary is simply
213 //    the scope of the root node. The soft boundary is that which is set
214 //    by the caller of this class by calling this function. If this function is
215 //    not called, the soft boundary is the same as the hard boundary.
216 //
217 //    When we reach the soft boundary (mSoftText.GetEnd()), we keep
218 //    going until we reach the end of a word. This allows the caller to set the
219 //    end of the range to anything, and we will always check whole multiples of
220 //    words. When we reach the hard boundary we stop no matter what.
221 //
222 //    There is no beginning soft boundary. This is because we only go to the
223 //    previous node once, when finding the previous word boundary in
224 //    SetPosition(). You might think of the soft boundary as being this initial
225 //    position.
226 
SetPositionAndEnd(nsINode * aPositionNode,int32_t aPositionOffset,nsINode * aEndNode,int32_t aEndOffset)227 nsresult mozInlineSpellWordUtil::SetPositionAndEnd(nsINode* aPositionNode,
228                                                    int32_t aPositionOffset,
229                                                    nsINode* aEndNode,
230                                                    int32_t aEndOffset) {
231   MOZ_LOG(sInlineSpellWordUtilLog, LogLevel::Debug,
232           ("%s: pos=(%p, %i), end=(%p, %i)", __FUNCTION__, aPositionNode,
233            aPositionOffset, aEndNode, aEndOffset));
234 
235   MOZ_ASSERT(aPositionNode, "Null begin node?");
236   MOZ_ASSERT(aEndNode, "Null end node?");
237 
238   MOZ_ASSERT(mRootNode, "Not initialized");
239 
240   // Find a appropriate root if we are dealing with contenteditable nodes which
241   // are in the shadow DOM.
242   if (mIsContentEditableOrDesignMode) {
243     nsINode* rootNode = aPositionNode->SubtreeRoot();
244     if (rootNode != aEndNode->SubtreeRoot()) {
245       return NS_ERROR_FAILURE;
246     }
247 
248     if (mozilla::dom::ShadowRoot::FromNode(rootNode)) {
249       mRootNode = rootNode;
250     }
251   }
252 
253   mSoftText.Invalidate();
254 
255   if (!IsSpellCheckingTextNode(aPositionNode)) {
256     // Start at the start of the first text node after aNode/aOffset.
257     aPositionNode = FindNextTextNode(aPositionNode, aPositionOffset, mRootNode);
258     aPositionOffset = 0;
259   }
260   NodeOffset softBegin = NodeOffset(aPositionNode, aPositionOffset);
261 
262   if (!IsSpellCheckingTextNode(aEndNode)) {
263     // End at the start of the first text node after aEndNode/aEndOffset.
264     aEndNode = FindNextTextNode(aEndNode, aEndOffset, mRootNode);
265     aEndOffset = 0;
266   }
267   NodeOffset softEnd = NodeOffset(aEndNode, aEndOffset);
268 
269   nsresult rv = EnsureWords(std::move(softBegin), std::move(softEnd));
270   if (NS_FAILED(rv)) {
271     return rv;
272   }
273 
274   int32_t textOffset = MapDOMPositionToSoftTextOffset(mSoftText.GetBegin());
275   if (textOffset < 0) {
276     return NS_OK;
277   }
278 
279   mNextWordIndex = FindRealWordContaining(textOffset, HINT_END, true);
280   return NS_OK;
281 }
282 
EnsureWords(NodeOffset aSoftBegin,NodeOffset aSoftEnd)283 nsresult mozInlineSpellWordUtil::EnsureWords(NodeOffset aSoftBegin,
284                                              NodeOffset aSoftEnd) {
285   if (mSoftText.mIsValid) return NS_OK;
286   mSoftText.AdjustBeginAndBuildText(std::move(aSoftBegin), std::move(aSoftEnd),
287                                     mRootNode);
288 
289   mRealWords.Clear();
290   Result<RealWords, nsresult> realWords = BuildRealWords();
291   if (realWords.isErr()) {
292     return realWords.unwrapErr();
293   }
294 
295   mRealWords = realWords.unwrap();
296   mSoftText.mIsValid = true;
297   return NS_OK;
298 }
299 
MakeRangeForWord(const RealWord & aWord,nsRange ** aRange) const300 nsresult mozInlineSpellWordUtil::MakeRangeForWord(const RealWord& aWord,
301                                                   nsRange** aRange) const {
302   NodeOffset begin =
303       MapSoftTextOffsetToDOMPosition(aWord.mSoftTextOffset, HINT_BEGIN);
304   NodeOffset end = MapSoftTextOffsetToDOMPosition(aWord.EndOffset(), HINT_END);
305   return MakeRange(begin, end, aRange);
306 }
MakeNodeOffsetRangeForWord(const RealWord & aWord,NodeOffsetRange * aNodeOffsetRange)307 void mozInlineSpellWordUtil::MakeNodeOffsetRangeForWord(
308     const RealWord& aWord, NodeOffsetRange* aNodeOffsetRange) {
309   NodeOffset begin =
310       MapSoftTextOffsetToDOMPosition(aWord.mSoftTextOffset, HINT_BEGIN);
311   NodeOffset end = MapSoftTextOffsetToDOMPosition(aWord.EndOffset(), HINT_END);
312   *aNodeOffsetRange = NodeOffsetRange(begin, end);
313 }
314 
315 // mozInlineSpellWordUtil::GetRangeForWord
316 
GetRangeForWord(nsINode * aWordNode,int32_t aWordOffset,nsRange ** aRange)317 nsresult mozInlineSpellWordUtil::GetRangeForWord(nsINode* aWordNode,
318                                                  int32_t aWordOffset,
319                                                  nsRange** aRange) {
320   // Set our soft end and start
321   NodeOffset pt(aWordNode, aWordOffset);
322 
323   if (!mSoftText.mIsValid || pt != mSoftText.GetBegin() ||
324       pt != mSoftText.GetEnd()) {
325     mSoftText.Invalidate();
326     NodeOffset softBegin = pt;
327     NodeOffset softEnd = pt;
328     nsresult rv = EnsureWords(std::move(softBegin), std::move(softEnd));
329     if (NS_FAILED(rv)) {
330       return rv;
331     }
332   }
333 
334   int32_t offset = MapDOMPositionToSoftTextOffset(pt);
335   if (offset < 0) return MakeRange(pt, pt, aRange);
336   int32_t wordIndex = FindRealWordContaining(offset, HINT_BEGIN, false);
337   if (wordIndex < 0) return MakeRange(pt, pt, aRange);
338   return MakeRangeForWord(mRealWords[wordIndex], aRange);
339 }
340 
341 // This is to fix characters that the spellchecker may not like
NormalizeWord(const nsAString & aInput,int32_t aPos,int32_t aLen,nsAString & aOutput)342 static void NormalizeWord(const nsAString& aInput, int32_t aPos, int32_t aLen,
343                           nsAString& aOutput) {
344   aOutput.Truncate();
345   for (int32_t i = 0; i < aLen; i++) {
346     char16_t ch = aInput.CharAt(i + aPos);
347 
348     // remove ignorable characters from the word
349     if (IsIgnorableCharacter(ch)) continue;
350 
351     // the spellchecker doesn't handle curly apostrophes in all languages
352     if (ch == 0x2019) {  // RIGHT SINGLE QUOTATION MARK
353       ch = '\'';
354     }
355 
356     aOutput.Append(ch);
357   }
358 }
359 
360 // mozInlineSpellWordUtil::GetNextWord
361 //
362 //    FIXME-optimization: we shouldn't have to generate a range every single
363 //    time. It would be better if the inline spellchecker didn't require a
364 //    range unless the word was misspelled. This may or may not be possible.
365 
GetNextWord(Word & aWord)366 bool mozInlineSpellWordUtil::GetNextWord(Word& aWord) {
367   MOZ_LOG(sInlineSpellWordUtilLog, LogLevel::Debug,
368           ("%s: mNextWordIndex=%d", __FUNCTION__, mNextWordIndex));
369 
370   if (mNextWordIndex < 0 || mNextWordIndex >= int32_t(mRealWords.Length())) {
371     mNextWordIndex = -1;
372     aWord.mSkipChecking = true;
373     return false;
374   }
375 
376   const RealWord& realWord = mRealWords[mNextWordIndex];
377   MakeNodeOffsetRangeForWord(realWord, &aWord.mNodeOffsetRange);
378   ++mNextWordIndex;
379   aWord.mSkipChecking = !realWord.mCheckableWord;
380   ::NormalizeWord(mSoftText.GetValue(), realWord.mSoftTextOffset,
381                   realWord.mLength, aWord.mText);
382 
383   MOZ_LOG(sInlineSpellWordUtilLog, LogLevel::Debug,
384           ("%s: returning: %s (skip=%d)", __FUNCTION__,
385            NS_ConvertUTF16toUTF8(aWord.mText).get(), aWord.mSkipChecking));
386 
387   return true;
388 }
389 
390 // mozInlineSpellWordUtil::MakeRange
391 //
392 //    Convenience function for creating a range over the current document.
393 
MakeRange(NodeOffset aBegin,NodeOffset aEnd,nsRange ** aRange) const394 nsresult mozInlineSpellWordUtil::MakeRange(NodeOffset aBegin, NodeOffset aEnd,
395                                            nsRange** aRange) const {
396   NS_ENSURE_ARG_POINTER(aBegin.mNode);
397   if (!mDocument) {
398     return NS_ERROR_NOT_INITIALIZED;
399   }
400 
401   ErrorResult error;
402   RefPtr<nsRange> range = nsRange::Create(aBegin.mNode, aBegin.mOffset,
403                                           aEnd.mNode, aEnd.mOffset, error);
404   if (NS_WARN_IF(error.Failed())) {
405     return error.StealNSResult();
406   }
407   MOZ_ASSERT(range);
408   range.forget(aRange);
409   return NS_OK;
410 }
411 
412 // static
MakeRange(const NodeOffsetRange & aRange)413 already_AddRefed<nsRange> mozInlineSpellWordUtil::MakeRange(
414     const NodeOffsetRange& aRange) {
415   IgnoredErrorResult ignoredError;
416   RefPtr<nsRange> range =
417       nsRange::Create(aRange.Begin().Node(), aRange.Begin().Offset(),
418                       aRange.End().Node(), aRange.End().Offset(), ignoredError);
419   NS_WARNING_ASSERTION(!ignoredError.Failed(), "Creating a range failed");
420   return range.forget();
421 }
422 
423 /*********** Word Splitting ************/
424 
425 // classifies a given character in the DOM word
426 enum CharClass {
427   CHAR_CLASS_WORD,
428   CHAR_CLASS_SEPARATOR,
429   CHAR_CLASS_END_OF_INPUT
430 };
431 
432 // Encapsulates DOM-word to real-word splitting
433 template <class T>
434 struct MOZ_STACK_CLASS WordSplitState {
435   const T& mDOMWordText;
436   int32_t mDOMWordOffset;
437   CharClass mCurCharClass;
438 
WordSplitStateWordSplitState439   explicit WordSplitState(const T& aString)
440       : mDOMWordText(aString),
441         mDOMWordOffset(0),
442         mCurCharClass(CHAR_CLASS_END_OF_INPUT) {}
443 
444   CharClass ClassifyCharacter(int32_t aIndex, bool aRecurse) const;
445   void Advance();
446   void AdvanceThroughSeparators();
447   void AdvanceThroughWord();
448 
449   // Finds special words like email addresses and URLs that may start at the
450   // current position, and returns their length, or 0 if not found. This allows
451   // arbitrary word breaking rules to be used for these special entities, as
452   // long as they can not contain whitespace.
453   bool IsSpecialWord() const;
454 
455   // Similar to IsSpecialWord except that this takes a split word as
456   // input. This checks for things that do not require special word-breaking
457   // rules.
458   bool ShouldSkipWord(int32_t aStart, int32_t aLength) const;
459 
460   // Finds the last sequence of DOM word separators before aBeforeOffset and
461   // returns the offset to its first element.
462   Maybe<int32_t> FindOffsetOfLastDOMWordSeparatorSequence(
463       int32_t aBeforeOffset) const;
464 
465   char16_t GetUnicharAt(int32_t aIndex) const;
466 };
467 
468 // WordSplitState::ClassifyCharacter
469 template <class T>
ClassifyCharacter(int32_t aIndex,bool aRecurse) const470 CharClass WordSplitState<T>::ClassifyCharacter(int32_t aIndex,
471                                                bool aRecurse) const {
472   MOZ_ASSERT(aIndex >= 0 && aIndex <= int32_t(mDOMWordText.Length()),
473              "Index out of range");
474   if (aIndex == int32_t(mDOMWordText.Length())) return CHAR_CLASS_SEPARATOR;
475 
476   // this will classify the character, we want to treat "ignorable" characters
477   // such as soft hyphens, and also ZWJ and ZWNJ as word characters.
478   nsUGenCategory charCategory =
479       mozilla::unicode::GetGenCategory(GetUnicharAt(aIndex));
480   if (charCategory == nsUGenCategory::kLetter ||
481       IsIgnorableCharacter(mDOMWordText[aIndex]) ||
482       mDOMWordText[aIndex] == 0x200C /* ZWNJ */ ||
483       mDOMWordText[aIndex] == 0x200D /* ZWJ */)
484     return CHAR_CLASS_WORD;
485 
486   // If conditional punctuation is surrounded immediately on both sides by word
487   // characters it also counts as a word character.
488   if (IsConditionalPunctuation(mDOMWordText[aIndex])) {
489     if (!aRecurse) {
490       // not allowed to look around, this punctuation counts like a separator
491       return CHAR_CLASS_SEPARATOR;
492     }
493 
494     // check the left-hand character
495     if (aIndex == 0) return CHAR_CLASS_SEPARATOR;
496     if (ClassifyCharacter(aIndex - 1, false) != CHAR_CLASS_WORD)
497       return CHAR_CLASS_SEPARATOR;
498     // If the previous charatcer is a word-char, make sure that it's not a
499     // special dot character.
500     if (mDOMWordText[aIndex - 1] == '.') return CHAR_CLASS_SEPARATOR;
501 
502     // now we know left char is a word-char, check the right-hand character
503     if (aIndex == int32_t(mDOMWordText.Length() - 1)) {
504       return CHAR_CLASS_SEPARATOR;
505     }
506 
507     if (ClassifyCharacter(aIndex + 1, false) != CHAR_CLASS_WORD)
508       return CHAR_CLASS_SEPARATOR;
509     // If the next charatcer is a word-char, make sure that it's not a
510     // special dot character.
511     if (mDOMWordText[aIndex + 1] == '.') return CHAR_CLASS_SEPARATOR;
512 
513     // char on either side is a word, this counts as a word
514     return CHAR_CLASS_WORD;
515   }
516 
517   // The dot character, if appearing at the end of a word, should
518   // be considered part of that word.  Example: "etc.", or
519   // abbreviations
520   if (aIndex > 0 && mDOMWordText[aIndex] == '.' &&
521       mDOMWordText[aIndex - 1] != '.' &&
522       ClassifyCharacter(aIndex - 1, false) != CHAR_CLASS_WORD) {
523     return CHAR_CLASS_WORD;
524   }
525 
526   // all other punctuation
527   if (charCategory == nsUGenCategory::kSeparator ||
528       charCategory == nsUGenCategory::kOther ||
529       charCategory == nsUGenCategory::kPunctuation ||
530       charCategory == nsUGenCategory::kSymbol) {
531     // Don't break on hyphens, as hunspell handles them on its own.
532     if (aIndex > 0 && mDOMWordText[aIndex] == '-' &&
533         mDOMWordText[aIndex - 1] != '-' &&
534         ClassifyCharacter(aIndex - 1, false) == CHAR_CLASS_WORD) {
535       // A hyphen is only meaningful as a separator inside a word
536       // if the previous and next characters are a word character.
537       if (aIndex == int32_t(mDOMWordText.Length()) - 1)
538         return CHAR_CLASS_SEPARATOR;
539       if (mDOMWordText[aIndex + 1] != '.' &&
540           ClassifyCharacter(aIndex + 1, false) == CHAR_CLASS_WORD)
541         return CHAR_CLASS_WORD;
542     }
543     return CHAR_CLASS_SEPARATOR;
544   }
545 
546   // any other character counts as a word
547   return CHAR_CLASS_WORD;
548 }
549 
550 // WordSplitState::Advance
551 template <class T>
Advance()552 void WordSplitState<T>::Advance() {
553   MOZ_ASSERT(mDOMWordOffset >= 0, "Negative word index");
554   MOZ_ASSERT(mDOMWordOffset < (int32_t)mDOMWordText.Length(),
555              "Length beyond end");
556 
557   mDOMWordOffset++;
558   if (mDOMWordOffset >= (int32_t)mDOMWordText.Length())
559     mCurCharClass = CHAR_CLASS_END_OF_INPUT;
560   else
561     mCurCharClass = ClassifyCharacter(mDOMWordOffset, true);
562 }
563 
564 // WordSplitState::AdvanceThroughSeparators
565 template <class T>
AdvanceThroughSeparators()566 void WordSplitState<T>::AdvanceThroughSeparators() {
567   while (mCurCharClass == CHAR_CLASS_SEPARATOR) Advance();
568 }
569 
570 // WordSplitState::AdvanceThroughWord
571 template <class T>
AdvanceThroughWord()572 void WordSplitState<T>::AdvanceThroughWord() {
573   while (mCurCharClass == CHAR_CLASS_WORD) Advance();
574 }
575 
576 // WordSplitState::IsSpecialWord
577 template <class T>
IsSpecialWord() const578 bool WordSplitState<T>::IsSpecialWord() const {
579   // Search for email addresses. We simply define these as any sequence of
580   // characters with an '@' character in the middle. The DOM word is already
581   // split on whitepace, so we know that everything to the end is the address
582   int32_t firstColon = -1;
583   for (int32_t i = mDOMWordOffset; i < int32_t(mDOMWordText.Length()); i++) {
584     if (mDOMWordText[i] == '@') {
585       // only accept this if there are unambiguous word characters (don't bother
586       // recursing to disambiguate apostrophes) on each side. This prevents
587       // classifying, e.g. "@home" as an email address
588 
589       // Use this condition to only accept words with '@' in the middle of
590       // them. It works, but the inlinespellcker doesn't like this. The problem
591       // is that you type "fhsgfh@" that's a misspelled word followed by a
592       // symbol, but when you type another letter "fhsgfh@g" that first word
593       // need to be unmarked misspelled. It doesn't do this. it only checks the
594       // current position for potentially removing a spelling range.
595       if (i > 0 && ClassifyCharacter(i - 1, false) == CHAR_CLASS_WORD &&
596           i < (int32_t)mDOMWordText.Length() - 1 &&
597           ClassifyCharacter(i + 1, false) == CHAR_CLASS_WORD) {
598         return true;
599       }
600     } else if (mDOMWordText[i] == ':' && firstColon < 0) {
601       firstColon = i;
602 
603       // If the first colon is followed by a slash, consider it a URL
604       // This will catch things like asdf://foo.com
605       if (firstColon < (int32_t)mDOMWordText.Length() - 1 &&
606           mDOMWordText[firstColon + 1] == '/') {
607         return true;
608       }
609     }
610   }
611 
612   // Check the text before the first colon against some known protocols. It
613   // is impossible to check against all protocols, especially since you can
614   // plug in new protocols. We also don't want to waste time here checking
615   // against a lot of obscure protocols.
616   if (firstColon > mDOMWordOffset) {
617     nsString protocol(
618         Substring(mDOMWordText, mDOMWordOffset, firstColon - mDOMWordOffset));
619     if (protocol.EqualsIgnoreCase("http") ||
620         protocol.EqualsIgnoreCase("https") ||
621         protocol.EqualsIgnoreCase("news") ||
622         protocol.EqualsIgnoreCase("file") ||
623         protocol.EqualsIgnoreCase("javascript") ||
624         protocol.EqualsIgnoreCase("data") || protocol.EqualsIgnoreCase("ftp")) {
625       return true;
626     }
627   }
628 
629   // not anything special
630   return false;
631 }
632 
633 // WordSplitState::ShouldSkipWord
634 template <class T>
ShouldSkipWord(int32_t aStart,int32_t aLength) const635 bool WordSplitState<T>::ShouldSkipWord(int32_t aStart, int32_t aLength) const {
636   int32_t last = aStart + aLength;
637 
638   // check to see if the word contains a digit
639   for (int32_t i = aStart; i < last; i++) {
640     if (mozilla::unicode::GetGenCategory(GetUnicharAt(i)) ==
641         nsUGenCategory::kNumber) {
642       return true;
643     }
644   }
645 
646   // not special
647   return false;
648 }
649 
650 template <class T>
FindOffsetOfLastDOMWordSeparatorSequence(const int32_t aBeforeOffset) const651 Maybe<int32_t> WordSplitState<T>::FindOffsetOfLastDOMWordSeparatorSequence(
652     const int32_t aBeforeOffset) const {
653   for (int32_t i = aBeforeOffset - 1; i >= 0; --i) {
654     if (IsDOMWordSeparator(mDOMWordText[i]) ||
655         (!IsAmbiguousDOMWordSeprator(mDOMWordText[i]) &&
656          ClassifyCharacter(i, true) == CHAR_CLASS_SEPARATOR)) {
657       // Be greedy, find as many separators as we can
658       for (int32_t j = i - 1; j >= 0; --j) {
659         if (IsDOMWordSeparator(mDOMWordText[j]) ||
660             (!IsAmbiguousDOMWordSeprator(mDOMWordText[j]) &&
661              ClassifyCharacter(j, true) == CHAR_CLASS_SEPARATOR)) {
662           i = j;
663         } else {
664           break;
665         }
666       }
667       return Some(i);
668     }
669   }
670   return Nothing();
671 }
672 
673 template <>
GetUnicharAt(int32_t aIndex) const674 char16_t WordSplitState<nsDependentSubstring>::GetUnicharAt(
675     int32_t aIndex) const {
676   return mDOMWordText[aIndex];
677 }
678 
679 template <>
GetUnicharAt(int32_t aIndex) const680 char16_t WordSplitState<nsDependentCSubstring>::GetUnicharAt(
681     int32_t aIndex) const {
682   return static_cast<char16_t>(static_cast<uint8_t>(mDOMWordText[aIndex]));
683 }
684 
IsBRElement(nsINode * aNode)685 static inline bool IsBRElement(nsINode* aNode) {
686   return aNode->IsHTMLElement(nsGkAtoms::br);
687 }
688 
689 /**
690  * Given a TextNode, finds the last sequence of DOM word separators before
691  * aBeforeOffset and returns the offset to its first element.
692  *
693  * @param aContent the TextNode to check.
694  * @param aBeforeOffset the offset in the TextNode before which we will search
695  *        for the DOM separator. You can pass INT32_MAX to search the entire
696  *        length of the string.
697  */
FindOffsetOfLastDOMWordSeparatorSequence(nsIContent * aContent,int32_t aBeforeOffset)698 static Maybe<int32_t> FindOffsetOfLastDOMWordSeparatorSequence(
699     nsIContent* aContent, int32_t aBeforeOffset) {
700   const nsTextFragment* textFragment = aContent->GetText();
701   MOZ_ASSERT(textFragment, "Where is our text?");
702   int32_t end = std::min(aBeforeOffset, int32_t(textFragment->GetLength()));
703 
704   if (textFragment->Is2b()) {
705     nsDependentSubstring targetText(textFragment->Get2b(), end);
706     WordSplitState<nsDependentSubstring> state(targetText);
707     return state.FindOffsetOfLastDOMWordSeparatorSequence(end);
708   }
709 
710   nsDependentCSubstring targetText(textFragment->Get1b(), end);
711   WordSplitState<nsDependentCSubstring> state(targetText);
712   return state.FindOffsetOfLastDOMWordSeparatorSequence(end);
713 }
714 
715 /**
716  * Check if there's a DOM word separator before aBeforeOffset in this node.
717  * Always returns true if it's a BR element.
718  * aSeparatorOffset is set to the index of the first character in the last
719  * separator if any is found (0 for BR elements).
720  *
721  * This function does not modify aSeparatorOffset when it returns false.
722  */
ContainsDOMWordSeparator(nsINode * aNode,int32_t aBeforeOffset,int32_t * aSeparatorOffset)723 static bool ContainsDOMWordSeparator(nsINode* aNode, int32_t aBeforeOffset,
724                                      int32_t* aSeparatorOffset) {
725   if (IsBRElement(aNode)) {
726     *aSeparatorOffset = 0;
727     return true;
728   }
729 
730   if (!IsSpellCheckingTextNode(aNode)) return false;
731 
732   const Maybe<int32_t> separatorOffset =
733       FindOffsetOfLastDOMWordSeparatorSequence(aNode->AsContent(),
734                                                aBeforeOffset);
735   if (separatorOffset) {
736     *aSeparatorOffset = *separatorOffset;
737     return true;
738   }
739 
740   return false;
741 }
742 
IsBreakElement(nsINode * aNode)743 static bool IsBreakElement(nsINode* aNode) {
744   if (!aNode->IsElement()) {
745     return false;
746   }
747 
748   dom::Element* element = aNode->AsElement();
749   if (element->IsHTMLElement(nsGkAtoms::br)) {
750     return true;
751   }
752 
753   // If we don't have a frame, we don't consider ourselves a break
754   // element.  In particular, words can span us.
755   nsIFrame* frame = element->GetPrimaryFrame();
756   if (!frame) {
757     return false;
758   }
759 
760   auto* disp = frame->StyleDisplay();
761   // Anything that's not an inline element is a break element.
762   // XXXbz should replaced inlines be break elements, though?
763   // Also should inline-block and such be break elements?
764   //
765   // FIXME(emilio): We should teach the spell checker to deal with generated
766   // content (it doesn't at all), then remove the IsListItem() check, as there
767   // could be no marker, etc...
768   return !disp->IsInlineFlow() || disp->IsListItem();
769 }
770 
771 struct CheckLeavingBreakElementClosure {
772   bool mLeftBreakElement;
773 };
774 
CheckLeavingBreakElement(nsINode * aNode,void * aClosure)775 static void CheckLeavingBreakElement(nsINode* aNode, void* aClosure) {
776   CheckLeavingBreakElementClosure* cl =
777       static_cast<CheckLeavingBreakElementClosure*>(aClosure);
778   if (!cl->mLeftBreakElement && IsBreakElement(aNode)) {
779     cl->mLeftBreakElement = true;
780   }
781 }
782 
NormalizeWord(nsAString & aWord)783 void mozInlineSpellWordUtil::NormalizeWord(nsAString& aWord) {
784   nsAutoString result;
785   ::NormalizeWord(aWord, 0, aWord.Length(), result);
786   aWord = result;
787 }
788 
AdjustBeginAndBuildText(NodeOffset aBegin,NodeOffset aEnd,const nsINode * aRootNode)789 void mozInlineSpellWordUtil::SoftText::AdjustBeginAndBuildText(
790     NodeOffset aBegin, NodeOffset aEnd, const nsINode* aRootNode) {
791   MOZ_LOG(sInlineSpellWordUtilLog, LogLevel::Debug, ("%s", __FUNCTION__));
792 
793   mBegin = std::move(aBegin);
794   mEnd = std::move(aEnd);
795 
796   // First we have to work backwards from mBegin to find a text node
797   // containing a DOM word separator, a non-inline-element
798   // boundary, or the hard start node. That's where we'll start building the
799   // soft string from.
800   nsINode* node = mBegin.mNode;
801   int32_t firstOffsetInNode = 0;
802   int32_t checkBeforeOffset = mBegin.mOffset;
803   while (node) {
804     if (ContainsDOMWordSeparator(node, checkBeforeOffset, &firstOffsetInNode)) {
805       if (node == mBegin.mNode) {
806         // If we find a word separator on the first node, look at the preceding
807         // word on the text node as well.
808         if (firstOffsetInNode > 0) {
809           // Try to find the previous word boundary in the current node. If
810           // we can't find one, start checking previous sibling nodes (if any
811           // adjacent ones exist) to see if we can find any text nodes with
812           // DOM word separators. We bail out as soon as we see a node that is
813           // not a text node, or we run out of previous sibling nodes. In the
814           // event that we simply cannot find any preceding word separator, the
815           // offset is set to 0, and the soft text beginning node is set to the
816           // "most previous" text node before the original starting node, or
817           // kept at the original starting node if no previous text nodes exist.
818           int32_t newOffset = 0;
819           if (!ContainsDOMWordSeparator(node, firstOffsetInNode - 1,
820                                         &newOffset)) {
821             nsIContent* prevNode = node->GetPreviousSibling();
822             while (prevNode && IsSpellCheckingTextNode(prevNode)) {
823               mBegin.mNode = prevNode;
824               const Maybe<int32_t> separatorOffset =
825                   FindOffsetOfLastDOMWordSeparatorSequence(prevNode, INT32_MAX);
826               if (separatorOffset) {
827                 newOffset = *separatorOffset;
828                 break;
829               }
830               prevNode = prevNode->GetPreviousSibling();
831             }
832           }
833           firstOffsetInNode = newOffset;
834         } else {
835           firstOffsetInNode = 0;
836         }
837 
838         MOZ_LOG(sInlineSpellWordUtilLog, LogLevel::Debug,
839                 ("%s: adjusting mBegin.mOffset from %i to %i.", __FUNCTION__,
840                  mBegin.mOffset, firstOffsetInNode));
841         mBegin.mOffset = firstOffsetInNode;
842       }
843       break;
844     }
845     checkBeforeOffset = INT32_MAX;
846     if (IsBreakElement(node)) {
847       // Since GetPreviousContent follows tree *preorder*, we're about to
848       // traverse up out of 'node'. Since node induces breaks (e.g., it's a
849       // block), don't bother trying to look outside it, just stop now.
850       break;
851     }
852     // GetPreviousContent below expects aRootNode to be an ancestor of node.
853     if (!node->IsInclusiveDescendantOf(aRootNode)) {
854       break;
855     }
856     node = node->GetPreviousContent(aRootNode);
857   }
858 
859   // Now build up the string moving forward through the DOM until we reach
860   // the soft end and *then* see a DOM word separator, a non-inline-element
861   // boundary, or the hard end node.
862   mValue.Truncate();
863   mDOMMapping.Clear();
864   bool seenSoftEnd = false;
865   // Leave this outside the loop so large heap string allocations can be reused
866   // across iterations
867   while (node) {
868     if (node == mEnd.mNode) {
869       seenSoftEnd = true;
870     }
871 
872     bool exit = false;
873     if (IsSpellCheckingTextNode(node)) {
874       nsIContent* content = static_cast<nsIContent*>(node);
875       MOZ_ASSERT(content, "Where is our content?");
876       const nsTextFragment* textFragment = content->GetText();
877       MOZ_ASSERT(textFragment, "Where is our text?");
878       int32_t lastOffsetInNode = textFragment->GetLength();
879 
880       if (seenSoftEnd) {
881         // check whether we can stop after this
882         for (int32_t i = node == mEnd.mNode ? mEnd.mOffset : 0;
883              i < int32_t(textFragment->GetLength()); ++i) {
884           if (IsDOMWordSeparator(textFragment->CharAt(i))) {
885             exit = true;
886             // stop at the first separator after the soft end point
887             lastOffsetInNode = i;
888             break;
889           }
890         }
891       }
892 
893       if (firstOffsetInNode < lastOffsetInNode) {
894         int32_t len = lastOffsetInNode - firstOffsetInNode;
895         mDOMMapping.AppendElement(DOMTextMapping(
896             NodeOffset(node, firstOffsetInNode), mValue.Length(), len));
897 
898         bool ok = textFragment->AppendTo(mValue, firstOffsetInNode, len,
899                                          mozilla::fallible);
900         if (!ok) {
901           // probably out of memory, remove from mDOMMapping
902           mDOMMapping.RemoveLastElement();
903           exit = true;
904         }
905       }
906 
907       firstOffsetInNode = 0;
908     }
909 
910     if (exit) break;
911 
912     CheckLeavingBreakElementClosure closure = {false};
913     node = FindNextNode(node, aRootNode, CheckLeavingBreakElement, &closure);
914     if (closure.mLeftBreakElement || (node && IsBreakElement(node))) {
915       // We left, or are entering, a break element (e.g., block). Maybe we can
916       // stop now.
917       if (seenSoftEnd) break;
918       // Record the break
919       mValue.Append(' ');
920     }
921   }
922 
923   MOZ_LOG(sInlineSpellWordUtilLog, LogLevel::Debug,
924           ("%s: got DOM string: %s", __FUNCTION__,
925            NS_ConvertUTF16toUTF8(mValue).get()));
926 }
927 
BuildRealWords() const928 auto mozInlineSpellWordUtil::BuildRealWords() const
929     -> Result<RealWords, nsresult> {
930   // This is pretty simple. We just have to walk mSoftText.GetValue(),
931   // tokenizing it into "real words". We do an outer traversal of words
932   // delimited by IsDOMWordSeparator, calling SplitDOMWordAndAppendTo on each of
933   // those DOM words
934   int32_t wordStart = -1;
935   RealWords realWords;
936   for (int32_t i = 0; i < int32_t(mSoftText.GetValue().Length()); ++i) {
937     if (IsDOMWordSeparator(mSoftText.GetValue().CharAt(i))) {
938       if (wordStart >= 0) {
939         nsresult rv = SplitDOMWordAndAppendTo(wordStart, i, realWords);
940         if (NS_FAILED(rv)) {
941           return Err(rv);
942         }
943         wordStart = -1;
944       }
945     } else {
946       if (wordStart < 0) {
947         wordStart = i;
948       }
949     }
950   }
951   if (wordStart >= 0) {
952     nsresult rv = SplitDOMWordAndAppendTo(
953         wordStart, mSoftText.GetValue().Length(), realWords);
954     if (NS_FAILED(rv)) {
955       return Err(rv);
956     }
957   }
958 
959   return realWords;
960 }
961 
962 /*********** DOM/realwords<->mSoftText.GetValue() mapping functions
963  * ************/
964 
MapDOMPositionToSoftTextOffset(const NodeOffset & aNodeOffset) const965 int32_t mozInlineSpellWordUtil::MapDOMPositionToSoftTextOffset(
966     const NodeOffset& aNodeOffset) const {
967   if (!mSoftText.mIsValid) {
968     NS_ERROR("Soft text must be valid if we're to map into it");
969     return -1;
970   }
971 
972   for (int32_t i = 0; i < int32_t(mSoftText.GetDOMMapping().Length()); ++i) {
973     const DOMTextMapping& map = mSoftText.GetDOMMapping()[i];
974     if (map.mNodeOffset.mNode == aNodeOffset.mNode) {
975       // Allow offsets at either end of the string, in particular, allow the
976       // offset that's at the end of the contributed string
977       int32_t offsetInContributedString =
978           aNodeOffset.mOffset - map.mNodeOffset.mOffset;
979       if (offsetInContributedString >= 0 &&
980           offsetInContributedString <= map.mLength)
981         return map.mSoftTextOffset + offsetInContributedString;
982       return -1;
983     }
984   }
985   return -1;
986 }
987 
988 namespace {
989 
990 template <class T>
991 class FirstLargerOffset {
992   int32_t mSoftTextOffset;
993 
994  public:
FirstLargerOffset(int32_t aSoftTextOffset)995   explicit FirstLargerOffset(int32_t aSoftTextOffset)
996       : mSoftTextOffset(aSoftTextOffset) {}
operator ()(const T & t) const997   int operator()(const T& t) const {
998     // We want the first larger offset, so never return 0 (which would
999     // short-circuit evaluation before finding the last such offset).
1000     return mSoftTextOffset < t.mSoftTextOffset ? -1 : 1;
1001   }
1002 };
1003 
1004 template <class T>
FindLastNongreaterOffset(const nsTArray<T> & aContainer,int32_t aSoftTextOffset,size_t * aIndex)1005 bool FindLastNongreaterOffset(const nsTArray<T>& aContainer,
1006                               int32_t aSoftTextOffset, size_t* aIndex) {
1007   if (aContainer.Length() == 0) {
1008     return false;
1009   }
1010 
1011   BinarySearchIf(aContainer, 0, aContainer.Length(),
1012                  FirstLargerOffset<T>(aSoftTextOffset), aIndex);
1013   if (*aIndex > 0) {
1014     // There was at least one mapping with offset <= aSoftTextOffset. Step back
1015     // to find the last element with |mSoftTextOffset <= aSoftTextOffset|.
1016     *aIndex -= 1;
1017   } else {
1018     // Every mapping had offset greater than aSoftTextOffset.
1019     MOZ_ASSERT(aContainer[*aIndex].mSoftTextOffset > aSoftTextOffset);
1020   }
1021   return true;
1022 }
1023 
1024 }  // namespace
1025 
MapSoftTextOffsetToDOMPosition(int32_t aSoftTextOffset,DOMMapHint aHint) const1026 NodeOffset mozInlineSpellWordUtil::MapSoftTextOffsetToDOMPosition(
1027     int32_t aSoftTextOffset, DOMMapHint aHint) const {
1028   MOZ_ASSERT(mSoftText.mIsValid,
1029              "Soft text must be valid if we're to map out of it");
1030   if (!mSoftText.mIsValid) return NodeOffset(nullptr, -1);
1031 
1032   // Find the last mapping, if any, such that mSoftTextOffset <= aSoftTextOffset
1033   size_t index;
1034   bool found = FindLastNongreaterOffset(mSoftText.GetDOMMapping(),
1035                                         aSoftTextOffset, &index);
1036   if (!found) {
1037     return NodeOffset(nullptr, -1);
1038   }
1039 
1040   // 'index' is now the last mapping, if any, such that
1041   // mSoftTextOffset <= aSoftTextOffset.
1042   // If we're doing HINT_END, then we may want to return the end of the
1043   // the previous mapping instead of the start of this mapping
1044   if (aHint == HINT_END && index > 0) {
1045     const DOMTextMapping& map = mSoftText.GetDOMMapping()[index - 1];
1046     if (map.mSoftTextOffset + map.mLength == aSoftTextOffset)
1047       return NodeOffset(map.mNodeOffset.mNode,
1048                         map.mNodeOffset.mOffset + map.mLength);
1049   }
1050 
1051   // We allow ourselves to return the end of this mapping even if we're
1052   // doing HINT_START. This will only happen if there is no mapping which this
1053   // point is the start of. I'm not 100% sure this is OK...
1054   const DOMTextMapping& map = mSoftText.GetDOMMapping()[index];
1055   int32_t offset = aSoftTextOffset - map.mSoftTextOffset;
1056   if (offset >= 0 && offset <= map.mLength)
1057     return NodeOffset(map.mNodeOffset.mNode, map.mNodeOffset.mOffset + offset);
1058 
1059   return NodeOffset(nullptr, -1);
1060 }
1061 
1062 // static
ToString(const DOMMapHint aHint,nsACString & aResult)1063 void mozInlineSpellWordUtil::ToString(const DOMMapHint aHint,
1064                                       nsACString& aResult) {
1065   switch (aHint) {
1066     case HINT_BEGIN:
1067       aResult.AssignLiteral("begin");
1068       break;
1069     case HINT_END:
1070       aResult.AssignLiteral("end");
1071       break;
1072   }
1073 }
1074 
FindRealWordContaining(int32_t aSoftTextOffset,DOMMapHint aHint,bool aSearchForward) const1075 int32_t mozInlineSpellWordUtil::FindRealWordContaining(
1076     int32_t aSoftTextOffset, DOMMapHint aHint, bool aSearchForward) const {
1077   if (MOZ_LOG_TEST(sInlineSpellWordUtilLog, LogLevel::Debug)) {
1078     nsAutoCString hint;
1079     mozInlineSpellWordUtil::ToString(aHint, hint);
1080 
1081     MOZ_LOG(
1082         sInlineSpellWordUtilLog, LogLevel::Debug,
1083         ("%s: offset=%i, hint=%s, searchForward=%i.", __FUNCTION__,
1084          aSoftTextOffset, hint.get(), static_cast<int32_t>(aSearchForward)));
1085   }
1086 
1087   MOZ_ASSERT(mSoftText.mIsValid,
1088              "Soft text must be valid if we're to map out of it");
1089   if (!mSoftText.mIsValid) return -1;
1090 
1091   // Find the last word, if any, such that mRealWords[index].mSoftTextOffset
1092   // <= aSoftTextOffset
1093   size_t index;
1094   bool found = FindLastNongreaterOffset(mRealWords, aSoftTextOffset, &index);
1095   if (!found) {
1096     return -1;
1097   }
1098 
1099   // 'index' is now the last word, if any, such that
1100   // mSoftTextOffset <= aSoftTextOffset.
1101   // If we're doing HINT_END, then we may want to return the end of the
1102   // the previous word instead of the start of this word
1103   if (aHint == HINT_END && index > 0) {
1104     const RealWord& word = mRealWords[index - 1];
1105     if (word.EndOffset() == aSoftTextOffset) {
1106       return index - 1;
1107     }
1108   }
1109 
1110   // We allow ourselves to return the end of this word even if we're
1111   // doing HINT_BEGIN. This will only happen if there is no word which this
1112   // point is the start of. I'm not 100% sure this is OK...
1113   const RealWord& word = mRealWords[index];
1114   int32_t offset = aSoftTextOffset - word.mSoftTextOffset;
1115   if (offset >= 0 && offset <= static_cast<int32_t>(word.mLength)) return index;
1116 
1117   if (aSearchForward) {
1118     if (mRealWords[0].mSoftTextOffset > aSoftTextOffset) {
1119       // All words have mSoftTextOffset > aSoftTextOffset
1120       return 0;
1121     }
1122     // 'index' is the last word such that mSoftTextOffset <= aSoftTextOffset.
1123     // Word index+1, if it exists, will be the first with
1124     // mSoftTextOffset > aSoftTextOffset.
1125     if (index + 1 < mRealWords.Length()) return index + 1;
1126   }
1127 
1128   return -1;
1129 }
1130 
1131 // mozInlineSpellWordUtil::SplitDOMWordAndAppendTo
1132 
SplitDOMWordAndAppendTo(int32_t aStart,int32_t aEnd,nsTArray<RealWord> & aRealWords) const1133 nsresult mozInlineSpellWordUtil::SplitDOMWordAndAppendTo(
1134     int32_t aStart, int32_t aEnd, nsTArray<RealWord>& aRealWords) const {
1135   nsDependentSubstring targetText(mSoftText.GetValue(), aStart, aEnd - aStart);
1136   WordSplitState<nsDependentSubstring> state(targetText);
1137   state.mCurCharClass = state.ClassifyCharacter(0, true);
1138 
1139   state.AdvanceThroughSeparators();
1140   if (state.mCurCharClass != CHAR_CLASS_END_OF_INPUT && state.IsSpecialWord()) {
1141     int32_t specialWordLength =
1142         state.mDOMWordText.Length() - state.mDOMWordOffset;
1143     if (!aRealWords.AppendElement(
1144             RealWord(aStart + state.mDOMWordOffset, specialWordLength, false),
1145             fallible)) {
1146       return NS_ERROR_OUT_OF_MEMORY;
1147     }
1148 
1149     return NS_OK;
1150   }
1151 
1152   while (state.mCurCharClass != CHAR_CLASS_END_OF_INPUT) {
1153     state.AdvanceThroughSeparators();
1154     if (state.mCurCharClass == CHAR_CLASS_END_OF_INPUT) break;
1155 
1156     // save the beginning of the word
1157     int32_t wordOffset = state.mDOMWordOffset;
1158 
1159     // find the end of the word
1160     state.AdvanceThroughWord();
1161     int32_t wordLen = state.mDOMWordOffset - wordOffset;
1162     if (!aRealWords.AppendElement(
1163             RealWord(aStart + wordOffset, wordLen,
1164                      !state.ShouldSkipWord(wordOffset, wordLen)),
1165             fallible)) {
1166       return NS_ERROR_OUT_OF_MEMORY;
1167     }
1168   }
1169 
1170   return NS_OK;
1171 }
1172