1 /* -*- Mode: C++; tab-width: 8; indent-tabs-mode: nil; c-basic-offset: 2 -*- */
2 /* vim: set ts=8 sts=2 et sw=2 tw=80: */
3 /* This Source Code Form is subject to the terms of the Mozilla Public
4  * License, v. 2.0. If a copy of the MPL was not distributed with this
5  * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
6 
7 #include "nsTextFrameUtils.h"
8 
9 #include "mozilla/dom/Text.h"
10 #include "nsBidiUtils.h"
11 #include "nsCharTraits.h"
12 #include "nsIContent.h"
13 #include "nsStyleStruct.h"
14 #include "nsTextFragment.h"
15 #include "nsUnicharUtils.h"
16 #include "nsUnicodeProperties.h"
17 #include <algorithm>
18 
19 using namespace mozilla;
20 using namespace mozilla::dom;
21 
22 // static
IsSpaceCombiningSequenceTail(const char16_t * aChars,int32_t aLength)23 bool nsTextFrameUtils::IsSpaceCombiningSequenceTail(const char16_t* aChars,
24                                                     int32_t aLength) {
25   return aLength > 0 &&
26          (mozilla::unicode::IsClusterExtender(aChars[0]) ||
27           (IsBidiControl(aChars[0]) &&
28            IsSpaceCombiningSequenceTail(aChars + 1, aLength - 1)));
29 }
30 
IsDiscardable(char16_t ch,nsTextFrameUtils::Flags * aFlags)31 static bool IsDiscardable(char16_t ch, nsTextFrameUtils::Flags* aFlags) {
32   // Unlike IS_DISCARDABLE, we don't discard \r. \r will be ignored by
33   // gfxTextRun and discarding it would force us to copy text in many cases of
34   // preformatted text containing \r\n.
35   if (ch == CH_SHY) {
36     *aFlags |= nsTextFrameUtils::Flags::HasShy;
37     return true;
38   }
39   return IsBidiControl(ch);
40 }
41 
IsDiscardable(uint8_t ch,nsTextFrameUtils::Flags * aFlags)42 static bool IsDiscardable(uint8_t ch, nsTextFrameUtils::Flags* aFlags) {
43   if (ch == CH_SHY) {
44     *aFlags |= nsTextFrameUtils::Flags::HasShy;
45     return true;
46   }
47   return false;
48 }
49 
IsSegmentBreak(char16_t aCh)50 static bool IsSegmentBreak(char16_t aCh) { return aCh == '\n'; }
51 
IsSpaceOrTab(char16_t aCh)52 static bool IsSpaceOrTab(char16_t aCh) { return aCh == ' ' || aCh == '\t'; }
53 
IsSpaceOrTabOrSegmentBreak(char16_t aCh)54 static bool IsSpaceOrTabOrSegmentBreak(char16_t aCh) {
55   return IsSpaceOrTab(aCh) || IsSegmentBreak(aCh);
56 }
57 
58 template <typename CharT>
59 /* static */
IsSkippableCharacterForTransformText(CharT aChar)60 bool nsTextFrameUtils::IsSkippableCharacterForTransformText(CharT aChar) {
61   return aChar == ' ' || aChar == '\t' || aChar == '\n' || aChar == CH_SHY ||
62          (aChar > 0xFF && IsBidiControl(aChar));
63 }
64 
65 #ifdef DEBUG
66 template <typename CharT>
AssertSkippedExpectedChars(const CharT * aText,const gfxSkipChars & aSkipChars,int32_t aSkipCharsOffset)67 static void AssertSkippedExpectedChars(const CharT* aText,
68                                        const gfxSkipChars& aSkipChars,
69                                        int32_t aSkipCharsOffset) {
70   gfxSkipCharsIterator it(aSkipChars);
71   it.AdvanceOriginal(aSkipCharsOffset);
72   while (it.GetOriginalOffset() < it.GetOriginalEnd()) {
73     CharT ch = aText[it.GetOriginalOffset() - aSkipCharsOffset];
74     MOZ_ASSERT(!it.IsOriginalCharSkipped() ||
75                    nsTextFrameUtils::IsSkippableCharacterForTransformText(ch),
76                "skipped unexpected character; need to update "
77                "IsSkippableCharacterForTransformText?");
78     it.AdvanceOriginal(1);
79   }
80 }
81 #endif
82 
83 template <class CharT>
TransformWhiteSpaces(const CharT * aText,uint32_t aLength,uint32_t aBegin,uint32_t aEnd,bool aHasSegmentBreak,bool & aInWhitespace,CharT * aOutput,nsTextFrameUtils::Flags & aFlags,nsTextFrameUtils::CompressionMode aCompression,gfxSkipChars * aSkipChars)84 static CharT* TransformWhiteSpaces(
85     const CharT* aText, uint32_t aLength, uint32_t aBegin, uint32_t aEnd,
86     bool aHasSegmentBreak, bool& aInWhitespace, CharT* aOutput,
87     nsTextFrameUtils::Flags& aFlags,
88     nsTextFrameUtils::CompressionMode aCompression, gfxSkipChars* aSkipChars) {
89   MOZ_ASSERT(aCompression == nsTextFrameUtils::COMPRESS_WHITESPACE ||
90                  aCompression == nsTextFrameUtils::COMPRESS_WHITESPACE_NEWLINE,
91              "whitespaces should be skippable!!");
92   // Get the context preceding/following this white space range.
93   // For 8-bit text (sizeof CharT == 1), the checks here should get optimized
94   // out, and isSegmentBreakSkippable should be initialized to be 'false'.
95   bool isSegmentBreakSkippable =
96       sizeof(CharT) > 1 &&
97       ((aBegin > 0 && IS_ZERO_WIDTH_SPACE(aText[aBegin - 1])) ||
98        (aEnd < aLength && IS_ZERO_WIDTH_SPACE(aText[aEnd])));
99   if (sizeof(CharT) > 1 && !isSegmentBreakSkippable && aBegin > 0 &&
100       aEnd < aLength) {
101     uint32_t ucs4before;
102     uint32_t ucs4after;
103     if (aBegin > 1 &&
104         NS_IS_SURROGATE_PAIR(aText[aBegin - 2], aText[aBegin - 1])) {
105       ucs4before = SURROGATE_TO_UCS4(aText[aBegin - 2], aText[aBegin - 1]);
106     } else {
107       ucs4before = aText[aBegin - 1];
108     }
109     if (aEnd + 1 < aLength &&
110         NS_IS_SURROGATE_PAIR(aText[aEnd], aText[aEnd + 1])) {
111       ucs4after = SURROGATE_TO_UCS4(aText[aEnd], aText[aEnd + 1]);
112     } else {
113       ucs4after = aText[aEnd];
114     }
115     // Discard newlines between characters that have F, W, or H
116     // EastAsianWidth property and neither side is Hangul.
117     isSegmentBreakSkippable =
118         IsSegmentBreakSkipChar(ucs4before) && IsSegmentBreakSkipChar(ucs4after);
119   }
120 
121   for (uint32_t i = aBegin; i < aEnd; ++i) {
122     CharT ch = aText[i];
123     bool keepChar = false;
124     bool keepTransformedWhiteSpace = false;
125     if (IsDiscardable(ch, &aFlags)) {
126       aSkipChars->SkipChar();
127       continue;
128     }
129     if (IsSpaceOrTab(ch)) {
130       if (aHasSegmentBreak) {
131         // If white-space is set to normal, nowrap, or pre-line, white space
132         // characters are considered collapsible and all spaces and tabs
133         // immediately preceding or following a segment break are removed.
134         aSkipChars->SkipChar();
135         continue;
136       }
137 
138       if (aInWhitespace) {
139         aSkipChars->SkipChar();
140         continue;
141       } else {
142         keepTransformedWhiteSpace = true;
143       }
144     } else {
145       // Apply Segment Break Transformation Rules (CSS Text 3 - 4.1.2) for
146       // segment break characters.
147       if (aCompression == nsTextFrameUtils::COMPRESS_WHITESPACE ||
148           // XXX: According to CSS Text 3, a lone CR should not always be
149           //      kept, but still go through the Segment Break Transformation
150           //      Rules. However, this is what current modern browser engines
151           //      (webkit/blink/edge) do. So, once we can get some clarity
152           //      from the specification issue, we should either remove the
153           //      lone CR condition here, or leave it here with this comment
154           //      being rephrased.
155           //      Please see https://github.com/w3c/csswg-drafts/issues/855.
156           ch == '\r') {
157         keepChar = true;
158       } else {
159         // aCompression == COMPRESS_WHITESPACE_NEWLINE
160 
161         // Any collapsible segment break immediately following another
162         // collapsible segment break is removed.  Then the remaining segment
163         // break is either transformed into a space (U+0020) or removed
164         // depending on the context before and after the break.
165         if (isSegmentBreakSkippable || aInWhitespace) {
166           aSkipChars->SkipChar();
167           continue;
168         }
169         isSegmentBreakSkippable = true;
170         keepTransformedWhiteSpace = true;
171       }
172     }
173 
174     if (keepChar) {
175       *aOutput++ = ch;
176       aSkipChars->KeepChar();
177       aInWhitespace = IsSpaceOrTab(ch);
178     } else if (keepTransformedWhiteSpace) {
179       *aOutput++ = ' ';
180       aSkipChars->KeepChar();
181       aInWhitespace = true;
182     } else {
183       MOZ_ASSERT_UNREACHABLE("Should've skipped the character!!");
184     }
185   }
186   return aOutput;
187 }
188 
189 template <class CharT>
TransformText(const CharT * aText,uint32_t aLength,CharT * aOutput,CompressionMode aCompression,uint8_t * aIncomingFlags,gfxSkipChars * aSkipChars,Flags * aAnalysisFlags)190 CharT* nsTextFrameUtils::TransformText(const CharT* aText, uint32_t aLength,
191                                        CharT* aOutput,
192                                        CompressionMode aCompression,
193                                        uint8_t* aIncomingFlags,
194                                        gfxSkipChars* aSkipChars,
195                                        Flags* aAnalysisFlags) {
196   Flags flags = Flags();
197 #ifdef DEBUG
198   int32_t skipCharsOffset = aSkipChars->GetOriginalCharCount();
199 #endif
200 
201   bool lastCharArabic = false;
202   if (aCompression == COMPRESS_NONE ||
203       aCompression == COMPRESS_NONE_TRANSFORM_TO_SPACE) {
204     // Skip discardables.
205     uint32_t i;
206     for (i = 0; i < aLength; ++i) {
207       CharT ch = aText[i];
208       if (IsDiscardable(ch, &flags)) {
209         aSkipChars->SkipChar();
210       } else {
211         aSkipChars->KeepChar();
212         if (ch > ' ') {
213           lastCharArabic = IS_ARABIC_CHAR(ch);
214         } else if (aCompression == COMPRESS_NONE_TRANSFORM_TO_SPACE) {
215           if (ch == '\t' || ch == '\n') {
216             ch = ' ';
217           }
218         } else {
219           // aCompression == COMPRESS_NONE
220           if (ch == '\t') {
221             flags |= Flags::HasTab;
222           }
223         }
224         *aOutput++ = ch;
225       }
226     }
227     if (lastCharArabic) {
228       *aIncomingFlags |= INCOMING_ARABICCHAR;
229     } else {
230       *aIncomingFlags &= ~INCOMING_ARABICCHAR;
231     }
232     *aIncomingFlags &= ~INCOMING_WHITESPACE;
233   } else {
234     bool inWhitespace = (*aIncomingFlags & INCOMING_WHITESPACE) != 0;
235     uint32_t i;
236     for (i = 0; i < aLength; ++i) {
237       CharT ch = aText[i];
238       // CSS Text 3 - 4.1. The White Space Processing Rules
239       // White space processing in CSS affects only the document white space
240       // characters: spaces (U+0020), tabs (U+0009), and segment breaks.
241       // Since we need the context of segment breaks and their surrounding
242       // white spaces to proceed the white space processing, a consecutive run
243       // of spaces/tabs/segment breaks is collected in a first pass loop, then
244       // we apply the collapsing and transformation rules to this run in a
245       // second pass loop.
246       if (IsSpaceOrTabOrSegmentBreak(ch)) {
247         bool keepLastSpace = false;
248         bool hasSegmentBreak = IsSegmentBreak(ch);
249         uint32_t countTrailingDiscardables = 0;
250         uint32_t j;
251         for (j = i + 1; j < aLength && (IsSpaceOrTabOrSegmentBreak(aText[j]) ||
252                                         IsDiscardable(aText[j], &flags));
253              j++) {
254           if (IsSegmentBreak(aText[j])) {
255             hasSegmentBreak = true;
256           }
257         }
258         // Exclude trailing discardables before checking space combining
259         // sequence tail.
260         for (; IsDiscardable(aText[j - 1], &flags); j--) {
261           countTrailingDiscardables++;
262         }
263         // If the last white space is followed by a combining sequence tail,
264         // exclude it from the range of TransformWhiteSpaces.
265         if (sizeof(CharT) > 1 && aText[j - 1] == ' ' && j < aLength &&
266             IsSpaceCombiningSequenceTail(&aText[j], aLength - j)) {
267           keepLastSpace = true;
268           j--;
269         }
270         if (j > i) {
271           aOutput = TransformWhiteSpaces(aText, aLength, i, j, hasSegmentBreak,
272                                          inWhitespace, aOutput, flags,
273                                          aCompression, aSkipChars);
274         }
275         // We need to keep KeepChar()/SkipChar() in order, so process the
276         // last white space first, then process the trailing discardables.
277         if (keepLastSpace) {
278           keepLastSpace = false;
279           *aOutput++ = ' ';
280           aSkipChars->KeepChar();
281           lastCharArabic = false;
282           j++;
283         }
284         for (; countTrailingDiscardables > 0; countTrailingDiscardables--) {
285           aSkipChars->SkipChar();
286           j++;
287         }
288         i = j - 1;
289         continue;
290       }
291       // Process characters other than the document white space characters.
292       if (IsDiscardable(ch, &flags)) {
293         aSkipChars->SkipChar();
294       } else {
295         *aOutput++ = ch;
296         aSkipChars->KeepChar();
297       }
298       lastCharArabic = IS_ARABIC_CHAR(ch);
299       inWhitespace = false;
300     }
301 
302     if (lastCharArabic) {
303       *aIncomingFlags |= INCOMING_ARABICCHAR;
304     } else {
305       *aIncomingFlags &= ~INCOMING_ARABICCHAR;
306     }
307     if (inWhitespace) {
308       *aIncomingFlags |= INCOMING_WHITESPACE;
309     } else {
310       *aIncomingFlags &= ~INCOMING_WHITESPACE;
311     }
312   }
313 
314   *aAnalysisFlags = flags;
315 
316 #ifdef DEBUG
317   AssertSkippedExpectedChars(aText, *aSkipChars, skipCharsOffset);
318 #endif
319   return aOutput;
320 }
321 
322 /*
323  * NOTE: The TransformText and IsSkippableCharacterForTransformText template
324  * functions are part of the public API of nsTextFrameUtils, while
325  * their function bodies are not available in the header. They may stop working
326  * (fail to resolve symbol in link time) once their callsites are moved to a
327  * different translation unit (e.g. a different unified source file).
328  * Explicit instantiating this function template with `uint8_t` and `char16_t`
329  * could prevent us from the potential risk.
330  */
331 template uint8_t* nsTextFrameUtils::TransformText(
332     const uint8_t* aText, uint32_t aLength, uint8_t* aOutput,
333     CompressionMode aCompression, uint8_t* aIncomingFlags,
334     gfxSkipChars* aSkipChars, Flags* aAnalysisFlags);
335 template char16_t* nsTextFrameUtils::TransformText(
336     const char16_t* aText, uint32_t aLength, char16_t* aOutput,
337     CompressionMode aCompression, uint8_t* aIncomingFlags,
338     gfxSkipChars* aSkipChars, Flags* aAnalysisFlags);
339 template bool nsTextFrameUtils::IsSkippableCharacterForTransformText(
340     uint8_t aChar);
341 template bool nsTextFrameUtils::IsSkippableCharacterForTransformText(
342     char16_t aChar);
343 
344 template <typename CharT>
DoComputeApproximateLengthWithWhitespaceCompression(const CharT * aChars,uint32_t aLength,const nsStyleText * aStyleText)345 static uint32_t DoComputeApproximateLengthWithWhitespaceCompression(
346     const CharT* aChars, uint32_t aLength, const nsStyleText* aStyleText) {
347   // This is an approximation so we don't really need anything
348   // too fancy here.
349   uint32_t len;
350   if (aStyleText->WhiteSpaceIsSignificant()) {
351     return aLength;
352   }
353   bool prevWS = true;  // more important to ignore blocks with
354                        // only whitespace than get inline boundaries
355                        // exactly right
356   len = 0;
357   for (uint32_t i = 0; i < aLength; ++i) {
358     CharT c = aChars[i];
359     if (c == ' ' || c == '\n' || c == '\t' || c == '\r') {
360       if (!prevWS) {
361         ++len;
362       }
363       prevWS = true;
364     } else {
365       ++len;
366       prevWS = false;
367     }
368   }
369   return len;
370 }
371 
ComputeApproximateLengthWithWhitespaceCompression(Text * aText,const nsStyleText * aStyleText)372 uint32_t nsTextFrameUtils::ComputeApproximateLengthWithWhitespaceCompression(
373     Text* aText, const nsStyleText* aStyleText) {
374   const nsTextFragment* frag = &aText->TextFragment();
375   if (frag->Is2b()) {
376     return DoComputeApproximateLengthWithWhitespaceCompression(
377         frag->Get2b(), frag->GetLength(), aStyleText);
378   }
379   return DoComputeApproximateLengthWithWhitespaceCompression(
380       frag->Get1b(), frag->GetLength(), aStyleText);
381 }
382 
ComputeApproximateLengthWithWhitespaceCompression(const nsAString & aString,const nsStyleText * aStyleText)383 uint32_t nsTextFrameUtils::ComputeApproximateLengthWithWhitespaceCompression(
384     const nsAString& aString, const nsStyleText* aStyleText) {
385   return DoComputeApproximateLengthWithWhitespaceCompression(
386       aString.BeginReading(), aString.Length(), aStyleText);
387 }
388 
NextRun()389 bool nsSkipCharsRunIterator::NextRun() {
390   do {
391     if (mRunLength) {
392       mIterator.AdvanceOriginal(mRunLength);
393       NS_ASSERTION(mRunLength > 0,
394                    "No characters in run (initial length too large?)");
395       if (!mSkipped || mLengthIncludesSkipped) {
396         mRemainingLength -= mRunLength;
397       }
398     }
399     if (!mRemainingLength) {
400       return false;
401     }
402     int32_t length;
403     mSkipped = mIterator.IsOriginalCharSkipped(&length);
404     mRunLength = std::min(length, mRemainingLength);
405   } while (!mVisitSkipped && mSkipped);
406 
407   return true;
408 }
409