1 /*
2  * Copyright (C) 2004, 2005, 2006, 2007, 2008, 2009, 2010 Apple Inc. All rights reserved.
3  * Copyright (C) 2005 Alexey Proskuryakov.
4  *
5  * Redistribution and use in source and binary forms, with or without
6  * modification, are permitted provided that the following conditions
7  * are met:
8  * 1. Redistributions of source code must retain the above copyright
9  *    notice, this list of conditions and the following disclaimer.
10  * 2. Redistributions in binary form must reproduce the above copyright
11  *    notice, this list of conditions and the following disclaimer in the
12  *    documentation and/or other materials provided with the distribution.
13  *
14  * THIS SOFTWARE IS PROVIDED BY APPLE COMPUTER, INC. ``AS IS'' AND ANY
15  * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
16  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
17  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL APPLE COMPUTER, INC. OR
18  * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
19  * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
20  * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
21  * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
22  * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
23  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
24  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
25  */
26 
27 #include "config.h"
28 #include "TextIterator.h"
29 
30 #include "Document.h"
31 #include "Frame.h"
32 #include "HTMLElement.h"
33 #include "HTMLNames.h"
34 #include "htmlediting.h"
35 #include "InlineTextBox.h"
36 #include "Range.h"
37 #include "RenderTableCell.h"
38 #include "RenderTableRow.h"
39 #include "RenderTextControl.h"
40 #include "RenderTextFragment.h"
41 #include "TextBoundaries.h"
42 #include "TextBreakIterator.h"
43 #include "VisiblePosition.h"
44 #include "visible_units.h"
45 #include <wtf/unicode/CharacterNames.h>
46 
47 #if USE(ICU_UNICODE) && !UCONFIG_NO_COLLATION
48 #include "TextBreakIteratorInternalICU.h"
49 #include <unicode/usearch.h>
50 #endif
51 
52 using namespace WTF::Unicode;
53 using namespace std;
54 
55 namespace WebCore {
56 
57 using namespace HTMLNames;
58 
59 // Buffer that knows how to compare with a search target.
60 // Keeps enough of the previous text to be able to search in the future, but no more.
61 // Non-breaking spaces are always equal to normal spaces.
62 // Case folding is also done if the CaseInsensitive option is specified.
63 // Matches are further filtered if the AtWordStarts option is specified, although some
64 // matches inside a word are permitted if TreatMedialCapitalAsWordStart is specified as well.
65 class SearchBuffer {
66     WTF_MAKE_NONCOPYABLE(SearchBuffer);
67 public:
68     SearchBuffer(const String& target, FindOptions);
69     ~SearchBuffer();
70 
71     // Returns number of characters appended; guaranteed to be in the range [1, length].
72     size_t append(const UChar*, size_t length);
73     bool needsMoreContext() const;
74     void prependContext(const UChar*, size_t length);
75     void reachedBreak();
76 
77     // Result is the size in characters of what was found.
78     // And <startOffset> is the number of characters back to the start of what was found.
79     size_t search(size_t& startOffset);
80     bool atBreak() const;
81 
82 #if USE(ICU_UNICODE) && !UCONFIG_NO_COLLATION
83 
84 private:
85     bool isBadMatch(const UChar*, size_t length) const;
86     bool isWordStartMatch(size_t start, size_t length) const;
87 
88     String m_target;
89     FindOptions m_options;
90 
91     Vector<UChar> m_buffer;
92     size_t m_overlap;
93     size_t m_prefixLength;
94     bool m_atBreak;
95     bool m_needsMoreContext;
96 
97     bool m_targetRequiresKanaWorkaround;
98     Vector<UChar> m_normalizedTarget;
99     mutable Vector<UChar> m_normalizedMatch;
100 
101 #else
102 
103 private:
104     void append(UChar, bool isCharacterStart);
105     size_t length() const;
106 
107     String m_target;
108     FindOptions m_options;
109 
110     Vector<UChar> m_buffer;
111     Vector<bool> m_isCharacterStartBuffer;
112     bool m_isBufferFull;
113     size_t m_cursor;
114 
115 #endif
116 };
117 
118 // --------
119 
120 static const unsigned bitsInWord = sizeof(unsigned) * 8;
121 static const unsigned bitInWordMask = bitsInWord - 1;
122 
BitStack()123 BitStack::BitStack()
124     : m_size(0)
125 {
126 }
127 
~BitStack()128 BitStack::~BitStack()
129 {
130 }
131 
push(bool bit)132 void BitStack::push(bool bit)
133 {
134     unsigned index = m_size / bitsInWord;
135     unsigned shift = m_size & bitInWordMask;
136     if (!shift && index == m_words.size()) {
137         m_words.grow(index + 1);
138         m_words[index] = 0;
139     }
140     unsigned& word = m_words[index];
141     unsigned mask = 1U << shift;
142     if (bit)
143         word |= mask;
144     else
145         word &= ~mask;
146     ++m_size;
147 }
148 
pop()149 void BitStack::pop()
150 {
151     if (m_size)
152         --m_size;
153 }
154 
top() const155 bool BitStack::top() const
156 {
157     if (!m_size)
158         return false;
159     unsigned shift = (m_size - 1) & bitInWordMask;
160     return m_words.last() & (1U << shift);
161 }
162 
size() const163 unsigned BitStack::size() const
164 {
165     return m_size;
166 }
167 
168 // --------
169 
170 #if !ASSERT_DISABLED
171 
depthCrossingShadowBoundaries(Node * node)172 static unsigned depthCrossingShadowBoundaries(Node* node)
173 {
174     unsigned depth = 0;
175     for (Node* parent = node->parentOrHostNode(); parent; parent = parent->parentOrHostNode())
176         ++depth;
177     return depth;
178 }
179 
180 #endif
181 
182 // This function is like Range::pastLastNode, except for the fact that it can climb up out of shadow trees.
nextInPreOrderCrossingShadowBoundaries(Node * rangeEndContainer,int rangeEndOffset)183 static Node* nextInPreOrderCrossingShadowBoundaries(Node* rangeEndContainer, int rangeEndOffset)
184 {
185     if (!rangeEndContainer)
186         return 0;
187     if (rangeEndOffset >= 0 && !rangeEndContainer->offsetInCharacters()) {
188         if (Node* next = rangeEndContainer->childNode(rangeEndOffset))
189             return next;
190     }
191     for (Node* node = rangeEndContainer; node; node = node->parentOrHostNode()) {
192         if (Node* next = node->nextSibling())
193             return next;
194     }
195     return 0;
196 }
197 
198 // --------
199 
fullyClipsContents(Node * node)200 static inline bool fullyClipsContents(Node* node)
201 {
202     RenderObject* renderer = node->renderer();
203     if (!renderer || !renderer->isBox() || !renderer->hasOverflowClip())
204         return false;
205     return toRenderBox(renderer)->size().isEmpty();
206 }
207 
ignoresContainerClip(Node * node)208 static inline bool ignoresContainerClip(Node* node)
209 {
210     RenderObject* renderer = node->renderer();
211     if (!renderer || renderer->isText())
212         return false;
213     EPosition position = renderer->style()->position();
214     return position == AbsolutePosition || position == FixedPosition;
215 }
216 
pushFullyClippedState(BitStack & stack,Node * node)217 static void pushFullyClippedState(BitStack& stack, Node* node)
218 {
219     ASSERT(stack.size() == depthCrossingShadowBoundaries(node));
220 
221     // Push true if this node full clips its contents, or if a parent already has fully
222     // clipped and this is not a node that ignores its container's clip.
223     stack.push(fullyClipsContents(node) || (stack.top() && !ignoresContainerClip(node)));
224 }
225 
setUpFullyClippedStack(BitStack & stack,Node * node)226 static void setUpFullyClippedStack(BitStack& stack, Node* node)
227 {
228     // Put the nodes in a vector so we can iterate in reverse order.
229     Vector<Node*, 100> ancestry;
230     for (Node* parent = node->parentOrHostNode(); parent; parent = parent->parentOrHostNode())
231         ancestry.append(parent);
232 
233     // Call pushFullyClippedState on each node starting with the earliest ancestor.
234     size_t size = ancestry.size();
235     for (size_t i = 0; i < size; ++i)
236         pushFullyClippedState(stack, ancestry[size - i - 1]);
237     pushFullyClippedState(stack, node);
238 
239     ASSERT(stack.size() == 1 + depthCrossingShadowBoundaries(node));
240 }
241 
242 // --------
243 
TextIterator()244 TextIterator::TextIterator()
245     : m_startContainer(0)
246     , m_startOffset(0)
247     , m_endContainer(0)
248     , m_endOffset(0)
249     , m_positionNode(0)
250     , m_textCharacters(0)
251     , m_textLength(0)
252     , m_remainingTextBox(0)
253     , m_firstLetterText(0)
254     , m_lastCharacter(0)
255     , m_emitsCharactersBetweenAllVisiblePositions(false)
256     , m_entersTextControls(false)
257     , m_emitsTextWithoutTranscoding(false)
258     , m_handledFirstLetter(false)
259     , m_ignoresStyleVisibility(false)
260     , m_emitsObjectReplacementCharacters(false)
261 {
262 }
263 
TextIterator(const Range * r,TextIteratorBehavior behavior)264 TextIterator::TextIterator(const Range* r, TextIteratorBehavior behavior)
265     : m_startContainer(0)
266     , m_startOffset(0)
267     , m_endContainer(0)
268     , m_endOffset(0)
269     , m_positionNode(0)
270     , m_textCharacters(0)
271     , m_textLength(0)
272     , m_remainingTextBox(0)
273     , m_firstLetterText(0)
274     , m_emitsCharactersBetweenAllVisiblePositions(behavior & TextIteratorEmitsCharactersBetweenAllVisiblePositions)
275     , m_entersTextControls(behavior & TextIteratorEntersTextControls)
276     , m_emitsTextWithoutTranscoding(behavior & TextIteratorEmitsTextsWithoutTranscoding)
277     , m_handledFirstLetter(false)
278     , m_ignoresStyleVisibility(behavior & TextIteratorIgnoresStyleVisibility)
279     , m_emitsObjectReplacementCharacters(behavior & TextIteratorEmitsObjectReplacementCharacters)
280 {
281     if (!r)
282         return;
283 
284     // get and validate the range endpoints
285     Node* startContainer = r->startContainer();
286     if (!startContainer)
287         return;
288     int startOffset = r->startOffset();
289     Node* endContainer = r->endContainer();
290     int endOffset = r->endOffset();
291 
292     // Callers should be handing us well-formed ranges. If we discover that this isn't
293     // the case, we could consider changing this assertion to an early return.
294     ASSERT(r->boundaryPointsValid());
295 
296     // remember range - this does not change
297     m_startContainer = startContainer;
298     m_startOffset = startOffset;
299     m_endContainer = endContainer;
300     m_endOffset = endOffset;
301 
302     // set up the current node for processing
303     m_node = r->firstNode();
304     if (!m_node)
305         return;
306     setUpFullyClippedStack(m_fullyClippedStack, m_node);
307     m_offset = m_node == m_startContainer ? m_startOffset : 0;
308     m_handledNode = false;
309     m_handledChildren = false;
310 
311     // calculate first out of bounds node
312     m_pastEndNode = nextInPreOrderCrossingShadowBoundaries(endContainer, endOffset);
313 
314     // initialize node processing state
315     m_needsAnotherNewline = false;
316     m_textBox = 0;
317 
318     // initialize record of previous node processing
319     m_hasEmitted = false;
320     m_lastTextNode = 0;
321     m_lastTextNodeEndedWithCollapsedSpace = false;
322     m_lastCharacter = 0;
323 
324 #ifndef NDEBUG
325     // need this just because of the assert in advance()
326     m_positionNode = m_node;
327 #endif
328 
329     // identify the first run
330     advance();
331 }
332 
~TextIterator()333 TextIterator::~TextIterator()
334 {
335 }
336 
advance()337 void TextIterator::advance()
338 {
339     // reset the run information
340     m_positionNode = 0;
341     m_textLength = 0;
342 
343     // handle remembered node that needed a newline after the text node's newline
344     if (m_needsAnotherNewline) {
345         // Emit the extra newline, and position it *inside* m_node, after m_node's
346         // contents, in case it's a block, in the same way that we position the first
347         // newline.  The range for the emitted newline should start where the line
348         // break begins.
349         // FIXME: It would be cleaner if we emitted two newlines during the last
350         // iteration, instead of using m_needsAnotherNewline.
351         Node* baseNode = m_node->lastChild() ? m_node->lastChild() : m_node;
352         emitCharacter('\n', baseNode->parentNode(), baseNode, 1, 1);
353         m_needsAnotherNewline = false;
354         return;
355     }
356 
357     if (!m_textBox && m_remainingTextBox) {
358         m_textBox = m_remainingTextBox;
359         m_remainingTextBox = 0;
360         m_firstLetterText = 0;
361         m_offset = 0;
362     }
363     // handle remembered text box
364     if (m_textBox) {
365         handleTextBox();
366         if (m_positionNode)
367             return;
368     }
369 
370     while (m_node && m_node != m_pastEndNode) {
371         // if the range ends at offset 0 of an element, represent the
372         // position, but not the content, of that element e.g. if the
373         // node is a blockflow element, emit a newline that
374         // precedes the element
375         if (m_node == m_endContainer && m_endOffset == 0) {
376             representNodeOffsetZero();
377             m_node = 0;
378             return;
379         }
380 
381         RenderObject* renderer = m_node->renderer();
382         if (!renderer) {
383             m_handledNode = true;
384             m_handledChildren = true;
385         } else {
386             // handle current node according to its type
387             if (!m_handledNode) {
388                 if (renderer->isText() && m_node->nodeType() == Node::TEXT_NODE) // FIXME: What about CDATA_SECTION_NODE?
389                     m_handledNode = handleTextNode();
390                 else if (renderer && (renderer->isImage() || renderer->isWidget() ||
391                          (renderer->node() && renderer->node()->isElementNode() &&
392                           static_cast<Element*>(renderer->node())->isFormControlElement())))
393                     m_handledNode = handleReplacedElement();
394                 else
395                     m_handledNode = handleNonTextNode();
396                 if (m_positionNode)
397                     return;
398             }
399         }
400 
401         // find a new current node to handle in depth-first manner,
402         // calling exitNode() as we come back thru a parent node
403         Node* next = m_handledChildren ? 0 : m_node->firstChild();
404         m_offset = 0;
405         if (!next) {
406             next = m_node->nextSibling();
407             if (!next) {
408                 bool pastEnd = m_node->traverseNextNode() == m_pastEndNode;
409                 Node* parentNode = m_node->parentOrHostNode();
410                 while (!next && parentNode) {
411                     if ((pastEnd && parentNode == m_endContainer) || m_endContainer->isDescendantOf(parentNode))
412                         return;
413                     bool haveRenderer = m_node->renderer();
414                     m_node = parentNode;
415                     m_fullyClippedStack.pop();
416                     parentNode = m_node->parentOrHostNode();
417                     if (haveRenderer)
418                         exitNode();
419                     if (m_positionNode) {
420                         m_handledNode = true;
421                         m_handledChildren = true;
422                         return;
423                     }
424                     next = m_node->nextSibling();
425                 }
426             }
427             m_fullyClippedStack.pop();
428         }
429 
430         // set the new current node
431         m_node = next;
432         if (m_node)
433             pushFullyClippedState(m_fullyClippedStack, m_node);
434         m_handledNode = false;
435         m_handledChildren = false;
436         m_handledFirstLetter = false;
437         m_firstLetterText = 0;
438 
439         // how would this ever be?
440         if (m_positionNode)
441             return;
442     }
443 }
444 
handleTextNode()445 bool TextIterator::handleTextNode()
446 {
447     if (m_fullyClippedStack.top() && !m_ignoresStyleVisibility)
448         return false;
449 
450     RenderText* renderer = toRenderText(m_node->renderer());
451 
452     m_lastTextNode = m_node;
453     String str = renderer->text();
454 
455     // handle pre-formatted text
456     if (!renderer->style()->collapseWhiteSpace()) {
457         int runStart = m_offset;
458         if (m_lastTextNodeEndedWithCollapsedSpace && hasVisibleTextNode(renderer)) {
459             emitCharacter(' ', m_node, 0, runStart, runStart);
460             return false;
461         }
462         if (!m_handledFirstLetter && renderer->isTextFragment() && !m_offset) {
463             handleTextNodeFirstLetter(static_cast<RenderTextFragment*>(renderer));
464             if (m_firstLetterText) {
465                 String firstLetter = m_firstLetterText->text();
466                 emitText(m_node, m_firstLetterText, m_offset, m_offset + firstLetter.length());
467                 m_firstLetterText = 0;
468                 m_textBox = 0;
469                 return false;
470             }
471         }
472         if (renderer->style()->visibility() != VISIBLE && !m_ignoresStyleVisibility)
473             return false;
474         int strLength = str.length();
475         int end = (m_node == m_endContainer) ? m_endOffset : INT_MAX;
476         int runEnd = min(strLength, end);
477 
478         if (runStart >= runEnd)
479             return true;
480 
481         emitText(m_node, runStart, runEnd);
482         return true;
483     }
484 
485     if (!renderer->firstTextBox() && str.length() > 0) {
486         if (!m_handledFirstLetter && renderer->isTextFragment()) {
487             handleTextNodeFirstLetter(static_cast<RenderTextFragment*>(renderer));
488             if (m_firstLetterText) {
489                 handleTextBox();
490                 return false;
491             }
492         }
493         if (renderer->style()->visibility() != VISIBLE && !m_ignoresStyleVisibility)
494             return false;
495         m_lastTextNodeEndedWithCollapsedSpace = true; // entire block is collapsed space
496         return true;
497     }
498 
499 
500     m_textBox = renderer->firstTextBox();
501     if (!m_handledFirstLetter && renderer->isTextFragment() && !m_offset)
502         handleTextNodeFirstLetter(static_cast<RenderTextFragment*>(renderer));
503 
504     if (m_firstLetterText)
505         renderer = m_firstLetterText;
506 
507     // Used when text boxes are out of order (Hebrew/Arabic w/ embeded LTR text)
508     if (renderer->containsReversedText()) {
509         m_sortedTextBoxes.clear();
510         for (InlineTextBox* textBox = renderer->firstTextBox(); textBox; textBox = textBox->nextTextBox()) {
511             m_sortedTextBoxes.append(textBox);
512         }
513         std::sort(m_sortedTextBoxes.begin(), m_sortedTextBoxes.end(), InlineTextBox::compareByStart);
514         m_sortedTextBoxesPosition = 0;
515         m_textBox = m_sortedTextBoxes.isEmpty() ? 0 : m_sortedTextBoxes[0];
516     }
517 
518     handleTextBox();
519     return true;
520 }
521 
handleTextBox()522 void TextIterator::handleTextBox()
523 {
524     RenderText* renderer = m_firstLetterText ? m_firstLetterText : toRenderText(m_node->renderer());
525     if (renderer->style()->visibility() != VISIBLE && !m_ignoresStyleVisibility) {
526         m_textBox = 0;
527         return;
528     }
529     String str = renderer->text();
530     unsigned start = m_offset;
531     unsigned end = (m_node == m_endContainer) ? static_cast<unsigned>(m_endOffset) : UINT_MAX;
532     while (m_textBox) {
533         unsigned textBoxStart = m_textBox->start();
534         unsigned runStart = max(textBoxStart, start);
535 
536         // Check for collapsed space at the start of this run.
537         InlineTextBox* firstTextBox = renderer->containsReversedText() ? (m_sortedTextBoxes.isEmpty() ? 0 : m_sortedTextBoxes[0]) : renderer->firstTextBox();
538         bool needSpace = m_lastTextNodeEndedWithCollapsedSpace
539             || (m_textBox == firstTextBox && textBoxStart == runStart && runStart > 0);
540         if (needSpace && !isCollapsibleWhitespace(m_lastCharacter) && m_lastCharacter) {
541             if (m_lastTextNode == m_node && runStart > 0 && str[runStart - 1] == ' ') {
542                 unsigned spaceRunStart = runStart - 1;
543                 while (spaceRunStart > 0 && str[spaceRunStart - 1] == ' ')
544                     --spaceRunStart;
545                 emitText(m_node, renderer, spaceRunStart, spaceRunStart + 1);
546             } else
547                 emitCharacter(' ', m_node, 0, runStart, runStart);
548             return;
549         }
550         unsigned textBoxEnd = textBoxStart + m_textBox->len();
551         unsigned runEnd = min(textBoxEnd, end);
552 
553         // Determine what the next text box will be, but don't advance yet
554         InlineTextBox* nextTextBox = 0;
555         if (renderer->containsReversedText()) {
556             if (m_sortedTextBoxesPosition + 1 < m_sortedTextBoxes.size())
557                 nextTextBox = m_sortedTextBoxes[m_sortedTextBoxesPosition + 1];
558         } else
559             nextTextBox = m_textBox->nextTextBox();
560 
561         if (runStart < runEnd) {
562             // Handle either a single newline character (which becomes a space),
563             // or a run of characters that does not include a newline.
564             // This effectively translates newlines to spaces without copying the text.
565             if (str[runStart] == '\n') {
566                 emitCharacter(' ', m_node, 0, runStart, runStart + 1);
567                 m_offset = runStart + 1;
568             } else {
569                 size_t subrunEnd = str.find('\n', runStart);
570                 if (subrunEnd == notFound || subrunEnd > runEnd)
571                     subrunEnd = runEnd;
572 
573                 m_offset = subrunEnd;
574                 emitText(m_node, renderer, runStart, subrunEnd);
575             }
576 
577             // If we are doing a subrun that doesn't go to the end of the text box,
578             // come back again to finish handling this text box; don't advance to the next one.
579             if (static_cast<unsigned>(m_positionEndOffset) < textBoxEnd)
580                 return;
581 
582             // Advance and return
583             unsigned nextRunStart = nextTextBox ? nextTextBox->start() : str.length();
584             if (nextRunStart > runEnd)
585                 m_lastTextNodeEndedWithCollapsedSpace = true; // collapsed space between runs or at the end
586             m_textBox = nextTextBox;
587             if (renderer->containsReversedText())
588                 ++m_sortedTextBoxesPosition;
589             return;
590         }
591         // Advance and continue
592         m_textBox = nextTextBox;
593         if (renderer->containsReversedText())
594             ++m_sortedTextBoxesPosition;
595     }
596     if (!m_textBox && m_remainingTextBox) {
597         m_textBox = m_remainingTextBox;
598         m_remainingTextBox = 0;
599         m_firstLetterText = 0;
600         m_offset = 0;
601         handleTextBox();
602     }
603 }
604 
firstRenderTextInFirstLetter(RenderObject * firstLetter)605 static inline RenderText* firstRenderTextInFirstLetter(RenderObject* firstLetter)
606 {
607     if (!firstLetter)
608         return 0;
609 
610     // FIXME: Should this check descendent objects?
611     for (RenderObject* current = firstLetter->firstChild(); current; current = current->nextSibling()) {
612         if (current->isText())
613             return toRenderText(current);
614     }
615     return 0;
616 }
617 
handleTextNodeFirstLetter(RenderTextFragment * renderer)618 void TextIterator::handleTextNodeFirstLetter(RenderTextFragment* renderer)
619 {
620     if (renderer->firstLetter()) {
621         RenderObject* r = renderer->firstLetter();
622         if (r->style()->visibility() != VISIBLE && !m_ignoresStyleVisibility)
623             return;
624         if (RenderText* firstLetter = firstRenderTextInFirstLetter(r)) {
625             m_handledFirstLetter = true;
626             m_remainingTextBox = m_textBox;
627             m_textBox = firstLetter->firstTextBox();
628             m_firstLetterText = firstLetter;
629         }
630     }
631     m_handledFirstLetter = true;
632 }
633 
handleReplacedElement()634 bool TextIterator::handleReplacedElement()
635 {
636     if (m_fullyClippedStack.top())
637         return false;
638 
639     RenderObject* renderer = m_node->renderer();
640     if (renderer->style()->visibility() != VISIBLE && !m_ignoresStyleVisibility)
641         return false;
642 
643     if (m_lastTextNodeEndedWithCollapsedSpace) {
644         emitCharacter(' ', m_lastTextNode->parentNode(), m_lastTextNode, 1, 1);
645         return false;
646     }
647 
648     if (m_entersTextControls && renderer->isTextControl()) {
649         if (HTMLElement* innerTextElement = toRenderTextControl(renderer)->innerTextElement()) {
650             m_node = innerTextElement->shadowTreeRootNode();
651             pushFullyClippedState(m_fullyClippedStack, m_node);
652             m_offset = 0;
653             return false;
654         }
655     }
656 
657     m_hasEmitted = true;
658 
659     if (m_emitsObjectReplacementCharacters && renderer && renderer->isReplaced()) {
660         emitCharacter(objectReplacementCharacter, m_node->parentNode(), m_node, 0, 1);
661         return true;
662     }
663 
664     if (m_emitsCharactersBetweenAllVisiblePositions) {
665         // We want replaced elements to behave like punctuation for boundary
666         // finding, and to simply take up space for the selection preservation
667         // code in moveParagraphs, so we use a comma.
668         emitCharacter(',', m_node->parentNode(), m_node, 0, 1);
669         return true;
670     }
671 
672     m_positionNode = m_node->parentNode();
673     m_positionOffsetBaseNode = m_node;
674     m_positionStartOffset = 0;
675     m_positionEndOffset = 1;
676 
677     m_textCharacters = 0;
678     m_textLength = 0;
679 
680     m_lastCharacter = 0;
681 
682     return true;
683 }
684 
hasVisibleTextNode(RenderText * renderer)685 bool TextIterator::hasVisibleTextNode(RenderText* renderer)
686 {
687     if (renderer->style()->visibility() == VISIBLE)
688         return true;
689     if (renderer->isTextFragment()) {
690         RenderTextFragment* fragment = static_cast<RenderTextFragment*>(renderer);
691         if (fragment->firstLetter() && fragment->firstLetter()->style()->visibility() == VISIBLE)
692             return true;
693     }
694     return false;
695 }
696 
shouldEmitTabBeforeNode(Node * node)697 static bool shouldEmitTabBeforeNode(Node* node)
698 {
699     RenderObject* r = node->renderer();
700 
701     // Table cells are delimited by tabs.
702     if (!r || !isTableCell(node))
703         return false;
704 
705     // Want a tab before every cell other than the first one
706     RenderTableCell* rc = toRenderTableCell(r);
707     RenderTable* t = rc->table();
708     return t && (t->cellBefore(rc) || t->cellAbove(rc));
709 }
710 
shouldEmitNewlineForNode(Node * node)711 static bool shouldEmitNewlineForNode(Node* node)
712 {
713     // br elements are represented by a single newline.
714     RenderObject* r = node->renderer();
715     if (!r)
716         return node->hasTagName(brTag);
717 
718     return r->isBR();
719 }
720 
shouldEmitNewlinesBeforeAndAfterNode(Node * node)721 static bool shouldEmitNewlinesBeforeAndAfterNode(Node* node)
722 {
723     // Block flow (versus inline flow) is represented by having
724     // a newline both before and after the element.
725     RenderObject* r = node->renderer();
726     if (!r) {
727         return (node->hasTagName(blockquoteTag)
728                 || node->hasTagName(ddTag)
729                 || node->hasTagName(divTag)
730                 || node->hasTagName(dlTag)
731                 || node->hasTagName(dtTag)
732                 || node->hasTagName(h1Tag)
733                 || node->hasTagName(h2Tag)
734                 || node->hasTagName(h3Tag)
735                 || node->hasTagName(h4Tag)
736                 || node->hasTagName(h5Tag)
737                 || node->hasTagName(h6Tag)
738                 || node->hasTagName(hrTag)
739                 || node->hasTagName(liTag)
740                 || node->hasTagName(listingTag)
741                 || node->hasTagName(olTag)
742                 || node->hasTagName(pTag)
743                 || node->hasTagName(preTag)
744                 || node->hasTagName(trTag)
745                 || node->hasTagName(ulTag));
746     }
747 
748     // Need to make an exception for table cells, because they are blocks, but we
749     // want them tab-delimited rather than having newlines before and after.
750     if (isTableCell(node))
751         return false;
752 
753     // Need to make an exception for table row elements, because they are neither
754     // "inline" or "RenderBlock", but we want newlines for them.
755     if (r->isTableRow()) {
756         RenderTable* t = toRenderTableRow(r)->table();
757         if (t && !t->isInline())
758             return true;
759     }
760 
761     return !r->isInline() && r->isRenderBlock() && !r->isFloatingOrPositioned() && !r->isBody();
762 }
763 
shouldEmitNewlineAfterNode(Node * node)764 static bool shouldEmitNewlineAfterNode(Node* node)
765 {
766     // FIXME: It should be better but slower to create a VisiblePosition here.
767     if (!shouldEmitNewlinesBeforeAndAfterNode(node))
768         return false;
769     // Check if this is the very last renderer in the document.
770     // If so, then we should not emit a newline.
771     while ((node = node->traverseNextSibling()))
772         if (node->renderer())
773             return true;
774     return false;
775 }
776 
shouldEmitNewlineBeforeNode(Node * node)777 static bool shouldEmitNewlineBeforeNode(Node* node)
778 {
779     return shouldEmitNewlinesBeforeAndAfterNode(node);
780 }
781 
shouldEmitExtraNewlineForNode(Node * node)782 static bool shouldEmitExtraNewlineForNode(Node* node)
783 {
784     // When there is a significant collapsed bottom margin, emit an extra
785     // newline for a more realistic result.  We end up getting the right
786     // result even without margin collapsing. For example: <div><p>text</p></div>
787     // will work right even if both the <div> and the <p> have bottom margins.
788     RenderObject* r = node->renderer();
789     if (!r || !r->isBox())
790         return false;
791 
792     // NOTE: We only do this for a select set of nodes, and fwiw WinIE appears
793     // not to do this at all
794     if (node->hasTagName(h1Tag)
795         || node->hasTagName(h2Tag)
796         || node->hasTagName(h3Tag)
797         || node->hasTagName(h4Tag)
798         || node->hasTagName(h5Tag)
799         || node->hasTagName(h6Tag)
800         || node->hasTagName(pTag)) {
801         RenderStyle* style = r->style();
802         if (style) {
803             int bottomMargin = toRenderBox(r)->collapsedMarginAfter();
804             int fontSize = style->fontDescription().computedPixelSize();
805             if (bottomMargin * 2 >= fontSize)
806                 return true;
807         }
808     }
809 
810     return false;
811 }
812 
collapsedSpaceLength(RenderText * renderer,int textEnd)813 static int collapsedSpaceLength(RenderText* renderer, int textEnd)
814 {
815     const UChar* characters = renderer->text()->characters();
816     int length = renderer->text()->length();
817     for (int i = textEnd; i < length; ++i) {
818         if (!renderer->style()->isCollapsibleWhiteSpace(characters[i]))
819             return i - textEnd;
820     }
821 
822     return length - textEnd;
823 }
824 
maxOffsetIncludingCollapsedSpaces(Node * node)825 static int maxOffsetIncludingCollapsedSpaces(Node* node)
826 {
827     int offset = caretMaxOffset(node);
828 
829     if (node->renderer() && node->renderer()->isText())
830         offset += collapsedSpaceLength(toRenderText(node->renderer()), offset);
831 
832     return offset;
833 }
834 
835 // Whether or not we should emit a character as we enter m_node (if it's a container) or as we hit it (if it's atomic).
shouldRepresentNodeOffsetZero()836 bool TextIterator::shouldRepresentNodeOffsetZero()
837 {
838     if (m_emitsCharactersBetweenAllVisiblePositions && m_node->renderer() && m_node->renderer()->isTable())
839         return true;
840 
841     // Leave element positioned flush with start of a paragraph
842     // (e.g. do not insert tab before a table cell at the start of a paragraph)
843     if (m_lastCharacter == '\n')
844         return false;
845 
846     // Otherwise, show the position if we have emitted any characters
847     if (m_hasEmitted)
848         return true;
849 
850     // We've not emitted anything yet. Generally, there is no need for any positioning then.
851     // The only exception is when the element is visually not in the same line as
852     // the start of the range (e.g. the range starts at the end of the previous paragraph).
853     // NOTE: Creating VisiblePositions and comparing them is relatively expensive, so we
854     // make quicker checks to possibly avoid that. Another check that we could make is
855     // is whether the inline vs block flow changed since the previous visible element.
856     // I think we're already in a special enough case that that won't be needed, tho.
857 
858     // No character needed if this is the first node in the range.
859     if (m_node == m_startContainer)
860         return false;
861 
862     // If we are outside the start container's subtree, assume we need to emit.
863     // FIXME: m_startContainer could be an inline block
864     if (!m_node->isDescendantOf(m_startContainer))
865         return true;
866 
867     // If we started as m_startContainer offset 0 and the current node is a descendant of
868     // the start container, we already had enough context to correctly decide whether to
869     // emit after a preceding block. We chose not to emit (m_hasEmitted is false),
870     // so don't second guess that now.
871     // NOTE: Is this really correct when m_node is not a leftmost descendant? Probably
872     // immaterial since we likely would have already emitted something by now.
873     if (m_startOffset == 0)
874         return false;
875 
876     // If this node is unrendered or invisible the VisiblePosition checks below won't have much meaning.
877     // Additionally, if the range we are iterating over contains huge sections of unrendered content,
878     // we would create VisiblePositions on every call to this function without this check.
879     if (!m_node->renderer() || m_node->renderer()->style()->visibility() != VISIBLE)
880         return false;
881 
882     // The startPos.isNotNull() check is needed because the start could be before the body,
883     // and in that case we'll get null. We don't want to put in newlines at the start in that case.
884     // The currPos.isNotNull() check is needed because positions in non-HTML content
885     // (like SVG) do not have visible positions, and we don't want to emit for them either.
886     VisiblePosition startPos = VisiblePosition(Position(m_startContainer, m_startOffset, Position::PositionIsOffsetInAnchor), DOWNSTREAM);
887     VisiblePosition currPos = VisiblePosition(positionBeforeNode(m_node), DOWNSTREAM);
888     return startPos.isNotNull() && currPos.isNotNull() && !inSameLine(startPos, currPos);
889 }
890 
shouldEmitSpaceBeforeAndAfterNode(Node * node)891 bool TextIterator::shouldEmitSpaceBeforeAndAfterNode(Node* node)
892 {
893     return node->renderer() && node->renderer()->isTable() && (node->renderer()->isInline() || m_emitsCharactersBetweenAllVisiblePositions);
894 }
895 
representNodeOffsetZero()896 void TextIterator::representNodeOffsetZero()
897 {
898     // Emit a character to show the positioning of m_node.
899 
900     // When we haven't been emitting any characters, shouldRepresentNodeOffsetZero() can
901     // create VisiblePositions, which is expensive.  So, we perform the inexpensive checks
902     // on m_node to see if it necessitates emitting a character first and will early return
903     // before encountering shouldRepresentNodeOffsetZero()s worse case behavior.
904     if (shouldEmitTabBeforeNode(m_node)) {
905         if (shouldRepresentNodeOffsetZero())
906             emitCharacter('\t', m_node->parentNode(), m_node, 0, 0);
907     } else if (shouldEmitNewlineBeforeNode(m_node)) {
908         if (shouldRepresentNodeOffsetZero())
909             emitCharacter('\n', m_node->parentNode(), m_node, 0, 0);
910     } else if (shouldEmitSpaceBeforeAndAfterNode(m_node)) {
911         if (shouldRepresentNodeOffsetZero())
912             emitCharacter(' ', m_node->parentNode(), m_node, 0, 0);
913     }
914 }
915 
handleNonTextNode()916 bool TextIterator::handleNonTextNode()
917 {
918     if (shouldEmitNewlineForNode(m_node))
919         emitCharacter('\n', m_node->parentNode(), m_node, 0, 1);
920     else if (m_emitsCharactersBetweenAllVisiblePositions && m_node->renderer() && m_node->renderer()->isHR())
921         emitCharacter(' ', m_node->parentNode(), m_node, 0, 1);
922     else
923         representNodeOffsetZero();
924 
925     return true;
926 }
927 
exitNode()928 void TextIterator::exitNode()
929 {
930     // prevent emitting a newline when exiting a collapsed block at beginning of the range
931     // FIXME: !m_hasEmitted does not necessarily mean there was a collapsed block... it could
932     // have been an hr (e.g.). Also, a collapsed block could have height (e.g. a table) and
933     // therefore look like a blank line.
934     if (!m_hasEmitted)
935         return;
936 
937     // Emit with a position *inside* m_node, after m_node's contents, in
938     // case it is a block, because the run should start where the
939     // emitted character is positioned visually.
940     Node* baseNode = m_node->lastChild() ? m_node->lastChild() : m_node;
941     // FIXME: This shouldn't require the m_lastTextNode to be true, but we can't change that without making
942     // the logic in _web_attributedStringFromRange match.  We'll get that for free when we switch to use
943     // TextIterator in _web_attributedStringFromRange.
944     // See <rdar://problem/5428427> for an example of how this mismatch will cause problems.
945     if (m_lastTextNode && shouldEmitNewlineAfterNode(m_node)) {
946         // use extra newline to represent margin bottom, as needed
947         bool addNewline = shouldEmitExtraNewlineForNode(m_node);
948 
949         // FIXME: We need to emit a '\n' as we leave an empty block(s) that
950         // contain a VisiblePosition when doing selection preservation.
951         if (m_lastCharacter != '\n') {
952             // insert a newline with a position following this block's contents.
953             emitCharacter('\n', baseNode->parentNode(), baseNode, 1, 1);
954             // remember whether to later add a newline for the current node
955             ASSERT(!m_needsAnotherNewline);
956             m_needsAnotherNewline = addNewline;
957         } else if (addNewline)
958             // insert a newline with a position following this block's contents.
959             emitCharacter('\n', baseNode->parentNode(), baseNode, 1, 1);
960     }
961 
962     // If nothing was emitted, see if we need to emit a space.
963     if (!m_positionNode && shouldEmitSpaceBeforeAndAfterNode(m_node))
964         emitCharacter(' ', baseNode->parentNode(), baseNode, 1, 1);
965 }
966 
emitCharacter(UChar c,Node * textNode,Node * offsetBaseNode,int textStartOffset,int textEndOffset)967 void TextIterator::emitCharacter(UChar c, Node* textNode, Node* offsetBaseNode, int textStartOffset, int textEndOffset)
968 {
969     m_hasEmitted = true;
970 
971     // remember information with which to construct the TextIterator::range()
972     // NOTE: textNode is often not a text node, so the range will specify child nodes of positionNode
973     m_positionNode = textNode;
974     m_positionOffsetBaseNode = offsetBaseNode;
975     m_positionStartOffset = textStartOffset;
976     m_positionEndOffset = textEndOffset;
977 
978     // remember information with which to construct the TextIterator::characters() and length()
979     m_singleCharacterBuffer = c;
980     m_textCharacters = &m_singleCharacterBuffer;
981     m_textLength = 1;
982 
983     // remember some iteration state
984     m_lastTextNodeEndedWithCollapsedSpace = false;
985     m_lastCharacter = c;
986 }
987 
emitText(Node * textNode,RenderObject * renderObject,int textStartOffset,int textEndOffset)988 void TextIterator::emitText(Node* textNode, RenderObject* renderObject, int textStartOffset, int textEndOffset)
989 {
990     RenderText* renderer = toRenderText(renderObject);
991     m_text = m_emitsTextWithoutTranscoding ? renderer->textWithoutTranscoding() : renderer->text();
992     ASSERT(m_text.characters());
993     ASSERT(0 <= textStartOffset && textStartOffset < static_cast<int>(m_text.length()));
994     ASSERT(0 <= textEndOffset && textEndOffset <= static_cast<int>(m_text.length()));
995     ASSERT(textStartOffset <= textEndOffset);
996 
997     m_positionNode = textNode;
998     m_positionOffsetBaseNode = 0;
999     m_positionStartOffset = textStartOffset;
1000     m_positionEndOffset = textEndOffset;
1001     m_textCharacters = m_text.characters() + textStartOffset;
1002     m_textLength = textEndOffset - textStartOffset;
1003     m_lastCharacter = m_text[textEndOffset - 1];
1004 
1005     m_lastTextNodeEndedWithCollapsedSpace = false;
1006     m_hasEmitted = true;
1007 }
1008 
emitText(Node * textNode,int textStartOffset,int textEndOffset)1009 void TextIterator::emitText(Node* textNode, int textStartOffset, int textEndOffset)
1010 {
1011     emitText(textNode, m_node->renderer(), textStartOffset, textEndOffset);
1012 }
1013 
range() const1014 PassRefPtr<Range> TextIterator::range() const
1015 {
1016     // use the current run information, if we have it
1017     if (m_positionNode) {
1018         if (m_positionOffsetBaseNode) {
1019             int index = m_positionOffsetBaseNode->nodeIndex();
1020             m_positionStartOffset += index;
1021             m_positionEndOffset += index;
1022             m_positionOffsetBaseNode = 0;
1023         }
1024         return Range::create(m_positionNode->document(), m_positionNode, m_positionStartOffset, m_positionNode, m_positionEndOffset);
1025     }
1026 
1027     // otherwise, return the end of the overall range we were given
1028     if (m_endContainer)
1029         return Range::create(m_endContainer->document(), m_endContainer, m_endOffset, m_endContainer, m_endOffset);
1030 
1031     return 0;
1032 }
1033 
node() const1034 Node* TextIterator::node() const
1035 {
1036     RefPtr<Range> textRange = range();
1037     if (!textRange)
1038         return 0;
1039 
1040     Node* node = textRange->startContainer();
1041     if (!node)
1042         return 0;
1043     if (node->offsetInCharacters())
1044         return node;
1045 
1046     return node->childNode(textRange->startOffset());
1047 }
1048 
1049 // --------
1050 
SimplifiedBackwardsTextIterator()1051 SimplifiedBackwardsTextIterator::SimplifiedBackwardsTextIterator()
1052     : m_behavior(TextIteratorDefaultBehavior)
1053     , m_node(0)
1054     , m_offset(0)
1055     , m_handledNode(false)
1056     , m_handledChildren(false)
1057     , m_startNode(0)
1058     , m_startOffset(0)
1059     , m_endNode(0)
1060     , m_endOffset(0)
1061     , m_positionNode(0)
1062     , m_positionStartOffset(0)
1063     , m_positionEndOffset(0)
1064     , m_textCharacters(0)
1065     , m_textLength(0)
1066     , m_lastTextNode(0)
1067     , m_lastCharacter(0)
1068     , m_singleCharacterBuffer(0)
1069     , m_havePassedStartNode(false)
1070     , m_shouldHandleFirstLetter(false)
1071 {
1072 }
1073 
SimplifiedBackwardsTextIterator(const Range * r,TextIteratorBehavior behavior)1074 SimplifiedBackwardsTextIterator::SimplifiedBackwardsTextIterator(const Range* r, TextIteratorBehavior behavior)
1075     : m_behavior(behavior)
1076     , m_node(0)
1077     , m_offset(0)
1078     , m_handledNode(false)
1079     , m_handledChildren(false)
1080     , m_startNode(0)
1081     , m_startOffset(0)
1082     , m_endNode(0)
1083     , m_endOffset(0)
1084     , m_positionNode(0)
1085     , m_positionStartOffset(0)
1086     , m_positionEndOffset(0)
1087     , m_textCharacters(0)
1088     , m_textLength(0)
1089     , m_lastTextNode(0)
1090     , m_lastCharacter(0)
1091     , m_singleCharacterBuffer(0)
1092     , m_havePassedStartNode(false)
1093     , m_shouldHandleFirstLetter(false)
1094 {
1095     ASSERT(m_behavior == TextIteratorDefaultBehavior);
1096 
1097     if (!r)
1098         return;
1099 
1100     Node* startNode = r->startContainer();
1101     if (!startNode)
1102         return;
1103     Node* endNode = r->endContainer();
1104     int startOffset = r->startOffset();
1105     int endOffset = r->endOffset();
1106 
1107     if (!startNode->offsetInCharacters()) {
1108         if (startOffset >= 0 && startOffset < static_cast<int>(startNode->childNodeCount())) {
1109             startNode = startNode->childNode(startOffset);
1110             startOffset = 0;
1111         }
1112     }
1113     if (!endNode->offsetInCharacters()) {
1114         if (endOffset > 0 && endOffset <= static_cast<int>(endNode->childNodeCount())) {
1115             endNode = endNode->childNode(endOffset - 1);
1116             endOffset = lastOffsetInNode(endNode);
1117         }
1118     }
1119 
1120     m_node = endNode;
1121     setUpFullyClippedStack(m_fullyClippedStack, m_node);
1122     m_offset = endOffset;
1123     m_handledNode = false;
1124     m_handledChildren = endOffset == 0;
1125 
1126     m_startNode = startNode;
1127     m_startOffset = startOffset;
1128     m_endNode = endNode;
1129     m_endOffset = endOffset;
1130 
1131 #ifndef NDEBUG
1132     // Need this just because of the assert.
1133     m_positionNode = endNode;
1134 #endif
1135 
1136     m_lastTextNode = 0;
1137     m_lastCharacter = '\n';
1138 
1139     m_havePassedStartNode = false;
1140 
1141     advance();
1142 }
1143 
advance()1144 void SimplifiedBackwardsTextIterator::advance()
1145 {
1146     ASSERT(m_positionNode);
1147 
1148     m_positionNode = 0;
1149     m_textLength = 0;
1150 
1151     while (m_node && !m_havePassedStartNode) {
1152         // Don't handle node if we start iterating at [node, 0].
1153         if (!m_handledNode && !(m_node == m_endNode && m_endOffset == 0)) {
1154             RenderObject* renderer = m_node->renderer();
1155             if (renderer && renderer->isText() && m_node->nodeType() == Node::TEXT_NODE) {
1156                 // FIXME: What about CDATA_SECTION_NODE?
1157                 if (renderer->style()->visibility() == VISIBLE && m_offset > 0)
1158                     m_handledNode = handleTextNode();
1159             } else if (renderer && (renderer->isImage() || renderer->isWidget())) {
1160                 if (renderer->style()->visibility() == VISIBLE && m_offset > 0)
1161                     m_handledNode = handleReplacedElement();
1162             } else
1163                 m_handledNode = handleNonTextNode();
1164             if (m_positionNode)
1165                 return;
1166         }
1167 
1168         if (!m_handledChildren && m_node->hasChildNodes()) {
1169             m_node = m_node->lastChild();
1170             pushFullyClippedState(m_fullyClippedStack, m_node);
1171         } else {
1172             // Exit empty containers as we pass over them or containers
1173             // where [container, 0] is where we started iterating.
1174             if (!m_handledNode
1175                     && canHaveChildrenForEditing(m_node)
1176                     && m_node->parentNode()
1177                     && (!m_node->lastChild() || (m_node == m_endNode && !m_endOffset))) {
1178                 exitNode();
1179                 if (m_positionNode) {
1180                     m_handledNode = true;
1181                     m_handledChildren = true;
1182                     return;
1183                 }
1184             }
1185 
1186             // Exit all other containers.
1187             while (!m_node->previousSibling()) {
1188                 if (!advanceRespectingRange(m_node->parentOrHostNode()))
1189                     break;
1190                 m_fullyClippedStack.pop();
1191                 exitNode();
1192                 if (m_positionNode) {
1193                     m_handledNode = true;
1194                     m_handledChildren = true;
1195                     return;
1196                 }
1197             }
1198 
1199             m_fullyClippedStack.pop();
1200             if (advanceRespectingRange(m_node->previousSibling()))
1201                 pushFullyClippedState(m_fullyClippedStack, m_node);
1202             else
1203                 m_node = 0;
1204         }
1205 
1206         // For the purpose of word boundary detection,
1207         // we should iterate all visible text and trailing (collapsed) whitespaces.
1208         m_offset = m_node ? maxOffsetIncludingCollapsedSpaces(m_node) : 0;
1209         m_handledNode = false;
1210         m_handledChildren = false;
1211 
1212         if (m_positionNode)
1213             return;
1214     }
1215 }
1216 
handleTextNode()1217 bool SimplifiedBackwardsTextIterator::handleTextNode()
1218 {
1219     m_lastTextNode = m_node;
1220 
1221     int startOffset;
1222     int offsetInNode;
1223     RenderText* renderer = handleFirstLetter(startOffset, offsetInNode);
1224     if (!renderer)
1225         return true;
1226 
1227     String text = renderer->text();
1228     if (!renderer->firstTextBox() && text.length() > 0)
1229         return true;
1230 
1231     m_positionEndOffset = m_offset;
1232     m_offset = startOffset + offsetInNode;
1233     m_positionNode = m_node;
1234     m_positionStartOffset = m_offset;
1235 
1236     ASSERT(0 <= m_positionStartOffset - offsetInNode && m_positionStartOffset - offsetInNode <= static_cast<int>(text.length()));
1237     ASSERT(1 <= m_positionEndOffset - offsetInNode && m_positionEndOffset - offsetInNode <= static_cast<int>(text.length()));
1238     ASSERT(m_positionStartOffset <= m_positionEndOffset);
1239 
1240     m_textLength = m_positionEndOffset - m_positionStartOffset;
1241     m_textCharacters = text.characters() + (m_positionStartOffset - offsetInNode);
1242     ASSERT(m_textCharacters >= text.characters());
1243     ASSERT(m_textCharacters + m_textLength <= text.characters() + static_cast<int>(text.length()));
1244 
1245     m_lastCharacter = text[m_positionEndOffset - 1];
1246 
1247     return !m_shouldHandleFirstLetter;
1248 }
1249 
handleFirstLetter(int & startOffset,int & offsetInNode)1250 RenderText* SimplifiedBackwardsTextIterator::handleFirstLetter(int& startOffset, int& offsetInNode)
1251 {
1252     RenderText* renderer = toRenderText(m_node->renderer());
1253     startOffset = (m_node == m_startNode) ? m_startOffset : 0;
1254 
1255     if (!renderer->isTextFragment()) {
1256         offsetInNode = 0;
1257         return renderer;
1258     }
1259 
1260     RenderTextFragment* fragment = toRenderTextFragment(renderer);
1261     int offsetAfterFirstLetter = fragment->start();
1262     if (startOffset >= offsetAfterFirstLetter) {
1263         ASSERT(!m_shouldHandleFirstLetter);
1264         offsetInNode = offsetAfterFirstLetter;
1265         return renderer;
1266     }
1267 
1268     if (!m_shouldHandleFirstLetter && offsetAfterFirstLetter < m_offset) {
1269         m_shouldHandleFirstLetter = true;
1270         offsetInNode = offsetAfterFirstLetter;
1271         return renderer;
1272     }
1273 
1274     m_shouldHandleFirstLetter = false;
1275     offsetInNode = 0;
1276     return firstRenderTextInFirstLetter(fragment->firstLetter());
1277 }
1278 
handleReplacedElement()1279 bool SimplifiedBackwardsTextIterator::handleReplacedElement()
1280 {
1281     unsigned index = m_node->nodeIndex();
1282     // We want replaced elements to behave like punctuation for boundary
1283     // finding, and to simply take up space for the selection preservation
1284     // code in moveParagraphs, so we use a comma.  Unconditionally emit
1285     // here because this iterator is only used for boundary finding.
1286     emitCharacter(',', m_node->parentNode(), index, index + 1);
1287     return true;
1288 }
1289 
handleNonTextNode()1290 bool SimplifiedBackwardsTextIterator::handleNonTextNode()
1291 {
1292     // We can use a linefeed in place of a tab because this simple iterator is only used to
1293     // find boundaries, not actual content.  A linefeed breaks words, sentences, and paragraphs.
1294     if (shouldEmitNewlineForNode(m_node) || shouldEmitNewlineAfterNode(m_node) || shouldEmitTabBeforeNode(m_node)) {
1295         unsigned index = m_node->nodeIndex();
1296         // The start of this emitted range is wrong. Ensuring correctness would require
1297         // VisiblePositions and so would be slow. previousBoundary expects this.
1298         emitCharacter('\n', m_node->parentNode(), index + 1, index + 1);
1299     }
1300     return true;
1301 }
1302 
exitNode()1303 void SimplifiedBackwardsTextIterator::exitNode()
1304 {
1305     if (shouldEmitNewlineForNode(m_node) || shouldEmitNewlineBeforeNode(m_node) || shouldEmitTabBeforeNode(m_node)) {
1306         // The start of this emitted range is wrong. Ensuring correctness would require
1307         // VisiblePositions and so would be slow. previousBoundary expects this.
1308         emitCharacter('\n', m_node, 0, 0);
1309     }
1310 }
1311 
emitCharacter(UChar c,Node * node,int startOffset,int endOffset)1312 void SimplifiedBackwardsTextIterator::emitCharacter(UChar c, Node* node, int startOffset, int endOffset)
1313 {
1314     m_singleCharacterBuffer = c;
1315     m_positionNode = node;
1316     m_positionStartOffset = startOffset;
1317     m_positionEndOffset = endOffset;
1318     m_textCharacters = &m_singleCharacterBuffer;
1319     m_textLength = 1;
1320     m_lastCharacter = c;
1321 }
1322 
advanceRespectingRange(Node * next)1323 bool SimplifiedBackwardsTextIterator::advanceRespectingRange(Node* next)
1324 {
1325     if (!next)
1326         return false;
1327     m_havePassedStartNode |= m_node == m_startNode;
1328     if (m_havePassedStartNode)
1329         return false;
1330     m_node = next;
1331     return true;
1332 }
1333 
range() const1334 PassRefPtr<Range> SimplifiedBackwardsTextIterator::range() const
1335 {
1336     if (m_positionNode)
1337         return Range::create(m_positionNode->document(), m_positionNode, m_positionStartOffset, m_positionNode, m_positionEndOffset);
1338 
1339     return Range::create(m_startNode->document(), m_startNode, m_startOffset, m_startNode, m_startOffset);
1340 }
1341 
1342 // --------
1343 
CharacterIterator()1344 CharacterIterator::CharacterIterator()
1345     : m_offset(0)
1346     , m_runOffset(0)
1347     , m_atBreak(true)
1348 {
1349 }
1350 
CharacterIterator(const Range * r,TextIteratorBehavior behavior)1351 CharacterIterator::CharacterIterator(const Range* r, TextIteratorBehavior behavior)
1352     : m_offset(0)
1353     , m_runOffset(0)
1354     , m_atBreak(true)
1355     , m_textIterator(r, behavior)
1356 {
1357     while (!atEnd() && m_textIterator.length() == 0)
1358         m_textIterator.advance();
1359 }
1360 
range() const1361 PassRefPtr<Range> CharacterIterator::range() const
1362 {
1363     RefPtr<Range> r = m_textIterator.range();
1364     if (!m_textIterator.atEnd()) {
1365         if (m_textIterator.length() <= 1) {
1366             ASSERT(m_runOffset == 0);
1367         } else {
1368             Node* n = r->startContainer();
1369             ASSERT(n == r->endContainer());
1370             int offset = r->startOffset() + m_runOffset;
1371             ExceptionCode ec = 0;
1372             r->setStart(n, offset, ec);
1373             r->setEnd(n, offset + 1, ec);
1374             ASSERT(!ec);
1375         }
1376     }
1377     return r.release();
1378 }
1379 
advance(int count)1380 void CharacterIterator::advance(int count)
1381 {
1382     if (count <= 0) {
1383         ASSERT(count == 0);
1384         return;
1385     }
1386 
1387     m_atBreak = false;
1388 
1389     // easy if there is enough left in the current m_textIterator run
1390     int remaining = m_textIterator.length() - m_runOffset;
1391     if (count < remaining) {
1392         m_runOffset += count;
1393         m_offset += count;
1394         return;
1395     }
1396 
1397     // exhaust the current m_textIterator run
1398     count -= remaining;
1399     m_offset += remaining;
1400 
1401     // move to a subsequent m_textIterator run
1402     for (m_textIterator.advance(); !atEnd(); m_textIterator.advance()) {
1403         int runLength = m_textIterator.length();
1404         if (runLength == 0)
1405             m_atBreak = true;
1406         else {
1407             // see whether this is m_textIterator to use
1408             if (count < runLength) {
1409                 m_runOffset = count;
1410                 m_offset += count;
1411                 return;
1412             }
1413 
1414             // exhaust this m_textIterator run
1415             count -= runLength;
1416             m_offset += runLength;
1417         }
1418     }
1419 
1420     // ran to the end of the m_textIterator... no more runs left
1421     m_atBreak = true;
1422     m_runOffset = 0;
1423 }
1424 
string(int numChars)1425 String CharacterIterator::string(int numChars)
1426 {
1427     Vector<UChar> result;
1428     result.reserveInitialCapacity(numChars);
1429     while (numChars > 0 && !atEnd()) {
1430         int runSize = min(numChars, length());
1431         result.append(characters(), runSize);
1432         numChars -= runSize;
1433         advance(runSize);
1434     }
1435     return String::adopt(result);
1436 }
1437 
characterSubrange(CharacterIterator & it,int offset,int length)1438 static PassRefPtr<Range> characterSubrange(CharacterIterator& it, int offset, int length)
1439 {
1440     it.advance(offset);
1441     RefPtr<Range> start = it.range();
1442 
1443     if (length > 1)
1444         it.advance(length - 1);
1445     RefPtr<Range> end = it.range();
1446 
1447     return Range::create(start->startContainer()->document(),
1448         start->startContainer(), start->startOffset(),
1449         end->endContainer(), end->endOffset());
1450 }
1451 
BackwardsCharacterIterator()1452 BackwardsCharacterIterator::BackwardsCharacterIterator()
1453     : m_offset(0)
1454     , m_runOffset(0)
1455     , m_atBreak(true)
1456 {
1457 }
1458 
BackwardsCharacterIterator(const Range * range,TextIteratorBehavior behavior)1459 BackwardsCharacterIterator::BackwardsCharacterIterator(const Range* range, TextIteratorBehavior behavior)
1460     : m_offset(0)
1461     , m_runOffset(0)
1462     , m_atBreak(true)
1463     , m_textIterator(range, behavior)
1464 {
1465     while (!atEnd() && !m_textIterator.length())
1466         m_textIterator.advance();
1467 }
1468 
range() const1469 PassRefPtr<Range> BackwardsCharacterIterator::range() const
1470 {
1471     RefPtr<Range> r = m_textIterator.range();
1472     if (!m_textIterator.atEnd()) {
1473         if (m_textIterator.length() <= 1)
1474             ASSERT(m_runOffset == 0);
1475         else {
1476             Node* n = r->startContainer();
1477             ASSERT(n == r->endContainer());
1478             int offset = r->endOffset() - m_runOffset;
1479             ExceptionCode ec = 0;
1480             r->setStart(n, offset - 1, ec);
1481             r->setEnd(n, offset, ec);
1482             ASSERT(!ec);
1483         }
1484     }
1485     return r.release();
1486 }
1487 
advance(int count)1488 void BackwardsCharacterIterator::advance(int count)
1489 {
1490     if (count <= 0) {
1491         ASSERT(!count);
1492         return;
1493     }
1494 
1495     m_atBreak = false;
1496 
1497     int remaining = m_textIterator.length() - m_runOffset;
1498     if (count < remaining) {
1499         m_runOffset += count;
1500         m_offset += count;
1501         return;
1502     }
1503 
1504     count -= remaining;
1505     m_offset += remaining;
1506 
1507     for (m_textIterator.advance(); !atEnd(); m_textIterator.advance()) {
1508         int runLength = m_textIterator.length();
1509         if (runLength == 0)
1510             m_atBreak = true;
1511         else {
1512             if (count < runLength) {
1513                 m_runOffset = count;
1514                 m_offset += count;
1515                 return;
1516             }
1517 
1518             count -= runLength;
1519             m_offset += runLength;
1520         }
1521     }
1522 
1523     m_atBreak = true;
1524     m_runOffset = 0;
1525 }
1526 
1527 // --------
1528 
WordAwareIterator()1529 WordAwareIterator::WordAwareIterator()
1530     : m_previousText(0)
1531     , m_didLookAhead(false)
1532 {
1533 }
1534 
WordAwareIterator(const Range * r)1535 WordAwareIterator::WordAwareIterator(const Range* r)
1536     : m_previousText(0)
1537     , m_didLookAhead(true) // so we consider the first chunk from the text iterator
1538     , m_textIterator(r)
1539 {
1540     advance(); // get in position over the first chunk of text
1541 }
1542 
~WordAwareIterator()1543 WordAwareIterator::~WordAwareIterator()
1544 {
1545 }
1546 
1547 // We're always in one of these modes:
1548 // - The current chunk in the text iterator is our current chunk
1549 //      (typically its a piece of whitespace, or text that ended with whitespace)
1550 // - The previous chunk in the text iterator is our current chunk
1551 //      (we looked ahead to the next chunk and found a word boundary)
1552 // - We built up our own chunk of text from many chunks from the text iterator
1553 
1554 // FIXME: Performance could be bad for huge spans next to each other that don't fall on word boundaries.
1555 
advance()1556 void WordAwareIterator::advance()
1557 {
1558     m_previousText = 0;
1559     m_buffer.clear();      // toss any old buffer we built up
1560 
1561     // If last time we did a look-ahead, start with that looked-ahead chunk now
1562     if (!m_didLookAhead) {
1563         ASSERT(!m_textIterator.atEnd());
1564         m_textIterator.advance();
1565     }
1566     m_didLookAhead = false;
1567 
1568     // Go to next non-empty chunk
1569     while (!m_textIterator.atEnd() && m_textIterator.length() == 0)
1570         m_textIterator.advance();
1571     m_range = m_textIterator.range();
1572 
1573     if (m_textIterator.atEnd())
1574         return;
1575 
1576     while (1) {
1577         // If this chunk ends in whitespace we can just use it as our chunk.
1578         if (isSpaceOrNewline(m_textIterator.characters()[m_textIterator.length() - 1]))
1579             return;
1580 
1581         // If this is the first chunk that failed, save it in previousText before look ahead
1582         if (m_buffer.isEmpty()) {
1583             m_previousText = m_textIterator.characters();
1584             m_previousLength = m_textIterator.length();
1585         }
1586 
1587         // Look ahead to next chunk.  If it is whitespace or a break, we can use the previous stuff
1588         m_textIterator.advance();
1589         if (m_textIterator.atEnd() || m_textIterator.length() == 0 || isSpaceOrNewline(m_textIterator.characters()[0])) {
1590             m_didLookAhead = true;
1591             return;
1592         }
1593 
1594         if (m_buffer.isEmpty()) {
1595             // Start gobbling chunks until we get to a suitable stopping point
1596             m_buffer.append(m_previousText, m_previousLength);
1597             m_previousText = 0;
1598         }
1599         m_buffer.append(m_textIterator.characters(), m_textIterator.length());
1600         int exception = 0;
1601         m_range->setEnd(m_textIterator.range()->endContainer(), m_textIterator.range()->endOffset(), exception);
1602     }
1603 }
1604 
length() const1605 int WordAwareIterator::length() const
1606 {
1607     if (!m_buffer.isEmpty())
1608         return m_buffer.size();
1609     if (m_previousText)
1610         return m_previousLength;
1611     return m_textIterator.length();
1612 }
1613 
characters() const1614 const UChar* WordAwareIterator::characters() const
1615 {
1616     if (!m_buffer.isEmpty())
1617         return m_buffer.data();
1618     if (m_previousText)
1619         return m_previousText;
1620     return m_textIterator.characters();
1621 }
1622 
1623 // --------
1624 
foldQuoteMarkOrSoftHyphen(UChar c)1625 static inline UChar foldQuoteMarkOrSoftHyphen(UChar c)
1626 {
1627     switch (c) {
1628         case hebrewPunctuationGershayim:
1629         case leftDoubleQuotationMark:
1630         case rightDoubleQuotationMark:
1631             return '"';
1632         case hebrewPunctuationGeresh:
1633         case leftSingleQuotationMark:
1634         case rightSingleQuotationMark:
1635             return '\'';
1636         case softHyphen:
1637             // Replace soft hyphen with an ignorable character so that their presence or absence will
1638             // not affect string comparison.
1639             return 0;
1640         default:
1641             return c;
1642     }
1643 }
1644 
foldQuoteMarksAndSoftHyphens(String & s)1645 static inline void foldQuoteMarksAndSoftHyphens(String& s)
1646 {
1647     s.replace(hebrewPunctuationGeresh, '\'');
1648     s.replace(hebrewPunctuationGershayim, '"');
1649     s.replace(leftDoubleQuotationMark, '"');
1650     s.replace(leftSingleQuotationMark, '\'');
1651     s.replace(rightDoubleQuotationMark, '"');
1652     s.replace(rightSingleQuotationMark, '\'');
1653     // Replace soft hyphen with an ignorable character so that their presence or absence will
1654     // not affect string comparison.
1655     s.replace(softHyphen, 0);
1656 }
1657 
1658 #if USE(ICU_UNICODE) && !UCONFIG_NO_COLLATION
1659 
foldQuoteMarksAndSoftHyphens(UChar * data,size_t length)1660 static inline void foldQuoteMarksAndSoftHyphens(UChar* data, size_t length)
1661 {
1662     for (size_t i = 0; i < length; ++i)
1663         data[i] = foldQuoteMarkOrSoftHyphen(data[i]);
1664 }
1665 
1666 static const size_t minimumSearchBufferSize = 8192;
1667 
1668 #ifndef NDEBUG
1669 static bool searcherInUse;
1670 #endif
1671 
createSearcher()1672 static UStringSearch* createSearcher()
1673 {
1674     // Provide a non-empty pattern and non-empty text so usearch_open will not fail,
1675     // but it doesn't matter exactly what it is, since we don't perform any searches
1676     // without setting both the pattern and the text.
1677     UErrorCode status = U_ZERO_ERROR;
1678     UStringSearch* searcher = usearch_open(&newlineCharacter, 1, &newlineCharacter, 1, currentSearchLocaleID(), 0, &status);
1679     ASSERT(status == U_ZERO_ERROR || status == U_USING_FALLBACK_WARNING || status == U_USING_DEFAULT_WARNING);
1680     return searcher;
1681 }
1682 
searcher()1683 static UStringSearch* searcher()
1684 {
1685     static UStringSearch* searcher = createSearcher();
1686     return searcher;
1687 }
1688 
lockSearcher()1689 static inline void lockSearcher()
1690 {
1691 #ifndef NDEBUG
1692     ASSERT(!searcherInUse);
1693     searcherInUse = true;
1694 #endif
1695 }
1696 
unlockSearcher()1697 static inline void unlockSearcher()
1698 {
1699 #ifndef NDEBUG
1700     ASSERT(searcherInUse);
1701     searcherInUse = false;
1702 #endif
1703 }
1704 
1705 // ICU's search ignores the distinction between small kana letters and ones
1706 // that are not small, and also characters that differ only in the voicing
1707 // marks when considering only primary collation strength diffrences.
1708 // This is not helpful for end users, since these differences make words
1709 // distinct, so for our purposes we need these to be considered.
1710 // The Unicode folks do not think the collation algorithm should be
1711 // changed. To work around this, we would like to tailor the ICU searcher,
1712 // but we can't get that to work yet. So instead, we check for cases where
1713 // these differences occur, and skip those matches.
1714 
1715 // We refer to the above technique as the "kana workaround". The next few
1716 // functions are helper functinos for the kana workaround.
1717 
isKanaLetter(UChar character)1718 static inline bool isKanaLetter(UChar character)
1719 {
1720     // Hiragana letters.
1721     if (character >= 0x3041 && character <= 0x3096)
1722         return true;
1723 
1724     // Katakana letters.
1725     if (character >= 0x30A1 && character <= 0x30FA)
1726         return true;
1727     if (character >= 0x31F0 && character <= 0x31FF)
1728         return true;
1729 
1730     // Halfwidth katakana letters.
1731     if (character >= 0xFF66 && character <= 0xFF9D && character != 0xFF70)
1732         return true;
1733 
1734     return false;
1735 }
1736 
isSmallKanaLetter(UChar character)1737 static inline bool isSmallKanaLetter(UChar character)
1738 {
1739     ASSERT(isKanaLetter(character));
1740 
1741     switch (character) {
1742     case 0x3041: // HIRAGANA LETTER SMALL A
1743     case 0x3043: // HIRAGANA LETTER SMALL I
1744     case 0x3045: // HIRAGANA LETTER SMALL U
1745     case 0x3047: // HIRAGANA LETTER SMALL E
1746     case 0x3049: // HIRAGANA LETTER SMALL O
1747     case 0x3063: // HIRAGANA LETTER SMALL TU
1748     case 0x3083: // HIRAGANA LETTER SMALL YA
1749     case 0x3085: // HIRAGANA LETTER SMALL YU
1750     case 0x3087: // HIRAGANA LETTER SMALL YO
1751     case 0x308E: // HIRAGANA LETTER SMALL WA
1752     case 0x3095: // HIRAGANA LETTER SMALL KA
1753     case 0x3096: // HIRAGANA LETTER SMALL KE
1754     case 0x30A1: // KATAKANA LETTER SMALL A
1755     case 0x30A3: // KATAKANA LETTER SMALL I
1756     case 0x30A5: // KATAKANA LETTER SMALL U
1757     case 0x30A7: // KATAKANA LETTER SMALL E
1758     case 0x30A9: // KATAKANA LETTER SMALL O
1759     case 0x30C3: // KATAKANA LETTER SMALL TU
1760     case 0x30E3: // KATAKANA LETTER SMALL YA
1761     case 0x30E5: // KATAKANA LETTER SMALL YU
1762     case 0x30E7: // KATAKANA LETTER SMALL YO
1763     case 0x30EE: // KATAKANA LETTER SMALL WA
1764     case 0x30F5: // KATAKANA LETTER SMALL KA
1765     case 0x30F6: // KATAKANA LETTER SMALL KE
1766     case 0x31F0: // KATAKANA LETTER SMALL KU
1767     case 0x31F1: // KATAKANA LETTER SMALL SI
1768     case 0x31F2: // KATAKANA LETTER SMALL SU
1769     case 0x31F3: // KATAKANA LETTER SMALL TO
1770     case 0x31F4: // KATAKANA LETTER SMALL NU
1771     case 0x31F5: // KATAKANA LETTER SMALL HA
1772     case 0x31F6: // KATAKANA LETTER SMALL HI
1773     case 0x31F7: // KATAKANA LETTER SMALL HU
1774     case 0x31F8: // KATAKANA LETTER SMALL HE
1775     case 0x31F9: // KATAKANA LETTER SMALL HO
1776     case 0x31FA: // KATAKANA LETTER SMALL MU
1777     case 0x31FB: // KATAKANA LETTER SMALL RA
1778     case 0x31FC: // KATAKANA LETTER SMALL RI
1779     case 0x31FD: // KATAKANA LETTER SMALL RU
1780     case 0x31FE: // KATAKANA LETTER SMALL RE
1781     case 0x31FF: // KATAKANA LETTER SMALL RO
1782     case 0xFF67: // HALFWIDTH KATAKANA LETTER SMALL A
1783     case 0xFF68: // HALFWIDTH KATAKANA LETTER SMALL I
1784     case 0xFF69: // HALFWIDTH KATAKANA LETTER SMALL U
1785     case 0xFF6A: // HALFWIDTH KATAKANA LETTER SMALL E
1786     case 0xFF6B: // HALFWIDTH KATAKANA LETTER SMALL O
1787     case 0xFF6C: // HALFWIDTH KATAKANA LETTER SMALL YA
1788     case 0xFF6D: // HALFWIDTH KATAKANA LETTER SMALL YU
1789     case 0xFF6E: // HALFWIDTH KATAKANA LETTER SMALL YO
1790     case 0xFF6F: // HALFWIDTH KATAKANA LETTER SMALL TU
1791         return true;
1792     }
1793     return false;
1794 }
1795 
1796 enum VoicedSoundMarkType { NoVoicedSoundMark, VoicedSoundMark, SemiVoicedSoundMark };
1797 
composedVoicedSoundMark(UChar character)1798 static inline VoicedSoundMarkType composedVoicedSoundMark(UChar character)
1799 {
1800     ASSERT(isKanaLetter(character));
1801 
1802     switch (character) {
1803     case 0x304C: // HIRAGANA LETTER GA
1804     case 0x304E: // HIRAGANA LETTER GI
1805     case 0x3050: // HIRAGANA LETTER GU
1806     case 0x3052: // HIRAGANA LETTER GE
1807     case 0x3054: // HIRAGANA LETTER GO
1808     case 0x3056: // HIRAGANA LETTER ZA
1809     case 0x3058: // HIRAGANA LETTER ZI
1810     case 0x305A: // HIRAGANA LETTER ZU
1811     case 0x305C: // HIRAGANA LETTER ZE
1812     case 0x305E: // HIRAGANA LETTER ZO
1813     case 0x3060: // HIRAGANA LETTER DA
1814     case 0x3062: // HIRAGANA LETTER DI
1815     case 0x3065: // HIRAGANA LETTER DU
1816     case 0x3067: // HIRAGANA LETTER DE
1817     case 0x3069: // HIRAGANA LETTER DO
1818     case 0x3070: // HIRAGANA LETTER BA
1819     case 0x3073: // HIRAGANA LETTER BI
1820     case 0x3076: // HIRAGANA LETTER BU
1821     case 0x3079: // HIRAGANA LETTER BE
1822     case 0x307C: // HIRAGANA LETTER BO
1823     case 0x3094: // HIRAGANA LETTER VU
1824     case 0x30AC: // KATAKANA LETTER GA
1825     case 0x30AE: // KATAKANA LETTER GI
1826     case 0x30B0: // KATAKANA LETTER GU
1827     case 0x30B2: // KATAKANA LETTER GE
1828     case 0x30B4: // KATAKANA LETTER GO
1829     case 0x30B6: // KATAKANA LETTER ZA
1830     case 0x30B8: // KATAKANA LETTER ZI
1831     case 0x30BA: // KATAKANA LETTER ZU
1832     case 0x30BC: // KATAKANA LETTER ZE
1833     case 0x30BE: // KATAKANA LETTER ZO
1834     case 0x30C0: // KATAKANA LETTER DA
1835     case 0x30C2: // KATAKANA LETTER DI
1836     case 0x30C5: // KATAKANA LETTER DU
1837     case 0x30C7: // KATAKANA LETTER DE
1838     case 0x30C9: // KATAKANA LETTER DO
1839     case 0x30D0: // KATAKANA LETTER BA
1840     case 0x30D3: // KATAKANA LETTER BI
1841     case 0x30D6: // KATAKANA LETTER BU
1842     case 0x30D9: // KATAKANA LETTER BE
1843     case 0x30DC: // KATAKANA LETTER BO
1844     case 0x30F4: // KATAKANA LETTER VU
1845     case 0x30F7: // KATAKANA LETTER VA
1846     case 0x30F8: // KATAKANA LETTER VI
1847     case 0x30F9: // KATAKANA LETTER VE
1848     case 0x30FA: // KATAKANA LETTER VO
1849         return VoicedSoundMark;
1850     case 0x3071: // HIRAGANA LETTER PA
1851     case 0x3074: // HIRAGANA LETTER PI
1852     case 0x3077: // HIRAGANA LETTER PU
1853     case 0x307A: // HIRAGANA LETTER PE
1854     case 0x307D: // HIRAGANA LETTER PO
1855     case 0x30D1: // KATAKANA LETTER PA
1856     case 0x30D4: // KATAKANA LETTER PI
1857     case 0x30D7: // KATAKANA LETTER PU
1858     case 0x30DA: // KATAKANA LETTER PE
1859     case 0x30DD: // KATAKANA LETTER PO
1860         return SemiVoicedSoundMark;
1861     }
1862     return NoVoicedSoundMark;
1863 }
1864 
isCombiningVoicedSoundMark(UChar character)1865 static inline bool isCombiningVoicedSoundMark(UChar character)
1866 {
1867     switch (character) {
1868     case 0x3099: // COMBINING KATAKANA-HIRAGANA VOICED SOUND MARK
1869     case 0x309A: // COMBINING KATAKANA-HIRAGANA SEMI-VOICED SOUND MARK
1870         return true;
1871     }
1872     return false;
1873 }
1874 
containsKanaLetters(const String & pattern)1875 static inline bool containsKanaLetters(const String& pattern)
1876 {
1877     const UChar* characters = pattern.characters();
1878     unsigned length = pattern.length();
1879     for (unsigned i = 0; i < length; ++i) {
1880         if (isKanaLetter(characters[i]))
1881             return true;
1882     }
1883     return false;
1884 }
1885 
normalizeCharacters(const UChar * characters,unsigned length,Vector<UChar> & buffer)1886 static void normalizeCharacters(const UChar* characters, unsigned length, Vector<UChar>& buffer)
1887 {
1888     ASSERT(length);
1889 
1890     buffer.resize(length);
1891 
1892     UErrorCode status = U_ZERO_ERROR;
1893     size_t bufferSize = unorm_normalize(characters, length, UNORM_NFC, 0, buffer.data(), length, &status);
1894     ASSERT(status == U_ZERO_ERROR || status == U_STRING_NOT_TERMINATED_WARNING || status == U_BUFFER_OVERFLOW_ERROR);
1895     ASSERT(bufferSize);
1896 
1897     buffer.resize(bufferSize);
1898 
1899     if (status == U_ZERO_ERROR || status == U_STRING_NOT_TERMINATED_WARNING)
1900         return;
1901 
1902     status = U_ZERO_ERROR;
1903     unorm_normalize(characters, length, UNORM_NFC, 0, buffer.data(), bufferSize, &status);
1904     ASSERT(status == U_STRING_NOT_TERMINATED_WARNING);
1905 }
1906 
isNonLatin1Separator(UChar32 character)1907 static bool isNonLatin1Separator(UChar32 character)
1908 {
1909     ASSERT_ARG(character, character >= 256);
1910 
1911     return U_GET_GC_MASK(character) & (U_GC_S_MASK | U_GC_P_MASK | U_GC_Z_MASK | U_GC_CF_MASK);
1912 }
1913 
isSeparator(UChar32 character)1914 static inline bool isSeparator(UChar32 character)
1915 {
1916     static const bool latin1SeparatorTable[256] = {
1917         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1918         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1919         1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // space ! " # $ % & ' ( ) * + , - . /
1920         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, //                         : ; < = > ?
1921         1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, //   @
1922         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, //                         [ \ ] ^ _
1923         1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, //   `
1924         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, //                           { | } ~
1925         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1926         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1927         0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1,
1928         1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1,
1929         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1930         0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0,
1931         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1932         0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0
1933     };
1934 
1935     if (character < 256)
1936         return latin1SeparatorTable[character];
1937 
1938     return isNonLatin1Separator(character);
1939 }
1940 
SearchBuffer(const String & target,FindOptions options)1941 inline SearchBuffer::SearchBuffer(const String& target, FindOptions options)
1942     : m_target(target)
1943     , m_options(options)
1944     , m_prefixLength(0)
1945     , m_atBreak(true)
1946     , m_needsMoreContext(options & AtWordStarts)
1947     , m_targetRequiresKanaWorkaround(containsKanaLetters(m_target))
1948 {
1949     ASSERT(!m_target.isEmpty());
1950 
1951     // FIXME: We'd like to tailor the searcher to fold quote marks for us instead
1952     // of doing it in a separate replacement pass here, but ICU doesn't offer a way
1953     // to add tailoring on top of the locale-specific tailoring as of this writing.
1954     foldQuoteMarksAndSoftHyphens(m_target);
1955 
1956     size_t targetLength = m_target.length();
1957     m_buffer.reserveInitialCapacity(max(targetLength * 8, minimumSearchBufferSize));
1958     m_overlap = m_buffer.capacity() / 4;
1959 
1960     if ((m_options & AtWordStarts) && targetLength) {
1961         UChar32 targetFirstCharacter;
1962         U16_GET(m_target.characters(), 0, 0, targetLength, targetFirstCharacter);
1963         // Characters in the separator category never really occur at the beginning of a word,
1964         // so if the target begins with such a character, we just ignore the AtWordStart option.
1965         if (isSeparator(targetFirstCharacter)) {
1966             m_options &= ~AtWordStarts;
1967             m_needsMoreContext = false;
1968         }
1969     }
1970 
1971     // Grab the single global searcher.
1972     // If we ever have a reason to do more than once search buffer at once, we'll have
1973     // to move to multiple searchers.
1974     lockSearcher();
1975 
1976     UStringSearch* searcher = WebCore::searcher();
1977     UCollator* collator = usearch_getCollator(searcher);
1978 
1979     UCollationStrength strength = m_options & CaseInsensitive ? UCOL_PRIMARY : UCOL_TERTIARY;
1980     if (ucol_getStrength(collator) != strength) {
1981         ucol_setStrength(collator, strength);
1982         usearch_reset(searcher);
1983     }
1984 
1985     UErrorCode status = U_ZERO_ERROR;
1986     usearch_setPattern(searcher, m_target.characters(), targetLength, &status);
1987     ASSERT(status == U_ZERO_ERROR);
1988 
1989     // The kana workaround requires a normalized copy of the target string.
1990     if (m_targetRequiresKanaWorkaround)
1991         normalizeCharacters(m_target.characters(), m_target.length(), m_normalizedTarget);
1992 }
1993 
~SearchBuffer()1994 inline SearchBuffer::~SearchBuffer()
1995 {
1996     // Leave the static object pointing to a valid string.
1997     UErrorCode status = U_ZERO_ERROR;
1998     usearch_setPattern(WebCore::searcher(), &newlineCharacter, 1, &status);
1999     ASSERT(status == U_ZERO_ERROR);
2000 
2001     unlockSearcher();
2002 }
2003 
append(const UChar * characters,size_t length)2004 inline size_t SearchBuffer::append(const UChar* characters, size_t length)
2005 {
2006     ASSERT(length);
2007 
2008     if (m_atBreak) {
2009         m_buffer.shrink(0);
2010         m_prefixLength = 0;
2011         m_atBreak = false;
2012     } else if (m_buffer.size() == m_buffer.capacity()) {
2013         memcpy(m_buffer.data(), m_buffer.data() + m_buffer.size() - m_overlap, m_overlap * sizeof(UChar));
2014         m_prefixLength -= min(m_prefixLength, m_buffer.size() - m_overlap);
2015         m_buffer.shrink(m_overlap);
2016     }
2017 
2018     size_t oldLength = m_buffer.size();
2019     size_t usableLength = min(m_buffer.capacity() - oldLength, length);
2020     ASSERT(usableLength);
2021     m_buffer.append(characters, usableLength);
2022     foldQuoteMarksAndSoftHyphens(m_buffer.data() + oldLength, usableLength);
2023     return usableLength;
2024 }
2025 
needsMoreContext() const2026 inline bool SearchBuffer::needsMoreContext() const
2027 {
2028     return m_needsMoreContext;
2029 }
2030 
prependContext(const UChar * characters,size_t length)2031 inline void SearchBuffer::prependContext(const UChar* characters, size_t length)
2032 {
2033     ASSERT(m_needsMoreContext);
2034     ASSERT(m_prefixLength == m_buffer.size());
2035 
2036     if (!length)
2037         return;
2038 
2039     m_atBreak = false;
2040 
2041     size_t wordBoundaryContextStart = length;
2042     if (wordBoundaryContextStart) {
2043         U16_BACK_1(characters, 0, wordBoundaryContextStart);
2044         wordBoundaryContextStart = startOfLastWordBoundaryContext(characters, wordBoundaryContextStart);
2045     }
2046 
2047     size_t usableLength = min(m_buffer.capacity() - m_prefixLength, length - wordBoundaryContextStart);
2048     m_buffer.prepend(characters + length - usableLength, usableLength);
2049     m_prefixLength += usableLength;
2050 
2051     if (wordBoundaryContextStart || m_prefixLength == m_buffer.capacity())
2052         m_needsMoreContext = false;
2053 }
2054 
atBreak() const2055 inline bool SearchBuffer::atBreak() const
2056 {
2057     return m_atBreak;
2058 }
2059 
reachedBreak()2060 inline void SearchBuffer::reachedBreak()
2061 {
2062     m_atBreak = true;
2063 }
2064 
isBadMatch(const UChar * match,size_t matchLength) const2065 inline bool SearchBuffer::isBadMatch(const UChar* match, size_t matchLength) const
2066 {
2067     // This function implements the kana workaround. If usearch treats
2068     // it as a match, but we do not want to, then it's a "bad match".
2069     if (!m_targetRequiresKanaWorkaround)
2070         return false;
2071 
2072     // Normalize into a match buffer. We reuse a single buffer rather than
2073     // creating a new one each time.
2074     normalizeCharacters(match, matchLength, m_normalizedMatch);
2075 
2076     const UChar* a = m_normalizedTarget.begin();
2077     const UChar* aEnd = m_normalizedTarget.end();
2078 
2079     const UChar* b = m_normalizedMatch.begin();
2080     const UChar* bEnd = m_normalizedMatch.end();
2081 
2082     while (true) {
2083         // Skip runs of non-kana-letter characters. This is necessary so we can
2084         // correctly handle strings where the target and match have different-length
2085         // runs of characters that match, while still double checking the correctness
2086         // of matches of kana letters with other kana letters.
2087         while (a != aEnd && !isKanaLetter(*a))
2088             ++a;
2089         while (b != bEnd && !isKanaLetter(*b))
2090             ++b;
2091 
2092         // If we reached the end of either the target or the match, we should have
2093         // reached the end of both; both should have the same number of kana letters.
2094         if (a == aEnd || b == bEnd) {
2095             ASSERT(a == aEnd);
2096             ASSERT(b == bEnd);
2097             return false;
2098         }
2099 
2100         // Check for differences in the kana letter character itself.
2101         if (isSmallKanaLetter(*a) != isSmallKanaLetter(*b))
2102             return true;
2103         if (composedVoicedSoundMark(*a) != composedVoicedSoundMark(*b))
2104             return true;
2105         ++a;
2106         ++b;
2107 
2108         // Check for differences in combining voiced sound marks found after the letter.
2109         while (1) {
2110             if (!(a != aEnd && isCombiningVoicedSoundMark(*a))) {
2111                 if (b != bEnd && isCombiningVoicedSoundMark(*b))
2112                     return true;
2113                 break;
2114             }
2115             if (!(b != bEnd && isCombiningVoicedSoundMark(*b)))
2116                 return true;
2117             if (*a != *b)
2118                 return true;
2119             ++a;
2120             ++b;
2121         }
2122     }
2123 }
2124 
isWordStartMatch(size_t start,size_t length) const2125 inline bool SearchBuffer::isWordStartMatch(size_t start, size_t length) const
2126 {
2127     ASSERT(m_options & AtWordStarts);
2128 
2129     if (!start)
2130         return true;
2131 
2132     if (m_options & TreatMedialCapitalAsWordStart) {
2133         int size = m_buffer.size();
2134         int offset = start;
2135         UChar32 firstCharacter;
2136         U16_GET(m_buffer.data(), 0, offset, size, firstCharacter);
2137         UChar32 previousCharacter;
2138         U16_PREV(m_buffer.data(), 0, offset, previousCharacter);
2139 
2140         if (isSeparator(firstCharacter)) {
2141             // The start of a separator run is a word start (".org" in "webkit.org").
2142             if (!isSeparator(previousCharacter))
2143                 return true;
2144         } else if (isASCIIUpper(firstCharacter)) {
2145             // The start of an uppercase run is a word start ("Kit" in "WebKit").
2146             if (!isASCIIUpper(previousCharacter))
2147                 return true;
2148             // The last character of an uppercase run followed by a non-separator, non-digit
2149             // is a word start ("Request" in "XMLHTTPRequest").
2150             offset = start;
2151             U16_FWD_1(m_buffer.data(), offset, size);
2152             UChar32 nextCharacter = 0;
2153             if (offset < size)
2154                 U16_GET(m_buffer.data(), 0, offset, size, nextCharacter);
2155             if (!isASCIIUpper(nextCharacter) && !isASCIIDigit(nextCharacter) && !isSeparator(nextCharacter))
2156                 return true;
2157         } else if (isASCIIDigit(firstCharacter)) {
2158             // The start of a digit run is a word start ("2" in "WebKit2").
2159             if (!isASCIIDigit(previousCharacter))
2160                 return true;
2161         } else if (isSeparator(previousCharacter) || isASCIIDigit(previousCharacter)) {
2162             // The start of a non-separator, non-uppercase, non-digit run is a word start,
2163             // except after an uppercase. ("org" in "webkit.org", but not "ore" in "WebCore").
2164             return true;
2165         }
2166     }
2167 
2168     size_t wordBreakSearchStart = start + length;
2169     while (wordBreakSearchStart > start)
2170         wordBreakSearchStart = findNextWordFromIndex(m_buffer.data(), m_buffer.size(), wordBreakSearchStart, false /* backwards */);
2171     return wordBreakSearchStart == start;
2172 }
2173 
search(size_t & start)2174 inline size_t SearchBuffer::search(size_t& start)
2175 {
2176     size_t size = m_buffer.size();
2177     if (m_atBreak) {
2178         if (!size)
2179             return 0;
2180     } else {
2181         if (size != m_buffer.capacity())
2182             return 0;
2183     }
2184 
2185     UStringSearch* searcher = WebCore::searcher();
2186 
2187     UErrorCode status = U_ZERO_ERROR;
2188     usearch_setText(searcher, m_buffer.data(), size, &status);
2189     ASSERT(status == U_ZERO_ERROR);
2190 
2191     usearch_setOffset(searcher, m_prefixLength, &status);
2192     ASSERT(status == U_ZERO_ERROR);
2193 
2194     int matchStart = usearch_next(searcher, &status);
2195     ASSERT(status == U_ZERO_ERROR);
2196 
2197 nextMatch:
2198     if (!(matchStart >= 0 && static_cast<size_t>(matchStart) < size)) {
2199         ASSERT(matchStart == USEARCH_DONE);
2200         return 0;
2201     }
2202 
2203     // Matches that start in the overlap area are only tentative.
2204     // The same match may appear later, matching more characters,
2205     // possibly including a combining character that's not yet in the buffer.
2206     if (!m_atBreak && static_cast<size_t>(matchStart) >= size - m_overlap) {
2207         size_t overlap = m_overlap;
2208         if (m_options & AtWordStarts) {
2209             // Ensure that there is sufficient context before matchStart the next time around for
2210             // determining if it is at a word boundary.
2211             int wordBoundaryContextStart = matchStart;
2212             U16_BACK_1(m_buffer.data(), 0, wordBoundaryContextStart);
2213             wordBoundaryContextStart = startOfLastWordBoundaryContext(m_buffer.data(), wordBoundaryContextStart);
2214             overlap = min(size - 1, max(overlap, size - wordBoundaryContextStart));
2215         }
2216         memcpy(m_buffer.data(), m_buffer.data() + size - overlap, overlap * sizeof(UChar));
2217         m_prefixLength -= min(m_prefixLength, size - overlap);
2218         m_buffer.shrink(overlap);
2219         return 0;
2220     }
2221 
2222     size_t matchedLength = usearch_getMatchedLength(searcher);
2223     ASSERT(matchStart + matchedLength <= size);
2224 
2225     // If this match is "bad", move on to the next match.
2226     if (isBadMatch(m_buffer.data() + matchStart, matchedLength) || ((m_options & AtWordStarts) && !isWordStartMatch(matchStart, matchedLength))) {
2227         matchStart = usearch_next(searcher, &status);
2228         ASSERT(status == U_ZERO_ERROR);
2229         goto nextMatch;
2230     }
2231 
2232     size_t newSize = size - (matchStart + 1);
2233     memmove(m_buffer.data(), m_buffer.data() + matchStart + 1, newSize * sizeof(UChar));
2234     m_prefixLength -= min<size_t>(m_prefixLength, matchStart + 1);
2235     m_buffer.shrink(newSize);
2236 
2237     start = size - matchStart;
2238     return matchedLength;
2239 }
2240 
2241 #else // !ICU_UNICODE
2242 
SearchBuffer(const String & target,FindOptions options)2243 inline SearchBuffer::SearchBuffer(const String& target, FindOptions options)
2244     : m_target(options & CaseInsensitive ? target.foldCase() : target)
2245     , m_options(options)
2246     , m_buffer(m_target.length())
2247     , m_isCharacterStartBuffer(m_target.length())
2248     , m_isBufferFull(false)
2249     , m_cursor(0)
2250 {
2251     ASSERT(!m_target.isEmpty());
2252     m_target.replace(noBreakSpace, ' ');
2253     foldQuoteMarksAndSoftHyphens(m_target);
2254 }
2255 
~SearchBuffer()2256 inline SearchBuffer::~SearchBuffer()
2257 {
2258 }
2259 
reachedBreak()2260 inline void SearchBuffer::reachedBreak()
2261 {
2262     m_cursor = 0;
2263     m_isBufferFull = false;
2264 }
2265 
atBreak() const2266 inline bool SearchBuffer::atBreak() const
2267 {
2268     return !m_cursor && !m_isBufferFull;
2269 }
2270 
append(UChar c,bool isStart)2271 inline void SearchBuffer::append(UChar c, bool isStart)
2272 {
2273     m_buffer[m_cursor] = c == noBreakSpace ? ' ' : foldQuoteMarkOrSoftHyphen(c);
2274     m_isCharacterStartBuffer[m_cursor] = isStart;
2275     if (++m_cursor == m_target.length()) {
2276         m_cursor = 0;
2277         m_isBufferFull = true;
2278     }
2279 }
2280 
append(const UChar * characters,size_t length)2281 inline size_t SearchBuffer::append(const UChar* characters, size_t length)
2282 {
2283     ASSERT(length);
2284     if (!(m_options & CaseInsensitive)) {
2285         append(characters[0], true);
2286         return 1;
2287     }
2288     const int maxFoldedCharacters = 16; // sensible maximum is 3, this should be more than enough
2289     UChar foldedCharacters[maxFoldedCharacters];
2290     bool error;
2291     int numFoldedCharacters = foldCase(foldedCharacters, maxFoldedCharacters, characters, 1, &error);
2292     ASSERT(!error);
2293     ASSERT(numFoldedCharacters);
2294     ASSERT(numFoldedCharacters <= maxFoldedCharacters);
2295     if (!error && numFoldedCharacters) {
2296         numFoldedCharacters = min(numFoldedCharacters, maxFoldedCharacters);
2297         append(foldedCharacters[0], true);
2298         for (int i = 1; i < numFoldedCharacters; ++i)
2299             append(foldedCharacters[i], false);
2300     }
2301     return 1;
2302 }
2303 
needsMoreContext() const2304 inline bool SearchBuffer::needsMoreContext() const
2305 {
2306     return false;
2307 }
2308 
prependContext(const UChar *,size_t)2309 void SearchBuffer::prependContext(const UChar*, size_t)
2310 {
2311     ASSERT_NOT_REACHED();
2312 }
2313 
search(size_t & start)2314 inline size_t SearchBuffer::search(size_t& start)
2315 {
2316     if (!m_isBufferFull)
2317         return 0;
2318     if (!m_isCharacterStartBuffer[m_cursor])
2319         return 0;
2320 
2321     size_t tailSpace = m_target.length() - m_cursor;
2322     if (memcmp(&m_buffer[m_cursor], m_target.characters(), tailSpace * sizeof(UChar)) != 0)
2323         return 0;
2324     if (memcmp(&m_buffer[0], m_target.characters() + tailSpace, m_cursor * sizeof(UChar)) != 0)
2325         return 0;
2326 
2327     start = length();
2328 
2329     // Now that we've found a match once, we don't want to find it again, because those
2330     // are the SearchBuffer semantics, allowing for a buffer where you append more than one
2331     // character at a time. To do this we take advantage of m_isCharacterStartBuffer, but if
2332     // we want to get rid of that in the future we could track this with a separate boolean
2333     // or even move the characters to the start of the buffer and set m_isBufferFull to false.
2334     m_isCharacterStartBuffer[m_cursor] = false;
2335 
2336     return start;
2337 }
2338 
2339 // Returns the number of characters that were appended to the buffer (what we are searching in).
2340 // That's not necessarily the same length as the passed-in target string, because case folding
2341 // can make two strings match even though they're not the same length.
length() const2342 size_t SearchBuffer::length() const
2343 {
2344     size_t bufferSize = m_target.length();
2345     size_t length = 0;
2346     for (size_t i = 0; i < bufferSize; ++i)
2347         length += m_isCharacterStartBuffer[i];
2348     return length;
2349 }
2350 
2351 #endif // !ICU_UNICODE
2352 
2353 // --------
2354 
rangeLength(const Range * r,bool forSelectionPreservation)2355 int TextIterator::rangeLength(const Range* r, bool forSelectionPreservation)
2356 {
2357     int length = 0;
2358     for (TextIterator it(r, forSelectionPreservation ? TextIteratorEmitsCharactersBetweenAllVisiblePositions : TextIteratorDefaultBehavior); !it.atEnd(); it.advance())
2359         length += it.length();
2360 
2361     return length;
2362 }
2363 
subrange(Range * entireRange,int characterOffset,int characterCount)2364 PassRefPtr<Range> TextIterator::subrange(Range* entireRange, int characterOffset, int characterCount)
2365 {
2366     CharacterIterator entireRangeIterator(entireRange);
2367     return characterSubrange(entireRangeIterator, characterOffset, characterCount);
2368 }
2369 
rangeFromLocationAndLength(Element * scope,int rangeLocation,int rangeLength,bool forSelectionPreservation)2370 PassRefPtr<Range> TextIterator::rangeFromLocationAndLength(Element* scope, int rangeLocation, int rangeLength, bool forSelectionPreservation)
2371 {
2372     RefPtr<Range> resultRange = scope->document()->createRange();
2373 
2374     int docTextPosition = 0;
2375     int rangeEnd = rangeLocation + rangeLength;
2376     bool startRangeFound = false;
2377 
2378     RefPtr<Range> textRunRange;
2379 
2380     TextIterator it(rangeOfContents(scope).get(), forSelectionPreservation ? TextIteratorEmitsCharactersBetweenAllVisiblePositions : TextIteratorDefaultBehavior);
2381 
2382     // FIXME: the atEnd() check shouldn't be necessary, workaround for <http://bugs.webkit.org/show_bug.cgi?id=6289>.
2383     if (rangeLocation == 0 && rangeLength == 0 && it.atEnd()) {
2384         textRunRange = it.range();
2385 
2386         ExceptionCode ec = 0;
2387         resultRange->setStart(textRunRange->startContainer(), 0, ec);
2388         ASSERT(!ec);
2389         resultRange->setEnd(textRunRange->startContainer(), 0, ec);
2390         ASSERT(!ec);
2391 
2392         return resultRange.release();
2393     }
2394 
2395     for (; !it.atEnd(); it.advance()) {
2396         int len = it.length();
2397         textRunRange = it.range();
2398 
2399         bool foundStart = rangeLocation >= docTextPosition && rangeLocation <= docTextPosition + len;
2400         bool foundEnd = rangeEnd >= docTextPosition && rangeEnd <= docTextPosition + len;
2401 
2402         // Fix textRunRange->endPosition(), but only if foundStart || foundEnd, because it is only
2403         // in those cases that textRunRange is used.
2404         if (foundEnd) {
2405             // FIXME: This is a workaround for the fact that the end of a run is often at the wrong
2406             // position for emitted '\n's.
2407             if (len == 1 && it.characters()[0] == '\n') {
2408                 scope->document()->updateLayoutIgnorePendingStylesheets();
2409                 it.advance();
2410                 if (!it.atEnd()) {
2411                     RefPtr<Range> range = it.range();
2412                     ExceptionCode ec = 0;
2413                     textRunRange->setEnd(range->startContainer(), range->startOffset(), ec);
2414                     ASSERT(!ec);
2415                 } else {
2416                     Position runStart = textRunRange->startPosition();
2417                     Position runEnd = VisiblePosition(runStart).next().deepEquivalent();
2418                     if (runEnd.isNotNull()) {
2419                         ExceptionCode ec = 0;
2420                         textRunRange->setEnd(runEnd.containerNode(), runEnd.computeOffsetInContainerNode(), ec);
2421                         ASSERT(!ec);
2422                     }
2423                 }
2424             }
2425         }
2426 
2427         if (foundStart) {
2428             startRangeFound = true;
2429             int exception = 0;
2430             if (textRunRange->startContainer()->isTextNode()) {
2431                 int offset = rangeLocation - docTextPosition;
2432                 resultRange->setStart(textRunRange->startContainer(), offset + textRunRange->startOffset(), exception);
2433             } else {
2434                 if (rangeLocation == docTextPosition)
2435                     resultRange->setStart(textRunRange->startContainer(), textRunRange->startOffset(), exception);
2436                 else
2437                     resultRange->setStart(textRunRange->endContainer(), textRunRange->endOffset(), exception);
2438             }
2439         }
2440 
2441         if (foundEnd) {
2442             int exception = 0;
2443             if (textRunRange->startContainer()->isTextNode()) {
2444                 int offset = rangeEnd - docTextPosition;
2445                 resultRange->setEnd(textRunRange->startContainer(), offset + textRunRange->startOffset(), exception);
2446             } else {
2447                 if (rangeEnd == docTextPosition)
2448                     resultRange->setEnd(textRunRange->startContainer(), textRunRange->startOffset(), exception);
2449                 else
2450                     resultRange->setEnd(textRunRange->endContainer(), textRunRange->endOffset(), exception);
2451             }
2452             docTextPosition += len;
2453             break;
2454         }
2455         docTextPosition += len;
2456     }
2457 
2458     if (!startRangeFound)
2459         return 0;
2460 
2461     if (rangeLength != 0 && rangeEnd > docTextPosition) { // rangeEnd is out of bounds
2462         int exception = 0;
2463         resultRange->setEnd(textRunRange->endContainer(), textRunRange->endOffset(), exception);
2464     }
2465 
2466     return resultRange.release();
2467 }
2468 
locationAndLengthFromRange(const Range * range,size_t & location,size_t & length)2469 bool TextIterator::locationAndLengthFromRange(const Range* range, size_t& location, size_t& length)
2470 {
2471     location = notFound;
2472     length = 0;
2473 
2474     if (!range->startContainer())
2475         return false;
2476 
2477     Element* selectionRoot = range->ownerDocument()->frame()->selection()->rootEditableElement();
2478     Element* scope = selectionRoot ? selectionRoot : range->ownerDocument()->documentElement();
2479 
2480     // The critical assumption is that this only gets called with ranges that
2481     // concentrate on a given area containing the selection root. This is done
2482     // because of text fields and textareas. The DOM for those is not
2483     // directly in the document DOM, so ensure that the range does not cross a
2484     // boundary of one of those.
2485     if (range->startContainer() != scope && !range->startContainer()->isDescendantOf(scope))
2486         return false;
2487     if (range->endContainer() != scope && !range->endContainer()->isDescendantOf(scope))
2488         return false;
2489 
2490     RefPtr<Range> testRange = Range::create(scope->document(), scope, 0, range->startContainer(), range->startOffset());
2491     ASSERT(testRange->startContainer() == scope);
2492     location = TextIterator::rangeLength(testRange.get());
2493 
2494     ExceptionCode ec;
2495     testRange->setEnd(range->endContainer(), range->endOffset(), ec);
2496     ASSERT(testRange->startContainer() == scope);
2497     length = TextIterator::rangeLength(testRange.get()) - location;
2498     return true;
2499 }
2500 
2501 // --------
2502 
plainTextToMallocAllocatedBuffer(const Range * r,unsigned & bufferLength,bool isDisplayString,TextIteratorBehavior defaultBehavior)2503 UChar* plainTextToMallocAllocatedBuffer(const Range* r, unsigned& bufferLength, bool isDisplayString, TextIteratorBehavior defaultBehavior)
2504 {
2505     UChar* result = 0;
2506 
2507     // Do this in pieces to avoid massive reallocations if there is a large amount of text.
2508     // Use system malloc for buffers since they can consume lots of memory and current TCMalloc is unable return it back to OS.
2509     static const unsigned cMaxSegmentSize = 1 << 16;
2510     bufferLength = 0;
2511     typedef pair<UChar*, unsigned> TextSegment;
2512     OwnPtr<Vector<TextSegment> > textSegments;
2513     Vector<UChar> textBuffer;
2514     textBuffer.reserveInitialCapacity(cMaxSegmentSize);
2515     TextIteratorBehavior behavior = defaultBehavior;
2516     if (!isDisplayString)
2517         behavior = static_cast<TextIteratorBehavior>(behavior | TextIteratorEmitsTextsWithoutTranscoding);
2518 
2519     for (TextIterator it(r, behavior); !it.atEnd(); it.advance()) {
2520         if (textBuffer.size() && textBuffer.size() + it.length() > cMaxSegmentSize) {
2521             UChar* newSegmentBuffer = static_cast<UChar*>(malloc(textBuffer.size() * sizeof(UChar)));
2522             if (!newSegmentBuffer)
2523                 goto exit;
2524             memcpy(newSegmentBuffer, textBuffer.data(), textBuffer.size() * sizeof(UChar));
2525             if (!textSegments)
2526                 textSegments = adoptPtr(new Vector<TextSegment>);
2527             textSegments->append(make_pair(newSegmentBuffer, (unsigned)textBuffer.size()));
2528             textBuffer.clear();
2529         }
2530         textBuffer.append(it.characters(), it.length());
2531         bufferLength += it.length();
2532     }
2533 
2534     if (!bufferLength)
2535         return 0;
2536 
2537     // Since we know the size now, we can make a single buffer out of the pieces with one big alloc
2538     result = static_cast<UChar*>(malloc(bufferLength * sizeof(UChar)));
2539     if (!result)
2540         goto exit;
2541 
2542     {
2543         UChar* resultPos = result;
2544         if (textSegments) {
2545             unsigned size = textSegments->size();
2546             for (unsigned i = 0; i < size; ++i) {
2547                 const TextSegment& segment = textSegments->at(i);
2548                 memcpy(resultPos, segment.first, segment.second * sizeof(UChar));
2549                 resultPos += segment.second;
2550             }
2551         }
2552         memcpy(resultPos, textBuffer.data(), textBuffer.size() * sizeof(UChar));
2553     }
2554 
2555 exit:
2556     if (textSegments) {
2557         unsigned size = textSegments->size();
2558         for (unsigned i = 0; i < size; ++i)
2559             free(textSegments->at(i).first);
2560     }
2561 
2562     if (isDisplayString && r->ownerDocument())
2563         r->ownerDocument()->displayBufferModifiedByEncoding(result, bufferLength);
2564 
2565     return result;
2566 }
2567 
plainText(const Range * r,TextIteratorBehavior defaultBehavior)2568 String plainText(const Range* r, TextIteratorBehavior defaultBehavior)
2569 {
2570     unsigned length;
2571     UChar* buf = plainTextToMallocAllocatedBuffer(r, length, false, defaultBehavior);
2572     if (!buf)
2573         return "";
2574     String result(buf, length);
2575     free(buf);
2576     return result;
2577 }
2578 
isAllCollapsibleWhitespace(const String & string)2579 static inline bool isAllCollapsibleWhitespace(const String& string)
2580 {
2581     const UChar* characters = string.characters();
2582     unsigned length = string.length();
2583     for (unsigned i = 0; i < length; ++i) {
2584         if (!isCollapsibleWhitespace(characters[i]))
2585             return false;
2586     }
2587     return true;
2588 }
2589 
collapsedToBoundary(const Range * range,bool forward)2590 static PassRefPtr<Range> collapsedToBoundary(const Range* range, bool forward)
2591 {
2592     ExceptionCode ec = 0;
2593     RefPtr<Range> result = range->cloneRange(ec);
2594     ASSERT(!ec);
2595     result->collapse(!forward, ec);
2596     ASSERT(!ec);
2597     return result.release();
2598 }
2599 
findPlainText(CharacterIterator & it,const String & target,FindOptions options,size_t & matchStart)2600 static size_t findPlainText(CharacterIterator& it, const String& target, FindOptions options, size_t& matchStart)
2601 {
2602     matchStart = 0;
2603     size_t matchLength = 0;
2604 
2605     SearchBuffer buffer(target, options);
2606 
2607     if (buffer.needsMoreContext()) {
2608         RefPtr<Range> startRange = it.range();
2609         RefPtr<Range> beforeStartRange = startRange->ownerDocument()->createRange();
2610         ExceptionCode ec = 0;
2611         beforeStartRange->setEnd(startRange->startContainer(), startRange->startOffset(), ec);
2612         for (SimplifiedBackwardsTextIterator backwardsIterator(beforeStartRange.get()); !backwardsIterator.atEnd(); backwardsIterator.advance()) {
2613             buffer.prependContext(backwardsIterator.characters(), backwardsIterator.length());
2614             if (!buffer.needsMoreContext())
2615                 break;
2616         }
2617     }
2618 
2619     while (!it.atEnd()) {
2620         it.advance(buffer.append(it.characters(), it.length()));
2621 tryAgain:
2622         size_t matchStartOffset;
2623         if (size_t newMatchLength = buffer.search(matchStartOffset)) {
2624             // Note that we found a match, and where we found it.
2625             size_t lastCharacterInBufferOffset = it.characterOffset();
2626             ASSERT(lastCharacterInBufferOffset >= matchStartOffset);
2627             matchStart = lastCharacterInBufferOffset - matchStartOffset;
2628             matchLength = newMatchLength;
2629             // If searching forward, stop on the first match.
2630             // If searching backward, don't stop, so we end up with the last match.
2631             if (!(options & Backwards))
2632                 break;
2633             goto tryAgain;
2634         }
2635         if (it.atBreak() && !buffer.atBreak()) {
2636             buffer.reachedBreak();
2637             goto tryAgain;
2638         }
2639     }
2640 
2641     return matchLength;
2642 }
2643 
findPlainText(const Range * range,const String & target,FindOptions options)2644 PassRefPtr<Range> findPlainText(const Range* range, const String& target, FindOptions options)
2645 {
2646     // CharacterIterator requires renderers to be up-to-date
2647     range->ownerDocument()->updateLayout();
2648 
2649     // First, find the text.
2650     size_t matchStart;
2651     size_t matchLength;
2652     {
2653         CharacterIterator findIterator(range, TextIteratorEntersTextControls);
2654         matchLength = findPlainText(findIterator, target, options, matchStart);
2655         if (!matchLength)
2656             return collapsedToBoundary(range, !(options & Backwards));
2657     }
2658 
2659     // Then, find the document position of the start and the end of the text.
2660     CharacterIterator computeRangeIterator(range, TextIteratorEntersTextControls);
2661     return characterSubrange(computeRangeIterator, matchStart, matchLength);
2662 }
2663 
2664 }
2665