1 /*
2 * Copyright (C) 2004, 2005, 2006, 2007, 2008, 2009, 2010 Apple Inc. All rights reserved.
3 * Copyright (C) 2005 Alexey Proskuryakov.
4 *
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions
7 * are met:
8 * 1. Redistributions of source code must retain the above copyright
9 * notice, this list of conditions and the following disclaimer.
10 * 2. Redistributions in binary form must reproduce the above copyright
11 * notice, this list of conditions and the following disclaimer in the
12 * documentation and/or other materials provided with the distribution.
13 *
14 * THIS SOFTWARE IS PROVIDED BY APPLE COMPUTER, INC. ``AS IS'' AND ANY
15 * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
16 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
17 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL APPLE COMPUTER, INC. OR
18 * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
19 * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
20 * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
21 * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
22 * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
23 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
24 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
25 */
26
27 #include "config.h"
28 #include "TextIterator.h"
29
30 #include "Document.h"
31 #include "Frame.h"
32 #include "HTMLElement.h"
33 #include "HTMLNames.h"
34 #include "htmlediting.h"
35 #include "InlineTextBox.h"
36 #include "Range.h"
37 #include "RenderTableCell.h"
38 #include "RenderTableRow.h"
39 #include "RenderTextControl.h"
40 #include "RenderTextFragment.h"
41 #include "TextBoundaries.h"
42 #include "TextBreakIterator.h"
43 #include "VisiblePosition.h"
44 #include "visible_units.h"
45 #include <wtf/unicode/CharacterNames.h>
46
47 #if USE(ICU_UNICODE) && !UCONFIG_NO_COLLATION
48 #include "TextBreakIteratorInternalICU.h"
49 #include <unicode/usearch.h>
50 #endif
51
52 using namespace WTF::Unicode;
53 using namespace std;
54
55 namespace WebCore {
56
57 using namespace HTMLNames;
58
59 // Buffer that knows how to compare with a search target.
60 // Keeps enough of the previous text to be able to search in the future, but no more.
61 // Non-breaking spaces are always equal to normal spaces.
62 // Case folding is also done if the CaseInsensitive option is specified.
63 // Matches are further filtered if the AtWordStarts option is specified, although some
64 // matches inside a word are permitted if TreatMedialCapitalAsWordStart is specified as well.
65 class SearchBuffer {
66 WTF_MAKE_NONCOPYABLE(SearchBuffer);
67 public:
68 SearchBuffer(const String& target, FindOptions);
69 ~SearchBuffer();
70
71 // Returns number of characters appended; guaranteed to be in the range [1, length].
72 size_t append(const UChar*, size_t length);
73 bool needsMoreContext() const;
74 void prependContext(const UChar*, size_t length);
75 void reachedBreak();
76
77 // Result is the size in characters of what was found.
78 // And <startOffset> is the number of characters back to the start of what was found.
79 size_t search(size_t& startOffset);
80 bool atBreak() const;
81
82 #if USE(ICU_UNICODE) && !UCONFIG_NO_COLLATION
83
84 private:
85 bool isBadMatch(const UChar*, size_t length) const;
86 bool isWordStartMatch(size_t start, size_t length) const;
87
88 String m_target;
89 FindOptions m_options;
90
91 Vector<UChar> m_buffer;
92 size_t m_overlap;
93 size_t m_prefixLength;
94 bool m_atBreak;
95 bool m_needsMoreContext;
96
97 bool m_targetRequiresKanaWorkaround;
98 Vector<UChar> m_normalizedTarget;
99 mutable Vector<UChar> m_normalizedMatch;
100
101 #else
102
103 private:
104 void append(UChar, bool isCharacterStart);
105 size_t length() const;
106
107 String m_target;
108 FindOptions m_options;
109
110 Vector<UChar> m_buffer;
111 Vector<bool> m_isCharacterStartBuffer;
112 bool m_isBufferFull;
113 size_t m_cursor;
114
115 #endif
116 };
117
118 // --------
119
120 static const unsigned bitsInWord = sizeof(unsigned) * 8;
121 static const unsigned bitInWordMask = bitsInWord - 1;
122
BitStack()123 BitStack::BitStack()
124 : m_size(0)
125 {
126 }
127
~BitStack()128 BitStack::~BitStack()
129 {
130 }
131
push(bool bit)132 void BitStack::push(bool bit)
133 {
134 unsigned index = m_size / bitsInWord;
135 unsigned shift = m_size & bitInWordMask;
136 if (!shift && index == m_words.size()) {
137 m_words.grow(index + 1);
138 m_words[index] = 0;
139 }
140 unsigned& word = m_words[index];
141 unsigned mask = 1U << shift;
142 if (bit)
143 word |= mask;
144 else
145 word &= ~mask;
146 ++m_size;
147 }
148
pop()149 void BitStack::pop()
150 {
151 if (m_size)
152 --m_size;
153 }
154
top() const155 bool BitStack::top() const
156 {
157 if (!m_size)
158 return false;
159 unsigned shift = (m_size - 1) & bitInWordMask;
160 return m_words.last() & (1U << shift);
161 }
162
size() const163 unsigned BitStack::size() const
164 {
165 return m_size;
166 }
167
168 // --------
169
170 #if !ASSERT_DISABLED
171
depthCrossingShadowBoundaries(Node * node)172 static unsigned depthCrossingShadowBoundaries(Node* node)
173 {
174 unsigned depth = 0;
175 for (Node* parent = node->parentOrHostNode(); parent; parent = parent->parentOrHostNode())
176 ++depth;
177 return depth;
178 }
179
180 #endif
181
182 // This function is like Range::pastLastNode, except for the fact that it can climb up out of shadow trees.
nextInPreOrderCrossingShadowBoundaries(Node * rangeEndContainer,int rangeEndOffset)183 static Node* nextInPreOrderCrossingShadowBoundaries(Node* rangeEndContainer, int rangeEndOffset)
184 {
185 if (!rangeEndContainer)
186 return 0;
187 if (rangeEndOffset >= 0 && !rangeEndContainer->offsetInCharacters()) {
188 if (Node* next = rangeEndContainer->childNode(rangeEndOffset))
189 return next;
190 }
191 for (Node* node = rangeEndContainer; node; node = node->parentOrHostNode()) {
192 if (Node* next = node->nextSibling())
193 return next;
194 }
195 return 0;
196 }
197
198 // --------
199
fullyClipsContents(Node * node)200 static inline bool fullyClipsContents(Node* node)
201 {
202 RenderObject* renderer = node->renderer();
203 if (!renderer || !renderer->isBox() || !renderer->hasOverflowClip())
204 return false;
205 return toRenderBox(renderer)->size().isEmpty();
206 }
207
ignoresContainerClip(Node * node)208 static inline bool ignoresContainerClip(Node* node)
209 {
210 RenderObject* renderer = node->renderer();
211 if (!renderer || renderer->isText())
212 return false;
213 EPosition position = renderer->style()->position();
214 return position == AbsolutePosition || position == FixedPosition;
215 }
216
pushFullyClippedState(BitStack & stack,Node * node)217 static void pushFullyClippedState(BitStack& stack, Node* node)
218 {
219 ASSERT(stack.size() == depthCrossingShadowBoundaries(node));
220
221 // Push true if this node full clips its contents, or if a parent already has fully
222 // clipped and this is not a node that ignores its container's clip.
223 stack.push(fullyClipsContents(node) || (stack.top() && !ignoresContainerClip(node)));
224 }
225
setUpFullyClippedStack(BitStack & stack,Node * node)226 static void setUpFullyClippedStack(BitStack& stack, Node* node)
227 {
228 // Put the nodes in a vector so we can iterate in reverse order.
229 Vector<Node*, 100> ancestry;
230 for (Node* parent = node->parentOrHostNode(); parent; parent = parent->parentOrHostNode())
231 ancestry.append(parent);
232
233 // Call pushFullyClippedState on each node starting with the earliest ancestor.
234 size_t size = ancestry.size();
235 for (size_t i = 0; i < size; ++i)
236 pushFullyClippedState(stack, ancestry[size - i - 1]);
237 pushFullyClippedState(stack, node);
238
239 ASSERT(stack.size() == 1 + depthCrossingShadowBoundaries(node));
240 }
241
242 // --------
243
TextIterator()244 TextIterator::TextIterator()
245 : m_startContainer(0)
246 , m_startOffset(0)
247 , m_endContainer(0)
248 , m_endOffset(0)
249 , m_positionNode(0)
250 , m_textCharacters(0)
251 , m_textLength(0)
252 , m_remainingTextBox(0)
253 , m_firstLetterText(0)
254 , m_lastCharacter(0)
255 , m_emitsCharactersBetweenAllVisiblePositions(false)
256 , m_entersTextControls(false)
257 , m_emitsTextWithoutTranscoding(false)
258 , m_handledFirstLetter(false)
259 , m_ignoresStyleVisibility(false)
260 , m_emitsObjectReplacementCharacters(false)
261 {
262 }
263
TextIterator(const Range * r,TextIteratorBehavior behavior)264 TextIterator::TextIterator(const Range* r, TextIteratorBehavior behavior)
265 : m_startContainer(0)
266 , m_startOffset(0)
267 , m_endContainer(0)
268 , m_endOffset(0)
269 , m_positionNode(0)
270 , m_textCharacters(0)
271 , m_textLength(0)
272 , m_remainingTextBox(0)
273 , m_firstLetterText(0)
274 , m_emitsCharactersBetweenAllVisiblePositions(behavior & TextIteratorEmitsCharactersBetweenAllVisiblePositions)
275 , m_entersTextControls(behavior & TextIteratorEntersTextControls)
276 , m_emitsTextWithoutTranscoding(behavior & TextIteratorEmitsTextsWithoutTranscoding)
277 , m_handledFirstLetter(false)
278 , m_ignoresStyleVisibility(behavior & TextIteratorIgnoresStyleVisibility)
279 , m_emitsObjectReplacementCharacters(behavior & TextIteratorEmitsObjectReplacementCharacters)
280 {
281 if (!r)
282 return;
283
284 // get and validate the range endpoints
285 Node* startContainer = r->startContainer();
286 if (!startContainer)
287 return;
288 int startOffset = r->startOffset();
289 Node* endContainer = r->endContainer();
290 int endOffset = r->endOffset();
291
292 // Callers should be handing us well-formed ranges. If we discover that this isn't
293 // the case, we could consider changing this assertion to an early return.
294 ASSERT(r->boundaryPointsValid());
295
296 // remember range - this does not change
297 m_startContainer = startContainer;
298 m_startOffset = startOffset;
299 m_endContainer = endContainer;
300 m_endOffset = endOffset;
301
302 // set up the current node for processing
303 m_node = r->firstNode();
304 if (!m_node)
305 return;
306 setUpFullyClippedStack(m_fullyClippedStack, m_node);
307 m_offset = m_node == m_startContainer ? m_startOffset : 0;
308 m_handledNode = false;
309 m_handledChildren = false;
310
311 // calculate first out of bounds node
312 m_pastEndNode = nextInPreOrderCrossingShadowBoundaries(endContainer, endOffset);
313
314 // initialize node processing state
315 m_needsAnotherNewline = false;
316 m_textBox = 0;
317
318 // initialize record of previous node processing
319 m_hasEmitted = false;
320 m_lastTextNode = 0;
321 m_lastTextNodeEndedWithCollapsedSpace = false;
322 m_lastCharacter = 0;
323
324 #ifndef NDEBUG
325 // need this just because of the assert in advance()
326 m_positionNode = m_node;
327 #endif
328
329 // identify the first run
330 advance();
331 }
332
~TextIterator()333 TextIterator::~TextIterator()
334 {
335 }
336
advance()337 void TextIterator::advance()
338 {
339 // reset the run information
340 m_positionNode = 0;
341 m_textLength = 0;
342
343 // handle remembered node that needed a newline after the text node's newline
344 if (m_needsAnotherNewline) {
345 // Emit the extra newline, and position it *inside* m_node, after m_node's
346 // contents, in case it's a block, in the same way that we position the first
347 // newline. The range for the emitted newline should start where the line
348 // break begins.
349 // FIXME: It would be cleaner if we emitted two newlines during the last
350 // iteration, instead of using m_needsAnotherNewline.
351 Node* baseNode = m_node->lastChild() ? m_node->lastChild() : m_node;
352 emitCharacter('\n', baseNode->parentNode(), baseNode, 1, 1);
353 m_needsAnotherNewline = false;
354 return;
355 }
356
357 if (!m_textBox && m_remainingTextBox) {
358 m_textBox = m_remainingTextBox;
359 m_remainingTextBox = 0;
360 m_firstLetterText = 0;
361 m_offset = 0;
362 }
363 // handle remembered text box
364 if (m_textBox) {
365 handleTextBox();
366 if (m_positionNode)
367 return;
368 }
369
370 while (m_node && m_node != m_pastEndNode) {
371 // if the range ends at offset 0 of an element, represent the
372 // position, but not the content, of that element e.g. if the
373 // node is a blockflow element, emit a newline that
374 // precedes the element
375 if (m_node == m_endContainer && m_endOffset == 0) {
376 representNodeOffsetZero();
377 m_node = 0;
378 return;
379 }
380
381 RenderObject* renderer = m_node->renderer();
382 if (!renderer) {
383 m_handledNode = true;
384 m_handledChildren = true;
385 } else {
386 // handle current node according to its type
387 if (!m_handledNode) {
388 if (renderer->isText() && m_node->nodeType() == Node::TEXT_NODE) // FIXME: What about CDATA_SECTION_NODE?
389 m_handledNode = handleTextNode();
390 else if (renderer && (renderer->isImage() || renderer->isWidget() ||
391 (renderer->node() && renderer->node()->isElementNode() &&
392 static_cast<Element*>(renderer->node())->isFormControlElement())))
393 m_handledNode = handleReplacedElement();
394 else
395 m_handledNode = handleNonTextNode();
396 if (m_positionNode)
397 return;
398 }
399 }
400
401 // find a new current node to handle in depth-first manner,
402 // calling exitNode() as we come back thru a parent node
403 Node* next = m_handledChildren ? 0 : m_node->firstChild();
404 m_offset = 0;
405 if (!next) {
406 next = m_node->nextSibling();
407 if (!next) {
408 bool pastEnd = m_node->traverseNextNode() == m_pastEndNode;
409 Node* parentNode = m_node->parentOrHostNode();
410 while (!next && parentNode) {
411 if ((pastEnd && parentNode == m_endContainer) || m_endContainer->isDescendantOf(parentNode))
412 return;
413 bool haveRenderer = m_node->renderer();
414 m_node = parentNode;
415 m_fullyClippedStack.pop();
416 parentNode = m_node->parentOrHostNode();
417 if (haveRenderer)
418 exitNode();
419 if (m_positionNode) {
420 m_handledNode = true;
421 m_handledChildren = true;
422 return;
423 }
424 next = m_node->nextSibling();
425 }
426 }
427 m_fullyClippedStack.pop();
428 }
429
430 // set the new current node
431 m_node = next;
432 if (m_node)
433 pushFullyClippedState(m_fullyClippedStack, m_node);
434 m_handledNode = false;
435 m_handledChildren = false;
436 m_handledFirstLetter = false;
437 m_firstLetterText = 0;
438
439 // how would this ever be?
440 if (m_positionNode)
441 return;
442 }
443 }
444
handleTextNode()445 bool TextIterator::handleTextNode()
446 {
447 if (m_fullyClippedStack.top() && !m_ignoresStyleVisibility)
448 return false;
449
450 RenderText* renderer = toRenderText(m_node->renderer());
451
452 m_lastTextNode = m_node;
453 String str = renderer->text();
454
455 // handle pre-formatted text
456 if (!renderer->style()->collapseWhiteSpace()) {
457 int runStart = m_offset;
458 if (m_lastTextNodeEndedWithCollapsedSpace && hasVisibleTextNode(renderer)) {
459 emitCharacter(' ', m_node, 0, runStart, runStart);
460 return false;
461 }
462 if (!m_handledFirstLetter && renderer->isTextFragment() && !m_offset) {
463 handleTextNodeFirstLetter(static_cast<RenderTextFragment*>(renderer));
464 if (m_firstLetterText) {
465 String firstLetter = m_firstLetterText->text();
466 emitText(m_node, m_firstLetterText, m_offset, m_offset + firstLetter.length());
467 m_firstLetterText = 0;
468 m_textBox = 0;
469 return false;
470 }
471 }
472 if (renderer->style()->visibility() != VISIBLE && !m_ignoresStyleVisibility)
473 return false;
474 int strLength = str.length();
475 int end = (m_node == m_endContainer) ? m_endOffset : INT_MAX;
476 int runEnd = min(strLength, end);
477
478 if (runStart >= runEnd)
479 return true;
480
481 emitText(m_node, runStart, runEnd);
482 return true;
483 }
484
485 if (!renderer->firstTextBox() && str.length() > 0) {
486 if (!m_handledFirstLetter && renderer->isTextFragment()) {
487 handleTextNodeFirstLetter(static_cast<RenderTextFragment*>(renderer));
488 if (m_firstLetterText) {
489 handleTextBox();
490 return false;
491 }
492 }
493 if (renderer->style()->visibility() != VISIBLE && !m_ignoresStyleVisibility)
494 return false;
495 m_lastTextNodeEndedWithCollapsedSpace = true; // entire block is collapsed space
496 return true;
497 }
498
499
500 m_textBox = renderer->firstTextBox();
501 if (!m_handledFirstLetter && renderer->isTextFragment() && !m_offset)
502 handleTextNodeFirstLetter(static_cast<RenderTextFragment*>(renderer));
503
504 if (m_firstLetterText)
505 renderer = m_firstLetterText;
506
507 // Used when text boxes are out of order (Hebrew/Arabic w/ embeded LTR text)
508 if (renderer->containsReversedText()) {
509 m_sortedTextBoxes.clear();
510 for (InlineTextBox* textBox = renderer->firstTextBox(); textBox; textBox = textBox->nextTextBox()) {
511 m_sortedTextBoxes.append(textBox);
512 }
513 std::sort(m_sortedTextBoxes.begin(), m_sortedTextBoxes.end(), InlineTextBox::compareByStart);
514 m_sortedTextBoxesPosition = 0;
515 m_textBox = m_sortedTextBoxes.isEmpty() ? 0 : m_sortedTextBoxes[0];
516 }
517
518 handleTextBox();
519 return true;
520 }
521
handleTextBox()522 void TextIterator::handleTextBox()
523 {
524 RenderText* renderer = m_firstLetterText ? m_firstLetterText : toRenderText(m_node->renderer());
525 if (renderer->style()->visibility() != VISIBLE && !m_ignoresStyleVisibility) {
526 m_textBox = 0;
527 return;
528 }
529 String str = renderer->text();
530 unsigned start = m_offset;
531 unsigned end = (m_node == m_endContainer) ? static_cast<unsigned>(m_endOffset) : UINT_MAX;
532 while (m_textBox) {
533 unsigned textBoxStart = m_textBox->start();
534 unsigned runStart = max(textBoxStart, start);
535
536 // Check for collapsed space at the start of this run.
537 InlineTextBox* firstTextBox = renderer->containsReversedText() ? (m_sortedTextBoxes.isEmpty() ? 0 : m_sortedTextBoxes[0]) : renderer->firstTextBox();
538 bool needSpace = m_lastTextNodeEndedWithCollapsedSpace
539 || (m_textBox == firstTextBox && textBoxStart == runStart && runStart > 0);
540 if (needSpace && !isCollapsibleWhitespace(m_lastCharacter) && m_lastCharacter) {
541 if (m_lastTextNode == m_node && runStart > 0 && str[runStart - 1] == ' ') {
542 unsigned spaceRunStart = runStart - 1;
543 while (spaceRunStart > 0 && str[spaceRunStart - 1] == ' ')
544 --spaceRunStart;
545 emitText(m_node, renderer, spaceRunStart, spaceRunStart + 1);
546 } else
547 emitCharacter(' ', m_node, 0, runStart, runStart);
548 return;
549 }
550 unsigned textBoxEnd = textBoxStart + m_textBox->len();
551 unsigned runEnd = min(textBoxEnd, end);
552
553 // Determine what the next text box will be, but don't advance yet
554 InlineTextBox* nextTextBox = 0;
555 if (renderer->containsReversedText()) {
556 if (m_sortedTextBoxesPosition + 1 < m_sortedTextBoxes.size())
557 nextTextBox = m_sortedTextBoxes[m_sortedTextBoxesPosition + 1];
558 } else
559 nextTextBox = m_textBox->nextTextBox();
560
561 if (runStart < runEnd) {
562 // Handle either a single newline character (which becomes a space),
563 // or a run of characters that does not include a newline.
564 // This effectively translates newlines to spaces without copying the text.
565 if (str[runStart] == '\n') {
566 emitCharacter(' ', m_node, 0, runStart, runStart + 1);
567 m_offset = runStart + 1;
568 } else {
569 size_t subrunEnd = str.find('\n', runStart);
570 if (subrunEnd == notFound || subrunEnd > runEnd)
571 subrunEnd = runEnd;
572
573 m_offset = subrunEnd;
574 emitText(m_node, renderer, runStart, subrunEnd);
575 }
576
577 // If we are doing a subrun that doesn't go to the end of the text box,
578 // come back again to finish handling this text box; don't advance to the next one.
579 if (static_cast<unsigned>(m_positionEndOffset) < textBoxEnd)
580 return;
581
582 // Advance and return
583 unsigned nextRunStart = nextTextBox ? nextTextBox->start() : str.length();
584 if (nextRunStart > runEnd)
585 m_lastTextNodeEndedWithCollapsedSpace = true; // collapsed space between runs or at the end
586 m_textBox = nextTextBox;
587 if (renderer->containsReversedText())
588 ++m_sortedTextBoxesPosition;
589 return;
590 }
591 // Advance and continue
592 m_textBox = nextTextBox;
593 if (renderer->containsReversedText())
594 ++m_sortedTextBoxesPosition;
595 }
596 if (!m_textBox && m_remainingTextBox) {
597 m_textBox = m_remainingTextBox;
598 m_remainingTextBox = 0;
599 m_firstLetterText = 0;
600 m_offset = 0;
601 handleTextBox();
602 }
603 }
604
firstRenderTextInFirstLetter(RenderObject * firstLetter)605 static inline RenderText* firstRenderTextInFirstLetter(RenderObject* firstLetter)
606 {
607 if (!firstLetter)
608 return 0;
609
610 // FIXME: Should this check descendent objects?
611 for (RenderObject* current = firstLetter->firstChild(); current; current = current->nextSibling()) {
612 if (current->isText())
613 return toRenderText(current);
614 }
615 return 0;
616 }
617
handleTextNodeFirstLetter(RenderTextFragment * renderer)618 void TextIterator::handleTextNodeFirstLetter(RenderTextFragment* renderer)
619 {
620 if (renderer->firstLetter()) {
621 RenderObject* r = renderer->firstLetter();
622 if (r->style()->visibility() != VISIBLE && !m_ignoresStyleVisibility)
623 return;
624 if (RenderText* firstLetter = firstRenderTextInFirstLetter(r)) {
625 m_handledFirstLetter = true;
626 m_remainingTextBox = m_textBox;
627 m_textBox = firstLetter->firstTextBox();
628 m_firstLetterText = firstLetter;
629 }
630 }
631 m_handledFirstLetter = true;
632 }
633
handleReplacedElement()634 bool TextIterator::handleReplacedElement()
635 {
636 if (m_fullyClippedStack.top())
637 return false;
638
639 RenderObject* renderer = m_node->renderer();
640 if (renderer->style()->visibility() != VISIBLE && !m_ignoresStyleVisibility)
641 return false;
642
643 if (m_lastTextNodeEndedWithCollapsedSpace) {
644 emitCharacter(' ', m_lastTextNode->parentNode(), m_lastTextNode, 1, 1);
645 return false;
646 }
647
648 if (m_entersTextControls && renderer->isTextControl()) {
649 if (HTMLElement* innerTextElement = toRenderTextControl(renderer)->innerTextElement()) {
650 m_node = innerTextElement->shadowTreeRootNode();
651 pushFullyClippedState(m_fullyClippedStack, m_node);
652 m_offset = 0;
653 return false;
654 }
655 }
656
657 m_hasEmitted = true;
658
659 if (m_emitsObjectReplacementCharacters && renderer && renderer->isReplaced()) {
660 emitCharacter(objectReplacementCharacter, m_node->parentNode(), m_node, 0, 1);
661 return true;
662 }
663
664 if (m_emitsCharactersBetweenAllVisiblePositions) {
665 // We want replaced elements to behave like punctuation for boundary
666 // finding, and to simply take up space for the selection preservation
667 // code in moveParagraphs, so we use a comma.
668 emitCharacter(',', m_node->parentNode(), m_node, 0, 1);
669 return true;
670 }
671
672 m_positionNode = m_node->parentNode();
673 m_positionOffsetBaseNode = m_node;
674 m_positionStartOffset = 0;
675 m_positionEndOffset = 1;
676
677 m_textCharacters = 0;
678 m_textLength = 0;
679
680 m_lastCharacter = 0;
681
682 return true;
683 }
684
hasVisibleTextNode(RenderText * renderer)685 bool TextIterator::hasVisibleTextNode(RenderText* renderer)
686 {
687 if (renderer->style()->visibility() == VISIBLE)
688 return true;
689 if (renderer->isTextFragment()) {
690 RenderTextFragment* fragment = static_cast<RenderTextFragment*>(renderer);
691 if (fragment->firstLetter() && fragment->firstLetter()->style()->visibility() == VISIBLE)
692 return true;
693 }
694 return false;
695 }
696
shouldEmitTabBeforeNode(Node * node)697 static bool shouldEmitTabBeforeNode(Node* node)
698 {
699 RenderObject* r = node->renderer();
700
701 // Table cells are delimited by tabs.
702 if (!r || !isTableCell(node))
703 return false;
704
705 // Want a tab before every cell other than the first one
706 RenderTableCell* rc = toRenderTableCell(r);
707 RenderTable* t = rc->table();
708 return t && (t->cellBefore(rc) || t->cellAbove(rc));
709 }
710
shouldEmitNewlineForNode(Node * node)711 static bool shouldEmitNewlineForNode(Node* node)
712 {
713 // br elements are represented by a single newline.
714 RenderObject* r = node->renderer();
715 if (!r)
716 return node->hasTagName(brTag);
717
718 return r->isBR();
719 }
720
shouldEmitNewlinesBeforeAndAfterNode(Node * node)721 static bool shouldEmitNewlinesBeforeAndAfterNode(Node* node)
722 {
723 // Block flow (versus inline flow) is represented by having
724 // a newline both before and after the element.
725 RenderObject* r = node->renderer();
726 if (!r) {
727 return (node->hasTagName(blockquoteTag)
728 || node->hasTagName(ddTag)
729 || node->hasTagName(divTag)
730 || node->hasTagName(dlTag)
731 || node->hasTagName(dtTag)
732 || node->hasTagName(h1Tag)
733 || node->hasTagName(h2Tag)
734 || node->hasTagName(h3Tag)
735 || node->hasTagName(h4Tag)
736 || node->hasTagName(h5Tag)
737 || node->hasTagName(h6Tag)
738 || node->hasTagName(hrTag)
739 || node->hasTagName(liTag)
740 || node->hasTagName(listingTag)
741 || node->hasTagName(olTag)
742 || node->hasTagName(pTag)
743 || node->hasTagName(preTag)
744 || node->hasTagName(trTag)
745 || node->hasTagName(ulTag));
746 }
747
748 // Need to make an exception for table cells, because they are blocks, but we
749 // want them tab-delimited rather than having newlines before and after.
750 if (isTableCell(node))
751 return false;
752
753 // Need to make an exception for table row elements, because they are neither
754 // "inline" or "RenderBlock", but we want newlines for them.
755 if (r->isTableRow()) {
756 RenderTable* t = toRenderTableRow(r)->table();
757 if (t && !t->isInline())
758 return true;
759 }
760
761 return !r->isInline() && r->isRenderBlock() && !r->isFloatingOrPositioned() && !r->isBody();
762 }
763
shouldEmitNewlineAfterNode(Node * node)764 static bool shouldEmitNewlineAfterNode(Node* node)
765 {
766 // FIXME: It should be better but slower to create a VisiblePosition here.
767 if (!shouldEmitNewlinesBeforeAndAfterNode(node))
768 return false;
769 // Check if this is the very last renderer in the document.
770 // If so, then we should not emit a newline.
771 while ((node = node->traverseNextSibling()))
772 if (node->renderer())
773 return true;
774 return false;
775 }
776
shouldEmitNewlineBeforeNode(Node * node)777 static bool shouldEmitNewlineBeforeNode(Node* node)
778 {
779 return shouldEmitNewlinesBeforeAndAfterNode(node);
780 }
781
shouldEmitExtraNewlineForNode(Node * node)782 static bool shouldEmitExtraNewlineForNode(Node* node)
783 {
784 // When there is a significant collapsed bottom margin, emit an extra
785 // newline for a more realistic result. We end up getting the right
786 // result even without margin collapsing. For example: <div><p>text</p></div>
787 // will work right even if both the <div> and the <p> have bottom margins.
788 RenderObject* r = node->renderer();
789 if (!r || !r->isBox())
790 return false;
791
792 // NOTE: We only do this for a select set of nodes, and fwiw WinIE appears
793 // not to do this at all
794 if (node->hasTagName(h1Tag)
795 || node->hasTagName(h2Tag)
796 || node->hasTagName(h3Tag)
797 || node->hasTagName(h4Tag)
798 || node->hasTagName(h5Tag)
799 || node->hasTagName(h6Tag)
800 || node->hasTagName(pTag)) {
801 RenderStyle* style = r->style();
802 if (style) {
803 int bottomMargin = toRenderBox(r)->collapsedMarginAfter();
804 int fontSize = style->fontDescription().computedPixelSize();
805 if (bottomMargin * 2 >= fontSize)
806 return true;
807 }
808 }
809
810 return false;
811 }
812
collapsedSpaceLength(RenderText * renderer,int textEnd)813 static int collapsedSpaceLength(RenderText* renderer, int textEnd)
814 {
815 const UChar* characters = renderer->text()->characters();
816 int length = renderer->text()->length();
817 for (int i = textEnd; i < length; ++i) {
818 if (!renderer->style()->isCollapsibleWhiteSpace(characters[i]))
819 return i - textEnd;
820 }
821
822 return length - textEnd;
823 }
824
maxOffsetIncludingCollapsedSpaces(Node * node)825 static int maxOffsetIncludingCollapsedSpaces(Node* node)
826 {
827 int offset = caretMaxOffset(node);
828
829 if (node->renderer() && node->renderer()->isText())
830 offset += collapsedSpaceLength(toRenderText(node->renderer()), offset);
831
832 return offset;
833 }
834
835 // Whether or not we should emit a character as we enter m_node (if it's a container) or as we hit it (if it's atomic).
shouldRepresentNodeOffsetZero()836 bool TextIterator::shouldRepresentNodeOffsetZero()
837 {
838 if (m_emitsCharactersBetweenAllVisiblePositions && m_node->renderer() && m_node->renderer()->isTable())
839 return true;
840
841 // Leave element positioned flush with start of a paragraph
842 // (e.g. do not insert tab before a table cell at the start of a paragraph)
843 if (m_lastCharacter == '\n')
844 return false;
845
846 // Otherwise, show the position if we have emitted any characters
847 if (m_hasEmitted)
848 return true;
849
850 // We've not emitted anything yet. Generally, there is no need for any positioning then.
851 // The only exception is when the element is visually not in the same line as
852 // the start of the range (e.g. the range starts at the end of the previous paragraph).
853 // NOTE: Creating VisiblePositions and comparing them is relatively expensive, so we
854 // make quicker checks to possibly avoid that. Another check that we could make is
855 // is whether the inline vs block flow changed since the previous visible element.
856 // I think we're already in a special enough case that that won't be needed, tho.
857
858 // No character needed if this is the first node in the range.
859 if (m_node == m_startContainer)
860 return false;
861
862 // If we are outside the start container's subtree, assume we need to emit.
863 // FIXME: m_startContainer could be an inline block
864 if (!m_node->isDescendantOf(m_startContainer))
865 return true;
866
867 // If we started as m_startContainer offset 0 and the current node is a descendant of
868 // the start container, we already had enough context to correctly decide whether to
869 // emit after a preceding block. We chose not to emit (m_hasEmitted is false),
870 // so don't second guess that now.
871 // NOTE: Is this really correct when m_node is not a leftmost descendant? Probably
872 // immaterial since we likely would have already emitted something by now.
873 if (m_startOffset == 0)
874 return false;
875
876 // If this node is unrendered or invisible the VisiblePosition checks below won't have much meaning.
877 // Additionally, if the range we are iterating over contains huge sections of unrendered content,
878 // we would create VisiblePositions on every call to this function without this check.
879 if (!m_node->renderer() || m_node->renderer()->style()->visibility() != VISIBLE)
880 return false;
881
882 // The startPos.isNotNull() check is needed because the start could be before the body,
883 // and in that case we'll get null. We don't want to put in newlines at the start in that case.
884 // The currPos.isNotNull() check is needed because positions in non-HTML content
885 // (like SVG) do not have visible positions, and we don't want to emit for them either.
886 VisiblePosition startPos = VisiblePosition(Position(m_startContainer, m_startOffset, Position::PositionIsOffsetInAnchor), DOWNSTREAM);
887 VisiblePosition currPos = VisiblePosition(positionBeforeNode(m_node), DOWNSTREAM);
888 return startPos.isNotNull() && currPos.isNotNull() && !inSameLine(startPos, currPos);
889 }
890
shouldEmitSpaceBeforeAndAfterNode(Node * node)891 bool TextIterator::shouldEmitSpaceBeforeAndAfterNode(Node* node)
892 {
893 return node->renderer() && node->renderer()->isTable() && (node->renderer()->isInline() || m_emitsCharactersBetweenAllVisiblePositions);
894 }
895
representNodeOffsetZero()896 void TextIterator::representNodeOffsetZero()
897 {
898 // Emit a character to show the positioning of m_node.
899
900 // When we haven't been emitting any characters, shouldRepresentNodeOffsetZero() can
901 // create VisiblePositions, which is expensive. So, we perform the inexpensive checks
902 // on m_node to see if it necessitates emitting a character first and will early return
903 // before encountering shouldRepresentNodeOffsetZero()s worse case behavior.
904 if (shouldEmitTabBeforeNode(m_node)) {
905 if (shouldRepresentNodeOffsetZero())
906 emitCharacter('\t', m_node->parentNode(), m_node, 0, 0);
907 } else if (shouldEmitNewlineBeforeNode(m_node)) {
908 if (shouldRepresentNodeOffsetZero())
909 emitCharacter('\n', m_node->parentNode(), m_node, 0, 0);
910 } else if (shouldEmitSpaceBeforeAndAfterNode(m_node)) {
911 if (shouldRepresentNodeOffsetZero())
912 emitCharacter(' ', m_node->parentNode(), m_node, 0, 0);
913 }
914 }
915
handleNonTextNode()916 bool TextIterator::handleNonTextNode()
917 {
918 if (shouldEmitNewlineForNode(m_node))
919 emitCharacter('\n', m_node->parentNode(), m_node, 0, 1);
920 else if (m_emitsCharactersBetweenAllVisiblePositions && m_node->renderer() && m_node->renderer()->isHR())
921 emitCharacter(' ', m_node->parentNode(), m_node, 0, 1);
922 else
923 representNodeOffsetZero();
924
925 return true;
926 }
927
exitNode()928 void TextIterator::exitNode()
929 {
930 // prevent emitting a newline when exiting a collapsed block at beginning of the range
931 // FIXME: !m_hasEmitted does not necessarily mean there was a collapsed block... it could
932 // have been an hr (e.g.). Also, a collapsed block could have height (e.g. a table) and
933 // therefore look like a blank line.
934 if (!m_hasEmitted)
935 return;
936
937 // Emit with a position *inside* m_node, after m_node's contents, in
938 // case it is a block, because the run should start where the
939 // emitted character is positioned visually.
940 Node* baseNode = m_node->lastChild() ? m_node->lastChild() : m_node;
941 // FIXME: This shouldn't require the m_lastTextNode to be true, but we can't change that without making
942 // the logic in _web_attributedStringFromRange match. We'll get that for free when we switch to use
943 // TextIterator in _web_attributedStringFromRange.
944 // See <rdar://problem/5428427> for an example of how this mismatch will cause problems.
945 if (m_lastTextNode && shouldEmitNewlineAfterNode(m_node)) {
946 // use extra newline to represent margin bottom, as needed
947 bool addNewline = shouldEmitExtraNewlineForNode(m_node);
948
949 // FIXME: We need to emit a '\n' as we leave an empty block(s) that
950 // contain a VisiblePosition when doing selection preservation.
951 if (m_lastCharacter != '\n') {
952 // insert a newline with a position following this block's contents.
953 emitCharacter('\n', baseNode->parentNode(), baseNode, 1, 1);
954 // remember whether to later add a newline for the current node
955 ASSERT(!m_needsAnotherNewline);
956 m_needsAnotherNewline = addNewline;
957 } else if (addNewline)
958 // insert a newline with a position following this block's contents.
959 emitCharacter('\n', baseNode->parentNode(), baseNode, 1, 1);
960 }
961
962 // If nothing was emitted, see if we need to emit a space.
963 if (!m_positionNode && shouldEmitSpaceBeforeAndAfterNode(m_node))
964 emitCharacter(' ', baseNode->parentNode(), baseNode, 1, 1);
965 }
966
emitCharacter(UChar c,Node * textNode,Node * offsetBaseNode,int textStartOffset,int textEndOffset)967 void TextIterator::emitCharacter(UChar c, Node* textNode, Node* offsetBaseNode, int textStartOffset, int textEndOffset)
968 {
969 m_hasEmitted = true;
970
971 // remember information with which to construct the TextIterator::range()
972 // NOTE: textNode is often not a text node, so the range will specify child nodes of positionNode
973 m_positionNode = textNode;
974 m_positionOffsetBaseNode = offsetBaseNode;
975 m_positionStartOffset = textStartOffset;
976 m_positionEndOffset = textEndOffset;
977
978 // remember information with which to construct the TextIterator::characters() and length()
979 m_singleCharacterBuffer = c;
980 m_textCharacters = &m_singleCharacterBuffer;
981 m_textLength = 1;
982
983 // remember some iteration state
984 m_lastTextNodeEndedWithCollapsedSpace = false;
985 m_lastCharacter = c;
986 }
987
emitText(Node * textNode,RenderObject * renderObject,int textStartOffset,int textEndOffset)988 void TextIterator::emitText(Node* textNode, RenderObject* renderObject, int textStartOffset, int textEndOffset)
989 {
990 RenderText* renderer = toRenderText(renderObject);
991 m_text = m_emitsTextWithoutTranscoding ? renderer->textWithoutTranscoding() : renderer->text();
992 ASSERT(m_text.characters());
993 ASSERT(0 <= textStartOffset && textStartOffset < static_cast<int>(m_text.length()));
994 ASSERT(0 <= textEndOffset && textEndOffset <= static_cast<int>(m_text.length()));
995 ASSERT(textStartOffset <= textEndOffset);
996
997 m_positionNode = textNode;
998 m_positionOffsetBaseNode = 0;
999 m_positionStartOffset = textStartOffset;
1000 m_positionEndOffset = textEndOffset;
1001 m_textCharacters = m_text.characters() + textStartOffset;
1002 m_textLength = textEndOffset - textStartOffset;
1003 m_lastCharacter = m_text[textEndOffset - 1];
1004
1005 m_lastTextNodeEndedWithCollapsedSpace = false;
1006 m_hasEmitted = true;
1007 }
1008
emitText(Node * textNode,int textStartOffset,int textEndOffset)1009 void TextIterator::emitText(Node* textNode, int textStartOffset, int textEndOffset)
1010 {
1011 emitText(textNode, m_node->renderer(), textStartOffset, textEndOffset);
1012 }
1013
range() const1014 PassRefPtr<Range> TextIterator::range() const
1015 {
1016 // use the current run information, if we have it
1017 if (m_positionNode) {
1018 if (m_positionOffsetBaseNode) {
1019 int index = m_positionOffsetBaseNode->nodeIndex();
1020 m_positionStartOffset += index;
1021 m_positionEndOffset += index;
1022 m_positionOffsetBaseNode = 0;
1023 }
1024 return Range::create(m_positionNode->document(), m_positionNode, m_positionStartOffset, m_positionNode, m_positionEndOffset);
1025 }
1026
1027 // otherwise, return the end of the overall range we were given
1028 if (m_endContainer)
1029 return Range::create(m_endContainer->document(), m_endContainer, m_endOffset, m_endContainer, m_endOffset);
1030
1031 return 0;
1032 }
1033
node() const1034 Node* TextIterator::node() const
1035 {
1036 RefPtr<Range> textRange = range();
1037 if (!textRange)
1038 return 0;
1039
1040 Node* node = textRange->startContainer();
1041 if (!node)
1042 return 0;
1043 if (node->offsetInCharacters())
1044 return node;
1045
1046 return node->childNode(textRange->startOffset());
1047 }
1048
1049 // --------
1050
SimplifiedBackwardsTextIterator()1051 SimplifiedBackwardsTextIterator::SimplifiedBackwardsTextIterator()
1052 : m_behavior(TextIteratorDefaultBehavior)
1053 , m_node(0)
1054 , m_offset(0)
1055 , m_handledNode(false)
1056 , m_handledChildren(false)
1057 , m_startNode(0)
1058 , m_startOffset(0)
1059 , m_endNode(0)
1060 , m_endOffset(0)
1061 , m_positionNode(0)
1062 , m_positionStartOffset(0)
1063 , m_positionEndOffset(0)
1064 , m_textCharacters(0)
1065 , m_textLength(0)
1066 , m_lastTextNode(0)
1067 , m_lastCharacter(0)
1068 , m_singleCharacterBuffer(0)
1069 , m_havePassedStartNode(false)
1070 , m_shouldHandleFirstLetter(false)
1071 {
1072 }
1073
SimplifiedBackwardsTextIterator(const Range * r,TextIteratorBehavior behavior)1074 SimplifiedBackwardsTextIterator::SimplifiedBackwardsTextIterator(const Range* r, TextIteratorBehavior behavior)
1075 : m_behavior(behavior)
1076 , m_node(0)
1077 , m_offset(0)
1078 , m_handledNode(false)
1079 , m_handledChildren(false)
1080 , m_startNode(0)
1081 , m_startOffset(0)
1082 , m_endNode(0)
1083 , m_endOffset(0)
1084 , m_positionNode(0)
1085 , m_positionStartOffset(0)
1086 , m_positionEndOffset(0)
1087 , m_textCharacters(0)
1088 , m_textLength(0)
1089 , m_lastTextNode(0)
1090 , m_lastCharacter(0)
1091 , m_singleCharacterBuffer(0)
1092 , m_havePassedStartNode(false)
1093 , m_shouldHandleFirstLetter(false)
1094 {
1095 ASSERT(m_behavior == TextIteratorDefaultBehavior);
1096
1097 if (!r)
1098 return;
1099
1100 Node* startNode = r->startContainer();
1101 if (!startNode)
1102 return;
1103 Node* endNode = r->endContainer();
1104 int startOffset = r->startOffset();
1105 int endOffset = r->endOffset();
1106
1107 if (!startNode->offsetInCharacters()) {
1108 if (startOffset >= 0 && startOffset < static_cast<int>(startNode->childNodeCount())) {
1109 startNode = startNode->childNode(startOffset);
1110 startOffset = 0;
1111 }
1112 }
1113 if (!endNode->offsetInCharacters()) {
1114 if (endOffset > 0 && endOffset <= static_cast<int>(endNode->childNodeCount())) {
1115 endNode = endNode->childNode(endOffset - 1);
1116 endOffset = lastOffsetInNode(endNode);
1117 }
1118 }
1119
1120 m_node = endNode;
1121 setUpFullyClippedStack(m_fullyClippedStack, m_node);
1122 m_offset = endOffset;
1123 m_handledNode = false;
1124 m_handledChildren = endOffset == 0;
1125
1126 m_startNode = startNode;
1127 m_startOffset = startOffset;
1128 m_endNode = endNode;
1129 m_endOffset = endOffset;
1130
1131 #ifndef NDEBUG
1132 // Need this just because of the assert.
1133 m_positionNode = endNode;
1134 #endif
1135
1136 m_lastTextNode = 0;
1137 m_lastCharacter = '\n';
1138
1139 m_havePassedStartNode = false;
1140
1141 advance();
1142 }
1143
advance()1144 void SimplifiedBackwardsTextIterator::advance()
1145 {
1146 ASSERT(m_positionNode);
1147
1148 m_positionNode = 0;
1149 m_textLength = 0;
1150
1151 while (m_node && !m_havePassedStartNode) {
1152 // Don't handle node if we start iterating at [node, 0].
1153 if (!m_handledNode && !(m_node == m_endNode && m_endOffset == 0)) {
1154 RenderObject* renderer = m_node->renderer();
1155 if (renderer && renderer->isText() && m_node->nodeType() == Node::TEXT_NODE) {
1156 // FIXME: What about CDATA_SECTION_NODE?
1157 if (renderer->style()->visibility() == VISIBLE && m_offset > 0)
1158 m_handledNode = handleTextNode();
1159 } else if (renderer && (renderer->isImage() || renderer->isWidget())) {
1160 if (renderer->style()->visibility() == VISIBLE && m_offset > 0)
1161 m_handledNode = handleReplacedElement();
1162 } else
1163 m_handledNode = handleNonTextNode();
1164 if (m_positionNode)
1165 return;
1166 }
1167
1168 if (!m_handledChildren && m_node->hasChildNodes()) {
1169 m_node = m_node->lastChild();
1170 pushFullyClippedState(m_fullyClippedStack, m_node);
1171 } else {
1172 // Exit empty containers as we pass over them or containers
1173 // where [container, 0] is where we started iterating.
1174 if (!m_handledNode
1175 && canHaveChildrenForEditing(m_node)
1176 && m_node->parentNode()
1177 && (!m_node->lastChild() || (m_node == m_endNode && !m_endOffset))) {
1178 exitNode();
1179 if (m_positionNode) {
1180 m_handledNode = true;
1181 m_handledChildren = true;
1182 return;
1183 }
1184 }
1185
1186 // Exit all other containers.
1187 while (!m_node->previousSibling()) {
1188 if (!advanceRespectingRange(m_node->parentOrHostNode()))
1189 break;
1190 m_fullyClippedStack.pop();
1191 exitNode();
1192 if (m_positionNode) {
1193 m_handledNode = true;
1194 m_handledChildren = true;
1195 return;
1196 }
1197 }
1198
1199 m_fullyClippedStack.pop();
1200 if (advanceRespectingRange(m_node->previousSibling()))
1201 pushFullyClippedState(m_fullyClippedStack, m_node);
1202 else
1203 m_node = 0;
1204 }
1205
1206 // For the purpose of word boundary detection,
1207 // we should iterate all visible text and trailing (collapsed) whitespaces.
1208 m_offset = m_node ? maxOffsetIncludingCollapsedSpaces(m_node) : 0;
1209 m_handledNode = false;
1210 m_handledChildren = false;
1211
1212 if (m_positionNode)
1213 return;
1214 }
1215 }
1216
handleTextNode()1217 bool SimplifiedBackwardsTextIterator::handleTextNode()
1218 {
1219 m_lastTextNode = m_node;
1220
1221 int startOffset;
1222 int offsetInNode;
1223 RenderText* renderer = handleFirstLetter(startOffset, offsetInNode);
1224 if (!renderer)
1225 return true;
1226
1227 String text = renderer->text();
1228 if (!renderer->firstTextBox() && text.length() > 0)
1229 return true;
1230
1231 m_positionEndOffset = m_offset;
1232 m_offset = startOffset + offsetInNode;
1233 m_positionNode = m_node;
1234 m_positionStartOffset = m_offset;
1235
1236 ASSERT(0 <= m_positionStartOffset - offsetInNode && m_positionStartOffset - offsetInNode <= static_cast<int>(text.length()));
1237 ASSERT(1 <= m_positionEndOffset - offsetInNode && m_positionEndOffset - offsetInNode <= static_cast<int>(text.length()));
1238 ASSERT(m_positionStartOffset <= m_positionEndOffset);
1239
1240 m_textLength = m_positionEndOffset - m_positionStartOffset;
1241 m_textCharacters = text.characters() + (m_positionStartOffset - offsetInNode);
1242 ASSERT(m_textCharacters >= text.characters());
1243 ASSERT(m_textCharacters + m_textLength <= text.characters() + static_cast<int>(text.length()));
1244
1245 m_lastCharacter = text[m_positionEndOffset - 1];
1246
1247 return !m_shouldHandleFirstLetter;
1248 }
1249
handleFirstLetter(int & startOffset,int & offsetInNode)1250 RenderText* SimplifiedBackwardsTextIterator::handleFirstLetter(int& startOffset, int& offsetInNode)
1251 {
1252 RenderText* renderer = toRenderText(m_node->renderer());
1253 startOffset = (m_node == m_startNode) ? m_startOffset : 0;
1254
1255 if (!renderer->isTextFragment()) {
1256 offsetInNode = 0;
1257 return renderer;
1258 }
1259
1260 RenderTextFragment* fragment = toRenderTextFragment(renderer);
1261 int offsetAfterFirstLetter = fragment->start();
1262 if (startOffset >= offsetAfterFirstLetter) {
1263 ASSERT(!m_shouldHandleFirstLetter);
1264 offsetInNode = offsetAfterFirstLetter;
1265 return renderer;
1266 }
1267
1268 if (!m_shouldHandleFirstLetter && offsetAfterFirstLetter < m_offset) {
1269 m_shouldHandleFirstLetter = true;
1270 offsetInNode = offsetAfterFirstLetter;
1271 return renderer;
1272 }
1273
1274 m_shouldHandleFirstLetter = false;
1275 offsetInNode = 0;
1276 return firstRenderTextInFirstLetter(fragment->firstLetter());
1277 }
1278
handleReplacedElement()1279 bool SimplifiedBackwardsTextIterator::handleReplacedElement()
1280 {
1281 unsigned index = m_node->nodeIndex();
1282 // We want replaced elements to behave like punctuation for boundary
1283 // finding, and to simply take up space for the selection preservation
1284 // code in moveParagraphs, so we use a comma. Unconditionally emit
1285 // here because this iterator is only used for boundary finding.
1286 emitCharacter(',', m_node->parentNode(), index, index + 1);
1287 return true;
1288 }
1289
handleNonTextNode()1290 bool SimplifiedBackwardsTextIterator::handleNonTextNode()
1291 {
1292 // We can use a linefeed in place of a tab because this simple iterator is only used to
1293 // find boundaries, not actual content. A linefeed breaks words, sentences, and paragraphs.
1294 if (shouldEmitNewlineForNode(m_node) || shouldEmitNewlineAfterNode(m_node) || shouldEmitTabBeforeNode(m_node)) {
1295 unsigned index = m_node->nodeIndex();
1296 // The start of this emitted range is wrong. Ensuring correctness would require
1297 // VisiblePositions and so would be slow. previousBoundary expects this.
1298 emitCharacter('\n', m_node->parentNode(), index + 1, index + 1);
1299 }
1300 return true;
1301 }
1302
exitNode()1303 void SimplifiedBackwardsTextIterator::exitNode()
1304 {
1305 if (shouldEmitNewlineForNode(m_node) || shouldEmitNewlineBeforeNode(m_node) || shouldEmitTabBeforeNode(m_node)) {
1306 // The start of this emitted range is wrong. Ensuring correctness would require
1307 // VisiblePositions and so would be slow. previousBoundary expects this.
1308 emitCharacter('\n', m_node, 0, 0);
1309 }
1310 }
1311
emitCharacter(UChar c,Node * node,int startOffset,int endOffset)1312 void SimplifiedBackwardsTextIterator::emitCharacter(UChar c, Node* node, int startOffset, int endOffset)
1313 {
1314 m_singleCharacterBuffer = c;
1315 m_positionNode = node;
1316 m_positionStartOffset = startOffset;
1317 m_positionEndOffset = endOffset;
1318 m_textCharacters = &m_singleCharacterBuffer;
1319 m_textLength = 1;
1320 m_lastCharacter = c;
1321 }
1322
advanceRespectingRange(Node * next)1323 bool SimplifiedBackwardsTextIterator::advanceRespectingRange(Node* next)
1324 {
1325 if (!next)
1326 return false;
1327 m_havePassedStartNode |= m_node == m_startNode;
1328 if (m_havePassedStartNode)
1329 return false;
1330 m_node = next;
1331 return true;
1332 }
1333
range() const1334 PassRefPtr<Range> SimplifiedBackwardsTextIterator::range() const
1335 {
1336 if (m_positionNode)
1337 return Range::create(m_positionNode->document(), m_positionNode, m_positionStartOffset, m_positionNode, m_positionEndOffset);
1338
1339 return Range::create(m_startNode->document(), m_startNode, m_startOffset, m_startNode, m_startOffset);
1340 }
1341
1342 // --------
1343
CharacterIterator()1344 CharacterIterator::CharacterIterator()
1345 : m_offset(0)
1346 , m_runOffset(0)
1347 , m_atBreak(true)
1348 {
1349 }
1350
CharacterIterator(const Range * r,TextIteratorBehavior behavior)1351 CharacterIterator::CharacterIterator(const Range* r, TextIteratorBehavior behavior)
1352 : m_offset(0)
1353 , m_runOffset(0)
1354 , m_atBreak(true)
1355 , m_textIterator(r, behavior)
1356 {
1357 while (!atEnd() && m_textIterator.length() == 0)
1358 m_textIterator.advance();
1359 }
1360
range() const1361 PassRefPtr<Range> CharacterIterator::range() const
1362 {
1363 RefPtr<Range> r = m_textIterator.range();
1364 if (!m_textIterator.atEnd()) {
1365 if (m_textIterator.length() <= 1) {
1366 ASSERT(m_runOffset == 0);
1367 } else {
1368 Node* n = r->startContainer();
1369 ASSERT(n == r->endContainer());
1370 int offset = r->startOffset() + m_runOffset;
1371 ExceptionCode ec = 0;
1372 r->setStart(n, offset, ec);
1373 r->setEnd(n, offset + 1, ec);
1374 ASSERT(!ec);
1375 }
1376 }
1377 return r.release();
1378 }
1379
advance(int count)1380 void CharacterIterator::advance(int count)
1381 {
1382 if (count <= 0) {
1383 ASSERT(count == 0);
1384 return;
1385 }
1386
1387 m_atBreak = false;
1388
1389 // easy if there is enough left in the current m_textIterator run
1390 int remaining = m_textIterator.length() - m_runOffset;
1391 if (count < remaining) {
1392 m_runOffset += count;
1393 m_offset += count;
1394 return;
1395 }
1396
1397 // exhaust the current m_textIterator run
1398 count -= remaining;
1399 m_offset += remaining;
1400
1401 // move to a subsequent m_textIterator run
1402 for (m_textIterator.advance(); !atEnd(); m_textIterator.advance()) {
1403 int runLength = m_textIterator.length();
1404 if (runLength == 0)
1405 m_atBreak = true;
1406 else {
1407 // see whether this is m_textIterator to use
1408 if (count < runLength) {
1409 m_runOffset = count;
1410 m_offset += count;
1411 return;
1412 }
1413
1414 // exhaust this m_textIterator run
1415 count -= runLength;
1416 m_offset += runLength;
1417 }
1418 }
1419
1420 // ran to the end of the m_textIterator... no more runs left
1421 m_atBreak = true;
1422 m_runOffset = 0;
1423 }
1424
string(int numChars)1425 String CharacterIterator::string(int numChars)
1426 {
1427 Vector<UChar> result;
1428 result.reserveInitialCapacity(numChars);
1429 while (numChars > 0 && !atEnd()) {
1430 int runSize = min(numChars, length());
1431 result.append(characters(), runSize);
1432 numChars -= runSize;
1433 advance(runSize);
1434 }
1435 return String::adopt(result);
1436 }
1437
characterSubrange(CharacterIterator & it,int offset,int length)1438 static PassRefPtr<Range> characterSubrange(CharacterIterator& it, int offset, int length)
1439 {
1440 it.advance(offset);
1441 RefPtr<Range> start = it.range();
1442
1443 if (length > 1)
1444 it.advance(length - 1);
1445 RefPtr<Range> end = it.range();
1446
1447 return Range::create(start->startContainer()->document(),
1448 start->startContainer(), start->startOffset(),
1449 end->endContainer(), end->endOffset());
1450 }
1451
BackwardsCharacterIterator()1452 BackwardsCharacterIterator::BackwardsCharacterIterator()
1453 : m_offset(0)
1454 , m_runOffset(0)
1455 , m_atBreak(true)
1456 {
1457 }
1458
BackwardsCharacterIterator(const Range * range,TextIteratorBehavior behavior)1459 BackwardsCharacterIterator::BackwardsCharacterIterator(const Range* range, TextIteratorBehavior behavior)
1460 : m_offset(0)
1461 , m_runOffset(0)
1462 , m_atBreak(true)
1463 , m_textIterator(range, behavior)
1464 {
1465 while (!atEnd() && !m_textIterator.length())
1466 m_textIterator.advance();
1467 }
1468
range() const1469 PassRefPtr<Range> BackwardsCharacterIterator::range() const
1470 {
1471 RefPtr<Range> r = m_textIterator.range();
1472 if (!m_textIterator.atEnd()) {
1473 if (m_textIterator.length() <= 1)
1474 ASSERT(m_runOffset == 0);
1475 else {
1476 Node* n = r->startContainer();
1477 ASSERT(n == r->endContainer());
1478 int offset = r->endOffset() - m_runOffset;
1479 ExceptionCode ec = 0;
1480 r->setStart(n, offset - 1, ec);
1481 r->setEnd(n, offset, ec);
1482 ASSERT(!ec);
1483 }
1484 }
1485 return r.release();
1486 }
1487
advance(int count)1488 void BackwardsCharacterIterator::advance(int count)
1489 {
1490 if (count <= 0) {
1491 ASSERT(!count);
1492 return;
1493 }
1494
1495 m_atBreak = false;
1496
1497 int remaining = m_textIterator.length() - m_runOffset;
1498 if (count < remaining) {
1499 m_runOffset += count;
1500 m_offset += count;
1501 return;
1502 }
1503
1504 count -= remaining;
1505 m_offset += remaining;
1506
1507 for (m_textIterator.advance(); !atEnd(); m_textIterator.advance()) {
1508 int runLength = m_textIterator.length();
1509 if (runLength == 0)
1510 m_atBreak = true;
1511 else {
1512 if (count < runLength) {
1513 m_runOffset = count;
1514 m_offset += count;
1515 return;
1516 }
1517
1518 count -= runLength;
1519 m_offset += runLength;
1520 }
1521 }
1522
1523 m_atBreak = true;
1524 m_runOffset = 0;
1525 }
1526
1527 // --------
1528
WordAwareIterator()1529 WordAwareIterator::WordAwareIterator()
1530 : m_previousText(0)
1531 , m_didLookAhead(false)
1532 {
1533 }
1534
WordAwareIterator(const Range * r)1535 WordAwareIterator::WordAwareIterator(const Range* r)
1536 : m_previousText(0)
1537 , m_didLookAhead(true) // so we consider the first chunk from the text iterator
1538 , m_textIterator(r)
1539 {
1540 advance(); // get in position over the first chunk of text
1541 }
1542
~WordAwareIterator()1543 WordAwareIterator::~WordAwareIterator()
1544 {
1545 }
1546
1547 // We're always in one of these modes:
1548 // - The current chunk in the text iterator is our current chunk
1549 // (typically its a piece of whitespace, or text that ended with whitespace)
1550 // - The previous chunk in the text iterator is our current chunk
1551 // (we looked ahead to the next chunk and found a word boundary)
1552 // - We built up our own chunk of text from many chunks from the text iterator
1553
1554 // FIXME: Performance could be bad for huge spans next to each other that don't fall on word boundaries.
1555
advance()1556 void WordAwareIterator::advance()
1557 {
1558 m_previousText = 0;
1559 m_buffer.clear(); // toss any old buffer we built up
1560
1561 // If last time we did a look-ahead, start with that looked-ahead chunk now
1562 if (!m_didLookAhead) {
1563 ASSERT(!m_textIterator.atEnd());
1564 m_textIterator.advance();
1565 }
1566 m_didLookAhead = false;
1567
1568 // Go to next non-empty chunk
1569 while (!m_textIterator.atEnd() && m_textIterator.length() == 0)
1570 m_textIterator.advance();
1571 m_range = m_textIterator.range();
1572
1573 if (m_textIterator.atEnd())
1574 return;
1575
1576 while (1) {
1577 // If this chunk ends in whitespace we can just use it as our chunk.
1578 if (isSpaceOrNewline(m_textIterator.characters()[m_textIterator.length() - 1]))
1579 return;
1580
1581 // If this is the first chunk that failed, save it in previousText before look ahead
1582 if (m_buffer.isEmpty()) {
1583 m_previousText = m_textIterator.characters();
1584 m_previousLength = m_textIterator.length();
1585 }
1586
1587 // Look ahead to next chunk. If it is whitespace or a break, we can use the previous stuff
1588 m_textIterator.advance();
1589 if (m_textIterator.atEnd() || m_textIterator.length() == 0 || isSpaceOrNewline(m_textIterator.characters()[0])) {
1590 m_didLookAhead = true;
1591 return;
1592 }
1593
1594 if (m_buffer.isEmpty()) {
1595 // Start gobbling chunks until we get to a suitable stopping point
1596 m_buffer.append(m_previousText, m_previousLength);
1597 m_previousText = 0;
1598 }
1599 m_buffer.append(m_textIterator.characters(), m_textIterator.length());
1600 int exception = 0;
1601 m_range->setEnd(m_textIterator.range()->endContainer(), m_textIterator.range()->endOffset(), exception);
1602 }
1603 }
1604
length() const1605 int WordAwareIterator::length() const
1606 {
1607 if (!m_buffer.isEmpty())
1608 return m_buffer.size();
1609 if (m_previousText)
1610 return m_previousLength;
1611 return m_textIterator.length();
1612 }
1613
characters() const1614 const UChar* WordAwareIterator::characters() const
1615 {
1616 if (!m_buffer.isEmpty())
1617 return m_buffer.data();
1618 if (m_previousText)
1619 return m_previousText;
1620 return m_textIterator.characters();
1621 }
1622
1623 // --------
1624
foldQuoteMarkOrSoftHyphen(UChar c)1625 static inline UChar foldQuoteMarkOrSoftHyphen(UChar c)
1626 {
1627 switch (c) {
1628 case hebrewPunctuationGershayim:
1629 case leftDoubleQuotationMark:
1630 case rightDoubleQuotationMark:
1631 return '"';
1632 case hebrewPunctuationGeresh:
1633 case leftSingleQuotationMark:
1634 case rightSingleQuotationMark:
1635 return '\'';
1636 case softHyphen:
1637 // Replace soft hyphen with an ignorable character so that their presence or absence will
1638 // not affect string comparison.
1639 return 0;
1640 default:
1641 return c;
1642 }
1643 }
1644
foldQuoteMarksAndSoftHyphens(String & s)1645 static inline void foldQuoteMarksAndSoftHyphens(String& s)
1646 {
1647 s.replace(hebrewPunctuationGeresh, '\'');
1648 s.replace(hebrewPunctuationGershayim, '"');
1649 s.replace(leftDoubleQuotationMark, '"');
1650 s.replace(leftSingleQuotationMark, '\'');
1651 s.replace(rightDoubleQuotationMark, '"');
1652 s.replace(rightSingleQuotationMark, '\'');
1653 // Replace soft hyphen with an ignorable character so that their presence or absence will
1654 // not affect string comparison.
1655 s.replace(softHyphen, 0);
1656 }
1657
1658 #if USE(ICU_UNICODE) && !UCONFIG_NO_COLLATION
1659
foldQuoteMarksAndSoftHyphens(UChar * data,size_t length)1660 static inline void foldQuoteMarksAndSoftHyphens(UChar* data, size_t length)
1661 {
1662 for (size_t i = 0; i < length; ++i)
1663 data[i] = foldQuoteMarkOrSoftHyphen(data[i]);
1664 }
1665
1666 static const size_t minimumSearchBufferSize = 8192;
1667
1668 #ifndef NDEBUG
1669 static bool searcherInUse;
1670 #endif
1671
createSearcher()1672 static UStringSearch* createSearcher()
1673 {
1674 // Provide a non-empty pattern and non-empty text so usearch_open will not fail,
1675 // but it doesn't matter exactly what it is, since we don't perform any searches
1676 // without setting both the pattern and the text.
1677 UErrorCode status = U_ZERO_ERROR;
1678 UStringSearch* searcher = usearch_open(&newlineCharacter, 1, &newlineCharacter, 1, currentSearchLocaleID(), 0, &status);
1679 ASSERT(status == U_ZERO_ERROR || status == U_USING_FALLBACK_WARNING || status == U_USING_DEFAULT_WARNING);
1680 return searcher;
1681 }
1682
searcher()1683 static UStringSearch* searcher()
1684 {
1685 static UStringSearch* searcher = createSearcher();
1686 return searcher;
1687 }
1688
lockSearcher()1689 static inline void lockSearcher()
1690 {
1691 #ifndef NDEBUG
1692 ASSERT(!searcherInUse);
1693 searcherInUse = true;
1694 #endif
1695 }
1696
unlockSearcher()1697 static inline void unlockSearcher()
1698 {
1699 #ifndef NDEBUG
1700 ASSERT(searcherInUse);
1701 searcherInUse = false;
1702 #endif
1703 }
1704
1705 // ICU's search ignores the distinction between small kana letters and ones
1706 // that are not small, and also characters that differ only in the voicing
1707 // marks when considering only primary collation strength diffrences.
1708 // This is not helpful for end users, since these differences make words
1709 // distinct, so for our purposes we need these to be considered.
1710 // The Unicode folks do not think the collation algorithm should be
1711 // changed. To work around this, we would like to tailor the ICU searcher,
1712 // but we can't get that to work yet. So instead, we check for cases where
1713 // these differences occur, and skip those matches.
1714
1715 // We refer to the above technique as the "kana workaround". The next few
1716 // functions are helper functinos for the kana workaround.
1717
isKanaLetter(UChar character)1718 static inline bool isKanaLetter(UChar character)
1719 {
1720 // Hiragana letters.
1721 if (character >= 0x3041 && character <= 0x3096)
1722 return true;
1723
1724 // Katakana letters.
1725 if (character >= 0x30A1 && character <= 0x30FA)
1726 return true;
1727 if (character >= 0x31F0 && character <= 0x31FF)
1728 return true;
1729
1730 // Halfwidth katakana letters.
1731 if (character >= 0xFF66 && character <= 0xFF9D && character != 0xFF70)
1732 return true;
1733
1734 return false;
1735 }
1736
isSmallKanaLetter(UChar character)1737 static inline bool isSmallKanaLetter(UChar character)
1738 {
1739 ASSERT(isKanaLetter(character));
1740
1741 switch (character) {
1742 case 0x3041: // HIRAGANA LETTER SMALL A
1743 case 0x3043: // HIRAGANA LETTER SMALL I
1744 case 0x3045: // HIRAGANA LETTER SMALL U
1745 case 0x3047: // HIRAGANA LETTER SMALL E
1746 case 0x3049: // HIRAGANA LETTER SMALL O
1747 case 0x3063: // HIRAGANA LETTER SMALL TU
1748 case 0x3083: // HIRAGANA LETTER SMALL YA
1749 case 0x3085: // HIRAGANA LETTER SMALL YU
1750 case 0x3087: // HIRAGANA LETTER SMALL YO
1751 case 0x308E: // HIRAGANA LETTER SMALL WA
1752 case 0x3095: // HIRAGANA LETTER SMALL KA
1753 case 0x3096: // HIRAGANA LETTER SMALL KE
1754 case 0x30A1: // KATAKANA LETTER SMALL A
1755 case 0x30A3: // KATAKANA LETTER SMALL I
1756 case 0x30A5: // KATAKANA LETTER SMALL U
1757 case 0x30A7: // KATAKANA LETTER SMALL E
1758 case 0x30A9: // KATAKANA LETTER SMALL O
1759 case 0x30C3: // KATAKANA LETTER SMALL TU
1760 case 0x30E3: // KATAKANA LETTER SMALL YA
1761 case 0x30E5: // KATAKANA LETTER SMALL YU
1762 case 0x30E7: // KATAKANA LETTER SMALL YO
1763 case 0x30EE: // KATAKANA LETTER SMALL WA
1764 case 0x30F5: // KATAKANA LETTER SMALL KA
1765 case 0x30F6: // KATAKANA LETTER SMALL KE
1766 case 0x31F0: // KATAKANA LETTER SMALL KU
1767 case 0x31F1: // KATAKANA LETTER SMALL SI
1768 case 0x31F2: // KATAKANA LETTER SMALL SU
1769 case 0x31F3: // KATAKANA LETTER SMALL TO
1770 case 0x31F4: // KATAKANA LETTER SMALL NU
1771 case 0x31F5: // KATAKANA LETTER SMALL HA
1772 case 0x31F6: // KATAKANA LETTER SMALL HI
1773 case 0x31F7: // KATAKANA LETTER SMALL HU
1774 case 0x31F8: // KATAKANA LETTER SMALL HE
1775 case 0x31F9: // KATAKANA LETTER SMALL HO
1776 case 0x31FA: // KATAKANA LETTER SMALL MU
1777 case 0x31FB: // KATAKANA LETTER SMALL RA
1778 case 0x31FC: // KATAKANA LETTER SMALL RI
1779 case 0x31FD: // KATAKANA LETTER SMALL RU
1780 case 0x31FE: // KATAKANA LETTER SMALL RE
1781 case 0x31FF: // KATAKANA LETTER SMALL RO
1782 case 0xFF67: // HALFWIDTH KATAKANA LETTER SMALL A
1783 case 0xFF68: // HALFWIDTH KATAKANA LETTER SMALL I
1784 case 0xFF69: // HALFWIDTH KATAKANA LETTER SMALL U
1785 case 0xFF6A: // HALFWIDTH KATAKANA LETTER SMALL E
1786 case 0xFF6B: // HALFWIDTH KATAKANA LETTER SMALL O
1787 case 0xFF6C: // HALFWIDTH KATAKANA LETTER SMALL YA
1788 case 0xFF6D: // HALFWIDTH KATAKANA LETTER SMALL YU
1789 case 0xFF6E: // HALFWIDTH KATAKANA LETTER SMALL YO
1790 case 0xFF6F: // HALFWIDTH KATAKANA LETTER SMALL TU
1791 return true;
1792 }
1793 return false;
1794 }
1795
1796 enum VoicedSoundMarkType { NoVoicedSoundMark, VoicedSoundMark, SemiVoicedSoundMark };
1797
composedVoicedSoundMark(UChar character)1798 static inline VoicedSoundMarkType composedVoicedSoundMark(UChar character)
1799 {
1800 ASSERT(isKanaLetter(character));
1801
1802 switch (character) {
1803 case 0x304C: // HIRAGANA LETTER GA
1804 case 0x304E: // HIRAGANA LETTER GI
1805 case 0x3050: // HIRAGANA LETTER GU
1806 case 0x3052: // HIRAGANA LETTER GE
1807 case 0x3054: // HIRAGANA LETTER GO
1808 case 0x3056: // HIRAGANA LETTER ZA
1809 case 0x3058: // HIRAGANA LETTER ZI
1810 case 0x305A: // HIRAGANA LETTER ZU
1811 case 0x305C: // HIRAGANA LETTER ZE
1812 case 0x305E: // HIRAGANA LETTER ZO
1813 case 0x3060: // HIRAGANA LETTER DA
1814 case 0x3062: // HIRAGANA LETTER DI
1815 case 0x3065: // HIRAGANA LETTER DU
1816 case 0x3067: // HIRAGANA LETTER DE
1817 case 0x3069: // HIRAGANA LETTER DO
1818 case 0x3070: // HIRAGANA LETTER BA
1819 case 0x3073: // HIRAGANA LETTER BI
1820 case 0x3076: // HIRAGANA LETTER BU
1821 case 0x3079: // HIRAGANA LETTER BE
1822 case 0x307C: // HIRAGANA LETTER BO
1823 case 0x3094: // HIRAGANA LETTER VU
1824 case 0x30AC: // KATAKANA LETTER GA
1825 case 0x30AE: // KATAKANA LETTER GI
1826 case 0x30B0: // KATAKANA LETTER GU
1827 case 0x30B2: // KATAKANA LETTER GE
1828 case 0x30B4: // KATAKANA LETTER GO
1829 case 0x30B6: // KATAKANA LETTER ZA
1830 case 0x30B8: // KATAKANA LETTER ZI
1831 case 0x30BA: // KATAKANA LETTER ZU
1832 case 0x30BC: // KATAKANA LETTER ZE
1833 case 0x30BE: // KATAKANA LETTER ZO
1834 case 0x30C0: // KATAKANA LETTER DA
1835 case 0x30C2: // KATAKANA LETTER DI
1836 case 0x30C5: // KATAKANA LETTER DU
1837 case 0x30C7: // KATAKANA LETTER DE
1838 case 0x30C9: // KATAKANA LETTER DO
1839 case 0x30D0: // KATAKANA LETTER BA
1840 case 0x30D3: // KATAKANA LETTER BI
1841 case 0x30D6: // KATAKANA LETTER BU
1842 case 0x30D9: // KATAKANA LETTER BE
1843 case 0x30DC: // KATAKANA LETTER BO
1844 case 0x30F4: // KATAKANA LETTER VU
1845 case 0x30F7: // KATAKANA LETTER VA
1846 case 0x30F8: // KATAKANA LETTER VI
1847 case 0x30F9: // KATAKANA LETTER VE
1848 case 0x30FA: // KATAKANA LETTER VO
1849 return VoicedSoundMark;
1850 case 0x3071: // HIRAGANA LETTER PA
1851 case 0x3074: // HIRAGANA LETTER PI
1852 case 0x3077: // HIRAGANA LETTER PU
1853 case 0x307A: // HIRAGANA LETTER PE
1854 case 0x307D: // HIRAGANA LETTER PO
1855 case 0x30D1: // KATAKANA LETTER PA
1856 case 0x30D4: // KATAKANA LETTER PI
1857 case 0x30D7: // KATAKANA LETTER PU
1858 case 0x30DA: // KATAKANA LETTER PE
1859 case 0x30DD: // KATAKANA LETTER PO
1860 return SemiVoicedSoundMark;
1861 }
1862 return NoVoicedSoundMark;
1863 }
1864
isCombiningVoicedSoundMark(UChar character)1865 static inline bool isCombiningVoicedSoundMark(UChar character)
1866 {
1867 switch (character) {
1868 case 0x3099: // COMBINING KATAKANA-HIRAGANA VOICED SOUND MARK
1869 case 0x309A: // COMBINING KATAKANA-HIRAGANA SEMI-VOICED SOUND MARK
1870 return true;
1871 }
1872 return false;
1873 }
1874
containsKanaLetters(const String & pattern)1875 static inline bool containsKanaLetters(const String& pattern)
1876 {
1877 const UChar* characters = pattern.characters();
1878 unsigned length = pattern.length();
1879 for (unsigned i = 0; i < length; ++i) {
1880 if (isKanaLetter(characters[i]))
1881 return true;
1882 }
1883 return false;
1884 }
1885
normalizeCharacters(const UChar * characters,unsigned length,Vector<UChar> & buffer)1886 static void normalizeCharacters(const UChar* characters, unsigned length, Vector<UChar>& buffer)
1887 {
1888 ASSERT(length);
1889
1890 buffer.resize(length);
1891
1892 UErrorCode status = U_ZERO_ERROR;
1893 size_t bufferSize = unorm_normalize(characters, length, UNORM_NFC, 0, buffer.data(), length, &status);
1894 ASSERT(status == U_ZERO_ERROR || status == U_STRING_NOT_TERMINATED_WARNING || status == U_BUFFER_OVERFLOW_ERROR);
1895 ASSERT(bufferSize);
1896
1897 buffer.resize(bufferSize);
1898
1899 if (status == U_ZERO_ERROR || status == U_STRING_NOT_TERMINATED_WARNING)
1900 return;
1901
1902 status = U_ZERO_ERROR;
1903 unorm_normalize(characters, length, UNORM_NFC, 0, buffer.data(), bufferSize, &status);
1904 ASSERT(status == U_STRING_NOT_TERMINATED_WARNING);
1905 }
1906
isNonLatin1Separator(UChar32 character)1907 static bool isNonLatin1Separator(UChar32 character)
1908 {
1909 ASSERT_ARG(character, character >= 256);
1910
1911 return U_GET_GC_MASK(character) & (U_GC_S_MASK | U_GC_P_MASK | U_GC_Z_MASK | U_GC_CF_MASK);
1912 }
1913
isSeparator(UChar32 character)1914 static inline bool isSeparator(UChar32 character)
1915 {
1916 static const bool latin1SeparatorTable[256] = {
1917 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1918 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1919 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // space ! " # $ % & ' ( ) * + , - . /
1920 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, // : ; < = > ?
1921 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // @
1922 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, // [ \ ] ^ _
1923 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // `
1924 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, // { | } ~
1925 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1926 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1927 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1,
1928 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1,
1929 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1930 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0,
1931 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1932 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0
1933 };
1934
1935 if (character < 256)
1936 return latin1SeparatorTable[character];
1937
1938 return isNonLatin1Separator(character);
1939 }
1940
SearchBuffer(const String & target,FindOptions options)1941 inline SearchBuffer::SearchBuffer(const String& target, FindOptions options)
1942 : m_target(target)
1943 , m_options(options)
1944 , m_prefixLength(0)
1945 , m_atBreak(true)
1946 , m_needsMoreContext(options & AtWordStarts)
1947 , m_targetRequiresKanaWorkaround(containsKanaLetters(m_target))
1948 {
1949 ASSERT(!m_target.isEmpty());
1950
1951 // FIXME: We'd like to tailor the searcher to fold quote marks for us instead
1952 // of doing it in a separate replacement pass here, but ICU doesn't offer a way
1953 // to add tailoring on top of the locale-specific tailoring as of this writing.
1954 foldQuoteMarksAndSoftHyphens(m_target);
1955
1956 size_t targetLength = m_target.length();
1957 m_buffer.reserveInitialCapacity(max(targetLength * 8, minimumSearchBufferSize));
1958 m_overlap = m_buffer.capacity() / 4;
1959
1960 if ((m_options & AtWordStarts) && targetLength) {
1961 UChar32 targetFirstCharacter;
1962 U16_GET(m_target.characters(), 0, 0, targetLength, targetFirstCharacter);
1963 // Characters in the separator category never really occur at the beginning of a word,
1964 // so if the target begins with such a character, we just ignore the AtWordStart option.
1965 if (isSeparator(targetFirstCharacter)) {
1966 m_options &= ~AtWordStarts;
1967 m_needsMoreContext = false;
1968 }
1969 }
1970
1971 // Grab the single global searcher.
1972 // If we ever have a reason to do more than once search buffer at once, we'll have
1973 // to move to multiple searchers.
1974 lockSearcher();
1975
1976 UStringSearch* searcher = WebCore::searcher();
1977 UCollator* collator = usearch_getCollator(searcher);
1978
1979 UCollationStrength strength = m_options & CaseInsensitive ? UCOL_PRIMARY : UCOL_TERTIARY;
1980 if (ucol_getStrength(collator) != strength) {
1981 ucol_setStrength(collator, strength);
1982 usearch_reset(searcher);
1983 }
1984
1985 UErrorCode status = U_ZERO_ERROR;
1986 usearch_setPattern(searcher, m_target.characters(), targetLength, &status);
1987 ASSERT(status == U_ZERO_ERROR);
1988
1989 // The kana workaround requires a normalized copy of the target string.
1990 if (m_targetRequiresKanaWorkaround)
1991 normalizeCharacters(m_target.characters(), m_target.length(), m_normalizedTarget);
1992 }
1993
~SearchBuffer()1994 inline SearchBuffer::~SearchBuffer()
1995 {
1996 // Leave the static object pointing to a valid string.
1997 UErrorCode status = U_ZERO_ERROR;
1998 usearch_setPattern(WebCore::searcher(), &newlineCharacter, 1, &status);
1999 ASSERT(status == U_ZERO_ERROR);
2000
2001 unlockSearcher();
2002 }
2003
append(const UChar * characters,size_t length)2004 inline size_t SearchBuffer::append(const UChar* characters, size_t length)
2005 {
2006 ASSERT(length);
2007
2008 if (m_atBreak) {
2009 m_buffer.shrink(0);
2010 m_prefixLength = 0;
2011 m_atBreak = false;
2012 } else if (m_buffer.size() == m_buffer.capacity()) {
2013 memcpy(m_buffer.data(), m_buffer.data() + m_buffer.size() - m_overlap, m_overlap * sizeof(UChar));
2014 m_prefixLength -= min(m_prefixLength, m_buffer.size() - m_overlap);
2015 m_buffer.shrink(m_overlap);
2016 }
2017
2018 size_t oldLength = m_buffer.size();
2019 size_t usableLength = min(m_buffer.capacity() - oldLength, length);
2020 ASSERT(usableLength);
2021 m_buffer.append(characters, usableLength);
2022 foldQuoteMarksAndSoftHyphens(m_buffer.data() + oldLength, usableLength);
2023 return usableLength;
2024 }
2025
needsMoreContext() const2026 inline bool SearchBuffer::needsMoreContext() const
2027 {
2028 return m_needsMoreContext;
2029 }
2030
prependContext(const UChar * characters,size_t length)2031 inline void SearchBuffer::prependContext(const UChar* characters, size_t length)
2032 {
2033 ASSERT(m_needsMoreContext);
2034 ASSERT(m_prefixLength == m_buffer.size());
2035
2036 if (!length)
2037 return;
2038
2039 m_atBreak = false;
2040
2041 size_t wordBoundaryContextStart = length;
2042 if (wordBoundaryContextStart) {
2043 U16_BACK_1(characters, 0, wordBoundaryContextStart);
2044 wordBoundaryContextStart = startOfLastWordBoundaryContext(characters, wordBoundaryContextStart);
2045 }
2046
2047 size_t usableLength = min(m_buffer.capacity() - m_prefixLength, length - wordBoundaryContextStart);
2048 m_buffer.prepend(characters + length - usableLength, usableLength);
2049 m_prefixLength += usableLength;
2050
2051 if (wordBoundaryContextStart || m_prefixLength == m_buffer.capacity())
2052 m_needsMoreContext = false;
2053 }
2054
atBreak() const2055 inline bool SearchBuffer::atBreak() const
2056 {
2057 return m_atBreak;
2058 }
2059
reachedBreak()2060 inline void SearchBuffer::reachedBreak()
2061 {
2062 m_atBreak = true;
2063 }
2064
isBadMatch(const UChar * match,size_t matchLength) const2065 inline bool SearchBuffer::isBadMatch(const UChar* match, size_t matchLength) const
2066 {
2067 // This function implements the kana workaround. If usearch treats
2068 // it as a match, but we do not want to, then it's a "bad match".
2069 if (!m_targetRequiresKanaWorkaround)
2070 return false;
2071
2072 // Normalize into a match buffer. We reuse a single buffer rather than
2073 // creating a new one each time.
2074 normalizeCharacters(match, matchLength, m_normalizedMatch);
2075
2076 const UChar* a = m_normalizedTarget.begin();
2077 const UChar* aEnd = m_normalizedTarget.end();
2078
2079 const UChar* b = m_normalizedMatch.begin();
2080 const UChar* bEnd = m_normalizedMatch.end();
2081
2082 while (true) {
2083 // Skip runs of non-kana-letter characters. This is necessary so we can
2084 // correctly handle strings where the target and match have different-length
2085 // runs of characters that match, while still double checking the correctness
2086 // of matches of kana letters with other kana letters.
2087 while (a != aEnd && !isKanaLetter(*a))
2088 ++a;
2089 while (b != bEnd && !isKanaLetter(*b))
2090 ++b;
2091
2092 // If we reached the end of either the target or the match, we should have
2093 // reached the end of both; both should have the same number of kana letters.
2094 if (a == aEnd || b == bEnd) {
2095 ASSERT(a == aEnd);
2096 ASSERT(b == bEnd);
2097 return false;
2098 }
2099
2100 // Check for differences in the kana letter character itself.
2101 if (isSmallKanaLetter(*a) != isSmallKanaLetter(*b))
2102 return true;
2103 if (composedVoicedSoundMark(*a) != composedVoicedSoundMark(*b))
2104 return true;
2105 ++a;
2106 ++b;
2107
2108 // Check for differences in combining voiced sound marks found after the letter.
2109 while (1) {
2110 if (!(a != aEnd && isCombiningVoicedSoundMark(*a))) {
2111 if (b != bEnd && isCombiningVoicedSoundMark(*b))
2112 return true;
2113 break;
2114 }
2115 if (!(b != bEnd && isCombiningVoicedSoundMark(*b)))
2116 return true;
2117 if (*a != *b)
2118 return true;
2119 ++a;
2120 ++b;
2121 }
2122 }
2123 }
2124
isWordStartMatch(size_t start,size_t length) const2125 inline bool SearchBuffer::isWordStartMatch(size_t start, size_t length) const
2126 {
2127 ASSERT(m_options & AtWordStarts);
2128
2129 if (!start)
2130 return true;
2131
2132 if (m_options & TreatMedialCapitalAsWordStart) {
2133 int size = m_buffer.size();
2134 int offset = start;
2135 UChar32 firstCharacter;
2136 U16_GET(m_buffer.data(), 0, offset, size, firstCharacter);
2137 UChar32 previousCharacter;
2138 U16_PREV(m_buffer.data(), 0, offset, previousCharacter);
2139
2140 if (isSeparator(firstCharacter)) {
2141 // The start of a separator run is a word start (".org" in "webkit.org").
2142 if (!isSeparator(previousCharacter))
2143 return true;
2144 } else if (isASCIIUpper(firstCharacter)) {
2145 // The start of an uppercase run is a word start ("Kit" in "WebKit").
2146 if (!isASCIIUpper(previousCharacter))
2147 return true;
2148 // The last character of an uppercase run followed by a non-separator, non-digit
2149 // is a word start ("Request" in "XMLHTTPRequest").
2150 offset = start;
2151 U16_FWD_1(m_buffer.data(), offset, size);
2152 UChar32 nextCharacter = 0;
2153 if (offset < size)
2154 U16_GET(m_buffer.data(), 0, offset, size, nextCharacter);
2155 if (!isASCIIUpper(nextCharacter) && !isASCIIDigit(nextCharacter) && !isSeparator(nextCharacter))
2156 return true;
2157 } else if (isASCIIDigit(firstCharacter)) {
2158 // The start of a digit run is a word start ("2" in "WebKit2").
2159 if (!isASCIIDigit(previousCharacter))
2160 return true;
2161 } else if (isSeparator(previousCharacter) || isASCIIDigit(previousCharacter)) {
2162 // The start of a non-separator, non-uppercase, non-digit run is a word start,
2163 // except after an uppercase. ("org" in "webkit.org", but not "ore" in "WebCore").
2164 return true;
2165 }
2166 }
2167
2168 size_t wordBreakSearchStart = start + length;
2169 while (wordBreakSearchStart > start)
2170 wordBreakSearchStart = findNextWordFromIndex(m_buffer.data(), m_buffer.size(), wordBreakSearchStart, false /* backwards */);
2171 return wordBreakSearchStart == start;
2172 }
2173
search(size_t & start)2174 inline size_t SearchBuffer::search(size_t& start)
2175 {
2176 size_t size = m_buffer.size();
2177 if (m_atBreak) {
2178 if (!size)
2179 return 0;
2180 } else {
2181 if (size != m_buffer.capacity())
2182 return 0;
2183 }
2184
2185 UStringSearch* searcher = WebCore::searcher();
2186
2187 UErrorCode status = U_ZERO_ERROR;
2188 usearch_setText(searcher, m_buffer.data(), size, &status);
2189 ASSERT(status == U_ZERO_ERROR);
2190
2191 usearch_setOffset(searcher, m_prefixLength, &status);
2192 ASSERT(status == U_ZERO_ERROR);
2193
2194 int matchStart = usearch_next(searcher, &status);
2195 ASSERT(status == U_ZERO_ERROR);
2196
2197 nextMatch:
2198 if (!(matchStart >= 0 && static_cast<size_t>(matchStart) < size)) {
2199 ASSERT(matchStart == USEARCH_DONE);
2200 return 0;
2201 }
2202
2203 // Matches that start in the overlap area are only tentative.
2204 // The same match may appear later, matching more characters,
2205 // possibly including a combining character that's not yet in the buffer.
2206 if (!m_atBreak && static_cast<size_t>(matchStart) >= size - m_overlap) {
2207 size_t overlap = m_overlap;
2208 if (m_options & AtWordStarts) {
2209 // Ensure that there is sufficient context before matchStart the next time around for
2210 // determining if it is at a word boundary.
2211 int wordBoundaryContextStart = matchStart;
2212 U16_BACK_1(m_buffer.data(), 0, wordBoundaryContextStart);
2213 wordBoundaryContextStart = startOfLastWordBoundaryContext(m_buffer.data(), wordBoundaryContextStart);
2214 overlap = min(size - 1, max(overlap, size - wordBoundaryContextStart));
2215 }
2216 memcpy(m_buffer.data(), m_buffer.data() + size - overlap, overlap * sizeof(UChar));
2217 m_prefixLength -= min(m_prefixLength, size - overlap);
2218 m_buffer.shrink(overlap);
2219 return 0;
2220 }
2221
2222 size_t matchedLength = usearch_getMatchedLength(searcher);
2223 ASSERT(matchStart + matchedLength <= size);
2224
2225 // If this match is "bad", move on to the next match.
2226 if (isBadMatch(m_buffer.data() + matchStart, matchedLength) || ((m_options & AtWordStarts) && !isWordStartMatch(matchStart, matchedLength))) {
2227 matchStart = usearch_next(searcher, &status);
2228 ASSERT(status == U_ZERO_ERROR);
2229 goto nextMatch;
2230 }
2231
2232 size_t newSize = size - (matchStart + 1);
2233 memmove(m_buffer.data(), m_buffer.data() + matchStart + 1, newSize * sizeof(UChar));
2234 m_prefixLength -= min<size_t>(m_prefixLength, matchStart + 1);
2235 m_buffer.shrink(newSize);
2236
2237 start = size - matchStart;
2238 return matchedLength;
2239 }
2240
2241 #else // !ICU_UNICODE
2242
SearchBuffer(const String & target,FindOptions options)2243 inline SearchBuffer::SearchBuffer(const String& target, FindOptions options)
2244 : m_target(options & CaseInsensitive ? target.foldCase() : target)
2245 , m_options(options)
2246 , m_buffer(m_target.length())
2247 , m_isCharacterStartBuffer(m_target.length())
2248 , m_isBufferFull(false)
2249 , m_cursor(0)
2250 {
2251 ASSERT(!m_target.isEmpty());
2252 m_target.replace(noBreakSpace, ' ');
2253 foldQuoteMarksAndSoftHyphens(m_target);
2254 }
2255
~SearchBuffer()2256 inline SearchBuffer::~SearchBuffer()
2257 {
2258 }
2259
reachedBreak()2260 inline void SearchBuffer::reachedBreak()
2261 {
2262 m_cursor = 0;
2263 m_isBufferFull = false;
2264 }
2265
atBreak() const2266 inline bool SearchBuffer::atBreak() const
2267 {
2268 return !m_cursor && !m_isBufferFull;
2269 }
2270
append(UChar c,bool isStart)2271 inline void SearchBuffer::append(UChar c, bool isStart)
2272 {
2273 m_buffer[m_cursor] = c == noBreakSpace ? ' ' : foldQuoteMarkOrSoftHyphen(c);
2274 m_isCharacterStartBuffer[m_cursor] = isStart;
2275 if (++m_cursor == m_target.length()) {
2276 m_cursor = 0;
2277 m_isBufferFull = true;
2278 }
2279 }
2280
append(const UChar * characters,size_t length)2281 inline size_t SearchBuffer::append(const UChar* characters, size_t length)
2282 {
2283 ASSERT(length);
2284 if (!(m_options & CaseInsensitive)) {
2285 append(characters[0], true);
2286 return 1;
2287 }
2288 const int maxFoldedCharacters = 16; // sensible maximum is 3, this should be more than enough
2289 UChar foldedCharacters[maxFoldedCharacters];
2290 bool error;
2291 int numFoldedCharacters = foldCase(foldedCharacters, maxFoldedCharacters, characters, 1, &error);
2292 ASSERT(!error);
2293 ASSERT(numFoldedCharacters);
2294 ASSERT(numFoldedCharacters <= maxFoldedCharacters);
2295 if (!error && numFoldedCharacters) {
2296 numFoldedCharacters = min(numFoldedCharacters, maxFoldedCharacters);
2297 append(foldedCharacters[0], true);
2298 for (int i = 1; i < numFoldedCharacters; ++i)
2299 append(foldedCharacters[i], false);
2300 }
2301 return 1;
2302 }
2303
needsMoreContext() const2304 inline bool SearchBuffer::needsMoreContext() const
2305 {
2306 return false;
2307 }
2308
prependContext(const UChar *,size_t)2309 void SearchBuffer::prependContext(const UChar*, size_t)
2310 {
2311 ASSERT_NOT_REACHED();
2312 }
2313
search(size_t & start)2314 inline size_t SearchBuffer::search(size_t& start)
2315 {
2316 if (!m_isBufferFull)
2317 return 0;
2318 if (!m_isCharacterStartBuffer[m_cursor])
2319 return 0;
2320
2321 size_t tailSpace = m_target.length() - m_cursor;
2322 if (memcmp(&m_buffer[m_cursor], m_target.characters(), tailSpace * sizeof(UChar)) != 0)
2323 return 0;
2324 if (memcmp(&m_buffer[0], m_target.characters() + tailSpace, m_cursor * sizeof(UChar)) != 0)
2325 return 0;
2326
2327 start = length();
2328
2329 // Now that we've found a match once, we don't want to find it again, because those
2330 // are the SearchBuffer semantics, allowing for a buffer where you append more than one
2331 // character at a time. To do this we take advantage of m_isCharacterStartBuffer, but if
2332 // we want to get rid of that in the future we could track this with a separate boolean
2333 // or even move the characters to the start of the buffer and set m_isBufferFull to false.
2334 m_isCharacterStartBuffer[m_cursor] = false;
2335
2336 return start;
2337 }
2338
2339 // Returns the number of characters that were appended to the buffer (what we are searching in).
2340 // That's not necessarily the same length as the passed-in target string, because case folding
2341 // can make two strings match even though they're not the same length.
length() const2342 size_t SearchBuffer::length() const
2343 {
2344 size_t bufferSize = m_target.length();
2345 size_t length = 0;
2346 for (size_t i = 0; i < bufferSize; ++i)
2347 length += m_isCharacterStartBuffer[i];
2348 return length;
2349 }
2350
2351 #endif // !ICU_UNICODE
2352
2353 // --------
2354
rangeLength(const Range * r,bool forSelectionPreservation)2355 int TextIterator::rangeLength(const Range* r, bool forSelectionPreservation)
2356 {
2357 int length = 0;
2358 for (TextIterator it(r, forSelectionPreservation ? TextIteratorEmitsCharactersBetweenAllVisiblePositions : TextIteratorDefaultBehavior); !it.atEnd(); it.advance())
2359 length += it.length();
2360
2361 return length;
2362 }
2363
subrange(Range * entireRange,int characterOffset,int characterCount)2364 PassRefPtr<Range> TextIterator::subrange(Range* entireRange, int characterOffset, int characterCount)
2365 {
2366 CharacterIterator entireRangeIterator(entireRange);
2367 return characterSubrange(entireRangeIterator, characterOffset, characterCount);
2368 }
2369
rangeFromLocationAndLength(Element * scope,int rangeLocation,int rangeLength,bool forSelectionPreservation)2370 PassRefPtr<Range> TextIterator::rangeFromLocationAndLength(Element* scope, int rangeLocation, int rangeLength, bool forSelectionPreservation)
2371 {
2372 RefPtr<Range> resultRange = scope->document()->createRange();
2373
2374 int docTextPosition = 0;
2375 int rangeEnd = rangeLocation + rangeLength;
2376 bool startRangeFound = false;
2377
2378 RefPtr<Range> textRunRange;
2379
2380 TextIterator it(rangeOfContents(scope).get(), forSelectionPreservation ? TextIteratorEmitsCharactersBetweenAllVisiblePositions : TextIteratorDefaultBehavior);
2381
2382 // FIXME: the atEnd() check shouldn't be necessary, workaround for <http://bugs.webkit.org/show_bug.cgi?id=6289>.
2383 if (rangeLocation == 0 && rangeLength == 0 && it.atEnd()) {
2384 textRunRange = it.range();
2385
2386 ExceptionCode ec = 0;
2387 resultRange->setStart(textRunRange->startContainer(), 0, ec);
2388 ASSERT(!ec);
2389 resultRange->setEnd(textRunRange->startContainer(), 0, ec);
2390 ASSERT(!ec);
2391
2392 return resultRange.release();
2393 }
2394
2395 for (; !it.atEnd(); it.advance()) {
2396 int len = it.length();
2397 textRunRange = it.range();
2398
2399 bool foundStart = rangeLocation >= docTextPosition && rangeLocation <= docTextPosition + len;
2400 bool foundEnd = rangeEnd >= docTextPosition && rangeEnd <= docTextPosition + len;
2401
2402 // Fix textRunRange->endPosition(), but only if foundStart || foundEnd, because it is only
2403 // in those cases that textRunRange is used.
2404 if (foundEnd) {
2405 // FIXME: This is a workaround for the fact that the end of a run is often at the wrong
2406 // position for emitted '\n's.
2407 if (len == 1 && it.characters()[0] == '\n') {
2408 scope->document()->updateLayoutIgnorePendingStylesheets();
2409 it.advance();
2410 if (!it.atEnd()) {
2411 RefPtr<Range> range = it.range();
2412 ExceptionCode ec = 0;
2413 textRunRange->setEnd(range->startContainer(), range->startOffset(), ec);
2414 ASSERT(!ec);
2415 } else {
2416 Position runStart = textRunRange->startPosition();
2417 Position runEnd = VisiblePosition(runStart).next().deepEquivalent();
2418 if (runEnd.isNotNull()) {
2419 ExceptionCode ec = 0;
2420 textRunRange->setEnd(runEnd.containerNode(), runEnd.computeOffsetInContainerNode(), ec);
2421 ASSERT(!ec);
2422 }
2423 }
2424 }
2425 }
2426
2427 if (foundStart) {
2428 startRangeFound = true;
2429 int exception = 0;
2430 if (textRunRange->startContainer()->isTextNode()) {
2431 int offset = rangeLocation - docTextPosition;
2432 resultRange->setStart(textRunRange->startContainer(), offset + textRunRange->startOffset(), exception);
2433 } else {
2434 if (rangeLocation == docTextPosition)
2435 resultRange->setStart(textRunRange->startContainer(), textRunRange->startOffset(), exception);
2436 else
2437 resultRange->setStart(textRunRange->endContainer(), textRunRange->endOffset(), exception);
2438 }
2439 }
2440
2441 if (foundEnd) {
2442 int exception = 0;
2443 if (textRunRange->startContainer()->isTextNode()) {
2444 int offset = rangeEnd - docTextPosition;
2445 resultRange->setEnd(textRunRange->startContainer(), offset + textRunRange->startOffset(), exception);
2446 } else {
2447 if (rangeEnd == docTextPosition)
2448 resultRange->setEnd(textRunRange->startContainer(), textRunRange->startOffset(), exception);
2449 else
2450 resultRange->setEnd(textRunRange->endContainer(), textRunRange->endOffset(), exception);
2451 }
2452 docTextPosition += len;
2453 break;
2454 }
2455 docTextPosition += len;
2456 }
2457
2458 if (!startRangeFound)
2459 return 0;
2460
2461 if (rangeLength != 0 && rangeEnd > docTextPosition) { // rangeEnd is out of bounds
2462 int exception = 0;
2463 resultRange->setEnd(textRunRange->endContainer(), textRunRange->endOffset(), exception);
2464 }
2465
2466 return resultRange.release();
2467 }
2468
locationAndLengthFromRange(const Range * range,size_t & location,size_t & length)2469 bool TextIterator::locationAndLengthFromRange(const Range* range, size_t& location, size_t& length)
2470 {
2471 location = notFound;
2472 length = 0;
2473
2474 if (!range->startContainer())
2475 return false;
2476
2477 Element* selectionRoot = range->ownerDocument()->frame()->selection()->rootEditableElement();
2478 Element* scope = selectionRoot ? selectionRoot : range->ownerDocument()->documentElement();
2479
2480 // The critical assumption is that this only gets called with ranges that
2481 // concentrate on a given area containing the selection root. This is done
2482 // because of text fields and textareas. The DOM for those is not
2483 // directly in the document DOM, so ensure that the range does not cross a
2484 // boundary of one of those.
2485 if (range->startContainer() != scope && !range->startContainer()->isDescendantOf(scope))
2486 return false;
2487 if (range->endContainer() != scope && !range->endContainer()->isDescendantOf(scope))
2488 return false;
2489
2490 RefPtr<Range> testRange = Range::create(scope->document(), scope, 0, range->startContainer(), range->startOffset());
2491 ASSERT(testRange->startContainer() == scope);
2492 location = TextIterator::rangeLength(testRange.get());
2493
2494 ExceptionCode ec;
2495 testRange->setEnd(range->endContainer(), range->endOffset(), ec);
2496 ASSERT(testRange->startContainer() == scope);
2497 length = TextIterator::rangeLength(testRange.get()) - location;
2498 return true;
2499 }
2500
2501 // --------
2502
plainTextToMallocAllocatedBuffer(const Range * r,unsigned & bufferLength,bool isDisplayString,TextIteratorBehavior defaultBehavior)2503 UChar* plainTextToMallocAllocatedBuffer(const Range* r, unsigned& bufferLength, bool isDisplayString, TextIteratorBehavior defaultBehavior)
2504 {
2505 UChar* result = 0;
2506
2507 // Do this in pieces to avoid massive reallocations if there is a large amount of text.
2508 // Use system malloc for buffers since they can consume lots of memory and current TCMalloc is unable return it back to OS.
2509 static const unsigned cMaxSegmentSize = 1 << 16;
2510 bufferLength = 0;
2511 typedef pair<UChar*, unsigned> TextSegment;
2512 OwnPtr<Vector<TextSegment> > textSegments;
2513 Vector<UChar> textBuffer;
2514 textBuffer.reserveInitialCapacity(cMaxSegmentSize);
2515 TextIteratorBehavior behavior = defaultBehavior;
2516 if (!isDisplayString)
2517 behavior = static_cast<TextIteratorBehavior>(behavior | TextIteratorEmitsTextsWithoutTranscoding);
2518
2519 for (TextIterator it(r, behavior); !it.atEnd(); it.advance()) {
2520 if (textBuffer.size() && textBuffer.size() + it.length() > cMaxSegmentSize) {
2521 UChar* newSegmentBuffer = static_cast<UChar*>(malloc(textBuffer.size() * sizeof(UChar)));
2522 if (!newSegmentBuffer)
2523 goto exit;
2524 memcpy(newSegmentBuffer, textBuffer.data(), textBuffer.size() * sizeof(UChar));
2525 if (!textSegments)
2526 textSegments = adoptPtr(new Vector<TextSegment>);
2527 textSegments->append(make_pair(newSegmentBuffer, (unsigned)textBuffer.size()));
2528 textBuffer.clear();
2529 }
2530 textBuffer.append(it.characters(), it.length());
2531 bufferLength += it.length();
2532 }
2533
2534 if (!bufferLength)
2535 return 0;
2536
2537 // Since we know the size now, we can make a single buffer out of the pieces with one big alloc
2538 result = static_cast<UChar*>(malloc(bufferLength * sizeof(UChar)));
2539 if (!result)
2540 goto exit;
2541
2542 {
2543 UChar* resultPos = result;
2544 if (textSegments) {
2545 unsigned size = textSegments->size();
2546 for (unsigned i = 0; i < size; ++i) {
2547 const TextSegment& segment = textSegments->at(i);
2548 memcpy(resultPos, segment.first, segment.second * sizeof(UChar));
2549 resultPos += segment.second;
2550 }
2551 }
2552 memcpy(resultPos, textBuffer.data(), textBuffer.size() * sizeof(UChar));
2553 }
2554
2555 exit:
2556 if (textSegments) {
2557 unsigned size = textSegments->size();
2558 for (unsigned i = 0; i < size; ++i)
2559 free(textSegments->at(i).first);
2560 }
2561
2562 if (isDisplayString && r->ownerDocument())
2563 r->ownerDocument()->displayBufferModifiedByEncoding(result, bufferLength);
2564
2565 return result;
2566 }
2567
plainText(const Range * r,TextIteratorBehavior defaultBehavior)2568 String plainText(const Range* r, TextIteratorBehavior defaultBehavior)
2569 {
2570 unsigned length;
2571 UChar* buf = plainTextToMallocAllocatedBuffer(r, length, false, defaultBehavior);
2572 if (!buf)
2573 return "";
2574 String result(buf, length);
2575 free(buf);
2576 return result;
2577 }
2578
isAllCollapsibleWhitespace(const String & string)2579 static inline bool isAllCollapsibleWhitespace(const String& string)
2580 {
2581 const UChar* characters = string.characters();
2582 unsigned length = string.length();
2583 for (unsigned i = 0; i < length; ++i) {
2584 if (!isCollapsibleWhitespace(characters[i]))
2585 return false;
2586 }
2587 return true;
2588 }
2589
collapsedToBoundary(const Range * range,bool forward)2590 static PassRefPtr<Range> collapsedToBoundary(const Range* range, bool forward)
2591 {
2592 ExceptionCode ec = 0;
2593 RefPtr<Range> result = range->cloneRange(ec);
2594 ASSERT(!ec);
2595 result->collapse(!forward, ec);
2596 ASSERT(!ec);
2597 return result.release();
2598 }
2599
findPlainText(CharacterIterator & it,const String & target,FindOptions options,size_t & matchStart)2600 static size_t findPlainText(CharacterIterator& it, const String& target, FindOptions options, size_t& matchStart)
2601 {
2602 matchStart = 0;
2603 size_t matchLength = 0;
2604
2605 SearchBuffer buffer(target, options);
2606
2607 if (buffer.needsMoreContext()) {
2608 RefPtr<Range> startRange = it.range();
2609 RefPtr<Range> beforeStartRange = startRange->ownerDocument()->createRange();
2610 ExceptionCode ec = 0;
2611 beforeStartRange->setEnd(startRange->startContainer(), startRange->startOffset(), ec);
2612 for (SimplifiedBackwardsTextIterator backwardsIterator(beforeStartRange.get()); !backwardsIterator.atEnd(); backwardsIterator.advance()) {
2613 buffer.prependContext(backwardsIterator.characters(), backwardsIterator.length());
2614 if (!buffer.needsMoreContext())
2615 break;
2616 }
2617 }
2618
2619 while (!it.atEnd()) {
2620 it.advance(buffer.append(it.characters(), it.length()));
2621 tryAgain:
2622 size_t matchStartOffset;
2623 if (size_t newMatchLength = buffer.search(matchStartOffset)) {
2624 // Note that we found a match, and where we found it.
2625 size_t lastCharacterInBufferOffset = it.characterOffset();
2626 ASSERT(lastCharacterInBufferOffset >= matchStartOffset);
2627 matchStart = lastCharacterInBufferOffset - matchStartOffset;
2628 matchLength = newMatchLength;
2629 // If searching forward, stop on the first match.
2630 // If searching backward, don't stop, so we end up with the last match.
2631 if (!(options & Backwards))
2632 break;
2633 goto tryAgain;
2634 }
2635 if (it.atBreak() && !buffer.atBreak()) {
2636 buffer.reachedBreak();
2637 goto tryAgain;
2638 }
2639 }
2640
2641 return matchLength;
2642 }
2643
findPlainText(const Range * range,const String & target,FindOptions options)2644 PassRefPtr<Range> findPlainText(const Range* range, const String& target, FindOptions options)
2645 {
2646 // CharacterIterator requires renderers to be up-to-date
2647 range->ownerDocument()->updateLayout();
2648
2649 // First, find the text.
2650 size_t matchStart;
2651 size_t matchLength;
2652 {
2653 CharacterIterator findIterator(range, TextIteratorEntersTextControls);
2654 matchLength = findPlainText(findIterator, target, options, matchStart);
2655 if (!matchLength)
2656 return collapsedToBoundary(range, !(options & Backwards));
2657 }
2658
2659 // Then, find the document position of the start and the end of the text.
2660 CharacterIterator computeRangeIterator(range, TextIteratorEntersTextControls);
2661 return characterSubrange(computeRangeIterator, matchStart, matchLength);
2662 }
2663
2664 }
2665