1 /****************************************************************************
2 **
3 ** Copyright (C) 2016 The Qt Company Ltd.
4 ** Contact: https://www.qt.io/licensing/
5 **
6 ** This file is part of the QtCore module of the Qt Toolkit.
7 **
8 ** $QT_BEGIN_LICENSE:LGPL$
9 ** Commercial License Usage
10 ** Licensees holding valid commercial Qt licenses may use this file in
11 ** accordance with the commercial license agreement provided with the
12 ** Software or, alternatively, in accordance with the terms contained in
13 ** a written agreement between you and The Qt Company. For licensing terms
14 ** and conditions see https://www.qt.io/terms-conditions. For further
15 ** information use the contact form at https://www.qt.io/contact-us.
16 **
17 ** GNU Lesser General Public License Usage
18 ** Alternatively, this file may be used under the terms of the GNU Lesser
19 ** General Public License version 3 as published by the Free Software
20 ** Foundation and appearing in the file LICENSE.LGPL3 included in the
21 ** packaging of this file. Please review the following information to
22 ** ensure the GNU Lesser General Public License version 3 requirements
23 ** will be met: https://www.gnu.org/licenses/lgpl-3.0.html.
24 **
25 ** GNU General Public License Usage
26 ** Alternatively, this file may be used under the terms of the GNU
27 ** General Public License version 2.0 or (at your option) the GNU General
28 ** Public license version 3 or any later version approved by the KDE Free
29 ** Qt Foundation. The licenses are as published by the Free Software
30 ** Foundation and appearing in the file LICENSE.GPL2 and LICENSE.GPL3
31 ** included in the packaging of this file. Please review the following
32 ** information to ensure the GNU General Public License requirements will
33 ** be met: https://www.gnu.org/licenses/gpl-2.0.html and
34 ** https://www.gnu.org/licenses/gpl-3.0.html.
35 **
36 ** $QT_END_LICENSE$
37 **
38 ****************************************************************************/
39 #include <QtCore/qtextboundaryfinder.h>
40 #include <QtCore/qvarlengtharray.h>
41 
42 #include <private/qunicodetools_p.h>
43 
44 QT_BEGIN_NAMESPACE
45 
46 class QTextBoundaryFinderPrivate
47 {
48 public:
49     QCharAttributes attributes[1];
50 };
51 
init(QTextBoundaryFinder::BoundaryType type,const QChar * chars,int length,QCharAttributes * attributes)52 static void init(QTextBoundaryFinder::BoundaryType type, const QChar *chars, int length, QCharAttributes *attributes)
53 {
54     const ushort *string = reinterpret_cast<const ushort *>(chars);
55 
56     QVarLengthArray<QUnicodeTools::ScriptItem> scriptItems;
57     {
58         QVarLengthArray<uchar> scripts(length);
59 
60         QUnicodeTools::initScripts(string, length, scripts.data());
61 
62         int start = 0;
63         for (int i = start + 1; i <= length; ++i) {
64             if (i == length || scripts[i] != scripts[start]) {
65                 QUnicodeTools::ScriptItem item;
66                 item.position = start;
67                 item.script = scripts[start];
68                 scriptItems.append(item);
69                 start = i;
70             }
71         }
72     }
73 
74     QUnicodeTools::CharAttributeOptions options;
75     switch (type) {
76     case QTextBoundaryFinder::Grapheme: options |= QUnicodeTools::GraphemeBreaks; break;
77     case QTextBoundaryFinder::Word: options |= QUnicodeTools::WordBreaks; break;
78     case QTextBoundaryFinder::Sentence: options |= QUnicodeTools::SentenceBreaks; break;
79     case QTextBoundaryFinder::Line: options |= QUnicodeTools::LineBreaks; break;
80     default: break;
81     }
82     QUnicodeTools::initCharAttributes(string, length, scriptItems.data(), scriptItems.count(), attributes, options);
83 }
84 
85 /*!
86     \class QTextBoundaryFinder
87     \inmodule QtCore
88 
89     \brief The QTextBoundaryFinder class provides a way of finding Unicode text boundaries in a string.
90 
91     \since 4.4
92     \ingroup tools
93     \ingroup shared
94     \ingroup string-processing
95     \reentrant
96 
97     QTextBoundaryFinder allows to find Unicode text boundaries in a
98     string, accordingly to the Unicode text boundary specification (see
99     \l{http://www.unicode.org/reports/tr14/}{Unicode Standard Annex #14} and
100     \l{http://www.unicode.org/reports/tr29/}{Unicode Standard Annex #29}).
101 
102     QTextBoundaryFinder can operate on a QString in four possible
103     modes depending on the value of \a BoundaryType.
104 
105     Units of Unicode characters that make up what the user thinks of
106     as a character or basic unit of the language are here called
107     Grapheme clusters. The two unicode characters 'A' + diaeresis do
108     for example form one grapheme cluster as the user thinks of them
109     as one character, yet it is in this case represented by two
110     unicode code points
111     (see \l{http://www.unicode.org/reports/tr29/#Grapheme_Cluster_Boundaries}).
112 
113     Word boundaries are there to locate the start and end of what a
114     language considers to be a word
115     (see \l{http://www.unicode.org/reports/tr29/#Word_Boundaries}).
116 
117     Line break boundaries give possible places where a line break
118     might happen and sentence boundaries will show the beginning and
119     end of whole sentences
120     (see \l{http://www.unicode.org/reports/tr29/#Sentence_Boundaries} and
121     \l{http://www.unicode.org/reports/tr14/}).
122 
123     The first position in a string is always a valid boundary and
124     refers to the position before the first character. The last
125     position at the length of the string is also valid and refers
126     to the position after the last character.
127 */
128 
129 /*!
130     \enum QTextBoundaryFinder::BoundaryType
131 
132     \value Grapheme Finds a grapheme which is the smallest boundary. It
133                     including letters, punctuation marks, numerals and more.
134     \value Word Finds a word.
135     \value Line Finds possible positions for breaking the text into multiple
136     lines.
137     \value Sentence Finds sentence boundaries. These include periods, question
138     marks etc.
139 */
140 
141 /*!
142   \enum QTextBoundaryFinder::BoundaryReason
143 
144   \value NotAtBoundary  The boundary finder is not at a boundary position.
145   \value BreakOpportunity  The boundary finder is at a break opportunity position.
146                            Such a break opportunity might also be an item boundary
147                            (either StartOfItem, EndOfItem, or combination of both),
148                            a mandatory line break, or a soft hyphen.
149   \value StartOfItem  Since 5.0. The boundary finder is at the start of
150                       a grapheme, a word, a sentence, or a line.
151   \value EndOfItem  Since 5.0. The boundary finder is at the end of
152                     a grapheme, a word, a sentence, or a line.
153   \value MandatoryBreak  Since 5.0. The boundary finder is at the end of line
154                          (can occur for a Line boundary type only).
155   \value SoftHyphen  The boundary finder is at the soft hyphen
156                      (can occur for a Line boundary type only).
157 */
158 
159 /*!
160   Constructs an invalid QTextBoundaryFinder object.
161 */
QTextBoundaryFinder()162 QTextBoundaryFinder::QTextBoundaryFinder()
163     : t(Grapheme)
164     , chars(nullptr)
165     , length(0)
166     , freePrivate(true)
167     , d(nullptr)
168 {
169 }
170 
171 /*!
172   Copies the QTextBoundaryFinder object, \a other.
173 */
QTextBoundaryFinder(const QTextBoundaryFinder & other)174 QTextBoundaryFinder::QTextBoundaryFinder(const QTextBoundaryFinder &other)
175     : t(other.t)
176     , s(other.s)
177     , chars(other.chars)
178     , length(other.length)
179     , pos(other.pos)
180     , freePrivate(true)
181     , d(nullptr)
182 {
183     if (other.d) {
184         Q_ASSERT(length > 0);
185         d = (QTextBoundaryFinderPrivate *) malloc((length + 1) * sizeof(QCharAttributes));
186         Q_CHECK_PTR(d);
187         memcpy(d, other.d, (length + 1) * sizeof(QCharAttributes));
188     }
189 }
190 
191 /*!
192   Assigns the object, \a other, to another QTextBoundaryFinder object.
193 */
operator =(const QTextBoundaryFinder & other)194 QTextBoundaryFinder &QTextBoundaryFinder::operator=(const QTextBoundaryFinder &other)
195 {
196     if (&other == this)
197         return *this;
198 
199     if (other.d) {
200         Q_ASSERT(other.length > 0);
201         uint newCapacity = (other.length + 1) * sizeof(QCharAttributes);
202         QTextBoundaryFinderPrivate *newD = (QTextBoundaryFinderPrivate *) realloc(freePrivate ? d : nullptr, newCapacity);
203         Q_CHECK_PTR(newD);
204         freePrivate = true;
205         d = newD;
206     }
207 
208     t = other.t;
209     s = other.s;
210     chars = other.chars;
211     length = other.length;
212     pos = other.pos;
213 
214     if (other.d) {
215         memcpy(d, other.d, (length + 1) * sizeof(QCharAttributes));
216     } else {
217         if (freePrivate)
218             free(d);
219         d = nullptr;
220     }
221 
222     return *this;
223 }
224 
225 /*!
226   Destructs the QTextBoundaryFinder object.
227 */
~QTextBoundaryFinder()228 QTextBoundaryFinder::~QTextBoundaryFinder()
229 {
230     Q_UNUSED(unused);
231     if (freePrivate)
232         free(d);
233 }
234 
235 /*!
236   Creates a QTextBoundaryFinder object of \a type operating on \a string.
237 */
QTextBoundaryFinder(BoundaryType type,const QString & string)238 QTextBoundaryFinder::QTextBoundaryFinder(BoundaryType type, const QString &string)
239     : t(type)
240     , s(string)
241     , chars(string.unicode())
242     , length(string.length())
243     , pos(0)
244     , freePrivate(true)
245     , d(nullptr)
246 {
247     if (length > 0) {
248         d = (QTextBoundaryFinderPrivate *) malloc((length + 1) * sizeof(QCharAttributes));
249         Q_CHECK_PTR(d);
250         init(t, chars, length, d->attributes);
251     }
252 }
253 
254 /*!
255   Creates a QTextBoundaryFinder object of \a type operating on \a chars
256   with \a length.
257 
258   \a buffer is an optional working buffer of size \a bufferSize you can pass to
259   the QTextBoundaryFinder. If the buffer is large enough to hold the working
260   data required (bufferSize >= length + 1), it will use this
261   instead of allocating its own buffer.
262 
263   \warning QTextBoundaryFinder does not create a copy of \a chars. It is the
264   application programmer's responsibility to ensure the array is allocated for
265   as long as the QTextBoundaryFinder object stays alive. The same applies to
266   \a buffer.
267 */
QTextBoundaryFinder(BoundaryType type,const QChar * chars,int length,unsigned char * buffer,int bufferSize)268 QTextBoundaryFinder::QTextBoundaryFinder(BoundaryType type, const QChar *chars, int length, unsigned char *buffer, int bufferSize)
269     : t(type)
270     , chars(chars)
271     , length(length)
272     , pos(0)
273     , freePrivate(true)
274     , d(nullptr)
275 {
276     if (!chars) {
277         length = 0;
278     } else if (length > 0) {
279         if (buffer && (uint)bufferSize >= (length + 1) * sizeof(QCharAttributes)) {
280             d = (QTextBoundaryFinderPrivate *)buffer;
281             freePrivate = false;
282         } else {
283             d = (QTextBoundaryFinderPrivate *) malloc((length + 1) * sizeof(QCharAttributes));
284             Q_CHECK_PTR(d);
285         }
286         init(t, chars, length, d->attributes);
287     }
288 }
289 
290 /*!
291   Moves the finder to the start of the string. This is equivalent to setPosition(0).
292 
293   \sa setPosition(), position()
294 */
toStart()295 void QTextBoundaryFinder::toStart()
296 {
297     pos = 0;
298 }
299 
300 /*!
301   Moves the finder to the end of the string. This is equivalent to setPosition(string.length()).
302 
303   \sa setPosition(), position()
304 */
toEnd()305 void QTextBoundaryFinder::toEnd()
306 {
307     pos = length;
308 }
309 
310 /*!
311   Returns the current position of the QTextBoundaryFinder.
312 
313   The range is from 0 (the beginning of the string) to the length of
314   the string inclusive.
315 
316   \sa setPosition()
317 */
position() const318 int QTextBoundaryFinder::position() const
319 {
320     return pos;
321 }
322 
323 /*!
324   Sets the current position of the QTextBoundaryFinder to \a position.
325 
326   If \a position is out of bounds, it will be bound to only valid
327   positions. In this case, valid positions are from 0 to the length of
328   the string inclusive.
329 
330   \sa position()
331 */
setPosition(int position)332 void QTextBoundaryFinder::setPosition(int position)
333 {
334     pos = qBound(0, position, length);
335 }
336 
337 /*! \fn QTextBoundaryFinder::BoundaryType QTextBoundaryFinder::type() const
338 
339   Returns the type of the QTextBoundaryFinder.
340 */
341 
342 /*! \fn bool QTextBoundaryFinder::isValid() const
343 
344    Returns \c true if the text boundary finder is valid; otherwise returns \c false.
345    A default QTextBoundaryFinder is invalid.
346 */
347 
348 /*!
349   Returns the string  the QTextBoundaryFinder object operates on.
350 */
string() const351 QString QTextBoundaryFinder::string() const
352 {
353     if (chars == s.unicode() && length == s.length())
354         return s;
355     return QString(chars, length);
356 }
357 
358 
359 /*!
360   Moves the QTextBoundaryFinder to the next boundary position and returns that position.
361 
362   Returns -1 if there is no next boundary.
363 */
toNextBoundary()364 int QTextBoundaryFinder::toNextBoundary()
365 {
366     if (!d || pos < 0 || pos >= length) {
367         pos = -1;
368         return pos;
369     }
370 
371     ++pos;
372     switch(t) {
373     case Grapheme:
374         while (pos < length && !d->attributes[pos].graphemeBoundary)
375             ++pos;
376         break;
377     case Word:
378         while (pos < length && !d->attributes[pos].wordBreak)
379             ++pos;
380         break;
381     case Sentence:
382         while (pos < length && !d->attributes[pos].sentenceBoundary)
383             ++pos;
384         break;
385     case Line:
386         while (pos < length && !d->attributes[pos].lineBreak)
387             ++pos;
388         break;
389     }
390 
391     return pos;
392 }
393 
394 /*!
395   Moves the QTextBoundaryFinder to the previous boundary position and returns that position.
396 
397   Returns -1 if there is no previous boundary.
398 */
toPreviousBoundary()399 int QTextBoundaryFinder::toPreviousBoundary()
400 {
401     if (!d || pos <= 0 || pos > length) {
402         pos = -1;
403         return pos;
404     }
405 
406     --pos;
407     switch(t) {
408     case Grapheme:
409         while (pos > 0 && !d->attributes[pos].graphemeBoundary)
410             --pos;
411         break;
412     case Word:
413         while (pos > 0 && !d->attributes[pos].wordBreak)
414             --pos;
415         break;
416     case Sentence:
417         while (pos > 0 && !d->attributes[pos].sentenceBoundary)
418             --pos;
419         break;
420     case Line:
421         while (pos > 0 && !d->attributes[pos].lineBreak)
422             --pos;
423         break;
424     }
425 
426     return pos;
427 }
428 
429 /*!
430   Returns \c true if the object's position() is currently at a valid text boundary.
431 */
isAtBoundary() const432 bool QTextBoundaryFinder::isAtBoundary() const
433 {
434     if (!d || pos < 0 || pos > length)
435         return false;
436 
437     switch(t) {
438     case Grapheme:
439         return d->attributes[pos].graphemeBoundary;
440     case Word:
441         return d->attributes[pos].wordBreak;
442     case Sentence:
443         return d->attributes[pos].sentenceBoundary;
444     case Line:
445         // ### TR#14 LB2 prohibits break at sot
446         return d->attributes[pos].lineBreak || pos == 0;
447     }
448     return false;
449 }
450 
451 /*!
452   Returns the reasons for the boundary finder to have chosen the current position as a boundary.
453 */
boundaryReasons() const454 QTextBoundaryFinder::BoundaryReasons QTextBoundaryFinder::boundaryReasons() const
455 {
456     BoundaryReasons reasons = NotAtBoundary;
457     if (!d || pos < 0 || pos > length)
458         return reasons;
459 
460     const QCharAttributes attr = d->attributes[pos];
461     switch (t) {
462     case Grapheme:
463         if (attr.graphemeBoundary) {
464             reasons |= BreakOpportunity | StartOfItem | EndOfItem;
465             if (pos == 0)
466                 reasons &= (~EndOfItem);
467             else if (pos == length)
468                 reasons &= (~StartOfItem);
469         }
470         break;
471     case Word:
472         if (attr.wordBreak) {
473             reasons |= BreakOpportunity;
474             if (attr.wordStart)
475                 reasons |= StartOfItem;
476             if (attr.wordEnd)
477                 reasons |= EndOfItem;
478         }
479         break;
480     case Sentence:
481         if (attr.sentenceBoundary) {
482             reasons |= BreakOpportunity | StartOfItem | EndOfItem;
483             if (pos == 0)
484                 reasons &= (~EndOfItem);
485             else if (pos == length)
486                 reasons &= (~StartOfItem);
487         }
488         break;
489     case Line:
490         // ### TR#14 LB2 prohibits break at sot
491         if (attr.lineBreak || pos == 0) {
492             reasons |= BreakOpportunity;
493             if (attr.mandatoryBreak || pos == 0) {
494                 reasons |= MandatoryBreak | StartOfItem | EndOfItem;
495                 if (pos == 0)
496                     reasons &= (~EndOfItem);
497                 else if (pos == length)
498                     reasons &= (~StartOfItem);
499             } else if (pos > 0 && chars[pos - 1].unicode() == QChar::SoftHyphen) {
500                 reasons |= SoftHyphen;
501             }
502         }
503         break;
504     default:
505         break;
506     }
507 
508     return reasons;
509 }
510 
511 QT_END_NAMESPACE
512