1 /****************************************************************************
2 **
3 ** Copyright (C) 2016 The Qt Company Ltd.
4 ** Contact: https://www.qt.io/licensing/
5 **
6 ** This file is part of the QtCore module of the Qt Toolkit.
7 **
8 ** $QT_BEGIN_LICENSE:LGPL$
9 ** Commercial License Usage
10 ** Licensees holding valid commercial Qt licenses may use this file in
11 ** accordance with the commercial license agreement provided with the
12 ** Software or, alternatively, in accordance with the terms contained in
13 ** a written agreement between you and The Qt Company. For licensing terms
14 ** and conditions see https://www.qt.io/terms-conditions. For further
15 ** information use the contact form at https://www.qt.io/contact-us.
16 **
17 ** GNU Lesser General Public License Usage
18 ** Alternatively, this file may be used under the terms of the GNU Lesser
19 ** General Public License version 3 as published by the Free Software
20 ** Foundation and appearing in the file LICENSE.LGPL3 included in the
21 ** packaging of this file. Please review the following information to
22 ** ensure the GNU Lesser General Public License version 3 requirements
23 ** will be met: https://www.gnu.org/licenses/lgpl-3.0.html.
24 **
25 ** GNU General Public License Usage
26 ** Alternatively, this file may be used under the terms of the GNU
27 ** General Public License version 2.0 or (at your option) the GNU General
28 ** Public license version 3 or any later version approved by the KDE Free
29 ** Qt Foundation. The licenses are as published by the Free Software
30 ** Foundation and appearing in the file LICENSE.GPL2 and LICENSE.GPL3
31 ** included in the packaging of this file. Please review the following
32 ** information to ensure the GNU General Public License requirements will
33 ** be met: https://www.gnu.org/licenses/gpl-2.0.html and
34 ** https://www.gnu.org/licenses/gpl-3.0.html.
35 **
36 ** $QT_END_LICENSE$
37 **
38 ****************************************************************************/
39 
40 #include "qunicodetools_p.h"
41 
42 #include "qunicodetables_p.h"
43 #include "qvarlengtharray.h"
44 
45 #include "qharfbuzz_p.h"
46 
47 #define FLAG(x) (1 << (x))
48 
49 QT_BEGIN_NAMESPACE
50 
51 Q_AUTOTEST_EXPORT int qt_initcharattributes_default_algorithm_only = 0;
52 
53 namespace QUnicodeTools {
54 
55 // -----------------------------------------------------------------------------------------------------
56 //
57 // The text boundaries determination algorithm.
58 // See http://www.unicode.org/reports/tr29/tr29-31.html
59 //
60 // -----------------------------------------------------------------------------------------------------
61 
62 namespace GB {
63 
64 /*
65  * Most grapheme break rules can be implemented table driven, but rules GB10, GB12 and GB13 need a bit
66  * of special treatment.
67  */
68 enum State : uchar {
69     Break,
70     Inside,
71     GB10,
72     GB10_2,
73     GB10_3,
74     GB13, // also covers GB12
75 };
76 
77 static const State breakTable[QUnicodeTables::NumGraphemeBreakClasses][QUnicodeTables::NumGraphemeBreakClasses] = {
78 //    Any     CR      LF     Control  Extend  ZWJ     RI     Prepend  S-Mark  L       V       T       LV      LVT     E_B     E_M     GAZ     EBG
79     { Break , Break , Break , Break , Inside, Inside, Break , Break , Inside, Break , Break , Break , Break , Break , Break , Break , Break , Break  }, // Any
80     { Break , Break , Inside, Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break  }, // CR
81     { Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break  }, // LF
82     { Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break  }, // Control
83     { Break , Break , Break , Break , GB10_2, Inside, Break , Break , Inside, Break , Break , Break , Break , Break , Break , GB10_3, Break , Break  }, // Extend
84     { Break , Break , Break , Break , Inside, Inside, Break , Break , Inside, Break , Break , Break , Break , Break , Break , Break , Inside, Inside }, // ZWJ
85     { Break , Break , Break , Break , Inside, Inside, GB13  , Break , Inside, Break , Break , Break , Break , Break , Break , Break , Break , Break  }, // RegionalIndicator
86     { Inside, Break , Break , Break , Inside, Inside, Inside, Inside, Inside, Inside, Inside, Inside, Inside, Inside, Inside, Inside, Inside, Inside }, // Prepend
87     { Break , Break , Break , Break , Inside, Inside, Break , Break , Inside, Break , Break , Break , Break , Break , Break , Break , Break , Break  }, // SpacingMark
88     { Break , Break , Break , Break , Inside, Inside, Break , Break , Inside, Inside, Inside, Break , Inside, Inside, Break , Break , Break , Break  }, // L
89     { Break , Break , Break , Break , Inside, Inside, Break , Break , Inside, Break , Inside, Inside, Break , Break , Break , Break , Break , Break  }, // V
90     { Break , Break , Break , Break , Inside, Inside, Break , Break , Inside, Break , Break , Inside, Break , Break , Break , Break , Break , Break  }, // T
91     { Break , Break , Break , Break , Inside, Inside, Break , Break , Inside, Break , Inside, Inside, Break , Break , Break , Break , Break , Break  }, // LV
92     { Break , Break , Break , Break , Inside, Inside, Break , Break , Inside, Break , Break , Inside, Break , Break , Break , Break , Break , Break  }, // LVT
93     { Break , Break , Break , Break , GB10  , Inside, Break , Break , Inside, Break , Break , Break , Break , Break , Break , Inside, Break , Break  }, // E_B
94     { Break , Break , Break , Break , Inside, Inside, Break , Break , Inside, Break , Break , Break , Break , Break , Break , Break , Break , Break  }, // E_M
95     { Break , Break , Break , Break , Inside, Inside, Break , Break , Inside, Break , Break , Break , Break , Break , Break , Break , Break , Break  }, // GAZ
96     { Break , Break , Break , Break , GB10  , Inside, Break , Break , Inside, Break , Break , Break , Break , Break , Break , Inside, Break , Break  }, // EBG
97 };
98 
99 } // namespace GB
100 
getGraphemeBreaks(const ushort * string,quint32 len,QCharAttributes * attributes)101 static void getGraphemeBreaks(const ushort *string, quint32 len, QCharAttributes *attributes)
102 {
103     QUnicodeTables::GraphemeBreakClass lcls = QUnicodeTables::GraphemeBreak_LF; // to meet GB1
104     GB::State state = GB::Break; // only required to track some of the rules
105     for (quint32 i = 0; i != len; ++i) {
106         quint32 pos = i;
107         uint ucs4 = string[i];
108         if (QChar::isHighSurrogate(ucs4) && i + 1 != len) {
109             ushort low = string[i + 1];
110             if (QChar::isLowSurrogate(low)) {
111                 ucs4 = QChar::surrogateToUcs4(ucs4, low);
112                 ++i;
113             }
114         }
115 
116         const QUnicodeTables::Properties *prop = QUnicodeTables::properties(ucs4);
117         QUnicodeTables::GraphemeBreakClass cls = (QUnicodeTables::GraphemeBreakClass) prop->graphemeBreakClass;
118 
119         switch (GB::breakTable[lcls][cls]) {
120         case GB::Break:
121             attributes[pos].graphemeBoundary = true;
122             state = GB::Break;
123             break;
124         case GB::Inside:
125             state = GB::Break;
126             break;
127         case GB::GB10:
128             state = GB::GB10;
129             break;
130         case GB::GB10_2:
131             if (state == GB::GB10 || state == GB::GB10_2)
132                 state = GB::GB10_2;
133             else
134                 state = GB::Break;
135             break;
136         case GB::GB10_3:
137             if (state != GB::GB10 && state != GB::GB10_2)
138                 attributes[pos].graphemeBoundary = true;
139             state = GB::Break;
140             break;
141         case GB::GB13:
142             if (state != GB::GB13) {
143                 state = GB::GB13;
144             } else {
145                 attributes[pos].graphemeBoundary = true;
146                 state = GB::Break;
147             }
148         }
149 
150         lcls = cls;
151     }
152 
153     attributes[len].graphemeBoundary = true; // GB2
154 }
155 
156 
157 namespace WB {
158 
159 enum Action {
160     NoBreak,
161     Break,
162     Lookup,
163     LookupW
164 };
165 
166 static const uchar breakTable[QUnicodeTables::NumWordBreakClasses][QUnicodeTables::NumWordBreakClasses] = {
167 //    Any      CR       LF       Newline  Extend   ZWJ      Format    RI       Katakana HLetter  ALetter  SQuote   DQuote  MidNumLet MidLetter MidNum  Numeric ExtNumLet E_Base   E_Mod    GAZ      EBG      WSeg
168     { Break  , Break  , Break  , Break  , NoBreak, NoBreak, NoBreak,  Break  , Break  , Break  , Break  , Break  , Break  , Break  , Break  , Break  , Break  , Break  , Break  , Break  , Break  , Break  , Break }, // Any
169     { Break  , Break  , NoBreak, Break  , Break  , Break  , Break  ,  Break  , Break  , Break  , Break  , Break  , Break  , Break  , Break  , Break  , Break  , Break  , Break  , Break  , Break  , Break  , Break }, // CR
170     { Break  , Break  , Break  , Break  , Break  , Break  , Break  ,  Break  , Break  , Break  , Break  , Break  , Break  , Break  , Break  , Break  , Break  , Break  , Break  , Break  , Break  , Break  , Break }, // LF
171     { Break  , Break  , Break  , Break  , Break  , Break  , Break  ,  Break  , Break  , Break  , Break  , Break  , Break  , Break  , Break  , Break  , Break  , Break  , Break  , Break  , Break  , Break  , Break }, // Newline
172     { Break  , Break  , Break  , Break  , NoBreak, NoBreak, NoBreak,  Break  , Break  , Break  , Break  , Break  , Break  , Break  , Break  , Break  , Break  , Break  , Break  , Break  , Break  , Break  , Break }, // Extend
173     { Break  , Break  , Break  , Break  , NoBreak, NoBreak, NoBreak,  Break  , Break  , Break  , Break  , Break  , Break  , Break  , Break  , Break  , Break  , Break  , Break  , Break  , NoBreak, NoBreak, Break }, // ZWJ
174     { Break  , Break  , Break  , Break  , NoBreak, NoBreak, NoBreak,  Break  , Break  , Break  , Break  , Break  , Break  , Break  , Break  , Break  , Break  , Break  , Break  , Break  , Break  , Break  , Break }, // Format
175     { Break  , Break  , Break  , Break  , NoBreak, NoBreak, NoBreak,  NoBreak, Break  , Break  , Break  , Break  , Break  , Break  , Break  , Break  , Break  , Break  , Break  , Break  , Break  , Break  , Break }, // RegionalIndicator
176     { Break  , Break  , Break  , Break  , NoBreak, NoBreak, NoBreak,  Break  , NoBreak, Break  , Break  , Break  , Break  , Break  , Break  , Break  , Break  , NoBreak, Break  , Break  , Break  , Break  , Break }, // Katakana
177     { Break  , Break  , Break  , Break  , NoBreak, NoBreak, NoBreak,  Break  , Break  , NoBreak, NoBreak, LookupW, Lookup , LookupW, LookupW, Break  , NoBreak, NoBreak, Break  , Break  , Break  , Break  , Break }, // HebrewLetter
178     { Break  , Break  , Break  , Break  , NoBreak, NoBreak, NoBreak,  Break  , Break  , NoBreak, NoBreak, LookupW, Break  , LookupW, LookupW, Break  , NoBreak, NoBreak, Break  , Break  , Break  , Break  , Break }, // ALetter
179     { Break  , Break  , Break  , Break  , NoBreak, NoBreak, NoBreak,  Break  , Break  , Break  , Break  , Break  , Break  , Break  , Break  , Break  , Break  , Break  , Break  , Break  , Break  , Break  , Break }, // SingleQuote
180     { Break  , Break  , Break  , Break  , NoBreak, NoBreak, NoBreak,  Break  , Break  , Break  , Break  , Break  , Break  , Break  , Break  , Break  , Break  , Break  , Break  , Break  , Break  , Break  , Break }, // DoubleQuote
181     { Break  , Break  , Break  , Break  , NoBreak, NoBreak, NoBreak,  Break  , Break  , Break  , Break  , Break  , Break  , Break  , Break  , Break  , Break  , Break  , Break  , Break  , Break  , Break  , Break }, // MidNumLet
182     { Break  , Break  , Break  , Break  , NoBreak, NoBreak, NoBreak,  Break  , Break  , Break  , Break  , Break  , Break  , Break  , Break  , Break  , Break  , Break  , Break  , Break  , Break  , Break  , Break }, // MidLetter
183     { Break  , Break  , Break  , Break  , NoBreak, NoBreak, NoBreak,  Break  , Break  , Break  , Break  , Break  , Break  , Break  , Break  , Break  , Break  , Break  , Break  , Break  , Break  , Break  , Break }, // MidNum
184     { Break  , Break  , Break  , Break  , NoBreak, NoBreak, NoBreak,  Break  , Break  , NoBreak, NoBreak, Lookup , Break  , Lookup , Break  , Lookup , NoBreak, NoBreak, Break  , Break  , Break  , Break  , Break }, // Numeric
185     { Break  , Break  , Break  , Break  , NoBreak, NoBreak, NoBreak,  Break  , NoBreak, NoBreak, NoBreak, Break  , Break  , Break  , Break  , Break  , NoBreak, NoBreak, Break  , Break  , Break  , Break  , Break }, // ExtendNumLet
186     { Break  , Break  , Break  , Break  , NoBreak, NoBreak, NoBreak,  Break  , Break  , Break  , Break  , Break  , Break  , Break  , Break  , Break  , Break  , Break  , Break  , NoBreak, Break  , Break  , Break }, // E_Base
187     { Break  , Break  , Break  , Break  , NoBreak, NoBreak, NoBreak,  Break  , Break  , Break  , Break  , Break  , Break  , Break  , Break  , Break  , Break  , Break  , Break  , Break  , Break  , Break  , Break }, // E_Mod
188     { Break  , Break  , Break  , Break  , NoBreak, NoBreak, NoBreak,  Break  , Break  , Break  , Break  , Break  , Break  , Break  , Break  , Break  , Break  , Break  , Break  , Break  , Break  , Break  , Break }, // GAZ
189     { Break  , Break  , Break  , Break  , NoBreak, NoBreak, NoBreak,  Break  , Break  , Break  , Break  , Break  , Break  , Break  , Break  , Break  , Break  , Break  , Break  , NoBreak, Break  , Break  , Break }, // EBG
190     { Break  , Break  , Break  , Break  , Break  , Break  , Break  ,  Break  , Break  , Break  , Break  , Break  , Break  , Break  , Break  , Break  , Break  , Break  , Break  , Break  , Break  , Break  , Break }, // WSeg
191 };
192 
193 } // namespace WB
194 
getWordBreaks(const ushort * string,quint32 len,QCharAttributes * attributes)195 static void getWordBreaks(const ushort *string, quint32 len, QCharAttributes *attributes)
196 {
197     enum WordType {
198         WordTypeNone, WordTypeAlphaNumeric, WordTypeHiraganaKatakana
199     } currentWordType = WordTypeNone;
200 
201     QUnicodeTables::WordBreakClass cls = QUnicodeTables::WordBreak_LF; // to meet WB1
202     for (quint32 i = 0; i != len; ++i) {
203         quint32 pos = i;
204         uint ucs4 = string[i];
205         if (QChar::isHighSurrogate(ucs4) && i + 1 != len) {
206             ushort low = string[i + 1];
207             if (QChar::isLowSurrogate(low)) {
208                 ucs4 = QChar::surrogateToUcs4(ucs4, low);
209                 ++i;
210             }
211         }
212 
213         const QUnicodeTables::Properties *prop = QUnicodeTables::properties(ucs4);
214         QUnicodeTables::WordBreakClass ncls = (QUnicodeTables::WordBreakClass) prop->wordBreakClass;
215 #ifdef QT_BUILD_INTERNAL
216         if (qt_initcharattributes_default_algorithm_only) {
217             // as of Unicode 5.1, some punctuation marks were mapped to MidLetter and MidNumLet
218             // which caused "hi.there" to be treated like if it were just a single word;
219             // we keep the pre-5.1 behavior by remapping these characters in the Unicode tables generator
220             // and this code is needed to pass the coverage tests; remove once the issue is fixed.
221             if (ucs4 == 0x002E) // FULL STOP
222                 ncls = QUnicodeTables::WordBreak_MidNumLet;
223             else if (ucs4 == 0x003A) // COLON
224                 ncls = QUnicodeTables::WordBreak_MidLetter;
225         }
226 #endif
227 
228         uchar action = WB::breakTable[cls][ncls];
229         switch (action) {
230         case WB::Break:
231             break;
232         case WB::NoBreak:
233             if (Q_UNLIKELY(ncls == QUnicodeTables::WordBreak_Extend || ncls == QUnicodeTables::WordBreak_ZWJ || ncls == QUnicodeTables::WordBreak_Format)) {
234                 // WB4: X(Extend|Format)* -> X
235                 if (cls != QUnicodeTables::WordBreak_ZWJ) // WB3c
236                     continue;
237             }
238             if (Q_UNLIKELY(cls == QUnicodeTables::WordBreak_RegionalIndicator)) {
239                 // WB15/WB16: break between pairs of Regional indicator
240                 ncls = QUnicodeTables::WordBreak_Any;
241             }
242             break;
243         case WB::Lookup:
244         case WB::LookupW:
245             for (quint32 lookahead = i + 1; lookahead < len; ++lookahead) {
246                 ucs4 = string[lookahead];
247                 if (QChar::isHighSurrogate(ucs4) && lookahead + 1 != len) {
248                     ushort low = string[lookahead + 1];
249                     if (QChar::isLowSurrogate(low)) {
250                         ucs4 = QChar::surrogateToUcs4(ucs4, low);
251                         ++lookahead;
252                     }
253                 }
254 
255                 prop = QUnicodeTables::properties(ucs4);
256                 QUnicodeTables::WordBreakClass tcls = (QUnicodeTables::WordBreakClass) prop->wordBreakClass;
257 
258                 if (Q_UNLIKELY(tcls == QUnicodeTables::WordBreak_Extend || tcls == QUnicodeTables::WordBreak_ZWJ || tcls == QUnicodeTables::WordBreak_Format)) {
259                     // WB4: X(Extend|Format)* -> X
260                     continue;
261                 }
262 
263                 if (Q_LIKELY(tcls == cls || (action == WB::LookupW && (tcls == QUnicodeTables::WordBreak_HebrewLetter
264                                                                        || tcls == QUnicodeTables::WordBreak_ALetter)))) {
265                     i = lookahead;
266                     ncls = tcls;
267                     action = WB::NoBreak;
268                 }
269                 break;
270             }
271             if (action != WB::NoBreak) {
272                 action = WB::Break;
273                 if (Q_UNLIKELY(ncls == QUnicodeTables::WordBreak_SingleQuote && cls == QUnicodeTables::WordBreak_HebrewLetter))
274                     action = WB::NoBreak; // WB7a
275             }
276             break;
277         }
278 
279         cls = ncls;
280         if (action == WB::Break) {
281             attributes[pos].wordBreak = true;
282             if (currentWordType != WordTypeNone)
283                 attributes[pos].wordEnd = true;
284             switch (cls) {
285             case QUnicodeTables::WordBreak_Katakana:
286                 currentWordType = WordTypeHiraganaKatakana;
287                 attributes[pos].wordStart = true;
288                 break;
289             case QUnicodeTables::WordBreak_HebrewLetter:
290             case QUnicodeTables::WordBreak_ALetter:
291             case QUnicodeTables::WordBreak_Numeric:
292                 currentWordType = WordTypeAlphaNumeric;
293                 attributes[pos].wordStart = true;
294                 break;
295             default:
296                 currentWordType = WordTypeNone;
297                 break;
298             }
299         }
300     }
301 
302     if (currentWordType != WordTypeNone)
303         attributes[len].wordEnd = true;
304     attributes[len].wordBreak = true; // WB2
305 }
306 
307 
308 namespace SB {
309 
310 enum State {
311     Initial,
312     Lower,
313     Upper,
314     LUATerm,
315     ATerm,
316     ATermC,
317     ACS,
318     STerm,
319     STermC,
320     SCS,
321     BAfterC,
322     BAfter,
323     Break,
324     Lookup
325 };
326 
327 static const uchar breakTable[BAfter + 1][QUnicodeTables::NumSentenceBreakClasses] = {
328 //    Any      CR       LF       Sep      Extend   Sp       Lower    Upper    OLetter  Numeric  ATerm   SContinue STerm    Close
329     { Initial, BAfterC, BAfter , BAfter , Initial, Initial, Lower  , Upper  , Initial, Initial, ATerm  , Initial, STerm  , Initial }, // Initial
330     { Initial, BAfterC, BAfter , BAfter , Lower  , Initial, Initial, Initial, Initial, Initial, LUATerm, Initial, STerm  , Initial }, // Lower
331     { Initial, BAfterC, BAfter , BAfter , Upper  , Initial, Initial, Upper  , Initial, Initial, LUATerm, Initial, STerm  , Initial }, // Upper
332 
333     { Lookup , BAfterC, BAfter , BAfter , LUATerm, ACS    , Initial, Upper  , Break  , Initial, ATerm  , STerm  , STerm  , ATermC  }, // LUATerm
334     { Lookup , BAfterC, BAfter , BAfter , ATerm  , ACS    , Initial, Break  , Break  , Initial, ATerm  , STerm  , STerm  , ATermC  }, // ATerm
335     { Lookup , BAfterC, BAfter , BAfter , ATermC , ACS    , Initial, Break  , Break  , Lookup , ATerm  , STerm  , STerm  , ATermC  }, // ATermC
336     { Lookup , BAfterC, BAfter , BAfter , ACS    , ACS    , Initial, Break  , Break  , Lookup , ATerm  , STerm  , STerm  , Lookup  }, // ACS
337 
338     { Break  , BAfterC, BAfter , BAfter , STerm  , SCS    , Break  , Break  , Break  , Break  , ATerm  , STerm  , STerm  , STermC  }, // STerm,
339     { Break  , BAfterC, BAfter , BAfter , STermC , SCS    , Break  , Break  , Break  , Break  , ATerm  , STerm  , STerm  , STermC  }, // STermC
340     { Break  , BAfterC, BAfter , BAfter , SCS    , SCS    , Break  , Break  , Break  , Break  , ATerm  , STerm  , STerm  , Break   }, // SCS
341     { Break  , Break  , BAfter , Break  , Break  , Break  , Break  , Break  , Break  , Break  , Break  , Break  , Break  , Break   }, // BAfterC
342     { Break  , Break  , Break  , Break  , Break  , Break  , Break  , Break  , Break  , Break  , Break  , Break  , Break  , Break   }, // BAfter
343 };
344 
345 } // namespace SB
346 
getSentenceBreaks(const ushort * string,quint32 len,QCharAttributes * attributes)347 static void getSentenceBreaks(const ushort *string, quint32 len, QCharAttributes *attributes)
348 {
349     uchar state = SB::BAfter; // to meet SB1
350     for (quint32 i = 0; i != len; ++i) {
351         quint32 pos = i;
352         uint ucs4 = string[i];
353         if (QChar::isHighSurrogate(ucs4) && i + 1 != len) {
354             ushort low = string[i + 1];
355             if (QChar::isLowSurrogate(low)) {
356                 ucs4 = QChar::surrogateToUcs4(ucs4, low);
357                 ++i;
358             }
359         }
360 
361         const QUnicodeTables::Properties *prop = QUnicodeTables::properties(ucs4);
362         QUnicodeTables::SentenceBreakClass ncls = (QUnicodeTables::SentenceBreakClass) prop->sentenceBreakClass;
363 
364         Q_ASSERT(state <= SB::BAfter);
365         state = SB::breakTable[state][ncls];
366         if (Q_UNLIKELY(state == SB::Lookup)) { // SB8
367             state = SB::Break;
368             for (quint32 lookahead = i + 1; lookahead < len; ++lookahead) {
369                 ucs4 = string[lookahead];
370                 if (QChar::isHighSurrogate(ucs4) && lookahead + 1 != len) {
371                     ushort low = string[lookahead + 1];
372                     if (QChar::isLowSurrogate(low)) {
373                         ucs4 = QChar::surrogateToUcs4(ucs4, low);
374                         ++lookahead;
375                     }
376                 }
377 
378                 prop = QUnicodeTables::properties(ucs4);
379                 QUnicodeTables::SentenceBreakClass tcls = (QUnicodeTables::SentenceBreakClass) prop->sentenceBreakClass;
380                 switch (tcls) {
381                 case QUnicodeTables::SentenceBreak_Any:
382                 case QUnicodeTables::SentenceBreak_Extend:
383                 case QUnicodeTables::SentenceBreak_Sp:
384                 case QUnicodeTables::SentenceBreak_Numeric:
385                 case QUnicodeTables::SentenceBreak_SContinue:
386                 case QUnicodeTables::SentenceBreak_Close:
387                     continue;
388                 case QUnicodeTables::SentenceBreak_Lower:
389                     i = lookahead;
390                     state = SB::Initial;
391                     break;
392                 default:
393                     break;
394                 }
395                 break;
396             }
397         }
398         if (Q_UNLIKELY(state == SB::Break)) {
399             attributes[pos].sentenceBoundary = true;
400             state = SB::breakTable[SB::Initial][ncls];
401         }
402     }
403 
404     attributes[len].sentenceBoundary = true; // SB2
405 }
406 
407 
408 // -----------------------------------------------------------------------------------------------------
409 //
410 // The line breaking algorithm.
411 // See http://www.unicode.org/reports/tr14/tr14-39.html
412 //
413 // -----------------------------------------------------------------------------------------------------
414 
415 namespace LB {
416 
417 namespace NS { // Number Sequence
418 
419 // LB25 recommends to not break lines inside numbers of the form
420 // described by the following regular expression:
421 //  (PR|PO)?(OP|HY)?NU(NU|SY|IS)*(CL|CP)?(PR|PO)?
422 
423 enum Action {
424     None,
425     Start,
426     Continue,
427     Break
428 };
429 
430 enum Class {
431     XX,
432     PRPO,
433     OPHY,
434     NU,
435     SYIS,
436     CLCP
437 };
438 
439 static const uchar actionTable[CLCP + 1][CLCP + 1] = {
440 //     XX       PRPO      OPHY       NU       SYIS      CLCP
441     { None    , Start   , Start   , Start   , None    , None     }, // XX
442     { None    , Start   , Continue, Continue, None    , None     }, // PRPO
443     { None    , Start   , Start   , Continue, None    , None     }, // OPHY
444     { Break   , Break   , Break   , Continue, Continue, Continue }, // NU
445     { Break   , Break   , Break   , Continue, Continue, Continue }, // SYIS
446     { Break   , Continue, Break   , Break   , Break   , Break    }, // CLCP
447 };
448 
toClass(QUnicodeTables::LineBreakClass lbc,QChar::Category category)449 inline Class toClass(QUnicodeTables::LineBreakClass lbc, QChar::Category category)
450 {
451     switch (lbc) {
452     case QUnicodeTables::LineBreak_AL:// case QUnicodeTables::LineBreak_AI:
453         // resolve AI math symbols in numerical context to IS
454         if (category == QChar::Symbol_Math)
455             return SYIS;
456         break;
457     case QUnicodeTables::LineBreak_PR: case QUnicodeTables::LineBreak_PO:
458         return PRPO;
459     case QUnicodeTables::LineBreak_OP: case QUnicodeTables::LineBreak_HY:
460         return OPHY;
461     case QUnicodeTables::LineBreak_NU:
462         return NU;
463     case QUnicodeTables::LineBreak_SY: case QUnicodeTables::LineBreak_IS:
464         return SYIS;
465     case QUnicodeTables::LineBreak_CL: case QUnicodeTables::LineBreak_CP:
466         return CLCP;
467     default:
468         break;
469     }
470     return XX;
471 }
472 
473 } // namespace NS
474 
475 /* In order to support the tailored implementation of LB25 properly
476    the following changes were made in the pair table to allow breaks
477    where the numeric expression doesn't match the template (i.e. [^NU](IS|SY)NU):
478    (CL)(PO) from IB to DB
479    (CP)(PO) from IB to DB
480    (CL)(PR) from IB to DB
481    (CP)(PR) from IB to DB
482    (PO)(OP) from IB to DB
483    (PR)(OP) from IB to DB
484    (IS)(NU) from IB to DB
485    (SY)(NU) from IB to DB
486 */
487 
488 /* In order to implementat LB21a properly a special rule HH has been introduced and
489    the following changes were made in the pair table to disallow breaks after Hebrew + Hyphen:
490    (HL)(HY|BA) from IB to CI
491    (HY|BA)(!CB) from DB to HH
492 */
493 
494 enum Action {
495     ProhibitedBreak, PB = ProhibitedBreak,
496     DirectBreak, DB = DirectBreak,
497     IndirectBreak, IB = IndirectBreak,
498     CombiningIndirectBreak, CI = CombiningIndirectBreak,
499     CombiningProhibitedBreak, CP = CombiningProhibitedBreak,
500     ProhibitedBreakAfterHebrewPlusHyphen, HH = ProhibitedBreakAfterHebrewPlusHyphen
501 };
502 
503 static const uchar breakTable[QUnicodeTables::LineBreak_SA][QUnicodeTables::LineBreak_SA] = {
504 /*         OP  CL  CP  QU  GL  NS  EX  SY  IS  PR  PO  NU  AL  HL  ID  IN  HY  BA  BB  B2  ZW  CM  WJ  H2  H3  JL  JV  JT  RI  CB  EB  EM  ZWJ*/
505 /* OP */ { PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, CP, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB },
506 /* CL */ { DB, PB, PB, IB, IB, PB, PB, PB, PB, DB, DB, DB, DB, DB, DB, DB, IB, IB, DB, DB, PB, CI, PB, DB, DB, DB, DB, DB, DB, DB, DB, DB, IB },
507 /* CP */ { DB, PB, PB, IB, IB, PB, PB, PB, PB, DB, DB, IB, IB, IB, DB, DB, IB, IB, DB, DB, PB, CI, PB, DB, DB, DB, DB, DB, DB, DB, DB, DB, IB },
508 /* QU */ { PB, PB, PB, IB, IB, IB, PB, PB, PB, IB, IB, IB, IB, IB, IB, IB, IB, IB, IB, IB, PB, CI, PB, IB, IB, IB, IB, IB, IB, IB, IB, IB, IB },
509 /* GL */ { IB, PB, PB, IB, IB, IB, PB, PB, PB, IB, IB, IB, IB, IB, IB, IB, IB, IB, IB, IB, PB, CI, PB, IB, IB, IB, IB, IB, IB, IB, IB, IB, IB },
510 /* NS */ { DB, PB, PB, IB, IB, IB, PB, PB, PB, DB, DB, DB, DB, DB, DB, DB, IB, IB, DB, DB, PB, CI, PB, DB, DB, DB, DB, DB, DB, DB, DB, DB, IB },
511 /* EX */ { DB, PB, PB, IB, IB, IB, PB, PB, PB, DB, DB, DB, DB, DB, DB, IB, IB, IB, DB, DB, PB, CI, PB, DB, DB, DB, DB, DB, DB, DB, DB, DB, IB },
512 /* SY */ { DB, PB, PB, IB, IB, IB, PB, PB, PB, DB, DB, DB, DB, IB, DB, DB, IB, IB, DB, DB, PB, CI, PB, DB, DB, DB, DB, DB, DB, DB, DB, DB, IB },
513 /* IS */ { DB, PB, PB, IB, IB, IB, PB, PB, PB, DB, DB, DB, IB, IB, DB, DB, IB, IB, DB, DB, PB, CI, PB, DB, DB, DB, DB, DB, DB, DB, DB, DB, IB },
514 /* PR */ { DB, PB, PB, IB, IB, IB, PB, PB, PB, DB, DB, IB, IB, IB, IB, DB, IB, IB, DB, DB, PB, CI, PB, IB, IB, IB, IB, IB, DB, DB, IB, IB, IB },
515 /* PO */ { DB, PB, PB, IB, IB, IB, PB, PB, PB, DB, DB, IB, IB, IB, DB, DB, IB, IB, DB, DB, PB, CI, PB, DB, DB, DB, DB, DB, DB, DB, DB, DB, IB },
516 /* NU */ { IB, PB, PB, IB, IB, IB, PB, PB, PB, IB, IB, IB, IB, IB, DB, IB, IB, IB, DB, DB, PB, CI, PB, DB, DB, DB, DB, DB, DB, DB, DB, DB, IB },
517 /* AL */ { IB, PB, PB, IB, IB, IB, PB, PB, PB, IB, IB, IB, IB, IB, DB, IB, IB, IB, DB, DB, PB, CI, PB, DB, DB, DB, DB, DB, DB, DB, DB, DB, IB },
518 /* HL */ { IB, PB, PB, IB, IB, IB, PB, PB, PB, IB, IB, IB, IB, IB, DB, IB, CI, CI, DB, DB, PB, CI, PB, DB, DB, DB, DB, DB, DB, DB, DB, DB, IB },
519 /* ID */ { DB, PB, PB, IB, IB, IB, PB, PB, PB, DB, IB, DB, DB, DB, DB, IB, IB, IB, DB, DB, PB, CI, PB, DB, DB, DB, DB, DB, DB, DB, DB, DB, IB },
520 /* IN */ { DB, PB, PB, IB, IB, IB, PB, PB, PB, DB, DB, DB, DB, DB, DB, IB, IB, IB, DB, DB, PB, CI, PB, DB, DB, DB, DB, DB, DB, DB, DB, DB, IB },
521 /* HY */ { HH, PB, PB, IB, HH, IB, PB, PB, PB, HH, HH, IB, HH, HH, HH, HH, IB, IB, HH, HH, PB, CI, PB, HH, HH, HH, HH, HH, HH, DB, DB, DB, IB },
522 /* BA */ { HH, PB, PB, IB, HH, IB, PB, PB, PB, HH, HH, HH, HH, HH, HH, HH, IB, IB, HH, HH, PB, CI, PB, HH, HH, HH, HH, HH, HH, DB, DB, DB, IB },
523 /* BB */ { IB, PB, PB, IB, IB, IB, PB, PB, PB, IB, IB, IB, IB, IB, IB, IB, IB, IB, IB, IB, PB, CI, PB, IB, IB, IB, IB, IB, IB, DB, IB, IB, IB },
524 /* B2 */ { DB, PB, PB, IB, IB, IB, PB, PB, PB, DB, DB, DB, DB, DB, DB, DB, IB, IB, DB, PB, PB, CI, PB, DB, DB, DB, DB, DB, DB, DB, DB, DB, IB },
525 /* ZW */ { DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, PB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB },
526 /* CM */ { IB, PB, PB, IB, IB, IB, PB, PB, PB, DB, DB, IB, IB, IB, DB, IB, IB, IB, DB, DB, PB, CI, PB, DB, DB, DB, DB, DB, DB, DB, DB, DB, IB },
527 /* WJ */ { IB, PB, PB, IB, IB, IB, PB, PB, PB, IB, IB, IB, IB, IB, IB, IB, IB, IB, IB, IB, PB, CI, PB, IB, IB, IB, IB, IB, IB, IB, IB, IB, IB },
528 /* H2 */ { DB, PB, PB, IB, IB, IB, PB, PB, PB, DB, IB, DB, DB, DB, DB, IB, IB, IB, DB, DB, PB, CI, PB, DB, DB, DB, IB, IB, DB, DB, DB, DB, IB },
529 /* H3 */ { DB, PB, PB, IB, IB, IB, PB, PB, PB, DB, IB, DB, DB, DB, DB, IB, IB, IB, DB, DB, PB, CI, PB, DB, DB, DB, DB, IB, DB, DB, DB, DB, IB },
530 /* JL */ { DB, PB, PB, IB, IB, IB, PB, PB, PB, DB, IB, DB, DB, DB, DB, IB, IB, IB, DB, DB, PB, CI, PB, IB, IB, IB, IB, DB, DB, DB, DB, DB, IB },
531 /* JV */ { DB, PB, PB, IB, IB, IB, PB, PB, PB, DB, IB, DB, DB, DB, DB, IB, IB, IB, DB, DB, PB, CI, PB, DB, DB, DB, IB, IB, DB, DB, DB, DB, IB },
532 /* JT */ { DB, PB, PB, IB, IB, IB, PB, PB, PB, DB, IB, DB, DB, DB, DB, IB, IB, IB, DB, DB, PB, CI, PB, DB, DB, DB, DB, IB, DB, DB, DB, DB, IB },
533 /* RI */ { DB, PB, PB, IB, IB, IB, PB, PB, PB, DB, DB, DB, DB, DB, DB, DB, IB, IB, DB, DB, PB, CI, PB, DB, DB, DB, DB, DB, IB, DB, DB, DB, IB },
534 /* CB */ { DB, PB, PB, IB, IB, DB, PB, PB, PB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, PB, CI, PB, DB, DB, DB, DB, DB, DB, DB, DB, DB, IB },
535 /* EB */ { DB, PB, PB, IB, IB, IB, PB, PB, PB, DB, IB, DB, DB, DB, DB, IB, IB, IB, DB, DB, PB, CI, PB, DB, DB, DB, DB, DB, DB, DB, DB, IB, IB },
536 /* EM */ { DB, PB, PB, IB, IB, IB, PB, PB, PB, DB, IB, DB, DB, DB, DB, IB, IB, IB, DB, DB, PB, CI, PB, DB, DB, DB, DB, DB, DB, DB, DB, DB, IB },
537 /* ZWJ*/ { IB, PB, PB, IB, IB, IB, PB, PB, PB, IB, IB, IB, IB, IB, IB, IB, IB, IB, DB, DB, PB, CI, PB, DB, DB, DB, DB, DB, DB, DB, IB, IB, IB }
538 };
539 
540 // The following line break classes are not treated by the pair table
541 // and must be resolved outside:
542 //  AI, BK, CB, CJ, CR, LF, NL, SA, SG, SP, XX
543 
544 } // namespace LB
545 
getLineBreaks(const ushort * string,quint32 len,QCharAttributes * attributes,QUnicodeTools::CharAttributeOptions options)546 static void getLineBreaks(const ushort *string, quint32 len, QCharAttributes *attributes, QUnicodeTools::CharAttributeOptions options)
547 {
548     quint32 nestart = 0;
549     LB::NS::Class nelast = LB::NS::XX;
550 
551     QUnicodeTables::LineBreakClass lcls = QUnicodeTables::LineBreak_LF; // to meet LB10
552     QUnicodeTables::LineBreakClass cls = lcls;
553     for (quint32 i = 0; i != len; ++i) {
554         quint32 pos = i;
555         uint ucs4 = string[i];
556         if (QChar::isHighSurrogate(ucs4) && i + 1 != len) {
557             ushort low = string[i + 1];
558             if (QChar::isLowSurrogate(low)) {
559                 ucs4 = QChar::surrogateToUcs4(ucs4, low);
560                 ++i;
561             }
562         }
563 
564         const QUnicodeTables::Properties *prop = QUnicodeTables::properties(ucs4);
565         QUnicodeTables::LineBreakClass ncls = (QUnicodeTables::LineBreakClass) prop->lineBreakClass;
566         QUnicodeTables::LineBreakClass tcls;
567 
568         if (options & QUnicodeTools::HangulLineBreakTailoring) {
569             if (Q_UNLIKELY((ncls >= QUnicodeTables::LineBreak_H2
570                         &&  ncls <= QUnicodeTables::LineBreak_JT)
571                         || (ucs4 >= 0x3130 && ucs4 <= 0x318F && ncls == QUnicodeTables::LineBreak_ID))
572                     ) {
573                 // LB27: use SPACE for line breaking
574                 // "When Korean uses SPACE for line breaking, the classes in rule LB26,
575                 // as well as characters of class ID, are often tailored to AL; see Section 8, Customization."
576                 // In case of Korean syllables: "3130..318F  HANGUL COMPATIBILITY JAMO"
577                 ncls = QUnicodeTables::LineBreak_AL;
578             } else {
579                 if (Q_UNLIKELY(ncls == QUnicodeTables::LineBreak_SA)) {
580                     // LB1: resolve SA to AL, except of those that have Category Mn or Mc be resolved to CM
581                     static const int test = FLAG(QChar::Mark_NonSpacing) | FLAG(QChar::Mark_SpacingCombining);
582                     if (FLAG(prop->category) & test)
583                         ncls = QUnicodeTables::LineBreak_CM;
584                 }
585                 if (Q_UNLIKELY(ncls == QUnicodeTables::LineBreak_CM)) {
586                     // LB10: treat CM that follows SP, BK, CR, LF, NL, or ZW as AL
587                     if (lcls == QUnicodeTables::LineBreak_ZW || lcls >= QUnicodeTables::LineBreak_SP)
588                         ncls = QUnicodeTables::LineBreak_AL;
589                 }
590             }
591         }
592 
593         if (Q_UNLIKELY(ncls == QUnicodeTables::LineBreak_SA)) {
594             // LB1: resolve SA to AL, except of those that have Category Mn or Mc be resolved to CM
595             static const int test = FLAG(QChar::Mark_NonSpacing) | FLAG(QChar::Mark_SpacingCombining);
596             if (FLAG(prop->category) & test)
597                 ncls = QUnicodeTables::LineBreak_CM;
598         }
599 
600         if (Q_UNLIKELY(lcls >= QUnicodeTables::LineBreak_CR)) {
601             // LB4: BK!, LB5: (CRxLF|CR|LF|NL)!
602             if (lcls > QUnicodeTables::LineBreak_CR || ncls != QUnicodeTables::LineBreak_LF)
603                 attributes[pos].lineBreak = attributes[pos].mandatoryBreak = true;
604             if (Q_UNLIKELY(ncls == QUnicodeTables::LineBreak_CM || ncls == QUnicodeTables::LineBreak_ZWJ)) {
605                 cls = QUnicodeTables::LineBreak_AL;
606                 goto next_no_cls_update;
607             }
608             goto next;
609         }
610 
611         if (Q_UNLIKELY(ncls >= QUnicodeTables::LineBreak_SP)) {
612             if (ncls > QUnicodeTables::LineBreak_SP)
613                 goto next; // LB6: x(BK|CR|LF|NL)
614             goto next_no_cls_update; // LB7: xSP
615         }
616 
617         if (Q_UNLIKELY(ncls == QUnicodeTables::LineBreak_CM || ncls == QUnicodeTables::LineBreak_ZWJ)) {
618             // LB9: treat CM that don't follows SP, BK, CR, LF, NL, or ZW as X
619             if (lcls != QUnicodeTables::LineBreak_ZW && lcls < QUnicodeTables::LineBreak_SP)
620                 // don't update anything
621                 goto next_no_cls_update;
622         }
623 
624         if (Q_UNLIKELY(lcls == QUnicodeTables::LineBreak_ZWJ)) {
625             // LB8a: ZWJ x (ID | EB | EM)
626             if (ncls == QUnicodeTables::LineBreak_ID || ncls == QUnicodeTables::LineBreak_EB || ncls == QUnicodeTables::LineBreak_EM)
627                 goto next;
628         }
629 
630         // LB25: do not break lines inside numbers
631         {
632             LB::NS::Class necur = LB::NS::toClass(ncls, (QChar::Category)prop->category);
633             switch (LB::NS::actionTable[nelast][necur]) {
634             case LB::NS::Break:
635                 // do not change breaks before and after the expression
636                 for (quint32 j = nestart + 1; j < pos; ++j)
637                     attributes[j].lineBreak = false;
638                 Q_FALLTHROUGH();
639             case LB::NS::None:
640                 nelast = LB::NS::XX; // reset state
641                 break;
642             case LB::NS::Start:
643                 nestart = i;
644                 Q_FALLTHROUGH();
645             default:
646                 nelast = necur;
647                 break;
648             }
649         }
650 
651         if (Q_UNLIKELY(ncls == QUnicodeTables::LineBreak_RI && lcls == QUnicodeTables::LineBreak_RI)) {
652             // LB30a
653             ncls = QUnicodeTables::LineBreak_SP;
654             goto next;
655         }
656 
657         // for South East Asian chars that require a complex analysis, the Unicode
658         // standard recommends to treat them as AL. tailoring that do dictionary analysis can override
659         if (Q_UNLIKELY(cls >= QUnicodeTables::LineBreak_SA))
660             cls = QUnicodeTables::LineBreak_AL;
661 
662         tcls = cls;
663         if (tcls == QUnicodeTables::LineBreak_CM)
664             // LB10
665             tcls = QUnicodeTables::LineBreak_AL;
666         switch (LB::breakTable[tcls][ncls < QUnicodeTables::LineBreak_SA ? ncls : QUnicodeTables::LineBreak_AL]) {
667         case LB::DirectBreak:
668             attributes[pos].lineBreak = true;
669             break;
670         case LB::IndirectBreak:
671             if (lcls == QUnicodeTables::LineBreak_SP)
672                 attributes[pos].lineBreak = true;
673             break;
674         case LB::CombiningIndirectBreak:
675             if (lcls != QUnicodeTables::LineBreak_SP)
676                 goto next_no_cls_update;
677             attributes[pos].lineBreak = true;
678             break;
679         case LB::CombiningProhibitedBreak:
680             if (lcls != QUnicodeTables::LineBreak_SP)
681                 goto next_no_cls_update;
682             break;
683         case LB::ProhibitedBreakAfterHebrewPlusHyphen:
684             if (lcls != QUnicodeTables::LineBreak_HL)
685                 attributes[pos].lineBreak = true;
686             break;
687         case LB::ProhibitedBreak:
688             // nothing to do
689         default:
690             break;
691         }
692 
693     next:
694         cls = ncls;
695     next_no_cls_update:
696         lcls = ncls;
697     }
698 
699     if (Q_UNLIKELY(LB::NS::actionTable[nelast][LB::NS::XX] == LB::NS::Break)) {
700         // LB25: do not break lines inside numbers
701         for (quint32 j = nestart + 1; j < len; ++j)
702             attributes[j].lineBreak = false;
703     }
704 
705     attributes[0].lineBreak = attributes[0].mandatoryBreak = false; // LB2
706     attributes[len].lineBreak = attributes[len].mandatoryBreak = true; // LB3
707 }
708 
709 
getWhiteSpaces(const ushort * string,quint32 len,QCharAttributes * attributes)710 static void getWhiteSpaces(const ushort *string, quint32 len, QCharAttributes *attributes)
711 {
712     for (quint32 i = 0; i != len; ++i) {
713         uint ucs4 = string[i];
714         if (QChar::isHighSurrogate(ucs4) && i + 1 != len) {
715             ushort low = string[i + 1];
716             if (QChar::isLowSurrogate(low)) {
717                 ucs4 = QChar::surrogateToUcs4(ucs4, low);
718                 ++i;
719             }
720         }
721 
722         if (Q_UNLIKELY(QChar::isSpace(ucs4)))
723             attributes[i].whiteSpace = true;
724     }
725 }
726 
727 
initCharAttributes(const ushort * string,int length,const ScriptItem * items,int numItems,QCharAttributes * attributes,CharAttributeOptions options)728 Q_CORE_EXPORT void initCharAttributes(const ushort *string, int length,
729                                       const ScriptItem *items, int numItems,
730                                       QCharAttributes *attributes, CharAttributeOptions options)
731 {
732     if (length <= 0)
733         return;
734 
735     if (!(options & DontClearAttributes))
736         ::memset(attributes, 0, (length + 1) * sizeof(QCharAttributes));
737 
738     if (options & GraphemeBreaks)
739         getGraphemeBreaks(string, length, attributes);
740     if (options & WordBreaks)
741         getWordBreaks(string, length, attributes);
742     if (options & SentenceBreaks)
743         getSentenceBreaks(string, length, attributes);
744     if (options & LineBreaks)
745         getLineBreaks(string, length, attributes, options);
746     if (options & WhiteSpaces)
747         getWhiteSpaces(string, length, attributes);
748 
749     if (!qt_initcharattributes_default_algorithm_only) {
750         if (!items || numItems <= 0)
751             return;
752 
753         QVarLengthArray<HB_ScriptItem, 64> scriptItems;
754         scriptItems.reserve(numItems);
755         int start = 0;
756         HB_Script startScript = script_to_hbscript(items[start].script);
757         if (Q_UNLIKELY(startScript == HB_Script_Inherited))
758             startScript = HB_Script_Common;
759         for (int i = start + 1; i < numItems; ++i) {
760             HB_Script script = script_to_hbscript(items[i].script);
761             if (Q_LIKELY(script == startScript || script == HB_Script_Inherited))
762                 continue;
763             Q_ASSERT(items[i].position > items[start].position);
764             HB_ScriptItem item;
765             item.pos = items[start].position;
766             item.length = items[i].position - items[start].position;
767             item.script = startScript;
768             item.bidiLevel = 0; // unused
769             scriptItems.append(item);
770             start = i;
771             startScript = script;
772         }
773         if (items[start].position + 1 < length) {
774             HB_ScriptItem item;
775             item.pos = items[start].position;
776             item.length = length - items[start].position;
777             item.script = startScript;
778             item.bidiLevel = 0; // unused
779             scriptItems.append(item);
780         }
781         Q_STATIC_ASSERT(sizeof(QCharAttributes) == sizeof(HB_CharAttributes));
782         HB_GetTailoredCharAttributes(string, length,
783                                      scriptItems.constData(), scriptItems.size(),
784                                      reinterpret_cast<HB_CharAttributes *>(attributes));
785     }
786 }
787 
788 
789 // ----------------------------------------------------------------------------
790 //
791 // The Unicode script property. See http://www.unicode.org/reports/tr24/tr24-24.html
792 //
793 // ----------------------------------------------------------------------------
794 
initScripts(const ushort * string,int length,uchar * scripts)795 Q_CORE_EXPORT void initScripts(const ushort *string, int length, uchar *scripts)
796 {
797     int sor = 0;
798     int eor = 0;
799     uchar script = QChar::Script_Common;
800 
801     for (int i = 0; i < length; ++i, eor = i) {
802         uint ucs4 = string[i];
803         if (QChar::isHighSurrogate(ucs4) && i + 1 < length) {
804             ushort low = string[i + 1];
805             if (QChar::isLowSurrogate(low)) {
806                 ucs4 = QChar::surrogateToUcs4(ucs4, low);
807                 ++i;
808             }
809         }
810 
811         const QUnicodeTables::Properties *prop = QUnicodeTables::properties(ucs4);
812 
813         uchar nscript = prop->script;
814 
815         if (Q_LIKELY(nscript == script || nscript <= QChar::Script_Common))
816             continue;
817 
818         // inherit preceding Common-s
819         if (Q_UNLIKELY(script <= QChar::Script_Common)) {
820             // also covers a case where the base character of Common script followed
821             // by one or more combining marks of non-Inherited, non-Common script
822             script = nscript;
823             continue;
824         }
825 
826         // Never break between a combining mark (gc= Mc, Mn or Me) and its base character.
827         // Thus, a combining mark - whatever its script property value is - should inherit
828         // the script property value of its base character.
829         static const int test = (FLAG(QChar::Mark_NonSpacing) | FLAG(QChar::Mark_SpacingCombining) | FLAG(QChar::Mark_Enclosing));
830         if (Q_UNLIKELY(FLAG(prop->category) & test))
831             continue;
832 
833         Q_ASSERT(script > QChar::Script_Common);
834         Q_ASSERT(sor < eor);
835         ::memset(scripts + sor, script, (eor - sor) * sizeof(uchar));
836         sor = eor;
837 
838         script = nscript;
839     }
840 
841     Q_ASSERT(script >= QChar::Script_Common);
842     Q_ASSERT(eor == length);
843     ::memset(scripts + sor, script, (eor - sor) * sizeof(uchar));
844 }
845 
846 } // namespace QUnicodeTools
847 
848 QT_END_NAMESPACE
849