1 /****************************************************************************
2 **
3 ** Copyright (C) 2019 The Qt Company Ltd.
4 ** Contact: https://www.qt.io/licensing/
5 **
6 ** This file is part of the QtCore module of the Qt Toolkit.
7 **
8 ** $QT_BEGIN_LICENSE:LGPL$
9 ** Commercial License Usage
10 ** Licensees holding valid commercial Qt licenses may use this file in
11 ** accordance with the commercial license agreement provided with the
12 ** Software or, alternatively, in accordance with the terms contained in
13 ** a written agreement between you and The Qt Company. For licensing terms
14 ** and conditions see https://www.qt.io/terms-conditions. For further
15 ** information use the contact form at https://www.qt.io/contact-us.
16 **
17 ** GNU Lesser General Public License Usage
18 ** Alternatively, this file may be used under the terms of the GNU Lesser
19 ** General Public License version 3 as published by the Free Software
20 ** Foundation and appearing in the file LICENSE.LGPL3 included in the
21 ** packaging of this file. Please review the following information to
22 ** ensure the GNU Lesser General Public License version 3 requirements
23 ** will be met: https://www.gnu.org/licenses/lgpl-3.0.html.
24 **
25 ** GNU General Public License Usage
26 ** Alternatively, this file may be used under the terms of the GNU
27 ** General Public License version 2.0 or (at your option) the GNU General
28 ** Public license version 3 or any later version approved by the KDE Free
29 ** Qt Foundation. The licenses are as published by the Free Software
30 ** Foundation and appearing in the file LICENSE.GPL2 and LICENSE.GPL3
31 ** included in the packaging of this file. Please review the following
32 ** information to ensure the GNU General Public License requirements will
33 ** be met: https://www.gnu.org/licenses/gpl-2.0.html and
34 ** https://www.gnu.org/licenses/gpl-3.0.html.
35 **
36 ** $QT_END_LICENSE$
37 **
38 ****************************************************************************/
39 
40 // Don't define it while compiling this module, or USERS of Qt will
41 // not be able to link.
42 #ifdef QT_NO_CAST_FROM_ASCII
43 #  undef QT_NO_CAST_FROM_ASCII
44 #endif
45 #ifdef QT_NO_CAST_TO_ASCII
46 #  undef QT_NO_CAST_TO_ASCII
47 #endif
48 #include "qchar.h"
49 
50 #include "qdatastream.h"
51 
52 #include "qunicodetables_p.h"
53 #include "qunicodetables.cpp"
54 
55 #include <algorithm>
56 
57 QT_BEGIN_NAMESPACE
58 
59 #define FLAG(x) (1 << (x))
60 
61 /*!
62     \class QLatin1Char
63     \inmodule QtCore
64     \reentrant
65     \brief The QLatin1Char class provides an 8-bit ASCII/Latin-1 character.
66 
67     \ingroup string-processing
68 
69     This class is only useful to construct a QChar with 8-bit character.
70 
71     \sa QChar, QLatin1String, QString
72 */
73 
74 /*!
75     \fn const char QLatin1Char::toLatin1() const
76 
77     Converts a Latin-1 character to an 8-bit ASCII representation of the character.
78 */
79 
80 /*!
81     \fn const ushort QLatin1Char::unicode() const
82 
83     Converts a Latin-1 character to an 16-bit-encoded Unicode representation
84     of the character.
85 */
86 
87 /*!
88     \fn QLatin1Char::QLatin1Char(char c)
89 
90     Constructs a Latin-1 character for \a c. This constructor should be
91     used when the encoding of the input character is known to be Latin-1.
92 */
93 
94 /*!
95     \class QChar
96     \inmodule QtCore
97     \brief The QChar class provides a 16-bit Unicode character.
98 
99     \ingroup string-processing
100     \reentrant
101 
102     In Qt, Unicode characters are 16-bit entities without any markup
103     or structure. This class represents such an entity. It is
104     lightweight, so it can be used everywhere. Most compilers treat
105     it like an \c{unsigned short}.
106 
107     QChar provides a full complement of testing/classification
108     functions, converting to and from other formats, converting from
109     composed to decomposed Unicode, and trying to compare and
110     case-convert if you ask it to.
111 
112     The classification functions include functions like those in the
113     standard C++ header \<cctype\> (formerly \<ctype.h\>), but
114     operating on the full range of Unicode characters, not just for the ASCII
115     range. They all return true if the character is a certain type of character;
116     otherwise they return false. These classification functions are
117     isNull() (returns \c true if the character is '\\0'), isPrint()
118     (true if the character is any sort of printable character,
119     including whitespace), isPunct() (any sort of punctation),
120     isMark() (Unicode Mark), isLetter() (a letter), isNumber() (any
121     sort of numeric character, not just 0-9), isLetterOrNumber(), and
122     isDigit() (decimal digits). All of these are wrappers around
123     category() which return the Unicode-defined category of each
124     character. Some of these also calculate the derived properties
125     (for example isSpace() returns \c true if the character is of category
126     Separator_* or an exceptional code point from Other_Control category).
127 
128     QChar also provides direction(), which indicates the "natural"
129     writing direction of this character. The joiningType() function
130     indicates how the character joins with it's neighbors (needed
131     mostly for Arabic or Syriac) and finally hasMirrored(), which indicates
132     whether the character needs to be mirrored when it is printed in
133     it's "unnatural" writing direction.
134 
135     Composed Unicode characters (like \a ring) can be converted to
136     decomposed Unicode ("a" followed by "ring above") by using decomposition().
137 
138     In Unicode, comparison is not necessarily possible and case
139     conversion is very difficult at best. Unicode, covering the
140     "entire" world, also includes most of the world's case and
141     sorting problems. operator==() and friends will do comparison
142     based purely on the numeric Unicode value (code point) of the
143     characters, and toUpper() and toLower() will do case changes when
144     the character has a well-defined uppercase/lowercase equivalent.
145     For locale-dependent comparisons, use QString::localeAwareCompare().
146 
147     The conversion functions include unicode() (to a scalar),
148     toLatin1() (to scalar, but converts all non-Latin-1 characters to
149     0), row() (gives the Unicode row), cell() (gives the Unicode
150     cell), digitValue() (gives the integer value of any of the
151     numerous digit characters), and a host of constructors.
152 
153     QChar provides constructors and cast operators that make it easy
154     to convert to and from traditional 8-bit \c{char}s. If you
155     defined \c QT_NO_CAST_FROM_ASCII and \c QT_NO_CAST_TO_ASCII, as
156     explained in the QString documentation, you will need to
157     explicitly call fromLatin1(), or use QLatin1Char,
158     to construct a QChar from an 8-bit \c char, and you will need to
159     call toLatin1() to get the 8-bit value back.
160 
161     For more information see
162     \l{http://www.unicode.org/ucd/}{"About the Unicode Character Database"}.
163 
164     \sa Unicode, QString, QLatin1Char
165 */
166 
167 /*!
168     \enum QChar::UnicodeVersion
169 
170     Specifies which version of the \l{http://www.unicode.org/}{Unicode standard}
171     introduced a certain character.
172 
173     \value Unicode_1_1  Version 1.1
174     \value Unicode_2_0  Version 2.0
175     \value Unicode_2_1_2  Version 2.1.2
176     \value Unicode_3_0  Version 3.0
177     \value Unicode_3_1  Version 3.1
178     \value Unicode_3_2  Version 3.2
179     \value Unicode_4_0  Version 4.0
180     \value Unicode_4_1  Version 4.1
181     \value Unicode_5_0  Version 5.0
182     \value Unicode_5_1  Version 5.1
183     \value Unicode_5_2  Version 5.2
184     \value Unicode_6_0  Version 6.0
185     \value Unicode_6_1  Version 6.1
186     \value Unicode_6_2  Version 6.2
187     \value Unicode_6_3  Version 6.3  Since Qt 5.3
188     \value Unicode_7_0  Version 7.0  Since Qt 5.5
189     \value Unicode_8_0  Version 8.0  Since Qt 5.6
190     \value Unicode_9_0  Version 9.0  Since Qt 5.11
191     \value Unicode_10_0 Version 10.0 Since Qt 5.11
192     \value Unicode_11_0 Version 11.0 Since Qt 5.15
193     \value Unicode_12_0 Version 12.0 Since Qt 5.15
194     \value Unicode_12_1 Version 12.1 Since Qt 5.15
195     \value Unicode_13_0 Version 13.0 Since Qt 5.15
196     \value Unicode_Unassigned  The value is not assigned to any character
197                                in version 8.0 of Unicode.
198 
199     \sa unicodeVersion(), currentUnicodeVersion()
200 */
201 
202 /*!
203     \enum QChar::Category
204 
205     This enum maps the Unicode character categories.
206 
207     The following characters are normative in Unicode:
208 
209     \value Mark_NonSpacing  Unicode class name Mn
210 
211     \value Mark_SpacingCombining  Unicode class name Mc
212 
213     \value Mark_Enclosing  Unicode class name Me
214 
215     \value Number_DecimalDigit  Unicode class name Nd
216 
217     \value Number_Letter  Unicode class name Nl
218 
219     \value Number_Other  Unicode class name No
220 
221     \value Separator_Space  Unicode class name Zs
222 
223     \value Separator_Line  Unicode class name Zl
224 
225     \value Separator_Paragraph  Unicode class name Zp
226 
227     \value Other_Control  Unicode class name Cc
228 
229     \value Other_Format  Unicode class name Cf
230 
231     \value Other_Surrogate  Unicode class name Cs
232 
233     \value Other_PrivateUse  Unicode class name Co
234 
235     \value Other_NotAssigned  Unicode class name Cn
236 
237 
238     The following categories are informative in Unicode:
239 
240     \value Letter_Uppercase  Unicode class name Lu
241 
242     \value Letter_Lowercase  Unicode class name Ll
243 
244     \value Letter_Titlecase  Unicode class name Lt
245 
246     \value Letter_Modifier  Unicode class name Lm
247 
248     \value Letter_Other Unicode class name Lo
249 
250     \value Punctuation_Connector  Unicode class name Pc
251 
252     \value Punctuation_Dash  Unicode class name Pd
253 
254     \value Punctuation_Open  Unicode class name Ps
255 
256     \value Punctuation_Close  Unicode class name Pe
257 
258     \value Punctuation_InitialQuote  Unicode class name Pi
259 
260     \value Punctuation_FinalQuote  Unicode class name Pf
261 
262     \value Punctuation_Other  Unicode class name Po
263 
264     \value Symbol_Math  Unicode class name Sm
265 
266     \value Symbol_Currency  Unicode class name Sc
267 
268     \value Symbol_Modifier  Unicode class name Sk
269 
270     \value Symbol_Other  Unicode class name So
271 
272     \sa category()
273 */
274 
275 /*!
276     \enum QChar::Script
277     \since 5.1
278 
279     This enum type defines the Unicode script property values.
280 
281     For details about the Unicode script property values see
282     \l{http://www.unicode.org/reports/tr24/}{Unicode Standard Annex #24}.
283 
284     In order to conform to C/C++ naming conventions "Script_" is prepended
285     to the codes used in the Unicode Standard.
286 
287     \value Script_Unknown    For unassigned, private-use, noncharacter, and surrogate code points.
288     \value Script_Inherited  For characters that may be used with multiple scripts
289                              and that inherit their script from the preceding characters.
290                              These include nonspacing marks, enclosing marks,
291                              and zero width joiner/non-joiner characters.
292     \value Script_Common     For characters that may be used with multiple scripts
293                              and that do not inherit their script from the preceding characters.
294 
295     \value Script_Adlam Since Qt 5.11
296     \value Script_Ahom Since Qt 5.6
297     \value Script_AnatolianHieroglyphs Since Qt 5.6
298     \value Script_Arabic
299     \value Script_Armenian
300     \value Script_Avestan
301     \value Script_Balinese
302     \value Script_Bamum
303     \value Script_BassaVah Since Qt 5.5
304     \value Script_Batak
305     \value Script_Bengali
306     \value Script_Bhaiksuki Since Qt 5.11
307     \value Script_Bopomofo
308     \value Script_Brahmi
309     \value Script_Braille
310     \value Script_Buginese
311     \value Script_Buhid
312     \value Script_CanadianAboriginal
313     \value Script_Carian
314     \value Script_CaucasianAlbanian Since Qt 5.5
315     \value Script_Chakma
316     \value Script_Cham
317     \value Script_Cherokee
318     \value Script_Chorasmian Since Qt 5.15
319     \value Script_Coptic
320     \value Script_Cuneiform
321     \value Script_Cypriot
322     \value Script_Cyrillic
323     \value Script_Deseret
324     \value Script_Devanagari
325     \value Script_DivesAkuru Since Qt 5.15
326     \value Script_Dogra Since Qt 5.15
327     \value Script_Duployan Since Qt 5.5
328     \value Script_EgyptianHieroglyphs
329     \value Script_Elbasan Since Qt 5.5
330     \value Script_Elymaic Since Qt 5.15
331     \value Script_Ethiopic
332     \value Script_Georgian
333     \value Script_Glagolitic
334     \value Script_Gothic
335     \value Script_Grantha Since Qt 5.5
336     \value Script_Greek
337     \value Script_Gujarati
338     \value Script_GunjalaGondi Since Qt 5.15
339     \value Script_Gurmukhi
340     \value Script_Han
341     \value Script_Hangul
342     \value Script_HanifiRohingya Since Qt 5.15
343     \value Script_Hanunoo
344     \value Script_Hatran Since Qt 5.6
345     \value Script_Hebrew
346     \value Script_Hiragana
347     \value Script_ImperialAramaic
348     \value Script_InscriptionalPahlavi
349     \value Script_InscriptionalParthian
350     \value Script_Javanese
351     \value Script_Kaithi
352     \value Script_Kannada
353     \value Script_Katakana
354     \value Script_KayahLi
355     \value Script_Kharoshthi
356     \value Script_KhitanSmallScript Since Qt 5.15
357     \value Script_Khmer
358     \value Script_Khojki Since Qt 5.5
359     \value Script_Khudawadi Since Qt 5.5
360     \value Script_Lao
361     \value Script_Latin
362     \value Script_Lepcha
363     \value Script_Limbu
364     \value Script_LinearA Since Qt 5.5
365     \value Script_LinearB
366     \value Script_Lisu
367     \value Script_Lycian
368     \value Script_Lydian
369     \value Script_Mahajani Since Qt 5.5
370     \value Script_Makasar Since Qt 5.15
371     \value Script_Malayalam
372     \value Script_Mandaic
373     \value Script_Manichaean Since Qt 5.5
374     \value Script_Marchen Since Qt 5.11
375     \value Script_MasaramGondi Since Qt 5.11
376     \value Script_Medefaidrin Since Qt 5.15
377     \value Script_MeeteiMayek
378     \value Script_MendeKikakui Since Qt 5.5
379     \value Script_MeroiticCursive
380     \value Script_MeroiticHieroglyphs
381     \value Script_Miao
382     \value Script_Modi Since Qt 5.5
383     \value Script_Mongolian
384     \value Script_Mro Since Qt 5.5
385     \value Script_Multani Since Qt 5.6
386     \value Script_Myanmar
387     \value Script_Nabataean Since Qt 5.5
388     \value Script_Nandinagari Since Qt 5.15
389     \value Script_Newa Since Qt 5.11
390     \value Script_NewTaiLue
391     \value Script_Nko
392     \value Script_Nushu Since Qt 5.11
393     \value Script_NyiakengPuachueHmong Since Qt 5.15
394     \value Script_Ogham
395     \value Script_OlChiki
396     \value Script_OldHungarian Since Qt 5.6
397     \value Script_OldItalic
398     \value Script_OldNorthArabian Since Qt 5.5
399     \value Script_OldPermic Since Qt 5.5
400     \value Script_OldPersian
401     \value Script_OldSogdian Since Qt 5.15
402     \value Script_OldSouthArabian
403     \value Script_OldTurkic
404     \value Script_Oriya
405     \value Script_Osage Since Qt 5.11
406     \value Script_Osmanya
407     \value Script_PahawhHmong Since Qt 5.5
408     \value Script_Palmyrene Since Qt 5.5
409     \value Script_PauCinHau Since Qt 5.5
410     \value Script_PhagsPa
411     \value Script_Phoenician
412     \value Script_PsalterPahlavi Since Qt 5.5
413     \value Script_Rejang
414     \value Script_Runic
415     \value Script_Samaritan
416     \value Script_Saurashtra
417     \value Script_Sharada
418     \value Script_Shavian
419     \value Script_Siddham Since Qt 5.5
420     \value Script_SignWriting Since Qt 5.6
421     \value Script_Sinhala
422     \value Script_Sogdian Since Qt 5.15
423     \value Script_SoraSompeng
424     \value Script_Soyombo Since Qt 5.11
425     \value Script_Sundanese
426     \value Script_SylotiNagri
427     \value Script_Syriac
428     \value Script_Tagalog
429     \value Script_Tagbanwa
430     \value Script_TaiLe
431     \value Script_TaiTham
432     \value Script_TaiViet
433     \value Script_Takri
434     \value Script_Tamil
435     \value Script_Tangut Since Qt 5.11
436     \value Script_Telugu
437     \value Script_Thaana
438     \value Script_Thai
439     \value Script_Tibetan
440     \value Script_Tifinagh
441     \value Script_Tirhuta Since Qt 5.5
442     \value Script_Ugaritic
443     \value Script_Vai
444     \value Script_Wancho Since Qt 5.15
445     \value Script_WarangCiti Since Qt 5.5
446     \value Script_Yezidi Since Qt 5.15
447     \value Script_Yi
448     \value Script_ZanabazarSquare Since Qt 5.11
449 
450     \omitvalue ScriptCount
451 
452     \sa script()
453 */
454 
455 /*!
456     \enum QChar::Direction
457 
458     This enum type defines the Unicode direction attributes. See the
459     \l{http://www.unicode.org/reports/tr9/tr9-35.html#Table_Bidirectional_Character_Types}{Unicode Standard} for a description
460     of the values.
461 
462     In order to conform to C/C++ naming conventions "Dir" is prepended
463     to the codes used in the Unicode Standard.
464 
465     \value DirAL
466     \value DirAN
467     \value DirB
468     \value DirBN
469     \value DirCS
470     \value DirEN
471     \value DirES
472     \value DirET
473     \value DirFSI Since Qt 5.3
474     \value DirL
475     \value DirLRE
476     \value DirLRI Since Qt 5.3
477     \value DirLRO
478     \value DirNSM
479     \value DirON
480     \value DirPDF
481     \value DirPDI Since Qt 5.3
482     \value DirR
483     \value DirRLE
484     \value DirRLI Since Qt 5.3
485     \value DirRLO
486     \value DirS
487     \value DirWS
488 
489     \sa direction()
490 */
491 
492 /*!
493     \enum QChar::Decomposition
494 
495     This enum type defines the Unicode decomposition attributes. See
496     the \l{http://www.unicode.org/}{Unicode Standard} for a
497     description of the values.
498 
499     \value NoDecomposition
500     \value Canonical
501     \value Circle
502     \value Compat
503     \value Final
504     \value Font
505     \value Fraction
506     \value Initial
507     \value Isolated
508     \value Medial
509     \value Narrow
510     \value NoBreak
511     \value Small
512     \value Square
513     \value Sub
514     \value Super
515     \value Vertical
516     \value Wide
517 
518     \sa decomposition()
519 */
520 
521 /*!
522     \enum QChar::JoiningType
523     since 5.3
524 
525     This enum type defines the Unicode joining type attributes. See the
526     \l{http://www.unicode.org/}{Unicode Standard} for a description of the values.
527 
528     In order to conform to C/C++ naming conventions "Joining_" is prepended
529     to the codes used in the Unicode Standard.
530 
531     \value Joining_None
532     \value Joining_Causing
533     \value Joining_Dual
534     \value Joining_Right
535     \value Joining_Left
536     \value Joining_Transparent
537 
538     \sa joiningType()
539 */
540 
541 #if QT_DEPRECATED_SINCE(5, 3)
542 /*!
543     \enum QChar::Joining
544     \deprecated in 5.3, use JoiningType instead.
545 
546     This enum type defines the Unicode joining attributes. See the
547     \l{http://www.unicode.org/}{Unicode Standard} for a description
548     of the values.
549 
550     \value Center
551     \value Dual
552     \value OtherJoining
553     \value Right
554 
555     \sa joining()
556 */
557 #endif
558 
559 /*!
560     \enum QChar::CombiningClass
561 
562     \internal
563 
564     This enum type defines names for some of the Unicode combining
565     classes. See the \l{http://www.unicode.org/}{Unicode Standard}
566     for a description of the values.
567 
568     \value Combining_Above
569     \value Combining_AboveAttached
570     \value Combining_AboveLeft
571     \value Combining_AboveLeftAttached
572     \value Combining_AboveRight
573     \value Combining_AboveRightAttached
574     \value Combining_Below
575     \value Combining_BelowAttached
576     \value Combining_BelowLeft
577     \value Combining_BelowLeftAttached
578     \value Combining_BelowRight
579     \value Combining_BelowRightAttached
580     \value Combining_DoubleAbove
581     \value Combining_DoubleBelow
582     \value Combining_IotaSubscript
583     \value Combining_Left
584     \value Combining_LeftAttached
585     \value Combining_Right
586     \value Combining_RightAttached
587 */
588 
589 /*!
590     \enum QChar::SpecialCharacter
591 
592     \value Null A QChar with this value isNull().
593     \value Tabulation Character tabulation.
594     \value LineFeed
595     \value FormFeed
596     \value CarriageReturn
597     \value Space
598     \value Nbsp Non-breaking space.
599     \value SoftHyphen
600     \value ReplacementCharacter The character shown when a font has no glyph
601            for a certain codepoint. A special question mark character is often
602            used. Codecs use this codepoint when input data cannot be
603            represented in Unicode.
604     \value ObjectReplacementCharacter Used to represent an object such as an
605            image when such objects cannot be presented.
606     \value ByteOrderMark
607     \value ByteOrderSwapped
608     \value ParagraphSeparator
609     \value LineSeparator
610     \value LastValidCodePoint
611 */
612 
613 /*!
614     \fn void QChar::setCell(uchar cell)
615     \internal
616 */
617 
618 /*!
619     \fn void QChar::setRow(uchar row)
620     \internal
621 */
622 
623 /*!
624     \fn QChar::QChar()
625 
626     Constructs a null QChar ('\\0').
627 
628     \sa isNull()
629 */
630 
631 /*!
632     \fn QChar::QChar(QLatin1Char ch)
633 
634     Constructs a QChar corresponding to ASCII/Latin-1 character \a ch.
635 */
636 
637 /*!
638     \fn QChar::QChar(SpecialCharacter ch)
639 
640     Constructs a QChar for the predefined character value \a ch.
641 */
642 
643 /*!
644     \fn QChar::QChar(char16_t ch)
645     \since 5.10
646 
647     Constructs a QChar corresponding to the UTF-16 character \a ch.
648 */
649 
650 /*!
651     \fn QChar::QChar(wchar_t ch)
652     \since 5.10
653 
654     Constructs a QChar corresponding to the wide character \a ch.
655 
656     \note This constructor is only available on Windows.
657 */
658 
659 /*!
660     \fn QChar::QChar(char ch)
661 
662     Constructs a QChar corresponding to ASCII/Latin-1 character \a ch.
663 
664     \note This constructor is not available when \c QT_NO_CAST_FROM_ASCII
665     is defined.
666 
667     \sa QT_NO_CAST_FROM_ASCII
668 */
669 
670 /*!
671     \fn QChar::QChar(uchar ch)
672 
673     Constructs a QChar corresponding to ASCII/Latin-1 character \a ch.
674 
675     \note This constructor is not available when \c QT_NO_CAST_FROM_ASCII
676     or \c QT_RESTRICTED_CAST_FROM_ASCII is defined.
677 
678     \sa QT_NO_CAST_FROM_ASCII, QT_RESTRICTED_CAST_FROM_ASCII
679 */
680 
681 /*!
682     \fn QChar::QChar(uchar cell, uchar row)
683 
684     Constructs a QChar for Unicode cell \a cell in row \a row.
685 
686     \sa cell(), row()
687 */
688 
689 /*!
690     \fn QChar::QChar(ushort code)
691 
692     Constructs a QChar for the character with Unicode code point \a code.
693 */
694 
695 /*!
696     \fn QChar::QChar(short code)
697 
698     Constructs a QChar for the character with Unicode code point \a code.
699 */
700 
701 /*!
702     \fn QChar::QChar(uint code)
703 
704     Constructs a QChar for the character with Unicode code point \a code.
705 */
706 
707 /*!
708     \fn QChar::QChar(int code)
709 
710     Constructs a QChar for the character with Unicode code point \a code.
711 */
712 
713 /*!
714     \fn bool QChar::isNull() const
715 
716     Returns \c true if the character is the Unicode character 0x0000
717     ('\\0'); otherwise returns \c false.
718 */
719 
720 /*!
721     \fn uchar QChar::cell() const
722 
723     Returns the cell (least significant byte) of the Unicode character.
724 
725     \sa row()
726 */
727 
728 /*!
729     \fn uchar QChar::row() const
730 
731     Returns the row (most significant byte) of the Unicode character.
732 
733     \sa cell()
734 */
735 
736 /*!
737     \fn bool QChar::isPrint() const
738 
739     Returns \c true if the character is a printable character; otherwise
740     returns \c false. This is any character not of category Other_*.
741 
742     Note that this gives no indication of whether the character is
743     available in a particular font.
744 */
745 
746 /*!
747     \overload
748     \since 5.0
749 
750     Returns \c true if the UCS-4-encoded character specified by \a ucs4 is
751     a printable character; otherwise returns \c false.
752     This is any character not of category Other_*.
753 
754     Note that this gives no indication of whether the character is
755     available in a particular font.
756 */
isPrint(uint ucs4)757 bool QChar::isPrint(uint ucs4) noexcept
758 {
759     if (ucs4 > LastValidCodePoint)
760         return false;
761     const int test = FLAG(Other_Control) |
762                      FLAG(Other_Format) |
763                      FLAG(Other_Surrogate) |
764                      FLAG(Other_PrivateUse) |
765                      FLAG(Other_NotAssigned);
766     return !(FLAG(qGetProp(ucs4)->category) & test);
767 }
768 
769 /*!
770     \fn bool QChar::isSpace() const
771 
772     Returns \c true if the character is a separator character
773     (Separator_* categories or certain code points from Other_Control category);
774     otherwise returns \c false.
775 */
776 
777 /*!
778     \fn bool QChar::isSpace(uint ucs4)
779     \overload
780     \since 5.0
781 
782     Returns \c true if the UCS-4-encoded character specified by \a ucs4 is
783     a separator character (Separator_* categories or certain code points
784     from Other_Control category); otherwise returns \c false.
785 */
786 
787 /*!
788     \internal
789 */
isSpace_helper(uint ucs4)790 bool QT_FASTCALL QChar::isSpace_helper(uint ucs4) noexcept
791 {
792     if (ucs4 > LastValidCodePoint)
793         return false;
794     const int test = FLAG(Separator_Space) |
795                      FLAG(Separator_Line) |
796                      FLAG(Separator_Paragraph);
797     return FLAG(qGetProp(ucs4)->category) & test;
798 }
799 
800 /*!
801     \fn bool QChar::isMark() const
802 
803     Returns \c true if the character is a mark (Mark_* categories);
804     otherwise returns \c false.
805 
806     See QChar::Category for more information regarding marks.
807 */
808 
809 /*!
810     \overload
811     \since 5.0
812 
813     Returns \c true if the UCS-4-encoded character specified by \a ucs4 is
814     a mark (Mark_* categories); otherwise returns \c false.
815 */
isMark(uint ucs4)816 bool QChar::isMark(uint ucs4) noexcept
817 {
818     if (ucs4 > LastValidCodePoint)
819         return false;
820     const int test = FLAG(Mark_NonSpacing) |
821                      FLAG(Mark_SpacingCombining) |
822                      FLAG(Mark_Enclosing);
823     return FLAG(qGetProp(ucs4)->category) & test;
824 }
825 
826 /*!
827     \fn bool QChar::isPunct() const
828 
829     Returns \c true if the character is a punctuation mark (Punctuation_*
830     categories); otherwise returns \c false.
831 */
832 
833 /*!
834     \overload
835     \since 5.0
836 
837     Returns \c true if the UCS-4-encoded character specified by \a ucs4 is
838     a punctuation mark (Punctuation_* categories); otherwise returns \c false.
839 */
isPunct(uint ucs4)840 bool QChar::isPunct(uint ucs4) noexcept
841 {
842     if (ucs4 > LastValidCodePoint)
843         return false;
844     const int test = FLAG(Punctuation_Connector) |
845                      FLAG(Punctuation_Dash) |
846                      FLAG(Punctuation_Open) |
847                      FLAG(Punctuation_Close) |
848                      FLAG(Punctuation_InitialQuote) |
849                      FLAG(Punctuation_FinalQuote) |
850                      FLAG(Punctuation_Other);
851     return FLAG(qGetProp(ucs4)->category) & test;
852 }
853 
854 /*!
855     \fn bool QChar::isSymbol() const
856 
857     Returns \c true if the character is a symbol (Symbol_* categories);
858     otherwise returns \c false.
859 */
860 
861 /*!
862     \overload
863     \since 5.0
864 
865     Returns \c true if the UCS-4-encoded character specified by \a ucs4 is
866     a symbol (Symbol_* categories); otherwise returns \c false.
867 */
isSymbol(uint ucs4)868 bool QChar::isSymbol(uint ucs4) noexcept
869 {
870     if (ucs4 > LastValidCodePoint)
871         return false;
872     const int test = FLAG(Symbol_Math) |
873                      FLAG(Symbol_Currency) |
874                      FLAG(Symbol_Modifier) |
875                      FLAG(Symbol_Other);
876     return FLAG(qGetProp(ucs4)->category) & test;
877 }
878 
879 /*!
880     \fn bool QChar::isLetter() const
881 
882     Returns \c true if the character is a letter (Letter_* categories);
883     otherwise returns \c false.
884 */
885 
886 /*!
887     \fn bool QChar::isLetter(uint ucs4)
888     \overload
889     \since 5.0
890 
891     Returns \c true if the UCS-4-encoded character specified by \a ucs4 is
892     a letter (Letter_* categories); otherwise returns \c false.
893 */
894 
895 /*!
896     \internal
897 */
isLetter_helper(uint ucs4)898 bool QT_FASTCALL QChar::isLetter_helper(uint ucs4) noexcept
899 {
900     if (ucs4 > LastValidCodePoint)
901         return false;
902     const int test = FLAG(Letter_Uppercase) |
903                      FLAG(Letter_Lowercase) |
904                      FLAG(Letter_Titlecase) |
905                      FLAG(Letter_Modifier) |
906                      FLAG(Letter_Other);
907     return FLAG(qGetProp(ucs4)->category) & test;
908 }
909 
910 /*!
911     \fn bool QChar::isNumber() const
912 
913     Returns \c true if the character is a number (Number_* categories,
914     not just 0-9); otherwise returns \c false.
915 
916     \sa isDigit()
917 */
918 
919 /*!
920     \fn bool QChar::isNumber(uint ucs4)
921     \overload
922     \since 5.0
923 
924     Returns \c true if the UCS-4-encoded character specified by \a ucs4 is
925     a number (Number_* categories, not just 0-9); otherwise returns \c false.
926 
927     \sa isDigit()
928 */
929 
930 /*!
931     \internal
932 */
isNumber_helper(uint ucs4)933 bool QT_FASTCALL QChar::isNumber_helper(uint ucs4) noexcept
934 {
935     if (ucs4 > LastValidCodePoint)
936         return false;
937     const int test = FLAG(Number_DecimalDigit) |
938                      FLAG(Number_Letter) |
939                      FLAG(Number_Other);
940     return FLAG(qGetProp(ucs4)->category) & test;
941 }
942 
943 /*!
944     \fn bool QChar::isLetterOrNumber() const
945 
946     Returns \c true if the character is a letter or number (Letter_* or
947     Number_* categories); otherwise returns \c false.
948 */
949 
950 /*!
951     \fn bool QChar::isLetterOrNumber(uint ucs4)
952     \overload
953     \since 5.0
954 
955     Returns \c true if the UCS-4-encoded character specified by \a ucs4 is
956     a letter or number (Letter_* or Number_* categories); otherwise returns \c false.
957 */
958 
959 /*!
960     \internal
961 */
isLetterOrNumber_helper(uint ucs4)962 bool QT_FASTCALL QChar::isLetterOrNumber_helper(uint ucs4) noexcept
963 {
964     if (ucs4 > LastValidCodePoint)
965         return false;
966     const int test = FLAG(Letter_Uppercase) |
967                      FLAG(Letter_Lowercase) |
968                      FLAG(Letter_Titlecase) |
969                      FLAG(Letter_Modifier) |
970                      FLAG(Letter_Other) |
971                      FLAG(Number_DecimalDigit) |
972                      FLAG(Number_Letter) |
973                      FLAG(Number_Other);
974     return FLAG(qGetProp(ucs4)->category) & test;
975 }
976 
977 /*!
978     \fn bool QChar::isDigit() const
979 
980     Returns \c true if the character is a decimal digit
981     (Number_DecimalDigit); otherwise returns \c false.
982 
983     \sa isNumber()
984 */
985 
986 /*!
987     \fn bool QChar::isDigit(uint ucs4)
988     \overload
989     \since 5.0
990 
991     Returns \c true if the UCS-4-encoded character specified by \a ucs4 is
992     a decimal digit (Number_DecimalDigit); otherwise returns \c false.
993 
994     \sa isNumber()
995 */
996 
997 /*!
998     \fn bool QChar::isNonCharacter() const
999     \since 5.0
1000 
1001     Returns \c true if the QChar is a non-character; false otherwise.
1002 
1003     Unicode has a certain number of code points that are classified
1004     as "non-characters:" that is, they can be used for internal purposes
1005     in applications but cannot be used for text interchange.
1006     Those are the last two entries each Unicode Plane ([0xfffe..0xffff],
1007     [0x1fffe..0x1ffff], etc.) as well as the entries in range [0xfdd0..0xfdef].
1008 */
1009 
1010 /*!
1011     \fn bool QChar::isHighSurrogate() const
1012 
1013     Returns \c true if the QChar is the high part of a UTF16 surrogate
1014     (for example if its code point is in range [0xd800..0xdbff]); false otherwise.
1015 */
1016 
1017 /*!
1018     \fn bool QChar::isLowSurrogate() const
1019 
1020     Returns \c true if the QChar is the low part of a UTF16 surrogate
1021     (for example if its code point is in range [0xdc00..0xdfff]); false otherwise.
1022 */
1023 
1024 /*!
1025     \fn bool QChar::isSurrogate() const
1026     \since 5.0
1027 
1028     Returns \c true if the QChar contains a code point that is in either
1029     the high or the low part of the UTF-16 surrogate range
1030     (for example if its code point is in range [0xd800..0xdfff]); false otherwise.
1031 */
1032 
1033 /*!
1034     \fn static bool QChar::isNonCharacter(uint ucs4)
1035     \overload
1036     \since 5.0
1037 
1038     Returns \c true if the UCS-4-encoded character specified by \a ucs4
1039     is a non-character; false otherwise.
1040 
1041     Unicode has a certain number of code points that are classified
1042     as "non-characters:" that is, they can be used for internal purposes
1043     in applications but cannot be used for text interchange.
1044     Those are the last two entries each Unicode Plane ([0xfffe..0xffff],
1045     [0x1fffe..0x1ffff], etc.) as well as the entries in range [0xfdd0..0xfdef].
1046 */
1047 
1048 /*!
1049     \fn static bool QChar::isHighSurrogate(uint ucs4)
1050     \overload
1051 
1052     Returns \c true if the UCS-4-encoded character specified by \a ucs4
1053     is the high part of a UTF16 surrogate
1054     (for example if its code point is in range [0xd800..0xdbff]); false otherwise.
1055 */
1056 
1057 /*!
1058     \fn static bool QChar::isLowSurrogate(uint ucs4)
1059     \overload
1060 
1061     Returns \c true if the UCS-4-encoded character specified by \a ucs4
1062     is the low part of a UTF16 surrogate
1063     (for example if its code point is in range [0xdc00..0xdfff]); false otherwise.
1064 */
1065 
1066 /*!
1067     \fn static bool QChar::isSurrogate(uint ucs4)
1068     \overload
1069     \since 5.0
1070 
1071     Returns \c true if the UCS-4-encoded character specified by \a ucs4
1072     contains a code point that is in either the high or the low part of the
1073     UTF-16 surrogate range (for example if its code point is in range [0xd800..0xdfff]);
1074     false otherwise.
1075 */
1076 
1077 /*!
1078     \fn static bool QChar::requiresSurrogates(uint ucs4)
1079 
1080     Returns \c true if the UCS-4-encoded character specified by \a ucs4
1081     can be split into the high and low parts of a UTF16 surrogate
1082     (for example if its code point is greater than or equals to 0x10000);
1083     false otherwise.
1084 */
1085 
1086 /*!
1087     \fn static uint QChar::surrogateToUcs4(ushort high, ushort low)
1088 
1089     Converts a UTF16 surrogate pair with the given \a high and \a low values
1090     to it's UCS-4-encoded code point.
1091 */
1092 
1093 /*!
1094     \fn static uint QChar::surrogateToUcs4(QChar high, QChar low)
1095     \overload
1096 
1097     Converts a UTF16 surrogate pair (\a high, \a low) to it's UCS-4-encoded code point.
1098 */
1099 
1100 /*!
1101     \fn static ushort QChar::highSurrogate(uint ucs4)
1102 
1103     Returns the high surrogate part of a UCS-4-encoded code point.
1104     The returned result is undefined if \a ucs4 is smaller than 0x10000.
1105 */
1106 
1107 /*!
1108     \fn static ushort QChar::lowSurrogate(uint ucs4)
1109 
1110     Returns the low surrogate part of a UCS-4-encoded code point.
1111     The returned result is undefined if \a ucs4 is smaller than 0x10000.
1112 */
1113 
1114 /*!
1115     \fn int QChar::digitValue() const
1116 
1117     Returns the numeric value of the digit, or -1 if the character is not a digit.
1118 */
1119 
1120 /*!
1121     \overload
1122     Returns the numeric value of the digit specified by the UCS-4-encoded
1123     character, \a ucs4, or -1 if the character is not a digit.
1124 */
digitValue(uint ucs4)1125 int QChar::digitValue(uint ucs4) noexcept
1126 {
1127     if (ucs4 > LastValidCodePoint)
1128         return -1;
1129     return qGetProp(ucs4)->digitValue;
1130 }
1131 
1132 /*!
1133     \fn QChar::Category QChar::category() const
1134 
1135     Returns the character's category.
1136 */
1137 
1138 /*!
1139     \overload
1140     Returns the category of the UCS-4-encoded character specified by \a ucs4.
1141 */
category(uint ucs4)1142 QChar::Category QChar::category(uint ucs4) noexcept
1143 {
1144     if (ucs4 > LastValidCodePoint)
1145         return QChar::Other_NotAssigned;
1146     return (QChar::Category) qGetProp(ucs4)->category;
1147 }
1148 
1149 /*!
1150     \fn QChar::Direction QChar::direction() const
1151 
1152     Returns the character's direction.
1153 */
1154 
1155 /*!
1156     \overload
1157     Returns the direction of the UCS-4-encoded character specified by \a ucs4.
1158 */
direction(uint ucs4)1159 QChar::Direction QChar::direction(uint ucs4) noexcept
1160 {
1161     if (ucs4 > LastValidCodePoint)
1162         return QChar::DirL;
1163     return (QChar::Direction) qGetProp(ucs4)->direction;
1164 }
1165 
1166 /*!
1167     \fn QChar::JoiningType QChar::joiningType() const
1168     \since 5.3
1169 
1170     Returns information about the joining type attributes of the character
1171     (needed for certain languages such as Arabic or Syriac).
1172 */
1173 
1174 /*!
1175     \overload
1176     \since 5.3
1177 
1178     Returns information about the joining type attributes of the UCS-4-encoded
1179     character specified by \a ucs4
1180     (needed for certain languages such as Arabic or Syriac).
1181 */
joiningType(uint ucs4)1182 QChar::JoiningType QChar::joiningType(uint ucs4) noexcept
1183 {
1184     if (ucs4 > LastValidCodePoint)
1185         return QChar::Joining_None;
1186     return QChar::JoiningType(qGetProp(ucs4)->joining);
1187 }
1188 
1189 #if QT_DEPRECATED_SINCE(5, 3)
1190 /*!
1191     \fn QChar::Joining QChar::joining() const
1192     \deprecated in 5.3, use joiningType() instead.
1193 
1194     Returns information about the joining properties of the character
1195     (needed for certain languages such as Arabic).
1196 */
1197 
1198 /*!
1199     \overload
1200     \deprecated in 5.3, use joiningType() instead.
1201 
1202     Returns information about the joining properties of the UCS-4-encoded
1203     character specified by \a ucs4 (needed for certain languages such as Arabic).
1204 */
joining(uint ucs4)1205 QChar::Joining QChar::joining(uint ucs4) noexcept
1206 {
1207     if (ucs4 > LastValidCodePoint)
1208         return QChar::OtherJoining;
1209     switch (qGetProp(ucs4)->joining) {
1210     case QChar::Joining_Causing: return QChar::Center;
1211     case QChar::Joining_Dual: return QChar::Dual;
1212     case QChar::Joining_Right: return QChar::Right;
1213     default: break;
1214     }
1215     return QChar::OtherJoining;
1216 }
1217 #endif
1218 
1219 /*!
1220     \fn bool QChar::hasMirrored() const
1221 
1222     Returns \c true if the character should be reversed if the text
1223     direction is reversed; otherwise returns \c false.
1224 
1225     A bit faster equivalent of (ch.mirroredChar() != ch).
1226 
1227     \sa mirroredChar()
1228 */
1229 
1230 /*!
1231     \overload
1232     \since 5.0
1233 
1234     Returns \c true if the UCS-4-encoded character specified by \a ucs4
1235     should be reversed if the text direction is reversed; otherwise returns \c false.
1236 
1237     A bit faster equivalent of (QChar::mirroredChar(ucs4) != ucs4).
1238 
1239     \sa mirroredChar()
1240 */
hasMirrored(uint ucs4)1241 bool QChar::hasMirrored(uint ucs4) noexcept
1242 {
1243     if (ucs4 > LastValidCodePoint)
1244         return false;
1245     return qGetProp(ucs4)->mirrorDiff != 0;
1246 }
1247 
1248 /*!
1249     \fn bool QChar::isLower() const
1250 
1251     Returns \c true if the character is a lowercase letter, for example
1252     category() is Letter_Lowercase.
1253 
1254     \sa isUpper(), toLower(), toUpper()
1255 */
1256 
1257 /*!
1258     \fn static bool QChar::isLower(uint ucs4)
1259     \overload
1260     \since 5.0
1261 
1262     Returns \c true if the UCS-4-encoded character specified by \a ucs4
1263     is a lowercase letter, for example category() is Letter_Lowercase.
1264 
1265     \sa isUpper(), toLower(), toUpper()
1266 */
1267 
1268 /*!
1269     \fn bool QChar::isUpper() const
1270 
1271     Returns \c true if the character is an uppercase letter, for example
1272     category() is Letter_Uppercase.
1273 
1274     \sa isLower(), toUpper(), toLower()
1275 */
1276 
1277 /*!
1278     \fn static bool QChar::isUpper(uint ucs4)
1279     \overload
1280     \since 5.0
1281 
1282     Returns \c true if the UCS-4-encoded character specified by \a ucs4
1283     is an uppercase letter, for example category() is Letter_Uppercase.
1284 
1285     \sa isLower(), toUpper(), toLower()
1286 */
1287 
1288 /*!
1289     \fn bool QChar::isTitleCase() const
1290 
1291     Returns \c true if the character is a titlecase letter, for example
1292     category() is Letter_Titlecase.
1293 
1294     \sa isLower(), toUpper(), toLower(), toTitleCase()
1295 */
1296 
1297 /*!
1298     \fn static bool QChar::isTitleCase(uint ucs4)
1299     \overload
1300     \since 5.0
1301 
1302     Returns \c true if the UCS-4-encoded character specified by \a ucs4
1303     is a titlecase letter, for example category() is Letter_Titlecase.
1304 
1305     \sa isLower(), toUpper(), toLower(), toTitleCase()
1306 */
1307 /*!
1308     \fn QChar QChar::mirroredChar() const
1309 
1310     Returns the mirrored character if this character is a mirrored
1311     character; otherwise returns the character itself.
1312 
1313     \sa hasMirrored()
1314 */
1315 
1316 /*!
1317     \overload
1318     Returns the mirrored character if the UCS-4-encoded character specified
1319     by \a ucs4 is a mirrored character; otherwise returns the character itself.
1320 
1321     \sa hasMirrored()
1322 */
mirroredChar(uint ucs4)1323 uint QChar::mirroredChar(uint ucs4) noexcept
1324 {
1325     if (ucs4 > LastValidCodePoint)
1326         return ucs4;
1327     return ucs4 + qGetProp(ucs4)->mirrorDiff;
1328 }
1329 
1330 
1331 // constants for Hangul (de)composition, see UAX #15
1332 enum {
1333     Hangul_SBase = 0xac00,
1334     Hangul_LBase = 0x1100,
1335     Hangul_VBase = 0x1161,
1336     Hangul_TBase = 0x11a7,
1337     Hangul_LCount = 19,
1338     Hangul_VCount = 21,
1339     Hangul_TCount = 28,
1340     Hangul_NCount = Hangul_VCount * Hangul_TCount,
1341     Hangul_SCount = Hangul_LCount * Hangul_NCount
1342 };
1343 
1344 // buffer has to have a length of 3. It's needed for Hangul decomposition
decompositionHelper(uint ucs4,int * length,int * tag,unsigned short * buffer)1345 static const unsigned short * QT_FASTCALL decompositionHelper
1346     (uint ucs4, int *length, int *tag, unsigned short *buffer)
1347 {
1348     if (ucs4 >= Hangul_SBase && ucs4 < Hangul_SBase + Hangul_SCount) {
1349         // compute Hangul syllable decomposition as per UAX #15
1350         const uint SIndex = ucs4 - Hangul_SBase;
1351         buffer[0] = Hangul_LBase + SIndex / Hangul_NCount; // L
1352         buffer[1] = Hangul_VBase + (SIndex % Hangul_NCount) / Hangul_TCount; // V
1353         buffer[2] = Hangul_TBase + SIndex % Hangul_TCount; // T
1354         *length = buffer[2] == Hangul_TBase ? 2 : 3;
1355         *tag = QChar::Canonical;
1356         return buffer;
1357     }
1358 
1359     const unsigned short index = GET_DECOMPOSITION_INDEX(ucs4);
1360     if (index == 0xffff) {
1361         *length = 0;
1362         *tag = QChar::NoDecomposition;
1363         return nullptr;
1364     }
1365 
1366     const unsigned short *decomposition = uc_decomposition_map+index;
1367     *tag = (*decomposition) & 0xff;
1368     *length = (*decomposition) >> 8;
1369     return decomposition+1;
1370 }
1371 
1372 /*!
1373     Decomposes a character into it's constituent parts. Returns an empty string
1374     if no decomposition exists.
1375 */
decomposition() const1376 QString QChar::decomposition() const
1377 {
1378     return QChar::decomposition(ucs);
1379 }
1380 
1381 /*!
1382     \overload
1383     Decomposes the UCS-4-encoded character specified by \a ucs4 into it's
1384     constituent parts. Returns an empty string if no decomposition exists.
1385 */
decomposition(uint ucs4)1386 QString QChar::decomposition(uint ucs4)
1387 {
1388     unsigned short buffer[3];
1389     int length;
1390     int tag;
1391     const unsigned short *d = decompositionHelper(ucs4, &length, &tag, buffer);
1392     return QString(reinterpret_cast<const QChar *>(d), length);
1393 }
1394 
1395 /*!
1396     \fn QChar::Decomposition QChar::decompositionTag() const
1397 
1398     Returns the tag defining the composition of the character. Returns
1399     QChar::NoDecomposition if no decomposition exists.
1400 */
1401 
1402 /*!
1403     \overload
1404     Returns the tag defining the composition of the UCS-4-encoded character
1405     specified by \a ucs4. Returns QChar::NoDecomposition if no decomposition exists.
1406 */
decompositionTag(uint ucs4)1407 QChar::Decomposition QChar::decompositionTag(uint ucs4) noexcept
1408 {
1409     if (ucs4 >= Hangul_SBase && ucs4 < Hangul_SBase + Hangul_SCount)
1410         return QChar::Canonical;
1411     const unsigned short index = GET_DECOMPOSITION_INDEX(ucs4);
1412     if (index == 0xffff)
1413         return QChar::NoDecomposition;
1414     return (QChar::Decomposition)(uc_decomposition_map[index] & 0xff);
1415 }
1416 
1417 /*!
1418     \fn unsigned char QChar::combiningClass() const
1419 
1420     Returns the combining class for the character as defined in the
1421     Unicode standard. This is mainly useful as a positioning hint for
1422     marks attached to a base character.
1423 
1424     The Qt text rendering engine uses this information to correctly
1425     position non-spacing marks around a base character.
1426 */
1427 
1428 /*!
1429     \overload
1430     Returns the combining class for the UCS-4-encoded character specified by
1431     \a ucs4, as defined in the Unicode standard.
1432 */
combiningClass(uint ucs4)1433 unsigned char QChar::combiningClass(uint ucs4) noexcept
1434 {
1435     if (ucs4 > LastValidCodePoint)
1436         return 0;
1437     return (unsigned char) qGetProp(ucs4)->combiningClass;
1438 }
1439 
1440 /*!
1441     \fn QChar::Script QChar::script() const
1442     \since 5.1
1443 
1444     Returns the Unicode script property value for this character.
1445 */
1446 
1447 /*!
1448     \overload
1449     \since 5.1
1450 
1451     Returns the Unicode script property value for the character specified in
1452     its UCS-4-encoded form as \a ucs4.
1453 */
script(uint ucs4)1454 QChar::Script QChar::script(uint ucs4) noexcept
1455 {
1456     if (ucs4 > LastValidCodePoint)
1457         return QChar::Script_Unknown;
1458     return (QChar::Script) qGetProp(ucs4)->script;
1459 }
1460 
1461 /*!
1462     \fn QChar::UnicodeVersion QChar::unicodeVersion() const
1463 
1464     Returns the Unicode version that introduced this character.
1465 */
1466 
1467 /*!
1468     \overload
1469     Returns the Unicode version that introduced the character specified in
1470     its UCS-4-encoded form as \a ucs4.
1471 */
unicodeVersion(uint ucs4)1472 QChar::UnicodeVersion QChar::unicodeVersion(uint ucs4) noexcept
1473 {
1474     if (ucs4 > LastValidCodePoint)
1475         return QChar::Unicode_Unassigned;
1476     return (QChar::UnicodeVersion) qGetProp(ucs4)->unicodeVersion;
1477 }
1478 
1479 /*!
1480     Returns the most recent supported Unicode version.
1481 */
currentUnicodeVersion()1482 QChar::UnicodeVersion QChar::currentUnicodeVersion() noexcept
1483 {
1484     return UNICODE_DATA_VERSION;
1485 }
1486 
1487 
1488 template <typename T>
convertCase_helper(T uc,QUnicodeTables::Case which)1489 Q_DECL_CONST_FUNCTION static inline T convertCase_helper(T uc, QUnicodeTables::Case which) noexcept
1490 {
1491     const auto fold = qGetProp(uc)->cases[which];
1492 
1493     if (Q_UNLIKELY(fold.special)) {
1494         const ushort *specialCase = specialCaseMap + fold.diff;
1495         // so far, there are no special cases beyond BMP (guaranteed by the qunicodetables generator)
1496         return *specialCase == 1 ? specialCase[1] : uc;
1497     }
1498 
1499     return uc + fold.diff;
1500 }
1501 
1502 /*!
1503     \fn QChar QChar::toLower() const
1504 
1505     Returns the lowercase equivalent if the character is uppercase or titlecase;
1506     otherwise returns the character itself.
1507 */
1508 
1509 /*!
1510     \overload
1511     Returns the lowercase equivalent of the UCS-4-encoded character specified
1512     by \a ucs4 if the character is uppercase or titlecase; otherwise returns
1513     the character itself.
1514 */
toLower(uint ucs4)1515 uint QChar::toLower(uint ucs4) noexcept
1516 {
1517     if (ucs4 > LastValidCodePoint)
1518         return ucs4;
1519     return convertCase_helper(ucs4, QUnicodeTables::LowerCase);
1520 }
1521 
1522 /*!
1523     \fn QChar QChar::toUpper() const
1524 
1525     Returns the uppercase equivalent if the character is lowercase or titlecase;
1526     otherwise returns the character itself.
1527 */
1528 
1529 /*!
1530     \overload
1531     Returns the uppercase equivalent of the UCS-4-encoded character specified
1532     by \a ucs4 if the character is lowercase or titlecase; otherwise returns
1533     the character itself.
1534 */
toUpper(uint ucs4)1535 uint QChar::toUpper(uint ucs4) noexcept
1536 {
1537     if (ucs4 > LastValidCodePoint)
1538         return ucs4;
1539     return convertCase_helper(ucs4, QUnicodeTables::UpperCase);
1540 }
1541 
1542 /*!
1543     \fn QChar QChar::toTitleCase() const
1544 
1545     Returns the title case equivalent if the character is lowercase or uppercase;
1546     otherwise returns the character itself.
1547 */
1548 
1549 /*!
1550     \overload
1551     Returns the title case equivalent of the UCS-4-encoded character specified
1552     by \a ucs4 if the character is lowercase or uppercase; otherwise returns
1553     the character itself.
1554 */
toTitleCase(uint ucs4)1555 uint QChar::toTitleCase(uint ucs4) noexcept
1556 {
1557     if (ucs4 > LastValidCodePoint)
1558         return ucs4;
1559     return convertCase_helper(ucs4, QUnicodeTables::TitleCase);
1560 }
1561 
foldCase(const ushort * ch,const ushort * start)1562 static inline uint foldCase(const ushort *ch, const ushort *start)
1563 {
1564     uint ucs4 = *ch;
1565     if (QChar::isLowSurrogate(ucs4) && ch > start && QChar::isHighSurrogate(*(ch - 1)))
1566         ucs4 = QChar::surrogateToUcs4(*(ch - 1), ucs4);
1567     return convertCase_helper(ucs4, QUnicodeTables::CaseFold);
1568 }
1569 
foldCase(uint ch,uint & last)1570 static inline uint foldCase(uint ch, uint &last) noexcept
1571 {
1572     uint ucs4 = ch;
1573     if (QChar::isLowSurrogate(ucs4) && QChar::isHighSurrogate(last))
1574         ucs4 = QChar::surrogateToUcs4(last, ucs4);
1575     last = ch;
1576     return convertCase_helper(ucs4, QUnicodeTables::CaseFold);
1577 }
1578 
foldCase(ushort ch)1579 static inline ushort foldCase(ushort ch) noexcept
1580 {
1581     return convertCase_helper(ch, QUnicodeTables::CaseFold);
1582 }
1583 
foldCase(QChar ch)1584 static inline QChar foldCase(QChar ch) noexcept
1585 {
1586     return QChar(foldCase(ch.unicode()));
1587 }
1588 
1589 /*!
1590     \fn QChar QChar::toCaseFolded() const
1591 
1592     Returns the case folded equivalent of the character.
1593     For most Unicode characters this is the same as toLower().
1594 */
1595 
1596 /*!
1597     \overload
1598     Returns the case folded equivalent of the UCS-4-encoded character specified
1599     by \a ucs4. For most Unicode characters this is the same as toLower().
1600 */
toCaseFolded(uint ucs4)1601 uint QChar::toCaseFolded(uint ucs4) noexcept
1602 {
1603     if (ucs4 > LastValidCodePoint)
1604         return ucs4;
1605     return convertCase_helper(ucs4, QUnicodeTables::CaseFold);
1606 }
1607 
1608 /*!
1609     \fn char QChar::toLatin1() const
1610 
1611     Returns the Latin-1 character equivalent to the QChar, or 0. This
1612     is mainly useful for non-internationalized software.
1613 
1614     \note It is not possible to distinguish a non-Latin-1 character from a Latin-1 0
1615     (NUL) character. Prefer to use unicode(), which does not have this ambiguity.
1616 
1617     \sa unicode()
1618 */
1619 
1620 /*!
1621     \fn QChar QChar::fromLatin1(char)
1622 
1623     Converts the Latin-1 character \a c to its equivalent QChar. This
1624     is mainly useful for non-internationalized software.
1625 
1626     An alternative is to use QLatin1Char.
1627 
1628     \sa toLatin1(), unicode()
1629 */
1630 
1631 /*!
1632     \fn char QChar::toAscii() const
1633     \deprecated
1634 
1635     Returns the Latin-1 character value of the QChar, or 0 if the character is not
1636     representable.
1637 
1638     The main purpose of this function is to preserve ASCII characters used
1639     in C strings. This is mainly useful for developers of non-internationalized
1640     software.
1641 
1642     \note It is not possible to distinguish a non-Latin 1 character from an ASCII 0
1643     (NUL) character. Prefer to use unicode(), which does not have this ambiguity.
1644 
1645     \note This function does not check whether the character value is inside
1646     the valid range of US-ASCII.
1647 
1648     \sa toLatin1(), unicode()
1649 */
1650 
1651 /*!
1652     \fn QChar QChar::fromAscii(char)
1653     \deprecated
1654 
1655     Converts the ASCII character \a c to it's equivalent QChar. This
1656     is mainly useful for non-internationalized software.
1657 
1658     An alternative is to use QLatin1Char.
1659 
1660     \sa fromLatin1(), unicode()
1661 */
1662 
1663 #ifndef QT_NO_DATASTREAM
1664 /*!
1665     \relates QChar
1666 
1667     Writes the char \a chr to the stream \a out.
1668 
1669     \sa {Serializing Qt Data Types}
1670 */
operator <<(QDataStream & out,QChar chr)1671 QDataStream &operator<<(QDataStream &out, QChar chr)
1672 {
1673     out << quint16(chr.unicode());
1674     return out;
1675 }
1676 
1677 /*!
1678     \relates QChar
1679 
1680     Reads a char from the stream \a in into char \a chr.
1681 
1682     \sa {Serializing Qt Data Types}
1683 */
operator >>(QDataStream & in,QChar & chr)1684 QDataStream &operator>>(QDataStream &in, QChar &chr)
1685 {
1686     quint16 u;
1687     in >> u;
1688     chr.unicode() = ushort(u);
1689     return in;
1690 }
1691 #endif // QT_NO_DATASTREAM
1692 
1693 /*!
1694     \fn ushort & QChar::unicode()
1695 
1696     Returns a reference to the numeric Unicode value of the QChar.
1697 */
1698 
1699 /*!
1700     \fn ushort QChar::unicode() const
1701 
1702     Returns the numeric Unicode value of the QChar.
1703 */
1704 
1705 /*****************************************************************************
1706   Documentation of QChar related functions
1707  *****************************************************************************/
1708 
1709 /*!
1710     \fn bool operator==(QChar c1, QChar c2)
1711 
1712     \relates QChar
1713 
1714     Returns \c true if \a c1 and \a c2 are the same Unicode character;
1715     otherwise returns \c false.
1716 */
1717 
1718 /*!
1719     \fn int operator!=(QChar c1, QChar c2)
1720 
1721     \relates QChar
1722 
1723     Returns \c true if \a c1 and \a c2 are not the same Unicode
1724     character; otherwise returns \c false.
1725 */
1726 
1727 /*!
1728     \fn int operator<=(QChar c1, QChar c2)
1729 
1730     \relates QChar
1731 
1732     Returns \c true if the numeric Unicode value of \a c1 is less than
1733     or equal to that of \a c2; otherwise returns \c false.
1734 */
1735 
1736 /*!
1737     \fn int operator>=(QChar c1, QChar c2)
1738 
1739     \relates QChar
1740 
1741     Returns \c true if the numeric Unicode value of \a c1 is greater than
1742     or equal to that of \a c2; otherwise returns \c false.
1743 */
1744 
1745 /*!
1746     \fn int operator<(QChar c1, QChar c2)
1747 
1748     \relates QChar
1749 
1750     Returns \c true if the numeric Unicode value of \a c1 is less than
1751     that of \a c2; otherwise returns \c false.
1752 */
1753 
1754 /*!
1755     \fn int operator>(QChar c1, QChar c2)
1756 
1757     \relates QChar
1758 
1759     Returns \c true if the numeric Unicode value of \a c1 is greater than
1760     that of \a c2; otherwise returns \c false.
1761 */
1762 
1763 
1764 // ---------------------------------------------------------------------------
1765 
1766 
decomposeHelper(QString * str,bool canonical,QChar::UnicodeVersion version,int from)1767 static void decomposeHelper(QString *str, bool canonical, QChar::UnicodeVersion version, int from)
1768 {
1769     int length;
1770     int tag;
1771     unsigned short buffer[3];
1772 
1773     QString &s = *str;
1774 
1775     const unsigned short *utf16 = reinterpret_cast<unsigned short *>(s.data());
1776     const unsigned short *uc = utf16 + s.length();
1777     while (uc != utf16 + from) {
1778         uint ucs4 = *(--uc);
1779         if (QChar(ucs4).isLowSurrogate() && uc != utf16) {
1780             ushort high = *(uc - 1);
1781             if (QChar(high).isHighSurrogate()) {
1782                 --uc;
1783                 ucs4 = QChar::surrogateToUcs4(high, ucs4);
1784             }
1785         }
1786 
1787         if (QChar::unicodeVersion(ucs4) > version)
1788             continue;
1789 
1790         const unsigned short *d = decompositionHelper(ucs4, &length, &tag, buffer);
1791         if (!d || (canonical && tag != QChar::Canonical))
1792             continue;
1793 
1794         int pos = uc - utf16;
1795         s.replace(pos, QChar::requiresSurrogates(ucs4) ? 2 : 1, reinterpret_cast<const QChar *>(d), length);
1796         // since the replace invalidates the pointers and we do decomposition recursive
1797         utf16 = reinterpret_cast<unsigned short *>(s.data());
1798         uc = utf16 + pos + length;
1799     }
1800 }
1801 
1802 
1803 struct UCS2Pair {
1804     ushort u1;
1805     ushort u2;
1806 };
1807 
operator <(const UCS2Pair & ligature1,const UCS2Pair & ligature2)1808 inline bool operator<(const UCS2Pair &ligature1, const UCS2Pair &ligature2)
1809 { return ligature1.u1 < ligature2.u1; }
operator <(ushort u1,const UCS2Pair & ligature)1810 inline bool operator<(ushort u1, const UCS2Pair &ligature)
1811 { return u1 < ligature.u1; }
operator <(const UCS2Pair & ligature,ushort u1)1812 inline bool operator<(const UCS2Pair &ligature, ushort u1)
1813 { return ligature.u1 < u1; }
1814 
1815 struct UCS2SurrogatePair {
1816     UCS2Pair p1;
1817     UCS2Pair p2;
1818 };
1819 
operator <(const UCS2SurrogatePair & ligature1,const UCS2SurrogatePair & ligature2)1820 inline bool operator<(const UCS2SurrogatePair &ligature1, const UCS2SurrogatePair &ligature2)
1821 { return QChar::surrogateToUcs4(ligature1.p1.u1, ligature1.p1.u2) < QChar::surrogateToUcs4(ligature2.p1.u1, ligature2.p1.u2); }
operator <(uint u1,const UCS2SurrogatePair & ligature)1822 inline bool operator<(uint u1, const UCS2SurrogatePair &ligature)
1823 { return u1 < QChar::surrogateToUcs4(ligature.p1.u1, ligature.p1.u2); }
operator <(const UCS2SurrogatePair & ligature,uint u1)1824 inline bool operator<(const UCS2SurrogatePair &ligature, uint u1)
1825 { return QChar::surrogateToUcs4(ligature.p1.u1, ligature.p1.u2) < u1; }
1826 
ligatureHelper(uint u1,uint u2)1827 static uint inline ligatureHelper(uint u1, uint u2)
1828 {
1829     if (u1 >= Hangul_LBase && u1 <= Hangul_SBase + Hangul_SCount) {
1830         // compute Hangul syllable composition as per UAX #15
1831         // hangul L-V pair
1832         const uint LIndex = u1 - Hangul_LBase;
1833         if (LIndex < Hangul_LCount) {
1834             const uint VIndex = u2 - Hangul_VBase;
1835             if (VIndex < Hangul_VCount)
1836                 return Hangul_SBase + (LIndex * Hangul_VCount + VIndex) * Hangul_TCount;
1837         }
1838         // hangul LV-T pair
1839         const uint SIndex = u1 - Hangul_SBase;
1840         if (SIndex < Hangul_SCount && (SIndex % Hangul_TCount) == 0) {
1841             const uint TIndex = u2 - Hangul_TBase;
1842             if (TIndex <= Hangul_TCount)
1843                 return u1 + TIndex;
1844         }
1845     }
1846 
1847     const unsigned short index = GET_LIGATURE_INDEX(u2);
1848     if (index == 0xffff)
1849         return 0;
1850     const unsigned short *ligatures = uc_ligature_map+index;
1851     ushort length = *ligatures++;
1852     if (QChar::requiresSurrogates(u1)) {
1853         const UCS2SurrogatePair *data = reinterpret_cast<const UCS2SurrogatePair *>(ligatures);
1854         const UCS2SurrogatePair *r = std::lower_bound(data, data + length, u1);
1855         if (r != data + length && QChar::surrogateToUcs4(r->p1.u1, r->p1.u2) == u1)
1856             return QChar::surrogateToUcs4(r->p2.u1, r->p2.u2);
1857     } else {
1858         const UCS2Pair *data = reinterpret_cast<const UCS2Pair *>(ligatures);
1859         const UCS2Pair *r = std::lower_bound(data, data + length, ushort(u1));
1860         if (r != data + length && r->u1 == ushort(u1))
1861             return r->u2;
1862     }
1863 
1864     return 0;
1865 }
1866 
composeHelper(QString * str,QChar::UnicodeVersion version,int from)1867 static void composeHelper(QString *str, QChar::UnicodeVersion version, int from)
1868 {
1869     QString &s = *str;
1870 
1871     if (from < 0 || s.length() - from < 2)
1872         return;
1873 
1874     uint stcode = 0; // starter code point
1875     int starter = -1; // starter position
1876     int next = -1; // to prevent i == next
1877     int lastCombining = 255; // to prevent combining > lastCombining
1878 
1879     int pos = from;
1880     while (pos < s.length()) {
1881         int i = pos;
1882         uint uc = s.at(pos).unicode();
1883         if (QChar(uc).isHighSurrogate() && pos < s.length()-1) {
1884             ushort low = s.at(pos+1).unicode();
1885             if (QChar(low).isLowSurrogate()) {
1886                 uc = QChar::surrogateToUcs4(uc, low);
1887                 ++pos;
1888             }
1889         }
1890 
1891         const QUnicodeTables::Properties *p = qGetProp(uc);
1892         if (p->unicodeVersion > version) {
1893             starter = -1;
1894             next = -1; // to prevent i == next
1895             lastCombining = 255; // to prevent combining > lastCombining
1896             ++pos;
1897             continue;
1898         }
1899 
1900         int combining = p->combiningClass;
1901         if ((i == next || combining > lastCombining) && starter >= from) {
1902             // allowed to form ligature with S
1903             uint ligature = ligatureHelper(stcode, uc);
1904             if (ligature) {
1905                 stcode = ligature;
1906                 QChar *d = s.data();
1907                 // ligatureHelper() never changes planes
1908                 if (QChar::requiresSurrogates(ligature)) {
1909                     d[starter] = QChar(QChar::highSurrogate(ligature));
1910                     d[starter + 1] = QChar(QChar::lowSurrogate(ligature));
1911                     s.remove(i, 2);
1912                 } else {
1913                     d[starter] = QChar(ligature);
1914                     s.remove(i, 1);
1915                 }
1916                 continue;
1917             }
1918         }
1919         if (combining == 0) {
1920             starter = i;
1921             stcode = uc;
1922             next = pos + 1;
1923         }
1924         lastCombining = combining;
1925 
1926         ++pos;
1927     }
1928 }
1929 
1930 
canonicalOrderHelper(QString * str,QChar::UnicodeVersion version,int from)1931 static void canonicalOrderHelper(QString *str, QChar::UnicodeVersion version, int from)
1932 {
1933     QString &s = *str;
1934     const int l = s.length()-1;
1935 
1936     uint u1, u2;
1937     ushort c1, c2;
1938 
1939     int pos = from;
1940     while (pos < l) {
1941         int p2 = pos+1;
1942         u1 = s.at(pos).unicode();
1943         if (QChar(u1).isHighSurrogate()) {
1944             ushort low = s.at(p2).unicode();
1945             if (QChar(low).isLowSurrogate()) {
1946                 u1 = QChar::surrogateToUcs4(u1, low);
1947                 if (p2 >= l)
1948                     break;
1949                 ++p2;
1950             }
1951         }
1952         c1 = 0;
1953 
1954     advance:
1955         u2 = s.at(p2).unicode();
1956         if (QChar(u2).isHighSurrogate() && p2 < l) {
1957             ushort low = s.at(p2+1).unicode();
1958             if (QChar(low).isLowSurrogate()) {
1959                 u2 = QChar::surrogateToUcs4(u2, low);
1960                 ++p2;
1961             }
1962         }
1963 
1964         c2 = 0;
1965         {
1966             const QUnicodeTables::Properties *p = qGetProp(u2);
1967             if (p->unicodeVersion <= version)
1968                 c2 = p->combiningClass;
1969         }
1970         if (c2 == 0) {
1971             pos = p2+1;
1972             continue;
1973         }
1974 
1975         if (c1 == 0) {
1976             const QUnicodeTables::Properties *p = qGetProp(u1);
1977             if (p->unicodeVersion <= version)
1978                 c1 = p->combiningClass;
1979         }
1980 
1981         if (c1 > c2) {
1982             QChar *uc = s.data();
1983             int p = pos;
1984             // exchange characters
1985             if (!QChar::requiresSurrogates(u2)) {
1986                 uc[p++] = QChar(u2);
1987             } else {
1988                 uc[p++] = QChar(QChar::highSurrogate(u2));
1989                 uc[p++] = QChar(QChar::lowSurrogate(u2));
1990             }
1991             if (!QChar::requiresSurrogates(u1)) {
1992                 uc[p++] = QChar(u1);
1993             } else {
1994                 uc[p++] = QChar(QChar::highSurrogate(u1));
1995                 uc[p++] = QChar(QChar::lowSurrogate(u1));
1996             }
1997             if (pos > 0)
1998                 --pos;
1999             if (pos > 0 && s.at(pos).isLowSurrogate())
2000                 --pos;
2001         } else {
2002             ++pos;
2003             if (QChar::requiresSurrogates(u1))
2004                 ++pos;
2005 
2006             u1 = u2;
2007             c1 = c2; // != 0
2008             p2 = pos + 1;
2009             if (QChar::requiresSurrogates(u1))
2010                 ++p2;
2011             if (p2 > l)
2012                 break;
2013 
2014             goto advance;
2015         }
2016     }
2017 }
2018 
2019 // returns true if the text is in a desired Normalization Form already; false otherwise.
2020 // sets lastStable to the position of the last stable code point
normalizationQuickCheckHelper(QString * str,QString::NormalizationForm mode,int from,int * lastStable)2021 static bool normalizationQuickCheckHelper(QString *str, QString::NormalizationForm mode, int from, int *lastStable)
2022 {
2023     Q_STATIC_ASSERT(QString::NormalizationForm_D == 0);
2024     Q_STATIC_ASSERT(QString::NormalizationForm_C == 1);
2025     Q_STATIC_ASSERT(QString::NormalizationForm_KD == 2);
2026     Q_STATIC_ASSERT(QString::NormalizationForm_KC == 3);
2027 
2028     enum { NFQC_YES = 0, NFQC_NO = 1, NFQC_MAYBE = 3 };
2029 
2030     const ushort *string = reinterpret_cast<const ushort *>(str->constData());
2031     int length = str->length();
2032 
2033     // this avoids one out of bounds check in the loop
2034     while (length > from && QChar::isHighSurrogate(string[length - 1]))
2035         --length;
2036 
2037     uchar lastCombining = 0;
2038     for (int i = from; i < length; ++i) {
2039         int pos = i;
2040         uint uc = string[i];
2041         if (uc < 0x80) {
2042             // ASCII characters are stable code points
2043             lastCombining = 0;
2044             *lastStable = pos;
2045             continue;
2046         }
2047 
2048         if (QChar::isHighSurrogate(uc)) {
2049             ushort low = string[i + 1];
2050             if (!QChar::isLowSurrogate(low)) {
2051                 // treat surrogate like stable code point
2052                 lastCombining = 0;
2053                 *lastStable = pos;
2054                 continue;
2055             }
2056             ++i;
2057             uc = QChar::surrogateToUcs4(uc, low);
2058         }
2059 
2060         const QUnicodeTables::Properties *p = qGetProp(uc);
2061 
2062         if (p->combiningClass < lastCombining && p->combiningClass > 0)
2063             return false;
2064 
2065         const uchar check = (p->nfQuickCheck >> (mode << 1)) & 0x03;
2066         if (check != NFQC_YES)
2067             return false; // ### can we quick check NFQC_MAYBE ?
2068 
2069         lastCombining = p->combiningClass;
2070         if (lastCombining == 0)
2071             *lastStable = pos;
2072     }
2073 
2074     if (length != str->length()) // low surrogate parts at the end of text
2075         *lastStable = str->length() - 1;
2076 
2077     return true;
2078 }
2079 
2080 QT_END_NAMESPACE
2081