1 /****************************************************************************
2 **
3 ** Copyright (C) 2019 The Qt Company Ltd.
4 ** Contact: https://www.qt.io/licensing/
5 **
6 ** This file is part of the QtCore module of the Qt Toolkit.
7 **
8 ** $QT_BEGIN_LICENSE:LGPL$
9 ** Commercial License Usage
10 ** Licensees holding valid commercial Qt licenses may use this file in
11 ** accordance with the commercial license agreement provided with the
12 ** Software or, alternatively, in accordance with the terms contained in
13 ** a written agreement between you and The Qt Company. For licensing terms
14 ** and conditions see https://www.qt.io/terms-conditions. For further
15 ** information use the contact form at https://www.qt.io/contact-us.
16 **
17 ** GNU Lesser General Public License Usage
18 ** Alternatively, this file may be used under the terms of the GNU Lesser
19 ** General Public License version 3 as published by the Free Software
20 ** Foundation and appearing in the file LICENSE.LGPL3 included in the
21 ** packaging of this file. Please review the following information to
22 ** ensure the GNU Lesser General Public License version 3 requirements
23 ** will be met: https://www.gnu.org/licenses/lgpl-3.0.html.
24 **
25 ** GNU General Public License Usage
26 ** Alternatively, this file may be used under the terms of the GNU
27 ** General Public License version 2.0 or (at your option) the GNU General
28 ** Public license version 3 or any later version approved by the KDE Free
29 ** Qt Foundation. The licenses are as published by the Free Software
30 ** Foundation and appearing in the file LICENSE.GPL2 and LICENSE.GPL3
31 ** included in the packaging of this file. Please review the following
32 ** information to ensure the GNU General Public License requirements will
33 ** be met: https://www.gnu.org/licenses/gpl-2.0.html and
34 ** https://www.gnu.org/licenses/gpl-3.0.html.
35 **
36 ** $QT_END_LICENSE$
37 **
38 ****************************************************************************/
39
40 // Don't define it while compiling this module, or USERS of Qt will
41 // not be able to link.
42 #ifdef QT_NO_CAST_FROM_ASCII
43 # undef QT_NO_CAST_FROM_ASCII
44 #endif
45 #ifdef QT_NO_CAST_TO_ASCII
46 # undef QT_NO_CAST_TO_ASCII
47 #endif
48 #include "qchar.h"
49
50 #include "qdatastream.h"
51
52 #include "qunicodetables_p.h"
53 #include "qunicodetables.cpp"
54
55 #include <algorithm>
56
57 QT_BEGIN_NAMESPACE
58
59 #define FLAG(x) (1 << (x))
60
61 /*!
62 \class QLatin1Char
63 \inmodule QtCore
64 \reentrant
65 \brief The QLatin1Char class provides an 8-bit ASCII/Latin-1 character.
66
67 \ingroup string-processing
68
69 This class is only useful to construct a QChar with 8-bit character.
70
71 \sa QChar, QLatin1String, QString
72 */
73
74 /*!
75 \fn const char QLatin1Char::toLatin1() const
76
77 Converts a Latin-1 character to an 8-bit ASCII representation of the character.
78 */
79
80 /*!
81 \fn const ushort QLatin1Char::unicode() const
82
83 Converts a Latin-1 character to an 16-bit-encoded Unicode representation
84 of the character.
85 */
86
87 /*!
88 \fn QLatin1Char::QLatin1Char(char c)
89
90 Constructs a Latin-1 character for \a c. This constructor should be
91 used when the encoding of the input character is known to be Latin-1.
92 */
93
94 /*!
95 \class QChar
96 \inmodule QtCore
97 \brief The QChar class provides a 16-bit Unicode character.
98
99 \ingroup string-processing
100 \reentrant
101
102 In Qt, Unicode characters are 16-bit entities without any markup
103 or structure. This class represents such an entity. It is
104 lightweight, so it can be used everywhere. Most compilers treat
105 it like an \c{unsigned short}.
106
107 QChar provides a full complement of testing/classification
108 functions, converting to and from other formats, converting from
109 composed to decomposed Unicode, and trying to compare and
110 case-convert if you ask it to.
111
112 The classification functions include functions like those in the
113 standard C++ header \<cctype\> (formerly \<ctype.h\>), but
114 operating on the full range of Unicode characters, not just for the ASCII
115 range. They all return true if the character is a certain type of character;
116 otherwise they return false. These classification functions are
117 isNull() (returns \c true if the character is '\\0'), isPrint()
118 (true if the character is any sort of printable character,
119 including whitespace), isPunct() (any sort of punctation),
120 isMark() (Unicode Mark), isLetter() (a letter), isNumber() (any
121 sort of numeric character, not just 0-9), isLetterOrNumber(), and
122 isDigit() (decimal digits). All of these are wrappers around
123 category() which return the Unicode-defined category of each
124 character. Some of these also calculate the derived properties
125 (for example isSpace() returns \c true if the character is of category
126 Separator_* or an exceptional code point from Other_Control category).
127
128 QChar also provides direction(), which indicates the "natural"
129 writing direction of this character. The joiningType() function
130 indicates how the character joins with it's neighbors (needed
131 mostly for Arabic or Syriac) and finally hasMirrored(), which indicates
132 whether the character needs to be mirrored when it is printed in
133 it's "unnatural" writing direction.
134
135 Composed Unicode characters (like \a ring) can be converted to
136 decomposed Unicode ("a" followed by "ring above") by using decomposition().
137
138 In Unicode, comparison is not necessarily possible and case
139 conversion is very difficult at best. Unicode, covering the
140 "entire" world, also includes most of the world's case and
141 sorting problems. operator==() and friends will do comparison
142 based purely on the numeric Unicode value (code point) of the
143 characters, and toUpper() and toLower() will do case changes when
144 the character has a well-defined uppercase/lowercase equivalent.
145 For locale-dependent comparisons, use QString::localeAwareCompare().
146
147 The conversion functions include unicode() (to a scalar),
148 toLatin1() (to scalar, but converts all non-Latin-1 characters to
149 0), row() (gives the Unicode row), cell() (gives the Unicode
150 cell), digitValue() (gives the integer value of any of the
151 numerous digit characters), and a host of constructors.
152
153 QChar provides constructors and cast operators that make it easy
154 to convert to and from traditional 8-bit \c{char}s. If you
155 defined \c QT_NO_CAST_FROM_ASCII and \c QT_NO_CAST_TO_ASCII, as
156 explained in the QString documentation, you will need to
157 explicitly call fromLatin1(), or use QLatin1Char,
158 to construct a QChar from an 8-bit \c char, and you will need to
159 call toLatin1() to get the 8-bit value back.
160
161 For more information see
162 \l{http://www.unicode.org/ucd/}{"About the Unicode Character Database"}.
163
164 \sa Unicode, QString, QLatin1Char
165 */
166
167 /*!
168 \enum QChar::UnicodeVersion
169
170 Specifies which version of the \l{http://www.unicode.org/}{Unicode standard}
171 introduced a certain character.
172
173 \value Unicode_1_1 Version 1.1
174 \value Unicode_2_0 Version 2.0
175 \value Unicode_2_1_2 Version 2.1.2
176 \value Unicode_3_0 Version 3.0
177 \value Unicode_3_1 Version 3.1
178 \value Unicode_3_2 Version 3.2
179 \value Unicode_4_0 Version 4.0
180 \value Unicode_4_1 Version 4.1
181 \value Unicode_5_0 Version 5.0
182 \value Unicode_5_1 Version 5.1
183 \value Unicode_5_2 Version 5.2
184 \value Unicode_6_0 Version 6.0
185 \value Unicode_6_1 Version 6.1
186 \value Unicode_6_2 Version 6.2
187 \value Unicode_6_3 Version 6.3 Since Qt 5.3
188 \value Unicode_7_0 Version 7.0 Since Qt 5.5
189 \value Unicode_8_0 Version 8.0 Since Qt 5.6
190 \value Unicode_9_0 Version 9.0 Since Qt 5.11
191 \value Unicode_10_0 Version 10.0 Since Qt 5.11
192 \value Unicode_11_0 Version 11.0 Since Qt 5.15
193 \value Unicode_12_0 Version 12.0 Since Qt 5.15
194 \value Unicode_12_1 Version 12.1 Since Qt 5.15
195 \value Unicode_13_0 Version 13.0 Since Qt 5.15
196 \value Unicode_Unassigned The value is not assigned to any character
197 in version 8.0 of Unicode.
198
199 \sa unicodeVersion(), currentUnicodeVersion()
200 */
201
202 /*!
203 \enum QChar::Category
204
205 This enum maps the Unicode character categories.
206
207 The following characters are normative in Unicode:
208
209 \value Mark_NonSpacing Unicode class name Mn
210
211 \value Mark_SpacingCombining Unicode class name Mc
212
213 \value Mark_Enclosing Unicode class name Me
214
215 \value Number_DecimalDigit Unicode class name Nd
216
217 \value Number_Letter Unicode class name Nl
218
219 \value Number_Other Unicode class name No
220
221 \value Separator_Space Unicode class name Zs
222
223 \value Separator_Line Unicode class name Zl
224
225 \value Separator_Paragraph Unicode class name Zp
226
227 \value Other_Control Unicode class name Cc
228
229 \value Other_Format Unicode class name Cf
230
231 \value Other_Surrogate Unicode class name Cs
232
233 \value Other_PrivateUse Unicode class name Co
234
235 \value Other_NotAssigned Unicode class name Cn
236
237
238 The following categories are informative in Unicode:
239
240 \value Letter_Uppercase Unicode class name Lu
241
242 \value Letter_Lowercase Unicode class name Ll
243
244 \value Letter_Titlecase Unicode class name Lt
245
246 \value Letter_Modifier Unicode class name Lm
247
248 \value Letter_Other Unicode class name Lo
249
250 \value Punctuation_Connector Unicode class name Pc
251
252 \value Punctuation_Dash Unicode class name Pd
253
254 \value Punctuation_Open Unicode class name Ps
255
256 \value Punctuation_Close Unicode class name Pe
257
258 \value Punctuation_InitialQuote Unicode class name Pi
259
260 \value Punctuation_FinalQuote Unicode class name Pf
261
262 \value Punctuation_Other Unicode class name Po
263
264 \value Symbol_Math Unicode class name Sm
265
266 \value Symbol_Currency Unicode class name Sc
267
268 \value Symbol_Modifier Unicode class name Sk
269
270 \value Symbol_Other Unicode class name So
271
272 \sa category()
273 */
274
275 /*!
276 \enum QChar::Script
277 \since 5.1
278
279 This enum type defines the Unicode script property values.
280
281 For details about the Unicode script property values see
282 \l{http://www.unicode.org/reports/tr24/}{Unicode Standard Annex #24}.
283
284 In order to conform to C/C++ naming conventions "Script_" is prepended
285 to the codes used in the Unicode Standard.
286
287 \value Script_Unknown For unassigned, private-use, noncharacter, and surrogate code points.
288 \value Script_Inherited For characters that may be used with multiple scripts
289 and that inherit their script from the preceding characters.
290 These include nonspacing marks, enclosing marks,
291 and zero width joiner/non-joiner characters.
292 \value Script_Common For characters that may be used with multiple scripts
293 and that do not inherit their script from the preceding characters.
294
295 \value Script_Adlam Since Qt 5.11
296 \value Script_Ahom Since Qt 5.6
297 \value Script_AnatolianHieroglyphs Since Qt 5.6
298 \value Script_Arabic
299 \value Script_Armenian
300 \value Script_Avestan
301 \value Script_Balinese
302 \value Script_Bamum
303 \value Script_BassaVah Since Qt 5.5
304 \value Script_Batak
305 \value Script_Bengali
306 \value Script_Bhaiksuki Since Qt 5.11
307 \value Script_Bopomofo
308 \value Script_Brahmi
309 \value Script_Braille
310 \value Script_Buginese
311 \value Script_Buhid
312 \value Script_CanadianAboriginal
313 \value Script_Carian
314 \value Script_CaucasianAlbanian Since Qt 5.5
315 \value Script_Chakma
316 \value Script_Cham
317 \value Script_Cherokee
318 \value Script_Chorasmian Since Qt 5.15
319 \value Script_Coptic
320 \value Script_Cuneiform
321 \value Script_Cypriot
322 \value Script_Cyrillic
323 \value Script_Deseret
324 \value Script_Devanagari
325 \value Script_DivesAkuru Since Qt 5.15
326 \value Script_Dogra Since Qt 5.15
327 \value Script_Duployan Since Qt 5.5
328 \value Script_EgyptianHieroglyphs
329 \value Script_Elbasan Since Qt 5.5
330 \value Script_Elymaic Since Qt 5.15
331 \value Script_Ethiopic
332 \value Script_Georgian
333 \value Script_Glagolitic
334 \value Script_Gothic
335 \value Script_Grantha Since Qt 5.5
336 \value Script_Greek
337 \value Script_Gujarati
338 \value Script_GunjalaGondi Since Qt 5.15
339 \value Script_Gurmukhi
340 \value Script_Han
341 \value Script_Hangul
342 \value Script_HanifiRohingya Since Qt 5.15
343 \value Script_Hanunoo
344 \value Script_Hatran Since Qt 5.6
345 \value Script_Hebrew
346 \value Script_Hiragana
347 \value Script_ImperialAramaic
348 \value Script_InscriptionalPahlavi
349 \value Script_InscriptionalParthian
350 \value Script_Javanese
351 \value Script_Kaithi
352 \value Script_Kannada
353 \value Script_Katakana
354 \value Script_KayahLi
355 \value Script_Kharoshthi
356 \value Script_KhitanSmallScript Since Qt 5.15
357 \value Script_Khmer
358 \value Script_Khojki Since Qt 5.5
359 \value Script_Khudawadi Since Qt 5.5
360 \value Script_Lao
361 \value Script_Latin
362 \value Script_Lepcha
363 \value Script_Limbu
364 \value Script_LinearA Since Qt 5.5
365 \value Script_LinearB
366 \value Script_Lisu
367 \value Script_Lycian
368 \value Script_Lydian
369 \value Script_Mahajani Since Qt 5.5
370 \value Script_Makasar Since Qt 5.15
371 \value Script_Malayalam
372 \value Script_Mandaic
373 \value Script_Manichaean Since Qt 5.5
374 \value Script_Marchen Since Qt 5.11
375 \value Script_MasaramGondi Since Qt 5.11
376 \value Script_Medefaidrin Since Qt 5.15
377 \value Script_MeeteiMayek
378 \value Script_MendeKikakui Since Qt 5.5
379 \value Script_MeroiticCursive
380 \value Script_MeroiticHieroglyphs
381 \value Script_Miao
382 \value Script_Modi Since Qt 5.5
383 \value Script_Mongolian
384 \value Script_Mro Since Qt 5.5
385 \value Script_Multani Since Qt 5.6
386 \value Script_Myanmar
387 \value Script_Nabataean Since Qt 5.5
388 \value Script_Nandinagari Since Qt 5.15
389 \value Script_Newa Since Qt 5.11
390 \value Script_NewTaiLue
391 \value Script_Nko
392 \value Script_Nushu Since Qt 5.11
393 \value Script_NyiakengPuachueHmong Since Qt 5.15
394 \value Script_Ogham
395 \value Script_OlChiki
396 \value Script_OldHungarian Since Qt 5.6
397 \value Script_OldItalic
398 \value Script_OldNorthArabian Since Qt 5.5
399 \value Script_OldPermic Since Qt 5.5
400 \value Script_OldPersian
401 \value Script_OldSogdian Since Qt 5.15
402 \value Script_OldSouthArabian
403 \value Script_OldTurkic
404 \value Script_Oriya
405 \value Script_Osage Since Qt 5.11
406 \value Script_Osmanya
407 \value Script_PahawhHmong Since Qt 5.5
408 \value Script_Palmyrene Since Qt 5.5
409 \value Script_PauCinHau Since Qt 5.5
410 \value Script_PhagsPa
411 \value Script_Phoenician
412 \value Script_PsalterPahlavi Since Qt 5.5
413 \value Script_Rejang
414 \value Script_Runic
415 \value Script_Samaritan
416 \value Script_Saurashtra
417 \value Script_Sharada
418 \value Script_Shavian
419 \value Script_Siddham Since Qt 5.5
420 \value Script_SignWriting Since Qt 5.6
421 \value Script_Sinhala
422 \value Script_Sogdian Since Qt 5.15
423 \value Script_SoraSompeng
424 \value Script_Soyombo Since Qt 5.11
425 \value Script_Sundanese
426 \value Script_SylotiNagri
427 \value Script_Syriac
428 \value Script_Tagalog
429 \value Script_Tagbanwa
430 \value Script_TaiLe
431 \value Script_TaiTham
432 \value Script_TaiViet
433 \value Script_Takri
434 \value Script_Tamil
435 \value Script_Tangut Since Qt 5.11
436 \value Script_Telugu
437 \value Script_Thaana
438 \value Script_Thai
439 \value Script_Tibetan
440 \value Script_Tifinagh
441 \value Script_Tirhuta Since Qt 5.5
442 \value Script_Ugaritic
443 \value Script_Vai
444 \value Script_Wancho Since Qt 5.15
445 \value Script_WarangCiti Since Qt 5.5
446 \value Script_Yezidi Since Qt 5.15
447 \value Script_Yi
448 \value Script_ZanabazarSquare Since Qt 5.11
449
450 \omitvalue ScriptCount
451
452 \sa script()
453 */
454
455 /*!
456 \enum QChar::Direction
457
458 This enum type defines the Unicode direction attributes. See the
459 \l{http://www.unicode.org/reports/tr9/tr9-35.html#Table_Bidirectional_Character_Types}{Unicode Standard} for a description
460 of the values.
461
462 In order to conform to C/C++ naming conventions "Dir" is prepended
463 to the codes used in the Unicode Standard.
464
465 \value DirAL
466 \value DirAN
467 \value DirB
468 \value DirBN
469 \value DirCS
470 \value DirEN
471 \value DirES
472 \value DirET
473 \value DirFSI Since Qt 5.3
474 \value DirL
475 \value DirLRE
476 \value DirLRI Since Qt 5.3
477 \value DirLRO
478 \value DirNSM
479 \value DirON
480 \value DirPDF
481 \value DirPDI Since Qt 5.3
482 \value DirR
483 \value DirRLE
484 \value DirRLI Since Qt 5.3
485 \value DirRLO
486 \value DirS
487 \value DirWS
488
489 \sa direction()
490 */
491
492 /*!
493 \enum QChar::Decomposition
494
495 This enum type defines the Unicode decomposition attributes. See
496 the \l{http://www.unicode.org/}{Unicode Standard} for a
497 description of the values.
498
499 \value NoDecomposition
500 \value Canonical
501 \value Circle
502 \value Compat
503 \value Final
504 \value Font
505 \value Fraction
506 \value Initial
507 \value Isolated
508 \value Medial
509 \value Narrow
510 \value NoBreak
511 \value Small
512 \value Square
513 \value Sub
514 \value Super
515 \value Vertical
516 \value Wide
517
518 \sa decomposition()
519 */
520
521 /*!
522 \enum QChar::JoiningType
523 since 5.3
524
525 This enum type defines the Unicode joining type attributes. See the
526 \l{http://www.unicode.org/}{Unicode Standard} for a description of the values.
527
528 In order to conform to C/C++ naming conventions "Joining_" is prepended
529 to the codes used in the Unicode Standard.
530
531 \value Joining_None
532 \value Joining_Causing
533 \value Joining_Dual
534 \value Joining_Right
535 \value Joining_Left
536 \value Joining_Transparent
537
538 \sa joiningType()
539 */
540
541 #if QT_DEPRECATED_SINCE(5, 3)
542 /*!
543 \enum QChar::Joining
544 \deprecated in 5.3, use JoiningType instead.
545
546 This enum type defines the Unicode joining attributes. See the
547 \l{http://www.unicode.org/}{Unicode Standard} for a description
548 of the values.
549
550 \value Center
551 \value Dual
552 \value OtherJoining
553 \value Right
554
555 \sa joining()
556 */
557 #endif
558
559 /*!
560 \enum QChar::CombiningClass
561
562 \internal
563
564 This enum type defines names for some of the Unicode combining
565 classes. See the \l{http://www.unicode.org/}{Unicode Standard}
566 for a description of the values.
567
568 \value Combining_Above
569 \value Combining_AboveAttached
570 \value Combining_AboveLeft
571 \value Combining_AboveLeftAttached
572 \value Combining_AboveRight
573 \value Combining_AboveRightAttached
574 \value Combining_Below
575 \value Combining_BelowAttached
576 \value Combining_BelowLeft
577 \value Combining_BelowLeftAttached
578 \value Combining_BelowRight
579 \value Combining_BelowRightAttached
580 \value Combining_DoubleAbove
581 \value Combining_DoubleBelow
582 \value Combining_IotaSubscript
583 \value Combining_Left
584 \value Combining_LeftAttached
585 \value Combining_Right
586 \value Combining_RightAttached
587 */
588
589 /*!
590 \enum QChar::SpecialCharacter
591
592 \value Null A QChar with this value isNull().
593 \value Tabulation Character tabulation.
594 \value LineFeed
595 \value FormFeed
596 \value CarriageReturn
597 \value Space
598 \value Nbsp Non-breaking space.
599 \value SoftHyphen
600 \value ReplacementCharacter The character shown when a font has no glyph
601 for a certain codepoint. A special question mark character is often
602 used. Codecs use this codepoint when input data cannot be
603 represented in Unicode.
604 \value ObjectReplacementCharacter Used to represent an object such as an
605 image when such objects cannot be presented.
606 \value ByteOrderMark
607 \value ByteOrderSwapped
608 \value ParagraphSeparator
609 \value LineSeparator
610 \value LastValidCodePoint
611 */
612
613 /*!
614 \fn void QChar::setCell(uchar cell)
615 \internal
616 */
617
618 /*!
619 \fn void QChar::setRow(uchar row)
620 \internal
621 */
622
623 /*!
624 \fn QChar::QChar()
625
626 Constructs a null QChar ('\\0').
627
628 \sa isNull()
629 */
630
631 /*!
632 \fn QChar::QChar(QLatin1Char ch)
633
634 Constructs a QChar corresponding to ASCII/Latin-1 character \a ch.
635 */
636
637 /*!
638 \fn QChar::QChar(SpecialCharacter ch)
639
640 Constructs a QChar for the predefined character value \a ch.
641 */
642
643 /*!
644 \fn QChar::QChar(char16_t ch)
645 \since 5.10
646
647 Constructs a QChar corresponding to the UTF-16 character \a ch.
648 */
649
650 /*!
651 \fn QChar::QChar(wchar_t ch)
652 \since 5.10
653
654 Constructs a QChar corresponding to the wide character \a ch.
655
656 \note This constructor is only available on Windows.
657 */
658
659 /*!
660 \fn QChar::QChar(char ch)
661
662 Constructs a QChar corresponding to ASCII/Latin-1 character \a ch.
663
664 \note This constructor is not available when \c QT_NO_CAST_FROM_ASCII
665 is defined.
666
667 \sa QT_NO_CAST_FROM_ASCII
668 */
669
670 /*!
671 \fn QChar::QChar(uchar ch)
672
673 Constructs a QChar corresponding to ASCII/Latin-1 character \a ch.
674
675 \note This constructor is not available when \c QT_NO_CAST_FROM_ASCII
676 or \c QT_RESTRICTED_CAST_FROM_ASCII is defined.
677
678 \sa QT_NO_CAST_FROM_ASCII, QT_RESTRICTED_CAST_FROM_ASCII
679 */
680
681 /*!
682 \fn QChar::QChar(uchar cell, uchar row)
683
684 Constructs a QChar for Unicode cell \a cell in row \a row.
685
686 \sa cell(), row()
687 */
688
689 /*!
690 \fn QChar::QChar(ushort code)
691
692 Constructs a QChar for the character with Unicode code point \a code.
693 */
694
695 /*!
696 \fn QChar::QChar(short code)
697
698 Constructs a QChar for the character with Unicode code point \a code.
699 */
700
701 /*!
702 \fn QChar::QChar(uint code)
703
704 Constructs a QChar for the character with Unicode code point \a code.
705 */
706
707 /*!
708 \fn QChar::QChar(int code)
709
710 Constructs a QChar for the character with Unicode code point \a code.
711 */
712
713 /*!
714 \fn bool QChar::isNull() const
715
716 Returns \c true if the character is the Unicode character 0x0000
717 ('\\0'); otherwise returns \c false.
718 */
719
720 /*!
721 \fn uchar QChar::cell() const
722
723 Returns the cell (least significant byte) of the Unicode character.
724
725 \sa row()
726 */
727
728 /*!
729 \fn uchar QChar::row() const
730
731 Returns the row (most significant byte) of the Unicode character.
732
733 \sa cell()
734 */
735
736 /*!
737 \fn bool QChar::isPrint() const
738
739 Returns \c true if the character is a printable character; otherwise
740 returns \c false. This is any character not of category Other_*.
741
742 Note that this gives no indication of whether the character is
743 available in a particular font.
744 */
745
746 /*!
747 \overload
748 \since 5.0
749
750 Returns \c true if the UCS-4-encoded character specified by \a ucs4 is
751 a printable character; otherwise returns \c false.
752 This is any character not of category Other_*.
753
754 Note that this gives no indication of whether the character is
755 available in a particular font.
756 */
isPrint(uint ucs4)757 bool QChar::isPrint(uint ucs4) noexcept
758 {
759 if (ucs4 > LastValidCodePoint)
760 return false;
761 const int test = FLAG(Other_Control) |
762 FLAG(Other_Format) |
763 FLAG(Other_Surrogate) |
764 FLAG(Other_PrivateUse) |
765 FLAG(Other_NotAssigned);
766 return !(FLAG(qGetProp(ucs4)->category) & test);
767 }
768
769 /*!
770 \fn bool QChar::isSpace() const
771
772 Returns \c true if the character is a separator character
773 (Separator_* categories or certain code points from Other_Control category);
774 otherwise returns \c false.
775 */
776
777 /*!
778 \fn bool QChar::isSpace(uint ucs4)
779 \overload
780 \since 5.0
781
782 Returns \c true if the UCS-4-encoded character specified by \a ucs4 is
783 a separator character (Separator_* categories or certain code points
784 from Other_Control category); otherwise returns \c false.
785 */
786
787 /*!
788 \internal
789 */
isSpace_helper(uint ucs4)790 bool QT_FASTCALL QChar::isSpace_helper(uint ucs4) noexcept
791 {
792 if (ucs4 > LastValidCodePoint)
793 return false;
794 const int test = FLAG(Separator_Space) |
795 FLAG(Separator_Line) |
796 FLAG(Separator_Paragraph);
797 return FLAG(qGetProp(ucs4)->category) & test;
798 }
799
800 /*!
801 \fn bool QChar::isMark() const
802
803 Returns \c true if the character is a mark (Mark_* categories);
804 otherwise returns \c false.
805
806 See QChar::Category for more information regarding marks.
807 */
808
809 /*!
810 \overload
811 \since 5.0
812
813 Returns \c true if the UCS-4-encoded character specified by \a ucs4 is
814 a mark (Mark_* categories); otherwise returns \c false.
815 */
isMark(uint ucs4)816 bool QChar::isMark(uint ucs4) noexcept
817 {
818 if (ucs4 > LastValidCodePoint)
819 return false;
820 const int test = FLAG(Mark_NonSpacing) |
821 FLAG(Mark_SpacingCombining) |
822 FLAG(Mark_Enclosing);
823 return FLAG(qGetProp(ucs4)->category) & test;
824 }
825
826 /*!
827 \fn bool QChar::isPunct() const
828
829 Returns \c true if the character is a punctuation mark (Punctuation_*
830 categories); otherwise returns \c false.
831 */
832
833 /*!
834 \overload
835 \since 5.0
836
837 Returns \c true if the UCS-4-encoded character specified by \a ucs4 is
838 a punctuation mark (Punctuation_* categories); otherwise returns \c false.
839 */
isPunct(uint ucs4)840 bool QChar::isPunct(uint ucs4) noexcept
841 {
842 if (ucs4 > LastValidCodePoint)
843 return false;
844 const int test = FLAG(Punctuation_Connector) |
845 FLAG(Punctuation_Dash) |
846 FLAG(Punctuation_Open) |
847 FLAG(Punctuation_Close) |
848 FLAG(Punctuation_InitialQuote) |
849 FLAG(Punctuation_FinalQuote) |
850 FLAG(Punctuation_Other);
851 return FLAG(qGetProp(ucs4)->category) & test;
852 }
853
854 /*!
855 \fn bool QChar::isSymbol() const
856
857 Returns \c true if the character is a symbol (Symbol_* categories);
858 otherwise returns \c false.
859 */
860
861 /*!
862 \overload
863 \since 5.0
864
865 Returns \c true if the UCS-4-encoded character specified by \a ucs4 is
866 a symbol (Symbol_* categories); otherwise returns \c false.
867 */
isSymbol(uint ucs4)868 bool QChar::isSymbol(uint ucs4) noexcept
869 {
870 if (ucs4 > LastValidCodePoint)
871 return false;
872 const int test = FLAG(Symbol_Math) |
873 FLAG(Symbol_Currency) |
874 FLAG(Symbol_Modifier) |
875 FLAG(Symbol_Other);
876 return FLAG(qGetProp(ucs4)->category) & test;
877 }
878
879 /*!
880 \fn bool QChar::isLetter() const
881
882 Returns \c true if the character is a letter (Letter_* categories);
883 otherwise returns \c false.
884 */
885
886 /*!
887 \fn bool QChar::isLetter(uint ucs4)
888 \overload
889 \since 5.0
890
891 Returns \c true if the UCS-4-encoded character specified by \a ucs4 is
892 a letter (Letter_* categories); otherwise returns \c false.
893 */
894
895 /*!
896 \internal
897 */
isLetter_helper(uint ucs4)898 bool QT_FASTCALL QChar::isLetter_helper(uint ucs4) noexcept
899 {
900 if (ucs4 > LastValidCodePoint)
901 return false;
902 const int test = FLAG(Letter_Uppercase) |
903 FLAG(Letter_Lowercase) |
904 FLAG(Letter_Titlecase) |
905 FLAG(Letter_Modifier) |
906 FLAG(Letter_Other);
907 return FLAG(qGetProp(ucs4)->category) & test;
908 }
909
910 /*!
911 \fn bool QChar::isNumber() const
912
913 Returns \c true if the character is a number (Number_* categories,
914 not just 0-9); otherwise returns \c false.
915
916 \sa isDigit()
917 */
918
919 /*!
920 \fn bool QChar::isNumber(uint ucs4)
921 \overload
922 \since 5.0
923
924 Returns \c true if the UCS-4-encoded character specified by \a ucs4 is
925 a number (Number_* categories, not just 0-9); otherwise returns \c false.
926
927 \sa isDigit()
928 */
929
930 /*!
931 \internal
932 */
isNumber_helper(uint ucs4)933 bool QT_FASTCALL QChar::isNumber_helper(uint ucs4) noexcept
934 {
935 if (ucs4 > LastValidCodePoint)
936 return false;
937 const int test = FLAG(Number_DecimalDigit) |
938 FLAG(Number_Letter) |
939 FLAG(Number_Other);
940 return FLAG(qGetProp(ucs4)->category) & test;
941 }
942
943 /*!
944 \fn bool QChar::isLetterOrNumber() const
945
946 Returns \c true if the character is a letter or number (Letter_* or
947 Number_* categories); otherwise returns \c false.
948 */
949
950 /*!
951 \fn bool QChar::isLetterOrNumber(uint ucs4)
952 \overload
953 \since 5.0
954
955 Returns \c true if the UCS-4-encoded character specified by \a ucs4 is
956 a letter or number (Letter_* or Number_* categories); otherwise returns \c false.
957 */
958
959 /*!
960 \internal
961 */
isLetterOrNumber_helper(uint ucs4)962 bool QT_FASTCALL QChar::isLetterOrNumber_helper(uint ucs4) noexcept
963 {
964 if (ucs4 > LastValidCodePoint)
965 return false;
966 const int test = FLAG(Letter_Uppercase) |
967 FLAG(Letter_Lowercase) |
968 FLAG(Letter_Titlecase) |
969 FLAG(Letter_Modifier) |
970 FLAG(Letter_Other) |
971 FLAG(Number_DecimalDigit) |
972 FLAG(Number_Letter) |
973 FLAG(Number_Other);
974 return FLAG(qGetProp(ucs4)->category) & test;
975 }
976
977 /*!
978 \fn bool QChar::isDigit() const
979
980 Returns \c true if the character is a decimal digit
981 (Number_DecimalDigit); otherwise returns \c false.
982
983 \sa isNumber()
984 */
985
986 /*!
987 \fn bool QChar::isDigit(uint ucs4)
988 \overload
989 \since 5.0
990
991 Returns \c true if the UCS-4-encoded character specified by \a ucs4 is
992 a decimal digit (Number_DecimalDigit); otherwise returns \c false.
993
994 \sa isNumber()
995 */
996
997 /*!
998 \fn bool QChar::isNonCharacter() const
999 \since 5.0
1000
1001 Returns \c true if the QChar is a non-character; false otherwise.
1002
1003 Unicode has a certain number of code points that are classified
1004 as "non-characters:" that is, they can be used for internal purposes
1005 in applications but cannot be used for text interchange.
1006 Those are the last two entries each Unicode Plane ([0xfffe..0xffff],
1007 [0x1fffe..0x1ffff], etc.) as well as the entries in range [0xfdd0..0xfdef].
1008 */
1009
1010 /*!
1011 \fn bool QChar::isHighSurrogate() const
1012
1013 Returns \c true if the QChar is the high part of a UTF16 surrogate
1014 (for example if its code point is in range [0xd800..0xdbff]); false otherwise.
1015 */
1016
1017 /*!
1018 \fn bool QChar::isLowSurrogate() const
1019
1020 Returns \c true if the QChar is the low part of a UTF16 surrogate
1021 (for example if its code point is in range [0xdc00..0xdfff]); false otherwise.
1022 */
1023
1024 /*!
1025 \fn bool QChar::isSurrogate() const
1026 \since 5.0
1027
1028 Returns \c true if the QChar contains a code point that is in either
1029 the high or the low part of the UTF-16 surrogate range
1030 (for example if its code point is in range [0xd800..0xdfff]); false otherwise.
1031 */
1032
1033 /*!
1034 \fn static bool QChar::isNonCharacter(uint ucs4)
1035 \overload
1036 \since 5.0
1037
1038 Returns \c true if the UCS-4-encoded character specified by \a ucs4
1039 is a non-character; false otherwise.
1040
1041 Unicode has a certain number of code points that are classified
1042 as "non-characters:" that is, they can be used for internal purposes
1043 in applications but cannot be used for text interchange.
1044 Those are the last two entries each Unicode Plane ([0xfffe..0xffff],
1045 [0x1fffe..0x1ffff], etc.) as well as the entries in range [0xfdd0..0xfdef].
1046 */
1047
1048 /*!
1049 \fn static bool QChar::isHighSurrogate(uint ucs4)
1050 \overload
1051
1052 Returns \c true if the UCS-4-encoded character specified by \a ucs4
1053 is the high part of a UTF16 surrogate
1054 (for example if its code point is in range [0xd800..0xdbff]); false otherwise.
1055 */
1056
1057 /*!
1058 \fn static bool QChar::isLowSurrogate(uint ucs4)
1059 \overload
1060
1061 Returns \c true if the UCS-4-encoded character specified by \a ucs4
1062 is the low part of a UTF16 surrogate
1063 (for example if its code point is in range [0xdc00..0xdfff]); false otherwise.
1064 */
1065
1066 /*!
1067 \fn static bool QChar::isSurrogate(uint ucs4)
1068 \overload
1069 \since 5.0
1070
1071 Returns \c true if the UCS-4-encoded character specified by \a ucs4
1072 contains a code point that is in either the high or the low part of the
1073 UTF-16 surrogate range (for example if its code point is in range [0xd800..0xdfff]);
1074 false otherwise.
1075 */
1076
1077 /*!
1078 \fn static bool QChar::requiresSurrogates(uint ucs4)
1079
1080 Returns \c true if the UCS-4-encoded character specified by \a ucs4
1081 can be split into the high and low parts of a UTF16 surrogate
1082 (for example if its code point is greater than or equals to 0x10000);
1083 false otherwise.
1084 */
1085
1086 /*!
1087 \fn static uint QChar::surrogateToUcs4(ushort high, ushort low)
1088
1089 Converts a UTF16 surrogate pair with the given \a high and \a low values
1090 to it's UCS-4-encoded code point.
1091 */
1092
1093 /*!
1094 \fn static uint QChar::surrogateToUcs4(QChar high, QChar low)
1095 \overload
1096
1097 Converts a UTF16 surrogate pair (\a high, \a low) to it's UCS-4-encoded code point.
1098 */
1099
1100 /*!
1101 \fn static ushort QChar::highSurrogate(uint ucs4)
1102
1103 Returns the high surrogate part of a UCS-4-encoded code point.
1104 The returned result is undefined if \a ucs4 is smaller than 0x10000.
1105 */
1106
1107 /*!
1108 \fn static ushort QChar::lowSurrogate(uint ucs4)
1109
1110 Returns the low surrogate part of a UCS-4-encoded code point.
1111 The returned result is undefined if \a ucs4 is smaller than 0x10000.
1112 */
1113
1114 /*!
1115 \fn int QChar::digitValue() const
1116
1117 Returns the numeric value of the digit, or -1 if the character is not a digit.
1118 */
1119
1120 /*!
1121 \overload
1122 Returns the numeric value of the digit specified by the UCS-4-encoded
1123 character, \a ucs4, or -1 if the character is not a digit.
1124 */
digitValue(uint ucs4)1125 int QChar::digitValue(uint ucs4) noexcept
1126 {
1127 if (ucs4 > LastValidCodePoint)
1128 return -1;
1129 return qGetProp(ucs4)->digitValue;
1130 }
1131
1132 /*!
1133 \fn QChar::Category QChar::category() const
1134
1135 Returns the character's category.
1136 */
1137
1138 /*!
1139 \overload
1140 Returns the category of the UCS-4-encoded character specified by \a ucs4.
1141 */
category(uint ucs4)1142 QChar::Category QChar::category(uint ucs4) noexcept
1143 {
1144 if (ucs4 > LastValidCodePoint)
1145 return QChar::Other_NotAssigned;
1146 return (QChar::Category) qGetProp(ucs4)->category;
1147 }
1148
1149 /*!
1150 \fn QChar::Direction QChar::direction() const
1151
1152 Returns the character's direction.
1153 */
1154
1155 /*!
1156 \overload
1157 Returns the direction of the UCS-4-encoded character specified by \a ucs4.
1158 */
direction(uint ucs4)1159 QChar::Direction QChar::direction(uint ucs4) noexcept
1160 {
1161 if (ucs4 > LastValidCodePoint)
1162 return QChar::DirL;
1163 return (QChar::Direction) qGetProp(ucs4)->direction;
1164 }
1165
1166 /*!
1167 \fn QChar::JoiningType QChar::joiningType() const
1168 \since 5.3
1169
1170 Returns information about the joining type attributes of the character
1171 (needed for certain languages such as Arabic or Syriac).
1172 */
1173
1174 /*!
1175 \overload
1176 \since 5.3
1177
1178 Returns information about the joining type attributes of the UCS-4-encoded
1179 character specified by \a ucs4
1180 (needed for certain languages such as Arabic or Syriac).
1181 */
joiningType(uint ucs4)1182 QChar::JoiningType QChar::joiningType(uint ucs4) noexcept
1183 {
1184 if (ucs4 > LastValidCodePoint)
1185 return QChar::Joining_None;
1186 return QChar::JoiningType(qGetProp(ucs4)->joining);
1187 }
1188
1189 #if QT_DEPRECATED_SINCE(5, 3)
1190 /*!
1191 \fn QChar::Joining QChar::joining() const
1192 \deprecated in 5.3, use joiningType() instead.
1193
1194 Returns information about the joining properties of the character
1195 (needed for certain languages such as Arabic).
1196 */
1197
1198 /*!
1199 \overload
1200 \deprecated in 5.3, use joiningType() instead.
1201
1202 Returns information about the joining properties of the UCS-4-encoded
1203 character specified by \a ucs4 (needed for certain languages such as Arabic).
1204 */
joining(uint ucs4)1205 QChar::Joining QChar::joining(uint ucs4) noexcept
1206 {
1207 if (ucs4 > LastValidCodePoint)
1208 return QChar::OtherJoining;
1209 switch (qGetProp(ucs4)->joining) {
1210 case QChar::Joining_Causing: return QChar::Center;
1211 case QChar::Joining_Dual: return QChar::Dual;
1212 case QChar::Joining_Right: return QChar::Right;
1213 default: break;
1214 }
1215 return QChar::OtherJoining;
1216 }
1217 #endif
1218
1219 /*!
1220 \fn bool QChar::hasMirrored() const
1221
1222 Returns \c true if the character should be reversed if the text
1223 direction is reversed; otherwise returns \c false.
1224
1225 A bit faster equivalent of (ch.mirroredChar() != ch).
1226
1227 \sa mirroredChar()
1228 */
1229
1230 /*!
1231 \overload
1232 \since 5.0
1233
1234 Returns \c true if the UCS-4-encoded character specified by \a ucs4
1235 should be reversed if the text direction is reversed; otherwise returns \c false.
1236
1237 A bit faster equivalent of (QChar::mirroredChar(ucs4) != ucs4).
1238
1239 \sa mirroredChar()
1240 */
hasMirrored(uint ucs4)1241 bool QChar::hasMirrored(uint ucs4) noexcept
1242 {
1243 if (ucs4 > LastValidCodePoint)
1244 return false;
1245 return qGetProp(ucs4)->mirrorDiff != 0;
1246 }
1247
1248 /*!
1249 \fn bool QChar::isLower() const
1250
1251 Returns \c true if the character is a lowercase letter, for example
1252 category() is Letter_Lowercase.
1253
1254 \sa isUpper(), toLower(), toUpper()
1255 */
1256
1257 /*!
1258 \fn static bool QChar::isLower(uint ucs4)
1259 \overload
1260 \since 5.0
1261
1262 Returns \c true if the UCS-4-encoded character specified by \a ucs4
1263 is a lowercase letter, for example category() is Letter_Lowercase.
1264
1265 \sa isUpper(), toLower(), toUpper()
1266 */
1267
1268 /*!
1269 \fn bool QChar::isUpper() const
1270
1271 Returns \c true if the character is an uppercase letter, for example
1272 category() is Letter_Uppercase.
1273
1274 \sa isLower(), toUpper(), toLower()
1275 */
1276
1277 /*!
1278 \fn static bool QChar::isUpper(uint ucs4)
1279 \overload
1280 \since 5.0
1281
1282 Returns \c true if the UCS-4-encoded character specified by \a ucs4
1283 is an uppercase letter, for example category() is Letter_Uppercase.
1284
1285 \sa isLower(), toUpper(), toLower()
1286 */
1287
1288 /*!
1289 \fn bool QChar::isTitleCase() const
1290
1291 Returns \c true if the character is a titlecase letter, for example
1292 category() is Letter_Titlecase.
1293
1294 \sa isLower(), toUpper(), toLower(), toTitleCase()
1295 */
1296
1297 /*!
1298 \fn static bool QChar::isTitleCase(uint ucs4)
1299 \overload
1300 \since 5.0
1301
1302 Returns \c true if the UCS-4-encoded character specified by \a ucs4
1303 is a titlecase letter, for example category() is Letter_Titlecase.
1304
1305 \sa isLower(), toUpper(), toLower(), toTitleCase()
1306 */
1307 /*!
1308 \fn QChar QChar::mirroredChar() const
1309
1310 Returns the mirrored character if this character is a mirrored
1311 character; otherwise returns the character itself.
1312
1313 \sa hasMirrored()
1314 */
1315
1316 /*!
1317 \overload
1318 Returns the mirrored character if the UCS-4-encoded character specified
1319 by \a ucs4 is a mirrored character; otherwise returns the character itself.
1320
1321 \sa hasMirrored()
1322 */
mirroredChar(uint ucs4)1323 uint QChar::mirroredChar(uint ucs4) noexcept
1324 {
1325 if (ucs4 > LastValidCodePoint)
1326 return ucs4;
1327 return ucs4 + qGetProp(ucs4)->mirrorDiff;
1328 }
1329
1330
1331 // constants for Hangul (de)composition, see UAX #15
1332 enum {
1333 Hangul_SBase = 0xac00,
1334 Hangul_LBase = 0x1100,
1335 Hangul_VBase = 0x1161,
1336 Hangul_TBase = 0x11a7,
1337 Hangul_LCount = 19,
1338 Hangul_VCount = 21,
1339 Hangul_TCount = 28,
1340 Hangul_NCount = Hangul_VCount * Hangul_TCount,
1341 Hangul_SCount = Hangul_LCount * Hangul_NCount
1342 };
1343
1344 // buffer has to have a length of 3. It's needed for Hangul decomposition
decompositionHelper(uint ucs4,int * length,int * tag,unsigned short * buffer)1345 static const unsigned short * QT_FASTCALL decompositionHelper
1346 (uint ucs4, int *length, int *tag, unsigned short *buffer)
1347 {
1348 if (ucs4 >= Hangul_SBase && ucs4 < Hangul_SBase + Hangul_SCount) {
1349 // compute Hangul syllable decomposition as per UAX #15
1350 const uint SIndex = ucs4 - Hangul_SBase;
1351 buffer[0] = Hangul_LBase + SIndex / Hangul_NCount; // L
1352 buffer[1] = Hangul_VBase + (SIndex % Hangul_NCount) / Hangul_TCount; // V
1353 buffer[2] = Hangul_TBase + SIndex % Hangul_TCount; // T
1354 *length = buffer[2] == Hangul_TBase ? 2 : 3;
1355 *tag = QChar::Canonical;
1356 return buffer;
1357 }
1358
1359 const unsigned short index = GET_DECOMPOSITION_INDEX(ucs4);
1360 if (index == 0xffff) {
1361 *length = 0;
1362 *tag = QChar::NoDecomposition;
1363 return nullptr;
1364 }
1365
1366 const unsigned short *decomposition = uc_decomposition_map+index;
1367 *tag = (*decomposition) & 0xff;
1368 *length = (*decomposition) >> 8;
1369 return decomposition+1;
1370 }
1371
1372 /*!
1373 Decomposes a character into it's constituent parts. Returns an empty string
1374 if no decomposition exists.
1375 */
decomposition() const1376 QString QChar::decomposition() const
1377 {
1378 return QChar::decomposition(ucs);
1379 }
1380
1381 /*!
1382 \overload
1383 Decomposes the UCS-4-encoded character specified by \a ucs4 into it's
1384 constituent parts. Returns an empty string if no decomposition exists.
1385 */
decomposition(uint ucs4)1386 QString QChar::decomposition(uint ucs4)
1387 {
1388 unsigned short buffer[3];
1389 int length;
1390 int tag;
1391 const unsigned short *d = decompositionHelper(ucs4, &length, &tag, buffer);
1392 return QString(reinterpret_cast<const QChar *>(d), length);
1393 }
1394
1395 /*!
1396 \fn QChar::Decomposition QChar::decompositionTag() const
1397
1398 Returns the tag defining the composition of the character. Returns
1399 QChar::NoDecomposition if no decomposition exists.
1400 */
1401
1402 /*!
1403 \overload
1404 Returns the tag defining the composition of the UCS-4-encoded character
1405 specified by \a ucs4. Returns QChar::NoDecomposition if no decomposition exists.
1406 */
decompositionTag(uint ucs4)1407 QChar::Decomposition QChar::decompositionTag(uint ucs4) noexcept
1408 {
1409 if (ucs4 >= Hangul_SBase && ucs4 < Hangul_SBase + Hangul_SCount)
1410 return QChar::Canonical;
1411 const unsigned short index = GET_DECOMPOSITION_INDEX(ucs4);
1412 if (index == 0xffff)
1413 return QChar::NoDecomposition;
1414 return (QChar::Decomposition)(uc_decomposition_map[index] & 0xff);
1415 }
1416
1417 /*!
1418 \fn unsigned char QChar::combiningClass() const
1419
1420 Returns the combining class for the character as defined in the
1421 Unicode standard. This is mainly useful as a positioning hint for
1422 marks attached to a base character.
1423
1424 The Qt text rendering engine uses this information to correctly
1425 position non-spacing marks around a base character.
1426 */
1427
1428 /*!
1429 \overload
1430 Returns the combining class for the UCS-4-encoded character specified by
1431 \a ucs4, as defined in the Unicode standard.
1432 */
combiningClass(uint ucs4)1433 unsigned char QChar::combiningClass(uint ucs4) noexcept
1434 {
1435 if (ucs4 > LastValidCodePoint)
1436 return 0;
1437 return (unsigned char) qGetProp(ucs4)->combiningClass;
1438 }
1439
1440 /*!
1441 \fn QChar::Script QChar::script() const
1442 \since 5.1
1443
1444 Returns the Unicode script property value for this character.
1445 */
1446
1447 /*!
1448 \overload
1449 \since 5.1
1450
1451 Returns the Unicode script property value for the character specified in
1452 its UCS-4-encoded form as \a ucs4.
1453 */
script(uint ucs4)1454 QChar::Script QChar::script(uint ucs4) noexcept
1455 {
1456 if (ucs4 > LastValidCodePoint)
1457 return QChar::Script_Unknown;
1458 return (QChar::Script) qGetProp(ucs4)->script;
1459 }
1460
1461 /*!
1462 \fn QChar::UnicodeVersion QChar::unicodeVersion() const
1463
1464 Returns the Unicode version that introduced this character.
1465 */
1466
1467 /*!
1468 \overload
1469 Returns the Unicode version that introduced the character specified in
1470 its UCS-4-encoded form as \a ucs4.
1471 */
unicodeVersion(uint ucs4)1472 QChar::UnicodeVersion QChar::unicodeVersion(uint ucs4) noexcept
1473 {
1474 if (ucs4 > LastValidCodePoint)
1475 return QChar::Unicode_Unassigned;
1476 return (QChar::UnicodeVersion) qGetProp(ucs4)->unicodeVersion;
1477 }
1478
1479 /*!
1480 Returns the most recent supported Unicode version.
1481 */
currentUnicodeVersion()1482 QChar::UnicodeVersion QChar::currentUnicodeVersion() noexcept
1483 {
1484 return UNICODE_DATA_VERSION;
1485 }
1486
1487
1488 template <typename T>
convertCase_helper(T uc,QUnicodeTables::Case which)1489 Q_DECL_CONST_FUNCTION static inline T convertCase_helper(T uc, QUnicodeTables::Case which) noexcept
1490 {
1491 const auto fold = qGetProp(uc)->cases[which];
1492
1493 if (Q_UNLIKELY(fold.special)) {
1494 const ushort *specialCase = specialCaseMap + fold.diff;
1495 // so far, there are no special cases beyond BMP (guaranteed by the qunicodetables generator)
1496 return *specialCase == 1 ? specialCase[1] : uc;
1497 }
1498
1499 return uc + fold.diff;
1500 }
1501
1502 /*!
1503 \fn QChar QChar::toLower() const
1504
1505 Returns the lowercase equivalent if the character is uppercase or titlecase;
1506 otherwise returns the character itself.
1507 */
1508
1509 /*!
1510 \overload
1511 Returns the lowercase equivalent of the UCS-4-encoded character specified
1512 by \a ucs4 if the character is uppercase or titlecase; otherwise returns
1513 the character itself.
1514 */
toLower(uint ucs4)1515 uint QChar::toLower(uint ucs4) noexcept
1516 {
1517 if (ucs4 > LastValidCodePoint)
1518 return ucs4;
1519 return convertCase_helper(ucs4, QUnicodeTables::LowerCase);
1520 }
1521
1522 /*!
1523 \fn QChar QChar::toUpper() const
1524
1525 Returns the uppercase equivalent if the character is lowercase or titlecase;
1526 otherwise returns the character itself.
1527 */
1528
1529 /*!
1530 \overload
1531 Returns the uppercase equivalent of the UCS-4-encoded character specified
1532 by \a ucs4 if the character is lowercase or titlecase; otherwise returns
1533 the character itself.
1534 */
toUpper(uint ucs4)1535 uint QChar::toUpper(uint ucs4) noexcept
1536 {
1537 if (ucs4 > LastValidCodePoint)
1538 return ucs4;
1539 return convertCase_helper(ucs4, QUnicodeTables::UpperCase);
1540 }
1541
1542 /*!
1543 \fn QChar QChar::toTitleCase() const
1544
1545 Returns the title case equivalent if the character is lowercase or uppercase;
1546 otherwise returns the character itself.
1547 */
1548
1549 /*!
1550 \overload
1551 Returns the title case equivalent of the UCS-4-encoded character specified
1552 by \a ucs4 if the character is lowercase or uppercase; otherwise returns
1553 the character itself.
1554 */
toTitleCase(uint ucs4)1555 uint QChar::toTitleCase(uint ucs4) noexcept
1556 {
1557 if (ucs4 > LastValidCodePoint)
1558 return ucs4;
1559 return convertCase_helper(ucs4, QUnicodeTables::TitleCase);
1560 }
1561
foldCase(const ushort * ch,const ushort * start)1562 static inline uint foldCase(const ushort *ch, const ushort *start)
1563 {
1564 uint ucs4 = *ch;
1565 if (QChar::isLowSurrogate(ucs4) && ch > start && QChar::isHighSurrogate(*(ch - 1)))
1566 ucs4 = QChar::surrogateToUcs4(*(ch - 1), ucs4);
1567 return convertCase_helper(ucs4, QUnicodeTables::CaseFold);
1568 }
1569
foldCase(uint ch,uint & last)1570 static inline uint foldCase(uint ch, uint &last) noexcept
1571 {
1572 uint ucs4 = ch;
1573 if (QChar::isLowSurrogate(ucs4) && QChar::isHighSurrogate(last))
1574 ucs4 = QChar::surrogateToUcs4(last, ucs4);
1575 last = ch;
1576 return convertCase_helper(ucs4, QUnicodeTables::CaseFold);
1577 }
1578
foldCase(ushort ch)1579 static inline ushort foldCase(ushort ch) noexcept
1580 {
1581 return convertCase_helper(ch, QUnicodeTables::CaseFold);
1582 }
1583
foldCase(QChar ch)1584 static inline QChar foldCase(QChar ch) noexcept
1585 {
1586 return QChar(foldCase(ch.unicode()));
1587 }
1588
1589 /*!
1590 \fn QChar QChar::toCaseFolded() const
1591
1592 Returns the case folded equivalent of the character.
1593 For most Unicode characters this is the same as toLower().
1594 */
1595
1596 /*!
1597 \overload
1598 Returns the case folded equivalent of the UCS-4-encoded character specified
1599 by \a ucs4. For most Unicode characters this is the same as toLower().
1600 */
toCaseFolded(uint ucs4)1601 uint QChar::toCaseFolded(uint ucs4) noexcept
1602 {
1603 if (ucs4 > LastValidCodePoint)
1604 return ucs4;
1605 return convertCase_helper(ucs4, QUnicodeTables::CaseFold);
1606 }
1607
1608 /*!
1609 \fn char QChar::toLatin1() const
1610
1611 Returns the Latin-1 character equivalent to the QChar, or 0. This
1612 is mainly useful for non-internationalized software.
1613
1614 \note It is not possible to distinguish a non-Latin-1 character from a Latin-1 0
1615 (NUL) character. Prefer to use unicode(), which does not have this ambiguity.
1616
1617 \sa unicode()
1618 */
1619
1620 /*!
1621 \fn QChar QChar::fromLatin1(char)
1622
1623 Converts the Latin-1 character \a c to its equivalent QChar. This
1624 is mainly useful for non-internationalized software.
1625
1626 An alternative is to use QLatin1Char.
1627
1628 \sa toLatin1(), unicode()
1629 */
1630
1631 /*!
1632 \fn char QChar::toAscii() const
1633 \deprecated
1634
1635 Returns the Latin-1 character value of the QChar, or 0 if the character is not
1636 representable.
1637
1638 The main purpose of this function is to preserve ASCII characters used
1639 in C strings. This is mainly useful for developers of non-internationalized
1640 software.
1641
1642 \note It is not possible to distinguish a non-Latin 1 character from an ASCII 0
1643 (NUL) character. Prefer to use unicode(), which does not have this ambiguity.
1644
1645 \note This function does not check whether the character value is inside
1646 the valid range of US-ASCII.
1647
1648 \sa toLatin1(), unicode()
1649 */
1650
1651 /*!
1652 \fn QChar QChar::fromAscii(char)
1653 \deprecated
1654
1655 Converts the ASCII character \a c to it's equivalent QChar. This
1656 is mainly useful for non-internationalized software.
1657
1658 An alternative is to use QLatin1Char.
1659
1660 \sa fromLatin1(), unicode()
1661 */
1662
1663 #ifndef QT_NO_DATASTREAM
1664 /*!
1665 \relates QChar
1666
1667 Writes the char \a chr to the stream \a out.
1668
1669 \sa {Serializing Qt Data Types}
1670 */
operator <<(QDataStream & out,QChar chr)1671 QDataStream &operator<<(QDataStream &out, QChar chr)
1672 {
1673 out << quint16(chr.unicode());
1674 return out;
1675 }
1676
1677 /*!
1678 \relates QChar
1679
1680 Reads a char from the stream \a in into char \a chr.
1681
1682 \sa {Serializing Qt Data Types}
1683 */
operator >>(QDataStream & in,QChar & chr)1684 QDataStream &operator>>(QDataStream &in, QChar &chr)
1685 {
1686 quint16 u;
1687 in >> u;
1688 chr.unicode() = ushort(u);
1689 return in;
1690 }
1691 #endif // QT_NO_DATASTREAM
1692
1693 /*!
1694 \fn ushort & QChar::unicode()
1695
1696 Returns a reference to the numeric Unicode value of the QChar.
1697 */
1698
1699 /*!
1700 \fn ushort QChar::unicode() const
1701
1702 Returns the numeric Unicode value of the QChar.
1703 */
1704
1705 /*****************************************************************************
1706 Documentation of QChar related functions
1707 *****************************************************************************/
1708
1709 /*!
1710 \fn bool operator==(QChar c1, QChar c2)
1711
1712 \relates QChar
1713
1714 Returns \c true if \a c1 and \a c2 are the same Unicode character;
1715 otherwise returns \c false.
1716 */
1717
1718 /*!
1719 \fn int operator!=(QChar c1, QChar c2)
1720
1721 \relates QChar
1722
1723 Returns \c true if \a c1 and \a c2 are not the same Unicode
1724 character; otherwise returns \c false.
1725 */
1726
1727 /*!
1728 \fn int operator<=(QChar c1, QChar c2)
1729
1730 \relates QChar
1731
1732 Returns \c true if the numeric Unicode value of \a c1 is less than
1733 or equal to that of \a c2; otherwise returns \c false.
1734 */
1735
1736 /*!
1737 \fn int operator>=(QChar c1, QChar c2)
1738
1739 \relates QChar
1740
1741 Returns \c true if the numeric Unicode value of \a c1 is greater than
1742 or equal to that of \a c2; otherwise returns \c false.
1743 */
1744
1745 /*!
1746 \fn int operator<(QChar c1, QChar c2)
1747
1748 \relates QChar
1749
1750 Returns \c true if the numeric Unicode value of \a c1 is less than
1751 that of \a c2; otherwise returns \c false.
1752 */
1753
1754 /*!
1755 \fn int operator>(QChar c1, QChar c2)
1756
1757 \relates QChar
1758
1759 Returns \c true if the numeric Unicode value of \a c1 is greater than
1760 that of \a c2; otherwise returns \c false.
1761 */
1762
1763
1764 // ---------------------------------------------------------------------------
1765
1766
decomposeHelper(QString * str,bool canonical,QChar::UnicodeVersion version,int from)1767 static void decomposeHelper(QString *str, bool canonical, QChar::UnicodeVersion version, int from)
1768 {
1769 int length;
1770 int tag;
1771 unsigned short buffer[3];
1772
1773 QString &s = *str;
1774
1775 const unsigned short *utf16 = reinterpret_cast<unsigned short *>(s.data());
1776 const unsigned short *uc = utf16 + s.length();
1777 while (uc != utf16 + from) {
1778 uint ucs4 = *(--uc);
1779 if (QChar(ucs4).isLowSurrogate() && uc != utf16) {
1780 ushort high = *(uc - 1);
1781 if (QChar(high).isHighSurrogate()) {
1782 --uc;
1783 ucs4 = QChar::surrogateToUcs4(high, ucs4);
1784 }
1785 }
1786
1787 if (QChar::unicodeVersion(ucs4) > version)
1788 continue;
1789
1790 const unsigned short *d = decompositionHelper(ucs4, &length, &tag, buffer);
1791 if (!d || (canonical && tag != QChar::Canonical))
1792 continue;
1793
1794 int pos = uc - utf16;
1795 s.replace(pos, QChar::requiresSurrogates(ucs4) ? 2 : 1, reinterpret_cast<const QChar *>(d), length);
1796 // since the replace invalidates the pointers and we do decomposition recursive
1797 utf16 = reinterpret_cast<unsigned short *>(s.data());
1798 uc = utf16 + pos + length;
1799 }
1800 }
1801
1802
1803 struct UCS2Pair {
1804 ushort u1;
1805 ushort u2;
1806 };
1807
operator <(const UCS2Pair & ligature1,const UCS2Pair & ligature2)1808 inline bool operator<(const UCS2Pair &ligature1, const UCS2Pair &ligature2)
1809 { return ligature1.u1 < ligature2.u1; }
operator <(ushort u1,const UCS2Pair & ligature)1810 inline bool operator<(ushort u1, const UCS2Pair &ligature)
1811 { return u1 < ligature.u1; }
operator <(const UCS2Pair & ligature,ushort u1)1812 inline bool operator<(const UCS2Pair &ligature, ushort u1)
1813 { return ligature.u1 < u1; }
1814
1815 struct UCS2SurrogatePair {
1816 UCS2Pair p1;
1817 UCS2Pair p2;
1818 };
1819
operator <(const UCS2SurrogatePair & ligature1,const UCS2SurrogatePair & ligature2)1820 inline bool operator<(const UCS2SurrogatePair &ligature1, const UCS2SurrogatePair &ligature2)
1821 { return QChar::surrogateToUcs4(ligature1.p1.u1, ligature1.p1.u2) < QChar::surrogateToUcs4(ligature2.p1.u1, ligature2.p1.u2); }
operator <(uint u1,const UCS2SurrogatePair & ligature)1822 inline bool operator<(uint u1, const UCS2SurrogatePair &ligature)
1823 { return u1 < QChar::surrogateToUcs4(ligature.p1.u1, ligature.p1.u2); }
operator <(const UCS2SurrogatePair & ligature,uint u1)1824 inline bool operator<(const UCS2SurrogatePair &ligature, uint u1)
1825 { return QChar::surrogateToUcs4(ligature.p1.u1, ligature.p1.u2) < u1; }
1826
ligatureHelper(uint u1,uint u2)1827 static uint inline ligatureHelper(uint u1, uint u2)
1828 {
1829 if (u1 >= Hangul_LBase && u1 <= Hangul_SBase + Hangul_SCount) {
1830 // compute Hangul syllable composition as per UAX #15
1831 // hangul L-V pair
1832 const uint LIndex = u1 - Hangul_LBase;
1833 if (LIndex < Hangul_LCount) {
1834 const uint VIndex = u2 - Hangul_VBase;
1835 if (VIndex < Hangul_VCount)
1836 return Hangul_SBase + (LIndex * Hangul_VCount + VIndex) * Hangul_TCount;
1837 }
1838 // hangul LV-T pair
1839 const uint SIndex = u1 - Hangul_SBase;
1840 if (SIndex < Hangul_SCount && (SIndex % Hangul_TCount) == 0) {
1841 const uint TIndex = u2 - Hangul_TBase;
1842 if (TIndex <= Hangul_TCount)
1843 return u1 + TIndex;
1844 }
1845 }
1846
1847 const unsigned short index = GET_LIGATURE_INDEX(u2);
1848 if (index == 0xffff)
1849 return 0;
1850 const unsigned short *ligatures = uc_ligature_map+index;
1851 ushort length = *ligatures++;
1852 if (QChar::requiresSurrogates(u1)) {
1853 const UCS2SurrogatePair *data = reinterpret_cast<const UCS2SurrogatePair *>(ligatures);
1854 const UCS2SurrogatePair *r = std::lower_bound(data, data + length, u1);
1855 if (r != data + length && QChar::surrogateToUcs4(r->p1.u1, r->p1.u2) == u1)
1856 return QChar::surrogateToUcs4(r->p2.u1, r->p2.u2);
1857 } else {
1858 const UCS2Pair *data = reinterpret_cast<const UCS2Pair *>(ligatures);
1859 const UCS2Pair *r = std::lower_bound(data, data + length, ushort(u1));
1860 if (r != data + length && r->u1 == ushort(u1))
1861 return r->u2;
1862 }
1863
1864 return 0;
1865 }
1866
composeHelper(QString * str,QChar::UnicodeVersion version,int from)1867 static void composeHelper(QString *str, QChar::UnicodeVersion version, int from)
1868 {
1869 QString &s = *str;
1870
1871 if (from < 0 || s.length() - from < 2)
1872 return;
1873
1874 uint stcode = 0; // starter code point
1875 int starter = -1; // starter position
1876 int next = -1; // to prevent i == next
1877 int lastCombining = 255; // to prevent combining > lastCombining
1878
1879 int pos = from;
1880 while (pos < s.length()) {
1881 int i = pos;
1882 uint uc = s.at(pos).unicode();
1883 if (QChar(uc).isHighSurrogate() && pos < s.length()-1) {
1884 ushort low = s.at(pos+1).unicode();
1885 if (QChar(low).isLowSurrogate()) {
1886 uc = QChar::surrogateToUcs4(uc, low);
1887 ++pos;
1888 }
1889 }
1890
1891 const QUnicodeTables::Properties *p = qGetProp(uc);
1892 if (p->unicodeVersion > version) {
1893 starter = -1;
1894 next = -1; // to prevent i == next
1895 lastCombining = 255; // to prevent combining > lastCombining
1896 ++pos;
1897 continue;
1898 }
1899
1900 int combining = p->combiningClass;
1901 if ((i == next || combining > lastCombining) && starter >= from) {
1902 // allowed to form ligature with S
1903 uint ligature = ligatureHelper(stcode, uc);
1904 if (ligature) {
1905 stcode = ligature;
1906 QChar *d = s.data();
1907 // ligatureHelper() never changes planes
1908 if (QChar::requiresSurrogates(ligature)) {
1909 d[starter] = QChar(QChar::highSurrogate(ligature));
1910 d[starter + 1] = QChar(QChar::lowSurrogate(ligature));
1911 s.remove(i, 2);
1912 } else {
1913 d[starter] = QChar(ligature);
1914 s.remove(i, 1);
1915 }
1916 continue;
1917 }
1918 }
1919 if (combining == 0) {
1920 starter = i;
1921 stcode = uc;
1922 next = pos + 1;
1923 }
1924 lastCombining = combining;
1925
1926 ++pos;
1927 }
1928 }
1929
1930
canonicalOrderHelper(QString * str,QChar::UnicodeVersion version,int from)1931 static void canonicalOrderHelper(QString *str, QChar::UnicodeVersion version, int from)
1932 {
1933 QString &s = *str;
1934 const int l = s.length()-1;
1935
1936 uint u1, u2;
1937 ushort c1, c2;
1938
1939 int pos = from;
1940 while (pos < l) {
1941 int p2 = pos+1;
1942 u1 = s.at(pos).unicode();
1943 if (QChar(u1).isHighSurrogate()) {
1944 ushort low = s.at(p2).unicode();
1945 if (QChar(low).isLowSurrogate()) {
1946 u1 = QChar::surrogateToUcs4(u1, low);
1947 if (p2 >= l)
1948 break;
1949 ++p2;
1950 }
1951 }
1952 c1 = 0;
1953
1954 advance:
1955 u2 = s.at(p2).unicode();
1956 if (QChar(u2).isHighSurrogate() && p2 < l) {
1957 ushort low = s.at(p2+1).unicode();
1958 if (QChar(low).isLowSurrogate()) {
1959 u2 = QChar::surrogateToUcs4(u2, low);
1960 ++p2;
1961 }
1962 }
1963
1964 c2 = 0;
1965 {
1966 const QUnicodeTables::Properties *p = qGetProp(u2);
1967 if (p->unicodeVersion <= version)
1968 c2 = p->combiningClass;
1969 }
1970 if (c2 == 0) {
1971 pos = p2+1;
1972 continue;
1973 }
1974
1975 if (c1 == 0) {
1976 const QUnicodeTables::Properties *p = qGetProp(u1);
1977 if (p->unicodeVersion <= version)
1978 c1 = p->combiningClass;
1979 }
1980
1981 if (c1 > c2) {
1982 QChar *uc = s.data();
1983 int p = pos;
1984 // exchange characters
1985 if (!QChar::requiresSurrogates(u2)) {
1986 uc[p++] = QChar(u2);
1987 } else {
1988 uc[p++] = QChar(QChar::highSurrogate(u2));
1989 uc[p++] = QChar(QChar::lowSurrogate(u2));
1990 }
1991 if (!QChar::requiresSurrogates(u1)) {
1992 uc[p++] = QChar(u1);
1993 } else {
1994 uc[p++] = QChar(QChar::highSurrogate(u1));
1995 uc[p++] = QChar(QChar::lowSurrogate(u1));
1996 }
1997 if (pos > 0)
1998 --pos;
1999 if (pos > 0 && s.at(pos).isLowSurrogate())
2000 --pos;
2001 } else {
2002 ++pos;
2003 if (QChar::requiresSurrogates(u1))
2004 ++pos;
2005
2006 u1 = u2;
2007 c1 = c2; // != 0
2008 p2 = pos + 1;
2009 if (QChar::requiresSurrogates(u1))
2010 ++p2;
2011 if (p2 > l)
2012 break;
2013
2014 goto advance;
2015 }
2016 }
2017 }
2018
2019 // returns true if the text is in a desired Normalization Form already; false otherwise.
2020 // sets lastStable to the position of the last stable code point
normalizationQuickCheckHelper(QString * str,QString::NormalizationForm mode,int from,int * lastStable)2021 static bool normalizationQuickCheckHelper(QString *str, QString::NormalizationForm mode, int from, int *lastStable)
2022 {
2023 Q_STATIC_ASSERT(QString::NormalizationForm_D == 0);
2024 Q_STATIC_ASSERT(QString::NormalizationForm_C == 1);
2025 Q_STATIC_ASSERT(QString::NormalizationForm_KD == 2);
2026 Q_STATIC_ASSERT(QString::NormalizationForm_KC == 3);
2027
2028 enum { NFQC_YES = 0, NFQC_NO = 1, NFQC_MAYBE = 3 };
2029
2030 const ushort *string = reinterpret_cast<const ushort *>(str->constData());
2031 int length = str->length();
2032
2033 // this avoids one out of bounds check in the loop
2034 while (length > from && QChar::isHighSurrogate(string[length - 1]))
2035 --length;
2036
2037 uchar lastCombining = 0;
2038 for (int i = from; i < length; ++i) {
2039 int pos = i;
2040 uint uc = string[i];
2041 if (uc < 0x80) {
2042 // ASCII characters are stable code points
2043 lastCombining = 0;
2044 *lastStable = pos;
2045 continue;
2046 }
2047
2048 if (QChar::isHighSurrogate(uc)) {
2049 ushort low = string[i + 1];
2050 if (!QChar::isLowSurrogate(low)) {
2051 // treat surrogate like stable code point
2052 lastCombining = 0;
2053 *lastStable = pos;
2054 continue;
2055 }
2056 ++i;
2057 uc = QChar::surrogateToUcs4(uc, low);
2058 }
2059
2060 const QUnicodeTables::Properties *p = qGetProp(uc);
2061
2062 if (p->combiningClass < lastCombining && p->combiningClass > 0)
2063 return false;
2064
2065 const uchar check = (p->nfQuickCheck >> (mode << 1)) & 0x03;
2066 if (check != NFQC_YES)
2067 return false; // ### can we quick check NFQC_MAYBE ?
2068
2069 lastCombining = p->combiningClass;
2070 if (lastCombining == 0)
2071 *lastStable = pos;
2072 }
2073
2074 if (length != str->length()) // low surrogate parts at the end of text
2075 *lastStable = str->length() - 1;
2076
2077 return true;
2078 }
2079
2080 QT_END_NAMESPACE
2081