1 /****************************************************************************
2 **
3 ** Copyright (C) 2019 The Qt Company Ltd.
4 ** Contact: https://www.qt.io/licensing/
5 **
6 ** This file is part of the utils of the Qt Toolkit.
7 **
8 ** $QT_BEGIN_LICENSE:GPL-EXCEPT$
9 ** Commercial License Usage
10 ** Licensees holding valid commercial Qt licenses may use this file in
11 ** accordance with the commercial license agreement provided with the
12 ** Software or, alternatively, in accordance with the terms contained in
13 ** a written agreement between you and The Qt Company. For licensing terms
14 ** and conditions see https://www.qt.io/terms-conditions. For further
15 ** information use the contact form at https://www.qt.io/contact-us.
16 **
17 ** GNU General Public License Usage
18 ** Alternatively, this file may be used under the terms of the GNU
19 ** General Public License version 3 as published by the Free Software
20 ** Foundation with exceptions as appearing in the file LICENSE.GPL3-EXCEPT
21 ** included in the packaging of this file. Please review the following
22 ** information to ensure the GNU General Public License requirements will
23 ** be met: https://www.gnu.org/licenses/gpl-3.0.html.
24 **
25 ** $QT_END_LICENSE$
26 **
27 ****************************************************************************/
28 
29 #include <qlist.h>
30 #include <qhash.h>
31 #include <qfile.h>
32 #include <qbytearray.h>
33 #include <qstring.h>
34 #include <qchar.h>
35 #include <qvector.h>
36 #include <qdebug.h>
37 #if 0
38 #include <private/qunicodetables_p.h>
39 #endif
40 
41 #define DATA_VERSION_S "13.0"
42 #define DATA_VERSION_STR "QChar::Unicode_13_0"
43 
44 
45 static QHash<QByteArray, QChar::UnicodeVersion> age_map;
46 
initAgeMap()47 static void initAgeMap()
48 {
49     struct AgeMap {
50         const QChar::UnicodeVersion version;
51         const char *age;
52     } ageMap[] = {
53         { QChar::Unicode_1_1,   "1.1" },
54         { QChar::Unicode_2_0,   "2.0" },
55         { QChar::Unicode_2_1_2, "2.1" },
56         { QChar::Unicode_3_0,   "3.0" },
57         { QChar::Unicode_3_1,   "3.1" },
58         { QChar::Unicode_3_2,   "3.2" },
59         { QChar::Unicode_4_0,   "4.0" },
60         { QChar::Unicode_4_1,   "4.1" },
61         { QChar::Unicode_5_0,   "5.0" },
62         { QChar::Unicode_5_1,   "5.1" },
63         { QChar::Unicode_5_2,   "5.2" },
64         { QChar::Unicode_6_0,   "6.0" },
65         { QChar::Unicode_6_1,   "6.1" },
66         { QChar::Unicode_6_2,   "6.2" },
67         { QChar::Unicode_6_3,   "6.3" },
68         { QChar::Unicode_7_0,   "7.0" },
69         { QChar::Unicode_8_0,   "8.0" },
70         { QChar::Unicode_9_0,   "9.0" },
71         { QChar::Unicode_10_0,   "10.0" },
72         { QChar::Unicode_11_0,   "11.0" },
73         { QChar::Unicode_12_0,   "12.0" },
74         { QChar::Unicode_12_1,   "12.1" }, // UCD Revision 24
75         { QChar::Unicode_13_0,   "13.0" }, // UCD Revision 26
76         { QChar::Unicode_Unassigned, 0 }
77     };
78     AgeMap *d = ageMap;
79     while (d->age) {
80         age_map.insert(d->age, d->version);
81         ++d;
82     }
83 }
84 
85 static QHash<QByteArray, QChar::Category> categoryMap;
86 
initCategoryMap()87 static void initCategoryMap()
88 {
89     struct Cat {
90         QChar::Category cat;
91         const char *name;
92     } categories[] = {
93         { QChar::Mark_NonSpacing,          "Mn" },
94         { QChar::Mark_SpacingCombining,    "Mc" },
95         { QChar::Mark_Enclosing,           "Me" },
96 
97         { QChar::Number_DecimalDigit,      "Nd" },
98         { QChar::Number_Letter,            "Nl" },
99         { QChar::Number_Other,             "No" },
100 
101         { QChar::Separator_Space,          "Zs" },
102         { QChar::Separator_Line,           "Zl" },
103         { QChar::Separator_Paragraph,      "Zp" },
104 
105         { QChar::Other_Control,            "Cc" },
106         { QChar::Other_Format,             "Cf" },
107         { QChar::Other_Surrogate,          "Cs" },
108         { QChar::Other_PrivateUse,         "Co" },
109         { QChar::Other_NotAssigned,        "Cn" },
110 
111         { QChar::Letter_Uppercase,         "Lu" },
112         { QChar::Letter_Lowercase,         "Ll" },
113         { QChar::Letter_Titlecase,         "Lt" },
114         { QChar::Letter_Modifier,          "Lm" },
115         { QChar::Letter_Other,             "Lo" },
116 
117         { QChar::Punctuation_Connector,    "Pc" },
118         { QChar::Punctuation_Dash,         "Pd" },
119         { QChar::Punctuation_Open,         "Ps" },
120         { QChar::Punctuation_Close,        "Pe" },
121         { QChar::Punctuation_InitialQuote, "Pi" },
122         { QChar::Punctuation_FinalQuote,   "Pf" },
123         { QChar::Punctuation_Other,        "Po" },
124 
125         { QChar::Symbol_Math,              "Sm" },
126         { QChar::Symbol_Currency,          "Sc" },
127         { QChar::Symbol_Modifier,          "Sk" },
128         { QChar::Symbol_Other,             "So" },
129         { QChar::Other_NotAssigned, 0 }
130     };
131     Cat *c = categories;
132     while (c->name) {
133         categoryMap.insert(c->name, c->cat);
134         ++c;
135     }
136 }
137 
138 
139 static QHash<QByteArray, QChar::Decomposition> decompositionMap;
140 
initDecompositionMap()141 static void initDecompositionMap()
142 {
143     struct Dec {
144         QChar::Decomposition dec;
145         const char *name;
146     } decompositions[] = {
147         { QChar::Canonical, "<canonical>" },
148         { QChar::Font, "<font>" },
149         { QChar::NoBreak, "<noBreak>" },
150         { QChar::Initial, "<initial>" },
151         { QChar::Medial, "<medial>" },
152         { QChar::Final, "<final>" },
153         { QChar::Isolated, "<isolated>" },
154         { QChar::Circle, "<circle>" },
155         { QChar::Super, "<super>" },
156         { QChar::Sub, "<sub>" },
157         { QChar::Vertical, "<vertical>" },
158         { QChar::Wide, "<wide>" },
159         { QChar::Narrow, "<narrow>" },
160         { QChar::Small, "<small>" },
161         { QChar::Square, "<square>" },
162         { QChar::Compat, "<compat>" },
163         { QChar::Fraction, "<fraction>" },
164         { QChar::NoDecomposition, 0 }
165     };
166     Dec *d = decompositions;
167     while (d->name) {
168         decompositionMap.insert(d->name, d->dec);
169         ++d;
170     }
171 }
172 
173 
174 enum Direction {
175     DirL = QChar::DirL,
176     DirR = QChar::DirR,
177     DirEN = QChar::DirEN,
178     DirES = QChar::DirES,
179     DirET = QChar::DirET,
180     DirAN = QChar::DirAN,
181     DirCS = QChar::DirCS,
182     DirB = QChar::DirB,
183     DirS = QChar::DirS,
184     DirWS = QChar::DirWS,
185     DirON = QChar::DirON,
186     DirLRE = QChar::DirLRE,
187     DirLRO = QChar::DirLRO,
188     DirAL = QChar::DirAL,
189     DirRLE = QChar::DirRLE,
190     DirRLO = QChar::DirRLO,
191     DirPDF = QChar::DirPDF,
192     DirNSM = QChar::DirNSM,
193     DirBN = QChar::DirBN,
194     DirLRI = QChar::DirLRI,
195     DirRLI = QChar::DirRLI,
196     DirFSI = QChar::DirFSI,
197     DirPDI = QChar::DirPDI,
198 
199     Dir_Unassigned
200 };
201 
202 static QHash<QByteArray, Direction> directionMap;
203 
initDirectionMap()204 static void initDirectionMap()
205 {
206     struct Dir {
207         Direction dir;
208         const char *name;
209     } directions[] = {
210         { DirL, "L" },
211         { DirR, "R" },
212         { DirEN, "EN" },
213         { DirES, "ES" },
214         { DirET, "ET" },
215         { DirAN, "AN" },
216         { DirCS, "CS" },
217         { DirB, "B" },
218         { DirS, "S" },
219         { DirWS, "WS" },
220         { DirON, "ON" },
221         { DirLRE, "LRE" },
222         { DirLRO, "LRO" },
223         { DirAL, "AL" },
224         { DirRLE, "RLE" },
225         { DirRLO, "RLO" },
226         { DirPDF, "PDF" },
227         { DirNSM, "NSM" },
228         { DirBN, "BN" },
229         { DirLRI, "LRI" },
230         { DirRLI, "RLI" },
231         { DirFSI, "FSI" },
232         { DirPDI, "PDI" },
233         { Dir_Unassigned, 0 }
234     };
235     Dir *d = directions;
236     while (d->name) {
237         directionMap.insert(d->name, d->dir);
238         ++d;
239     }
240 }
241 
242 
243 enum JoiningType {
244     Joining_None,
245     Joining_Causing,
246     Joining_Dual,
247     Joining_Right,
248     Joining_Left,
249     Joining_Transparent,
250 
251     Joining_Unassigned
252 };
253 
254 static QHash<QByteArray, JoiningType> joining_map;
255 
initJoiningMap()256 static void initJoiningMap()
257 {
258     struct JoiningList {
259         JoiningType joining;
260         const char *name;
261     } joinings[] = {
262         { Joining_None,        "U" },
263         { Joining_Causing,     "C" },
264         { Joining_Dual,        "D" },
265         { Joining_Right,       "R" },
266         { Joining_Left,        "L" },
267         { Joining_Transparent, "T" },
268         { Joining_Unassigned, 0 }
269     };
270     JoiningList *d = joinings;
271     while (d->name) {
272         joining_map.insert(d->name, d->joining);
273         ++d;
274     }
275 }
276 
277 
278 static const char *grapheme_break_class_string =
279     "enum GraphemeBreakClass {\n"
280     "    GraphemeBreak_Any,\n"
281     "    GraphemeBreak_CR,\n"
282     "    GraphemeBreak_LF,\n"
283     "    GraphemeBreak_Control,\n"
284     "    GraphemeBreak_Extend,\n"
285     "    GraphemeBreak_ZWJ,\n"
286     "    GraphemeBreak_RegionalIndicator,\n"
287     "    GraphemeBreak_Prepend,\n"
288     "    GraphemeBreak_SpacingMark,\n"
289     "    GraphemeBreak_L,\n"
290     "    GraphemeBreak_V,\n"
291     "    GraphemeBreak_T,\n"
292     "    GraphemeBreak_LV,\n"
293     "    GraphemeBreak_LVT,\n"
294     "    Graphemebreak_E_Base,\n"
295     "    Graphemebreak_E_Modifier,\n"
296     "    Graphemebreak_Glue_After_Zwj,\n"
297     "    Graphemebreak_E_Base_GAZ,\n"
298     "\n"
299     "    NumGraphemeBreakClasses\n"
300     "};\n\n";
301 
302 enum GraphemeBreakClass {
303     GraphemeBreak_Any,
304     GraphemeBreak_CR,
305     GraphemeBreak_LF,
306     GraphemeBreak_Control,
307     GraphemeBreak_Extend,
308     GraphemeBreak_ZWJ,
309     GraphemeBreak_RegionalIndicator,
310     GraphemeBreak_Prepend,
311     GraphemeBreak_SpacingMark,
312     GraphemeBreak_L,
313     GraphemeBreak_V,
314     GraphemeBreak_T,
315     GraphemeBreak_LV,
316     GraphemeBreak_LVT,
317     Graphemebreak_E_Base,
318     Graphemebreak_E_Modifier,
319     Graphemebreak_Glue_After_Zwj,
320     Graphemebreak_E_Base_GAZ,
321 
322     GraphemeBreak_Unassigned
323 };
324 
325 static QHash<QByteArray, GraphemeBreakClass> grapheme_break_map;
326 
initGraphemeBreak()327 static void initGraphemeBreak()
328 {
329     struct GraphemeBreakList {
330         GraphemeBreakClass brk;
331         const char *name;
332     } breaks[] = {
333         { GraphemeBreak_Any, "Any" },
334         { GraphemeBreak_CR, "CR" },
335         { GraphemeBreak_LF, "LF" },
336         { GraphemeBreak_Control, "Control" },
337         { GraphemeBreak_Extend, "Extend" },
338         { GraphemeBreak_ZWJ, "ZWJ" },
339         { GraphemeBreak_RegionalIndicator, "Regional_Indicator" },
340         { GraphemeBreak_Prepend, "Prepend" },
341         { GraphemeBreak_SpacingMark, "SpacingMark" },
342         { GraphemeBreak_L, "L" },
343         { GraphemeBreak_V, "V" },
344         { GraphemeBreak_T, "T" },
345         { GraphemeBreak_LV, "LV" },
346         { GraphemeBreak_LVT, "LVT" },
347         { Graphemebreak_E_Base, "E_Base" },
348         { Graphemebreak_E_Modifier, "E_Modifier" },
349         { Graphemebreak_Glue_After_Zwj, "Glue_After_Zwj" },
350         { Graphemebreak_E_Base_GAZ, "E_Base_GAZ" },
351         { GraphemeBreak_Unassigned, 0 }
352     };
353     GraphemeBreakList *d = breaks;
354     while (d->name) {
355         grapheme_break_map.insert(d->name, d->brk);
356         ++d;
357     }
358 }
359 
360 
361 static const char *word_break_class_string =
362     "enum WordBreakClass {\n"
363     "    WordBreak_Any,\n"
364     "    WordBreak_CR,\n"
365     "    WordBreak_LF,\n"
366     "    WordBreak_Newline,\n"
367     "    WordBreak_Extend,\n"
368     "    WordBreak_ZWJ,\n"
369     "    WordBreak_Format,\n"
370     "    WordBreak_RegionalIndicator,\n"
371     "    WordBreak_Katakana,\n"
372     "    WordBreak_HebrewLetter,\n"
373     "    WordBreak_ALetter,\n"
374     "    WordBreak_SingleQuote,\n"
375     "    WordBreak_DoubleQuote,\n"
376     "    WordBreak_MidNumLet,\n"
377     "    WordBreak_MidLetter,\n"
378     "    WordBreak_MidNum,\n"
379     "    WordBreak_Numeric,\n"
380     "    WordBreak_ExtendNumLet,\n"
381     "    WordBreak_E_Base,\n"
382     "    WordBreak_E_Modifier,\n"
383     "    WordBreak_Glue_After_Zwj,\n"
384     "    WordBreak_E_Base_GAZ,\n"
385     "    WordBreak_WSegSpace,\n"
386     "\n"
387     "    NumWordBreakClasses\n"
388     "};\n\n";
389 
390 enum WordBreakClass {
391     WordBreak_Any,
392     WordBreak_CR,
393     WordBreak_LF,
394     WordBreak_Newline,
395     WordBreak_Extend,
396     WordBreak_ZWJ,
397     WordBreak_Format,
398     WordBreak_RegionalIndicator,
399     WordBreak_Katakana,
400     WordBreak_HebrewLetter,
401     WordBreak_ALetter,
402     WordBreak_SingleQuote,
403     WordBreak_DoubleQuote,
404     WordBreak_MidNumLet,
405     WordBreak_MidLetter,
406     WordBreak_MidNum,
407     WordBreak_Numeric,
408     WordBreak_ExtendNumLet,
409     WordBreak_E_Base,
410     WordBreak_E_Modifier,
411     WordBreak_Glue_After_Zwj,
412     WordBreak_E_Base_GAZ,
413     WordBreak_WSegSpace,
414 
415     WordBreak_Unassigned
416 };
417 
418 static QHash<QByteArray, WordBreakClass> word_break_map;
419 
initWordBreak()420 static void initWordBreak()
421 {
422     struct WordBreakList {
423         WordBreakClass brk;
424         const char *name;
425     } breaks[] = {
426         { WordBreak_Any, "Any" },
427         { WordBreak_CR, "CR" },
428         { WordBreak_LF, "LF" },
429         { WordBreak_Newline, "Newline" },
430         { WordBreak_Extend, "Extend" },
431         { WordBreak_ZWJ, "ZWJ" },
432         { WordBreak_Format, "Format" },
433         { WordBreak_RegionalIndicator, "Regional_Indicator" },
434         { WordBreak_Katakana, "Katakana" },
435         { WordBreak_HebrewLetter, "Hebrew_Letter" },
436         { WordBreak_ALetter, "ALetter" },
437         { WordBreak_SingleQuote, "Single_Quote" },
438         { WordBreak_DoubleQuote, "Double_Quote" },
439         { WordBreak_MidNumLet, "MidNumLet" },
440         { WordBreak_MidLetter, "MidLetter" },
441         { WordBreak_MidNum, "MidNum" },
442         { WordBreak_Numeric, "Numeric" },
443         { WordBreak_ExtendNumLet, "ExtendNumLet" },
444         { WordBreak_E_Base, "E_Base" },
445         { WordBreak_E_Modifier, "E_Modifier" },
446         { WordBreak_Glue_After_Zwj, "Glue_After_Zwj" },
447         { WordBreak_E_Base_GAZ, "E_Base_GAZ" },
448         { WordBreak_WSegSpace, "WSegSpace" },
449         { WordBreak_Unassigned, 0 }
450     };
451     WordBreakList *d = breaks;
452     while (d->name) {
453         word_break_map.insert(d->name, d->brk);
454         ++d;
455     }
456 }
457 
458 
459 static const char *sentence_break_class_string =
460     "enum SentenceBreakClass {\n"
461     "    SentenceBreak_Any,\n"
462     "    SentenceBreak_CR,\n"
463     "    SentenceBreak_LF,\n"
464     "    SentenceBreak_Sep,\n"
465     "    SentenceBreak_Extend,\n"
466     "    SentenceBreak_Sp,\n"
467     "    SentenceBreak_Lower,\n"
468     "    SentenceBreak_Upper,\n"
469     "    SentenceBreak_OLetter,\n"
470     "    SentenceBreak_Numeric,\n"
471     "    SentenceBreak_ATerm,\n"
472     "    SentenceBreak_SContinue,\n"
473     "    SentenceBreak_STerm,\n"
474     "    SentenceBreak_Close,\n"
475     "\n"
476     "    NumSentenceBreakClasses\n"
477     "};\n\n";
478 
479 enum SentenceBreakClass {
480     SentenceBreak_Any,
481     SentenceBreak_CR,
482     SentenceBreak_LF,
483     SentenceBreak_Sep,
484     SentenceBreak_Extend,
485     SentenceBreak_Sp,
486     SentenceBreak_Lower,
487     SentenceBreak_Upper,
488     SentenceBreak_OLetter,
489     SentenceBreak_Numeric,
490     SentenceBreak_ATerm,
491     SentenceBreak_SContinue,
492     SentenceBreak_STerm,
493     SentenceBreak_Close,
494 
495     SentenceBreak_Unassigned
496 };
497 
498 static QHash<QByteArray, SentenceBreakClass> sentence_break_map;
499 
initSentenceBreak()500 static void initSentenceBreak()
501 {
502     struct SentenceBreakList {
503         SentenceBreakClass brk;
504         const char *name;
505     } breaks[] = {
506         { SentenceBreak_Any, "Any" },
507         { SentenceBreak_CR, "CR" },
508         { SentenceBreak_LF, "LF" },
509         { SentenceBreak_Sep, "Sep" },
510         { SentenceBreak_Extend, "Extend" },
511         { SentenceBreak_Extend, "Format" },
512         { SentenceBreak_Sp, "Sp" },
513         { SentenceBreak_Lower, "Lower" },
514         { SentenceBreak_Upper, "Upper" },
515         { SentenceBreak_OLetter, "OLetter" },
516         { SentenceBreak_Numeric, "Numeric" },
517         { SentenceBreak_ATerm, "ATerm" },
518         { SentenceBreak_SContinue, "SContinue" },
519         { SentenceBreak_STerm, "STerm" },
520         { SentenceBreak_Close, "Close" },
521         { SentenceBreak_Unassigned, 0 }
522     };
523     SentenceBreakList *d = breaks;
524     while (d->name) {
525         sentence_break_map.insert(d->name, d->brk);
526         ++d;
527     }
528 }
529 
530 
531 static const char *line_break_class_string =
532     "// see http://www.unicode.org/reports/tr14/tr14-30.html\n"
533     "// we don't use the XX and AI classes and map them to AL instead.\n"
534     "enum LineBreakClass {\n"
535     "    LineBreak_OP, LineBreak_CL, LineBreak_CP, LineBreak_QU, LineBreak_GL,\n"
536     "    LineBreak_NS, LineBreak_EX, LineBreak_SY, LineBreak_IS, LineBreak_PR,\n"
537     "    LineBreak_PO, LineBreak_NU, LineBreak_AL, LineBreak_HL, LineBreak_ID,\n"
538     "    LineBreak_IN, LineBreak_HY, LineBreak_BA, LineBreak_BB, LineBreak_B2,\n"
539     "    LineBreak_ZW, LineBreak_CM, LineBreak_WJ, LineBreak_H2, LineBreak_H3,\n"
540     "    LineBreak_JL, LineBreak_JV, LineBreak_JT, LineBreak_RI, LineBreak_CB,\n"
541     "    LineBreak_EB, LineBreak_EM, LineBreak_ZWJ,\n"
542     "    LineBreak_SA, LineBreak_SG, LineBreak_SP,\n"
543     "    LineBreak_CR, LineBreak_LF, LineBreak_BK,\n"
544     "\n"
545     "    NumLineBreakClasses\n"
546     "};\n\n";
547 
548 enum LineBreakClass {
549     LineBreak_OP, LineBreak_CL, LineBreak_CP, LineBreak_QU, LineBreak_GL,
550     LineBreak_NS, LineBreak_EX, LineBreak_SY, LineBreak_IS, LineBreak_PR,
551     LineBreak_PO, LineBreak_NU, LineBreak_AL, LineBreak_HL, LineBreak_ID,
552     LineBreak_IN, LineBreak_HY, LineBreak_BA, LineBreak_BB, LineBreak_B2,
553     LineBreak_ZW, LineBreak_CM, LineBreak_WJ, LineBreak_H2, LineBreak_H3,
554     LineBreak_JL, LineBreak_JV, LineBreak_JT, LineBreak_RI, LineBreak_CB,
555     LineBreak_EB, LineBreak_EM, LineBreak_ZWJ,
556     LineBreak_SA, LineBreak_SG, LineBreak_SP,
557     LineBreak_CR, LineBreak_LF, LineBreak_BK,
558 
559     LineBreak_Unassigned
560 };
561 
562 static QHash<QByteArray, LineBreakClass> line_break_map;
563 
initLineBreak()564 static void initLineBreak()
565 {
566     // ### Classes XX and AI are left out and mapped to AL for now.
567     // ### Class NL is mapped to BK.
568     // ### Treating characters of class CJ as class NS will give CSS strict line breaking;
569     //     treating them as class ID will give CSS normal breaking.
570     struct LineBreakList {
571         LineBreakClass brk;
572         const char *name;
573     } breaks[] = {
574         { LineBreak_BK, "BK" },
575         { LineBreak_CR, "CR" },
576         { LineBreak_LF, "LF" },
577         { LineBreak_CM, "CM" },
578         { LineBreak_BK, "NL" },
579         { LineBreak_SG, "SG" },
580         { LineBreak_WJ, "WJ" },
581         { LineBreak_ZW, "ZW" },
582         { LineBreak_GL, "GL" },
583         { LineBreak_SP, "SP" },
584         { LineBreak_B2, "B2" },
585         { LineBreak_BA, "BA" },
586         { LineBreak_BB, "BB" },
587         { LineBreak_HY, "HY" },
588         { LineBreak_CB, "CB" },
589         { LineBreak_NS, "CJ" },
590         { LineBreak_CL, "CL" },
591         { LineBreak_CP, "CP" },
592         { LineBreak_EX, "EX" },
593         { LineBreak_IN, "IN" },
594         { LineBreak_NS, "NS" },
595         { LineBreak_OP, "OP" },
596         { LineBreak_QU, "QU" },
597         { LineBreak_IS, "IS" },
598         { LineBreak_NU, "NU" },
599         { LineBreak_PO, "PO" },
600         { LineBreak_PR, "PR" },
601         { LineBreak_SY, "SY" },
602         { LineBreak_AL, "AI" },
603         { LineBreak_AL, "AL" },
604         { LineBreak_HL, "HL" },
605         { LineBreak_H2, "H2" },
606         { LineBreak_H3, "H3" },
607         { LineBreak_ID, "ID" },
608         { LineBreak_JL, "JL" },
609         { LineBreak_JV, "JV" },
610         { LineBreak_JT, "JT" },
611         { LineBreak_RI, "RI" },
612         { LineBreak_SA, "SA" },
613         { LineBreak_AL, "XX" },
614         { LineBreak_EB, "EB" },
615         { LineBreak_EM, "EM" },
616         { LineBreak_ZWJ, "ZWJ" },
617         { LineBreak_Unassigned, 0 }
618     };
619     LineBreakList *d = breaks;
620     while (d->name) {
621         line_break_map.insert(d->name, d->brk);
622         ++d;
623     }
624 }
625 
626 
627 static QHash<QByteArray, QChar::Script> scriptMap;
628 
initScriptMap()629 static void initScriptMap()
630 {
631     struct Scrpt {
632         QChar::Script script;
633         const char *name;
634     } scripts[] = {
635         // general
636         { QChar::Script_Unknown,                "Unknown" },
637         { QChar::Script_Inherited,              "Inherited" },
638         { QChar::Script_Common,                 "Common" },
639         // pre-4.0
640         { QChar::Script_Latin,                  "Latin" },
641         { QChar::Script_Greek,                  "Greek" },
642         { QChar::Script_Cyrillic,               "Cyrillic" },
643         { QChar::Script_Armenian,               "Armenian" },
644         { QChar::Script_Hebrew,                 "Hebrew" },
645         { QChar::Script_Arabic,                 "Arabic" },
646         { QChar::Script_Syriac,                 "Syriac" },
647         { QChar::Script_Thaana,                 "Thaana" },
648         { QChar::Script_Devanagari,             "Devanagari" },
649         { QChar::Script_Bengali,                "Bengali" },
650         { QChar::Script_Gurmukhi,               "Gurmukhi" },
651         { QChar::Script_Gujarati,               "Gujarati" },
652         { QChar::Script_Oriya,                  "Oriya" },
653         { QChar::Script_Tamil,                  "Tamil" },
654         { QChar::Script_Telugu,                 "Telugu" },
655         { QChar::Script_Kannada,                "Kannada" },
656         { QChar::Script_Malayalam,              "Malayalam" },
657         { QChar::Script_Sinhala,                "Sinhala" },
658         { QChar::Script_Thai,                   "Thai" },
659         { QChar::Script_Lao,                    "Lao" },
660         { QChar::Script_Tibetan,                "Tibetan" },
661         { QChar::Script_Myanmar,                "Myanmar" },
662         { QChar::Script_Georgian,               "Georgian" },
663         { QChar::Script_Hangul,                 "Hangul" },
664         { QChar::Script_Ethiopic,               "Ethiopic" },
665         { QChar::Script_Cherokee,               "Cherokee" },
666         { QChar::Script_CanadianAboriginal,     "CanadianAboriginal" },
667         { QChar::Script_Ogham,                  "Ogham" },
668         { QChar::Script_Runic,                  "Runic" },
669         { QChar::Script_Khmer,                  "Khmer" },
670         { QChar::Script_Mongolian,              "Mongolian" },
671         { QChar::Script_Hiragana,               "Hiragana" },
672         { QChar::Script_Katakana,               "Katakana" },
673         { QChar::Script_Bopomofo,               "Bopomofo" },
674         { QChar::Script_Han,                    "Han" },
675         { QChar::Script_Yi,                     "Yi" },
676         { QChar::Script_OldItalic,              "OldItalic" },
677         { QChar::Script_Gothic,                 "Gothic" },
678         { QChar::Script_Deseret,                "Deseret" },
679         { QChar::Script_Tagalog,                "Tagalog" },
680         { QChar::Script_Hanunoo,                "Hanunoo" },
681         { QChar::Script_Buhid,                  "Buhid" },
682         { QChar::Script_Tagbanwa,               "Tagbanwa" },
683         { QChar::Script_Coptic,                 "Coptic" },
684         // 4.0
685         { QChar::Script_Limbu,                  "Limbu" },
686         { QChar::Script_TaiLe,                  "TaiLe" },
687         { QChar::Script_LinearB,                "LinearB" },
688         { QChar::Script_Ugaritic,               "Ugaritic" },
689         { QChar::Script_Shavian,                "Shavian" },
690         { QChar::Script_Osmanya,                "Osmanya" },
691         { QChar::Script_Cypriot,                "Cypriot" },
692         { QChar::Script_Braille,                "Braille" },
693         // 4.1
694         { QChar::Script_Buginese,               "Buginese" },
695         { QChar::Script_NewTaiLue,              "NewTaiLue" },
696         { QChar::Script_Glagolitic,             "Glagolitic" },
697         { QChar::Script_Tifinagh,               "Tifinagh" },
698         { QChar::Script_SylotiNagri,            "SylotiNagri" },
699         { QChar::Script_OldPersian,             "OldPersian" },
700         { QChar::Script_Kharoshthi,             "Kharoshthi" },
701         // 5.0
702         { QChar::Script_Balinese,               "Balinese" },
703         { QChar::Script_Cuneiform,              "Cuneiform" },
704         { QChar::Script_Phoenician,             "Phoenician" },
705         { QChar::Script_PhagsPa,                "PhagsPa" },
706         { QChar::Script_Nko,                    "Nko" },
707         // 5.1
708         { QChar::Script_Sundanese,              "Sundanese" },
709         { QChar::Script_Lepcha,                 "Lepcha" },
710         { QChar::Script_OlChiki,                "OlChiki" },
711         { QChar::Script_Vai,                    "Vai" },
712         { QChar::Script_Saurashtra,             "Saurashtra" },
713         { QChar::Script_KayahLi,                "KayahLi" },
714         { QChar::Script_Rejang,                 "Rejang" },
715         { QChar::Script_Lycian,                 "Lycian" },
716         { QChar::Script_Carian,                 "Carian" },
717         { QChar::Script_Lydian,                 "Lydian" },
718         { QChar::Script_Cham,                   "Cham" },
719         // 5.2
720         { QChar::Script_TaiTham,                "TaiTham" },
721         { QChar::Script_TaiViet,                "TaiViet" },
722         { QChar::Script_Avestan,                "Avestan" },
723         { QChar::Script_EgyptianHieroglyphs,    "EgyptianHieroglyphs" },
724         { QChar::Script_Samaritan,              "Samaritan" },
725         { QChar::Script_Lisu,                   "Lisu" },
726         { QChar::Script_Bamum,                  "Bamum" },
727         { QChar::Script_Javanese,               "Javanese" },
728         { QChar::Script_MeeteiMayek,            "MeeteiMayek" },
729         { QChar::Script_ImperialAramaic,        "ImperialAramaic" },
730         { QChar::Script_OldSouthArabian,        "OldSouthArabian" },
731         { QChar::Script_InscriptionalParthian,  "InscriptionalParthian" },
732         { QChar::Script_InscriptionalPahlavi,   "InscriptionalPahlavi" },
733         { QChar::Script_OldTurkic,              "OldTurkic" },
734         { QChar::Script_Kaithi,                 "Kaithi" },
735         // 6.0
736         { QChar::Script_Batak,                  "Batak" },
737         { QChar::Script_Brahmi,                 "Brahmi" },
738         { QChar::Script_Mandaic,                "Mandaic" },
739         // 6.1
740         { QChar::Script_Chakma,                 "Chakma" },
741         { QChar::Script_MeroiticCursive,        "MeroiticCursive" },
742         { QChar::Script_MeroiticHieroglyphs,    "MeroiticHieroglyphs" },
743         { QChar::Script_Miao,                   "Miao" },
744         { QChar::Script_Sharada,                "Sharada" },
745         { QChar::Script_SoraSompeng,            "SoraSompeng" },
746         { QChar::Script_Takri,                  "Takri" },
747         // 7.0
748         { QChar::Script_CaucasianAlbanian,      "CaucasianAlbanian" },
749         { QChar::Script_BassaVah,               "BassaVah" },
750         { QChar::Script_Duployan,               "Duployan" },
751         { QChar::Script_Elbasan,                "Elbasan" },
752         { QChar::Script_Grantha,                "Grantha" },
753         { QChar::Script_PahawhHmong,            "PahawhHmong" },
754         { QChar::Script_Khojki,                 "Khojki" },
755         { QChar::Script_LinearA,                "LinearA" },
756         { QChar::Script_Mahajani,               "Mahajani" },
757         { QChar::Script_Manichaean,             "Manichaean" },
758         { QChar::Script_MendeKikakui,           "MendeKikakui" },
759         { QChar::Script_Modi,                   "Modi" },
760         { QChar::Script_Mro,                    "Mro" },
761         { QChar::Script_OldNorthArabian,        "OldNorthArabian" },
762         { QChar::Script_Nabataean,              "Nabataean" },
763         { QChar::Script_Palmyrene,              "Palmyrene" },
764         { QChar::Script_PauCinHau,              "PauCinHau" },
765         { QChar::Script_OldPermic,              "OldPermic" },
766         { QChar::Script_PsalterPahlavi,         "PsalterPahlavi" },
767         { QChar::Script_Siddham,                "Siddham" },
768         { QChar::Script_Khudawadi,              "Khudawadi" },
769         { QChar::Script_Tirhuta,                "Tirhuta" },
770         { QChar::Script_WarangCiti,             "WarangCiti" },
771         // 8.0
772         { QChar::Script_Ahom,                   "Ahom" },
773         { QChar::Script_AnatolianHieroglyphs,   "AnatolianHieroglyphs" },
774         { QChar::Script_Hatran,                 "Hatran" },
775         { QChar::Script_Multani,                "Multani" },
776         { QChar::Script_OldHungarian,           "OldHungarian" },
777         { QChar::Script_SignWriting,            "SignWriting" },
778         // 9.0
779         { QChar::Script_Adlam,                  "Adlam" },
780         { QChar::Script_Bhaiksuki,              "Bhaiksuki" },
781         { QChar::Script_Marchen,                "Marchen" },
782         { QChar::Script_Newa,                   "Newa" },
783         { QChar::Script_Osage,                  "Osage" },
784         { QChar::Script_Tangut,                 "Tangut" },
785         // 10.0
786         { QChar::Script_MasaramGondi,           "MasaramGondi" },
787         { QChar::Script_Nushu,                  "Nushu" },
788         { QChar::Script_Soyombo,                "Soyombo" },
789         { QChar::Script_ZanabazarSquare,        "ZanabazarSquare" },
790         // 12.1
791         { QChar::Script_Dogra,                  "Dogra" },
792         { QChar::Script_GunjalaGondi,           "GunjalaGondi" },
793         { QChar::Script_HanifiRohingya,         "HanifiRohingya" },
794         { QChar::Script_Makasar,                "Makasar" },
795         { QChar::Script_Medefaidrin,            "Medefaidrin" },
796         { QChar::Script_OldSogdian,             "OldSogdian" },
797         { QChar::Script_Sogdian,                "Sogdian" },
798         { QChar::Script_Elymaic,                "Elymaic" },
799         { QChar::Script_Nandinagari,            "Nandinagari" },
800         { QChar::Script_NyiakengPuachueHmong,   "NyiakengPuachueHmong" },
801         { QChar::Script_Wancho,                 "Wancho" },
802         // 13.0
803         { QChar::Script_Chorasmian,             "Chorasmian" },
804         { QChar::Script_DivesAkuru,             "DivesAkuru" },
805         { QChar::Script_KhitanSmallScript,      "KhitanSmallScript" },
806         { QChar::Script_Yezidi,                 "Yezidi" },
807 
808         // unhandled
809         { QChar::Script_Unknown,                0 }
810     };
811     Scrpt *p = scripts;
812     while (p->name) {
813         scriptMap.insert(p->name, p->script);
814         ++p;
815     }
816 }
817 
818 // Keep this one in sync with the code in createPropertyInfo
819 static const char *property_string =
820     "enum Case {\n"
821     "    LowerCase,\n"
822     "    UpperCase,\n"
823     "    TitleCase,\n"
824     "    CaseFold,\n"
825     "\n"
826     "    NumCases\n"
827     "};\n"
828     "\n"
829     "struct Properties {\n"
830     "    ushort category            : 8; /* 5 used */\n"
831     "    ushort direction           : 8; /* 5 used */\n"
832     "    ushort combiningClass      : 8;\n"
833     "    ushort joining             : 3;\n"
834     "    signed short digitValue    : 5;\n"
835     "    signed short mirrorDiff    : 16;\n"
836     "    ushort unicodeVersion      : 8; /* 5 used */\n"
837     "    ushort nfQuickCheck        : 8;\n" // could be narrowed
838     "#ifdef Q_OS_WASM\n"
839     "    unsigned char              : 0; //wasm 64 packing trick\n"
840     "#endif\n"
841     "    struct {\n"
842     "        ushort special    : 1;\n"
843     "        signed short diff : 15;\n"
844     "    } cases[NumCases];\n"
845     "#ifdef Q_OS_WASM\n"
846     "    unsigned char              : 0; //wasm 64 packing trick\n"
847     "#endif\n"
848     "    ushort graphemeBreakClass  : 5; /* 5 used */\n"
849     "    ushort wordBreakClass      : 5; /* 5 used */\n"
850     "    ushort lineBreakClass      : 6; /* 6 used */\n"
851     "    ushort sentenceBreakClass  : 8; /* 4 used */\n"
852     "    ushort script              : 8;\n"
853     "};\n\n"
854     "Q_CORE_EXPORT const Properties * QT_FASTCALL properties(uint ucs4) noexcept;\n"
855     "Q_CORE_EXPORT const Properties * QT_FASTCALL properties(ushort ucs2) noexcept;\n"
856     "\n";
857 
858 static const char *methods =
859     "Q_CORE_EXPORT GraphemeBreakClass QT_FASTCALL graphemeBreakClass(uint ucs4) noexcept;\n"
860     "inline GraphemeBreakClass graphemeBreakClass(QChar ch) noexcept\n"
861     "{ return graphemeBreakClass(ch.unicode()); }\n"
862     "\n"
863     "Q_CORE_EXPORT WordBreakClass QT_FASTCALL wordBreakClass(uint ucs4) noexcept;\n"
864     "inline WordBreakClass wordBreakClass(QChar ch) noexcept\n"
865     "{ return wordBreakClass(ch.unicode()); }\n"
866     "\n"
867     "Q_CORE_EXPORT SentenceBreakClass QT_FASTCALL sentenceBreakClass(uint ucs4) noexcept;\n"
868     "inline SentenceBreakClass sentenceBreakClass(QChar ch) noexcept\n"
869     "{ return sentenceBreakClass(ch.unicode()); }\n"
870     "\n"
871     "Q_CORE_EXPORT LineBreakClass QT_FASTCALL lineBreakClass(uint ucs4) noexcept;\n"
872     "inline LineBreakClass lineBreakClass(QChar ch) noexcept\n"
873     "{ return lineBreakClass(ch.unicode()); }\n"
874     "\n";
875 
876 static const int SizeOfPropertiesStruct = 20;
877 
878 static const QByteArray sizeOfPropertiesStructCheck =
879         "Q_STATIC_ASSERT(sizeof(Properties) == " + QByteArray::number(SizeOfPropertiesStruct) + ");\n\n";
880 
881 struct PropertyFlags {
operator ==PropertyFlags882     bool operator==(const PropertyFlags &o) const {
883         return (combiningClass == o.combiningClass
884                 && category == o.category
885                 && direction == o.direction
886                 && joining == o.joining
887                 && age == o.age
888                 && digitValue == o.digitValue
889                 && mirrorDiff == o.mirrorDiff
890                 && lowerCaseDiff == o.lowerCaseDiff
891                 && upperCaseDiff == o.upperCaseDiff
892                 && titleCaseDiff == o.titleCaseDiff
893                 && caseFoldDiff == o.caseFoldDiff
894                 && lowerCaseSpecial == o.lowerCaseSpecial
895                 && upperCaseSpecial == o.upperCaseSpecial
896                 && titleCaseSpecial == o.titleCaseSpecial
897                 && caseFoldSpecial == o.caseFoldSpecial
898                 && graphemeBreakClass == o.graphemeBreakClass
899                 && wordBreakClass == o.wordBreakClass
900                 && sentenceBreakClass == o.sentenceBreakClass
901                 && lineBreakClass == o.lineBreakClass
902                 && script == o.script
903                 && nfQuickCheck == o.nfQuickCheck
904             );
905     }
906     // from UnicodeData.txt
907     uchar combiningClass : 8;
908     QChar::Category category : 5;
909     QChar::Direction direction : 5;
910     // from ArabicShaping.txt
911     QChar::JoiningType joining : 3;
912     // from DerivedAge.txt
913     QChar::UnicodeVersion age : 5;
914     int digitValue;
915 
916     int mirrorDiff : 16;
917 
918     int lowerCaseDiff;
919     int upperCaseDiff;
920     int titleCaseDiff;
921     int caseFoldDiff;
922     bool lowerCaseSpecial;
923     bool upperCaseSpecial;
924     bool titleCaseSpecial;
925     bool caseFoldSpecial;
926     GraphemeBreakClass graphemeBreakClass;
927     WordBreakClass wordBreakClass;
928     SentenceBreakClass sentenceBreakClass;
929     LineBreakClass lineBreakClass;
930     int script;
931     // from DerivedNormalizationProps.txt
932     uchar nfQuickCheck;
933 };
934 
935 
936 static QList<int> specialCaseMap;
937 
appendToSpecialCaseMap(const QList<int> & map)938 static int appendToSpecialCaseMap(const QList<int> &map)
939 {
940     QList<int> utf16map;
941     for (int i = 0; i < map.size(); ++i) {
942         uint codepoint = map.at(i);
943         // if the condition below doesn't hold anymore we need to modify our special case mapping code
944         Q_ASSERT(!QChar::requiresSurrogates(codepoint));
945         if (QChar::requiresSurrogates(codepoint)) {
946             utf16map << QChar::highSurrogate(codepoint);
947             utf16map << QChar::lowSurrogate(codepoint);
948         } else {
949             utf16map << codepoint;
950         }
951     }
952     int length = utf16map.size();
953     utf16map.prepend(length);
954 
955     if (specialCaseMap.isEmpty())
956         specialCaseMap << 0; // placeholder
957 
958     int i = 1;
959     while (i < specialCaseMap.size()) {
960         int n = specialCaseMap.at(i);
961         if (n == length) {
962             int j;
963             for (j = 1; j <= n; ++j) {
964                 if (specialCaseMap.at(i+j) != utf16map.at(j))
965                     break;
966             }
967             if (j > n)
968                 return i;
969         }
970         i += n + 1;
971     }
972 
973     int pos = specialCaseMap.size();
974     specialCaseMap << utf16map;
975     return pos;
976 }
977 
978 // DerivedCoreProperties.txt
isDefaultIgnorable(uint ucs4)979 static inline bool isDefaultIgnorable(uint ucs4)
980 {
981     // Default_Ignorable_Code_Point:
982     //  Generated from
983     //    Other_Default_Ignorable_Code_Point + Cf + Variation_Selector
984     //    - White_Space - FFF9..FFFB (Annotation Characters)
985     //    - 0600..0604, 06DD, 070F, 110BD (exceptional Cf characters that should be visible)
986     if (ucs4 <= 0xff)
987         return ucs4 == 0xad;
988 
989     return ucs4 == 0x034f
990             || ucs4 == 0x061c
991             || (ucs4 >= 0x115f && ucs4 <= 0x1160)
992             || (ucs4 >= 0x17b4 && ucs4 <= 0x17b5)
993             || (ucs4 >= 0x180b && ucs4 <= 0x180d)
994             || ucs4 == 0x180e
995             || (ucs4 >= 0x200b && ucs4 <= 0x200f)
996             || (ucs4 >= 0x202a && ucs4 <= 0x202e)
997             || (ucs4 >= 0x2060 && ucs4 <= 0x206f)
998             || ucs4 == 0x3164
999             || (ucs4 >= 0xfe00 && ucs4 <= 0xfe0f)
1000             || ucs4 == 0xfeff
1001             || ucs4 == 0xffa0
1002             || (ucs4 >= 0xfff0 && ucs4 <= 0xfff8)
1003             || (ucs4 >= 0x1bca0 && ucs4 <= 0x1bca3)
1004             || (ucs4 >= 0x1d173 && ucs4 <= 0x1d17a)
1005             || (ucs4 >= 0xe0000 && ucs4 <= 0xe0fff);
1006 }
1007 
1008 struct UnicodeData {
UnicodeDataUnicodeData1009     UnicodeData(int codepoint = 0) {
1010         p.category = QChar::Other_NotAssigned; // Cn
1011         p.combiningClass = 0;
1012 
1013         p.direction = QChar::DirL;
1014         // DerivedBidiClass.txt
1015         // The unassigned code points that default to AL are in the ranges:
1016         //     [U+0600..U+07BF, U+08A0..U+08FF, U+FB50..U+FDCF, U+FDF0..U+FDFF, U+FE70..U+FEFF, U+1EE00..U+1EEFF]
1017         if ((codepoint >= 0x0600 && codepoint <= 0x07BF)
1018             || (codepoint >= 0x08A0 && codepoint <= 0x08FF)
1019             || (codepoint >= 0xFB50 && codepoint <= 0xFDCF)
1020             || (codepoint >= 0xFDF0 && codepoint <= 0xFDFF)
1021             || (codepoint >= 0xFE70 && codepoint <= 0xFEFF)
1022             || (codepoint >= 0x1EE00 && codepoint <= 0x1EEFF)) {
1023             p.direction = QChar::DirAL;
1024         }
1025         // The unassigned code points that default to R are in the ranges:
1026         //     [U+0590..U+05FF, U+07C0..U+089F, U+FB1D..U+FB4F, U+10800..U+10FFF, U+1E800..U+1EDFF, U+1EF00..U+1EFFF]
1027         else if ((codepoint >= 0x0590 && codepoint <= 0x05FF)
1028             || (codepoint >= 0x07C0 && codepoint <= 0x089F)
1029             || (codepoint >= 0xFB1D && codepoint <= 0xFB4F)
1030             || (codepoint >= 0x10800 && codepoint <= 0x10FFF)
1031             || (codepoint >= 0x1E800 && codepoint <= 0x1EDFF)
1032             || (codepoint >= 0x1EF00 && codepoint <= 0x1EFFF)) {
1033             p.direction = QChar::DirR;
1034         }
1035         // The unassigned code points that default to ET are in the range:
1036         //     [U+20A0..U+20CF]
1037         else if (codepoint >= 0x20A0 && codepoint <= 0x20CF) {
1038             p.direction = QChar::DirET;
1039         }
1040         // The unassigned code points that default to BN have one of the following properties:
1041         //     Default_Ignorable_Code_Point
1042         //     Noncharacter_Code_Point
1043         else if (QChar::isNonCharacter(codepoint) || isDefaultIgnorable(codepoint)) {
1044             p.direction = QChar::DirBN;
1045         }
1046 
1047         p.lineBreakClass = LineBreak_AL; // XX -> AL
1048         // LineBreak.txt
1049         // The unassigned code points that default to "ID" include ranges in the following blocks:
1050         //     [U+3400..U+4DBF, U+4E00..U+9FFF, U+F900..U+FAFF, U+20000..U+2A6DF, U+2A700..U+2B73F, U+2B740..U+2B81F, U+2B820..U+2CEAF, U+2F800..U+2FA1F]
1051         // and any other reserved code points on
1052         //     [U+20000..U+2FFFD, U+30000..U+3FFFD]
1053         if ((codepoint >= 0x3400 && codepoint <= 0x4DBF)
1054             || (codepoint >= 0x4E00 && codepoint <= 0x9FFF)
1055             || (codepoint >= 0xF900 && codepoint <= 0xFAFF)
1056             || (codepoint >= 0x20000 && codepoint <= 0x2A6DF)
1057             || (codepoint >= 0x2A700 && codepoint <= 0x2B73F)
1058             || (codepoint >= 0x2B740 && codepoint <= 0x2B81F)
1059             || (codepoint >= 0x2B820 && codepoint <= 0x2CEAF)
1060             || (codepoint >= 0x2F800 && codepoint <= 0x2FA1F)
1061             || (codepoint >= 0x20000 && codepoint <= 0x2FFFD)
1062             || (codepoint >= 0x30000 && codepoint <= 0x3FFFD)) {
1063             p.lineBreakClass = LineBreak_ID;
1064         }
1065         // The unassigned code points that default to "PR" comprise a range in the following block:
1066         //     [U+20A0..U+20CF]
1067         else if (codepoint >= 0x20A0 && codepoint <= 0x20CF) {
1068             p.lineBreakClass = LineBreak_PR;
1069         }
1070 
1071         mirroredChar = 0;
1072         decompositionType = QChar::NoDecomposition;
1073         p.joining = QChar::Joining_None;
1074         p.age = QChar::Unicode_Unassigned;
1075         p.mirrorDiff = 0;
1076         p.digitValue = -1;
1077         p.lowerCaseDiff = 0;
1078         p.upperCaseDiff = 0;
1079         p.titleCaseDiff = 0;
1080         p.caseFoldDiff = 0;
1081         p.lowerCaseSpecial = 0;
1082         p.upperCaseSpecial = 0;
1083         p.titleCaseSpecial = 0;
1084         p.caseFoldSpecial = 0;
1085         p.graphemeBreakClass = GraphemeBreak_Any;
1086         p.wordBreakClass = WordBreak_Any;
1087         p.sentenceBreakClass = SentenceBreak_Any;
1088         p.script = QChar::Script_Unknown;
1089         p.nfQuickCheck = 0;
1090         propertyIndex = -1;
1091         excludedComposition = false;
1092     }
1093 
1094     static UnicodeData &valueRef(int codepoint);
1095 
1096     PropertyFlags p;
1097 
1098     // from UnicodeData.txt
1099     QChar::Decomposition decompositionType;
1100     QList<int> decomposition;
1101 
1102     QList<int> specialFolding;
1103 
1104     // from BidiMirroring.txt
1105     int mirroredChar;
1106 
1107     // DerivedNormalizationProps.txt
1108     bool excludedComposition;
1109 
1110     // computed position of unicode property set
1111     int propertyIndex;
1112 };
1113 
1114 static QList<UnicodeData> unicodeData;
1115 
valueRef(int codepoint)1116 UnicodeData &UnicodeData::valueRef(int codepoint)
1117 {
1118     static bool initialized = false;
1119     if (!initialized) {
1120         unicodeData.reserve(QChar::LastValidCodePoint + 1);
1121         for (int uc = 0; uc <= QChar::LastValidCodePoint; ++uc)
1122             unicodeData.append(UnicodeData(uc));
1123         initialized = true;
1124     }
1125 
1126     Q_ASSERT(codepoint <= 0x10ffff);
1127     return unicodeData[codepoint];
1128 }
1129 
1130 
1131 static QHash<int, int> decompositionLength;
1132 static int highestComposedCharacter = 0;
1133 static int numLigatures = 0;
1134 static int highestLigature = 0;
1135 
1136 struct Ligature {
1137     int u1;
1138     int u2;
1139     int ligature;
1140 };
1141 // we need them sorted after the first component for fast lookup
operator <(const Ligature & l1,const Ligature & l2)1142 bool operator < (const Ligature &l1, const Ligature &l2)
1143 { return l1.u1 < l2.u1; }
1144 
1145 static QHash<int, QList<Ligature> > ligatureHashes;
1146 
1147 static QHash<int, int> combiningClassUsage;
1148 
1149 static int maxLowerCaseDiff = 0;
1150 static int maxUpperCaseDiff = 0;
1151 static int maxTitleCaseDiff = 0;
1152 
readUnicodeData()1153 static void readUnicodeData()
1154 {
1155     qDebug("Reading UnicodeData.txt");
1156 
1157     enum UniDataFields {
1158         UD_Value,
1159         UD_Name,
1160         UD_Category,
1161         UD_CombiningClass,
1162         UD_BidiCategory,
1163         UD_Decomposition,
1164         UD_DecimalDigitValue,
1165         UD_DigitValue,
1166         UD_NumericValue,
1167         UD_Mirrored,
1168         UD_OldName,
1169         UD_Comment,
1170         UD_UpperCase,
1171         UD_LowerCase,
1172         UD_TitleCase
1173     };
1174 
1175     QFile f("data/UnicodeData.txt");
1176     if (!f.exists())
1177         qFatal("Couldn't find UnicodeData.txt");
1178 
1179     f.open(QFile::ReadOnly);
1180 
1181     while (!f.atEnd()) {
1182         QByteArray line;
1183         line.resize(1024);
1184         int len = f.readLine(line.data(), 1024);
1185         line.truncate(len-1);
1186 
1187         int comment = line.indexOf('#');
1188         if (comment >= 0)
1189             line = line.left(comment);
1190         if (line.isEmpty())
1191             continue;
1192 
1193         QList<QByteArray> properties = line.split(';');
1194         bool ok;
1195         int codepoint = properties[UD_Value].toInt(&ok, 16);
1196         Q_ASSERT(ok);
1197         Q_ASSERT(codepoint <= QChar::LastValidCodePoint);
1198         int lastCodepoint = codepoint;
1199 
1200         QByteArray name = properties[UD_Name];
1201         if (name.startsWith('<') && name.contains("First")) {
1202             QByteArray nextLine;
1203             nextLine.resize(1024);
1204             f.readLine(nextLine.data(), 1024);
1205             QList<QByteArray> properties = nextLine.split(';');
1206             Q_ASSERT(properties[UD_Name].startsWith('<') && properties[UD_Name].contains("Last"));
1207             lastCodepoint = properties[UD_Value].toInt(&ok, 16);
1208             Q_ASSERT(ok);
1209             Q_ASSERT(lastCodepoint <= QChar::LastValidCodePoint);
1210         }
1211 
1212         UnicodeData &data = UnicodeData::valueRef(codepoint);
1213         data.p.category = categoryMap.value(properties[UD_Category], QChar::Other_NotAssigned);
1214         data.p.combiningClass = properties[UD_CombiningClass].toInt();
1215         if (!combiningClassUsage.contains(data.p.combiningClass))
1216             combiningClassUsage[data.p.combiningClass] = 1;
1217         else
1218             ++combiningClassUsage[data.p.combiningClass];
1219 
1220         Direction dir = directionMap.value(properties[UD_BidiCategory], Dir_Unassigned);
1221         if (dir == Dir_Unassigned)
1222             qFatal("unhandled direction value: %s", properties[UD_BidiCategory].constData());
1223         data.p.direction = QChar::Direction(dir);
1224 
1225         if (!properties[UD_UpperCase].isEmpty()) {
1226             int upperCase = properties[UD_UpperCase].toInt(&ok, 16);
1227             Q_ASSERT(ok);
1228             int diff = upperCase - codepoint;
1229             // if the conditions below doesn't hold anymore we need to modify our upper casing code
1230             Q_ASSERT(QChar::requiresSurrogates(codepoint) == QChar::requiresSurrogates(upperCase));
1231             if (QChar::requiresSurrogates(codepoint)) {
1232                 Q_ASSERT(QChar::highSurrogate(codepoint) == QChar::highSurrogate(upperCase));
1233                 Q_ASSERT(QChar::lowSurrogate(codepoint) + diff == QChar::lowSurrogate(upperCase));
1234             }
1235             if (qAbs(diff) >= (1<<13)) {
1236                 qWarning() << "upperCaseDiff exceeded (" << hex << codepoint << "->" << upperCase << "); map it for special case";
1237                 data.p.upperCaseSpecial = true;
1238                 data.p.upperCaseDiff = appendToSpecialCaseMap(QList<int>() << upperCase);
1239             } else {
1240                 data.p.upperCaseDiff = diff;
1241                 maxUpperCaseDiff = qMax(maxUpperCaseDiff, qAbs(diff));
1242             }
1243         }
1244         if (!properties[UD_LowerCase].isEmpty()) {
1245             int lowerCase = properties[UD_LowerCase].toInt(&ok, 16);
1246             Q_ASSERT(ok);
1247             int diff = lowerCase - codepoint;
1248             // if the conditions below doesn't hold anymore we need to modify our lower casing code
1249             Q_ASSERT(QChar::requiresSurrogates(codepoint) == QChar::requiresSurrogates(lowerCase));
1250             if (QChar::requiresSurrogates(codepoint)) {
1251                 Q_ASSERT(QChar::highSurrogate(codepoint) == QChar::highSurrogate(lowerCase));
1252                 Q_ASSERT(QChar::lowSurrogate(codepoint) + diff == QChar::lowSurrogate(lowerCase));
1253             }
1254             if (qAbs(diff) >= (1<<13)) {
1255                 qWarning() << "lowerCaseDiff exceeded (" << hex << codepoint << "->" << lowerCase << "); map it for special case";
1256                 data.p.lowerCaseSpecial = true;
1257                 data.p.lowerCaseDiff = appendToSpecialCaseMap(QList<int>() << lowerCase);
1258             } else {
1259                 data.p.lowerCaseDiff = diff;
1260                 maxLowerCaseDiff = qMax(maxLowerCaseDiff, qAbs(diff));
1261             }
1262         }
1263         // we want toTitleCase to map to ToUpper in case we don't have any titlecase.
1264         if (properties[UD_TitleCase].isEmpty())
1265             properties[UD_TitleCase] = properties[UD_UpperCase];
1266         if (!properties[UD_TitleCase].isEmpty()) {
1267             int titleCase = properties[UD_TitleCase].toInt(&ok, 16);
1268             Q_ASSERT(ok);
1269             int diff = titleCase - codepoint;
1270             // if the conditions below doesn't hold anymore we need to modify our title casing code
1271             Q_ASSERT(QChar::requiresSurrogates(codepoint) == QChar::requiresSurrogates(titleCase));
1272             if (QChar::requiresSurrogates(codepoint)) {
1273                 Q_ASSERT(QChar::highSurrogate(codepoint) == QChar::highSurrogate(titleCase));
1274                 Q_ASSERT(QChar::lowSurrogate(codepoint) + diff == QChar::lowSurrogate(titleCase));
1275             }
1276             if (qAbs(diff) >= (1<<13)) {
1277                 qWarning() << "titleCaseDiff exceeded (" << hex << codepoint << "->" << titleCase << "); map it for special case";
1278                 data.p.titleCaseSpecial = true;
1279                 data.p.titleCaseDiff = appendToSpecialCaseMap(QList<int>() << titleCase);
1280             } else {
1281                 data.p.titleCaseDiff = diff;
1282                 maxTitleCaseDiff = qMax(maxTitleCaseDiff, qAbs(diff));
1283             }
1284         }
1285 
1286         if (!properties[UD_DigitValue].isEmpty())
1287             data.p.digitValue = properties[UD_DigitValue].toInt();
1288 
1289         // decompositition
1290         QByteArray decomposition = properties[UD_Decomposition];
1291         if (!decomposition.isEmpty()) {
1292             highestComposedCharacter = qMax(highestComposedCharacter, codepoint);
1293             QList<QByteArray> d = decomposition.split(' ');
1294             if (d[0].contains('<')) {
1295                 data.decompositionType = decompositionMap.value(d[0], QChar::NoDecomposition);
1296                 if (data.decompositionType == QChar::NoDecomposition)
1297                     qFatal("unhandled decomposition type: %s", d[0].constData());
1298                 d.takeFirst();
1299             } else {
1300                 data.decompositionType = QChar::Canonical;
1301             }
1302             for (int i = 0; i < d.size(); ++i) {
1303                 data.decomposition.append(d[i].toInt(&ok, 16));
1304                 Q_ASSERT(ok);
1305             }
1306             ++decompositionLength[data.decomposition.size()];
1307         }
1308 
1309         for (int i = codepoint; i <= lastCodepoint; ++i)
1310             unicodeData[i] = data;
1311     }
1312 }
1313 
1314 static int maxMirroredDiff = 0;
1315 
readBidiMirroring()1316 static void readBidiMirroring()
1317 {
1318     qDebug("Reading BidiMirroring.txt");
1319 
1320     QFile f("data/BidiMirroring.txt");
1321     if (!f.exists())
1322         qFatal("Couldn't find BidiMirroring.txt");
1323 
1324     f.open(QFile::ReadOnly);
1325 
1326     while (!f.atEnd()) {
1327         QByteArray line;
1328         line.resize(1024);
1329         int len = f.readLine(line.data(), 1024);
1330         line.resize(len-1);
1331 
1332         int comment = line.indexOf('#');
1333         if (comment >= 0)
1334             line = line.left(comment);
1335 
1336         if (line.isEmpty())
1337             continue;
1338         line = line.replace(" ", "");
1339 
1340         QList<QByteArray> pair = line.split(';');
1341         Q_ASSERT(pair.size() == 2);
1342 
1343         bool ok;
1344         int codepoint = pair[0].toInt(&ok, 16);
1345         Q_ASSERT(ok);
1346         int mirror = pair[1].toInt(&ok, 16);
1347         Q_ASSERT(ok);
1348 
1349         UnicodeData &d = UnicodeData::valueRef(codepoint);
1350         d.mirroredChar = mirror;
1351         d.p.mirrorDiff = d.mirroredChar - codepoint;
1352         maxMirroredDiff = qMax(maxMirroredDiff, qAbs(d.p.mirrorDiff));
1353     }
1354 }
1355 
readArabicShaping()1356 static void readArabicShaping()
1357 {
1358     qDebug("Reading ArabicShaping.txt");
1359 
1360     // Initialize defaults:
1361     // Code points that are not explicitly listed in ArabicShaping.txt are either of joining type T or U:
1362     // - Those that not explicitly listed that are of General Category Mn, Me, or Cf have joining type T.
1363     // - All others not explicitly listed have joining type U.
1364     for (int codepoint = 0; codepoint <= QChar::LastValidCodePoint; ++codepoint) {
1365         UnicodeData &d = UnicodeData::valueRef(codepoint);
1366         if (d.p.joining == QChar::Joining_None) {
1367             if (d.p.category == QChar::Mark_NonSpacing || d.p.category == QChar::Mark_Enclosing || d.p.category == QChar::Other_Format)
1368                 d.p.joining = QChar::Joining_Transparent;
1369         }
1370     }
1371 
1372     QFile f("data/ArabicShaping.txt");
1373     if (!f.exists())
1374         qFatal("Couldn't find ArabicShaping.txt");
1375 
1376     f.open(QFile::ReadOnly);
1377 
1378     while (!f.atEnd()) {
1379         QByteArray line;
1380         line.resize(1024);
1381         int len = f.readLine(line.data(), 1024);
1382         line.resize(len-1);
1383 
1384         int comment = line.indexOf('#');
1385         if (comment >= 0)
1386             line = line.left(comment);
1387         line = line.trimmed();
1388 
1389         if (line.isEmpty())
1390             continue;
1391 
1392         QList<QByteArray> l = line.split(';');
1393         Q_ASSERT(l.size() == 4);
1394 
1395         bool ok;
1396         int codepoint = l[0].toInt(&ok, 16);
1397         Q_ASSERT(ok);
1398 
1399         UnicodeData &d = UnicodeData::valueRef(codepoint);
1400         JoiningType joining = joining_map.value(l[2].trimmed(), Joining_Unassigned);
1401         switch (joining) {
1402         case Joining_Unassigned:
1403             qFatal("%x: unassigned or unhandled joining type: %s", codepoint, l[2].constData());
1404             break;
1405         case Joining_Transparent:
1406             switch (d.p.category) {
1407             case QChar::Mark_Enclosing:
1408             case QChar::Mark_NonSpacing:
1409             case QChar::Letter_Modifier:
1410             case QChar::Other_Format:
1411                 break;
1412             default:
1413                 qFatal("%x: joining type '%s' was met (category: %d); "
1414                        "the current implementation needs to be revised!",
1415                        codepoint, l[2].constData(), d.p.category);
1416             }
1417             Q_FALLTHROUGH();
1418         default:
1419             d.p.joining = QChar::JoiningType(joining);
1420             break;
1421         }
1422     }
1423 }
1424 
readDerivedAge()1425 static void readDerivedAge()
1426 {
1427     qDebug("Reading DerivedAge.txt");
1428 
1429     QFile f("data/DerivedAge.txt");
1430     if (!f.exists())
1431         qFatal("Couldn't find DerivedAge.txt");
1432 
1433     f.open(QFile::ReadOnly);
1434 
1435     while (!f.atEnd()) {
1436         QByteArray line;
1437         line.resize(1024);
1438         int len = f.readLine(line.data(), 1024);
1439         line.resize(len-1);
1440 
1441         int comment = line.indexOf('#');
1442         if (comment >= 0)
1443             line = line.left(comment);
1444         line.replace(" ", "");
1445 
1446         if (line.isEmpty())
1447             continue;
1448 
1449         QList<QByteArray> l = line.split(';');
1450         Q_ASSERT(l.size() == 2);
1451 
1452         QByteArray codes = l[0];
1453         codes.replace("..", ".");
1454         QList<QByteArray> cl = codes.split('.');
1455 
1456         bool ok;
1457         int from = cl[0].toInt(&ok, 16);
1458         Q_ASSERT(ok);
1459         int to = from;
1460         if (cl.size() == 2) {
1461             to = cl[1].toInt(&ok, 16);
1462             Q_ASSERT(ok);
1463         }
1464 
1465         QChar::UnicodeVersion age = age_map.value(l[1].trimmed(), QChar::Unicode_Unassigned);
1466         //qDebug() << hex << from << ".." << to << ba << age;
1467         if (age == QChar::Unicode_Unassigned)
1468             qFatal("unassigned or unhandled age value: %s", l[1].constData());
1469 
1470         for (int codepoint = from; codepoint <= to; ++codepoint) {
1471             UnicodeData &d = UnicodeData::valueRef(codepoint);
1472             d.p.age = age;
1473         }
1474     }
1475 }
1476 
readDerivedNormalizationProps()1477 static void readDerivedNormalizationProps()
1478 {
1479     qDebug("Reading DerivedNormalizationProps.txt");
1480 
1481     QFile f("data/DerivedNormalizationProps.txt");
1482     if (!f.exists())
1483         qFatal("Couldn't find DerivedNormalizationProps.txt");
1484 
1485     f.open(QFile::ReadOnly);
1486 
1487     while (!f.atEnd()) {
1488         QByteArray line;
1489         line.resize(1024);
1490         int len = f.readLine(line.data(), 1024);
1491         line.resize(len-1);
1492 
1493         int comment = line.indexOf('#');
1494         if (comment >= 0)
1495             line = line.left(comment);
1496 
1497         if (line.trimmed().isEmpty())
1498             continue;
1499 
1500         QList<QByteArray> l = line.split(';');
1501         Q_ASSERT(l.size() >= 2);
1502 
1503         QByteArray propName = l[1].trimmed();
1504         if (propName != "Full_Composition_Exclusion" &&
1505             propName != "NFD_QC" && propName != "NFC_QC" &&
1506             propName != "NFKD_QC" && propName != "NFKC_QC") {
1507             // ###
1508             continue;
1509         }
1510 
1511         QByteArray codes = l[0].trimmed();
1512         codes.replace("..", ".");
1513         QList<QByteArray> cl = codes.split('.');
1514 
1515         bool ok;
1516         int from = cl[0].toInt(&ok, 16);
1517         Q_ASSERT(ok);
1518         int to = from;
1519         if (cl.size() == 2) {
1520             to = cl[1].toInt(&ok, 16);
1521             Q_ASSERT(ok);
1522         }
1523 
1524         for (int codepoint = from; codepoint <= to; ++codepoint) {
1525             UnicodeData &d = UnicodeData::valueRef(codepoint);
1526             if (propName == "Full_Composition_Exclusion") {
1527                 d.excludedComposition = true;
1528             } else {
1529                 Q_STATIC_ASSERT(QString::NormalizationForm_D == 0);
1530                 Q_STATIC_ASSERT(QString::NormalizationForm_C == 1);
1531                 Q_STATIC_ASSERT(QString::NormalizationForm_KD == 2);
1532                 Q_STATIC_ASSERT(QString::NormalizationForm_KC == 3);
1533 
1534                 QString::NormalizationForm form;
1535                 if (propName == "NFD_QC")
1536                     form = QString::NormalizationForm_D;
1537                 else if (propName == "NFC_QC")
1538                     form = QString::NormalizationForm_C;
1539                 else if (propName == "NFKD_QC")
1540                     form = QString::NormalizationForm_KD;
1541                 else// if (propName == "NFKC_QC")
1542                     form = QString::NormalizationForm_KC;
1543 
1544                 Q_ASSERT(l.size() == 3);
1545                 l[2] = l[2].trimmed();
1546 
1547                 enum { NFQC_YES = 0, NFQC_NO = 1, NFQC_MAYBE = 3 };
1548                 uchar ynm = (l[2] == "N" ? NFQC_NO : l[2] == "M" ? NFQC_MAYBE : NFQC_YES);
1549                 if (ynm == NFQC_MAYBE) {
1550                     // if this changes, we need to revise the normalizationQuickCheckHelper() implementation
1551                     Q_ASSERT(form == QString::NormalizationForm_C || form == QString::NormalizationForm_KC);
1552                 }
1553                 d.p.nfQuickCheck |= (ynm << (form << 1)); // 2 bits per NF
1554             }
1555         }
1556     }
1557 
1558     for (int codepoint = 0; codepoint <= QChar::LastValidCodePoint; ++codepoint) {
1559         UnicodeData &d = UnicodeData::valueRef(codepoint);
1560         if (!d.excludedComposition
1561             && d.decompositionType == QChar::Canonical
1562             && d.decomposition.size() > 1) {
1563             Q_ASSERT(d.decomposition.size() == 2);
1564 
1565             int part1 = d.decomposition.at(0);
1566             int part2 = d.decomposition.at(1);
1567 
1568             // all non-starters are listed in DerivedNormalizationProps.txt
1569             // and already excluded from composition
1570             Q_ASSERT(UnicodeData::valueRef(part1).p.combiningClass == 0);
1571 
1572             ++numLigatures;
1573             highestLigature = qMax(highestLigature, part1);
1574             Ligature l = { part1, part2, codepoint };
1575             ligatureHashes[part2].append(l);
1576         }
1577     }
1578 }
1579 
1580 
1581 struct NormalizationCorrection {
1582     uint codepoint;
1583     uint mapped;
1584     int version;
1585 };
1586 
createNormalizationCorrections()1587 static QByteArray createNormalizationCorrections()
1588 {
1589     qDebug("Reading NormalizationCorrections.txt");
1590 
1591     QFile f("data/NormalizationCorrections.txt");
1592     if (!f.exists())
1593         qFatal("Couldn't find NormalizationCorrections.txt");
1594 
1595     f.open(QFile::ReadOnly);
1596 
1597     QByteArray out;
1598 
1599     out += "struct NormalizationCorrection {\n"
1600            "    uint ucs4;\n"
1601            "    uint old_mapping;\n"
1602            "    int version;\n"
1603            "};\n\n"
1604 
1605            "static const NormalizationCorrection uc_normalization_corrections[] = {\n";
1606 
1607     int maxVersion = 0;
1608     int numCorrections = 0;
1609     while (!f.atEnd()) {
1610         QByteArray line;
1611         line.resize(1024);
1612         int len = f.readLine(line.data(), 1024);
1613         line.resize(len-1);
1614 
1615         int comment = line.indexOf('#');
1616         if (comment >= 0)
1617             line = line.left(comment);
1618         line.replace(" ", "");
1619 
1620         if (line.isEmpty())
1621             continue;
1622 
1623         Q_ASSERT(!line.contains(".."));
1624 
1625         QList<QByteArray> fields = line.split(';');
1626         Q_ASSERT(fields.size() == 4);
1627 
1628         NormalizationCorrection c = { 0, 0, 0 };
1629         bool ok;
1630         c.codepoint = fields.at(0).toInt(&ok, 16);
1631         Q_ASSERT(ok);
1632         c.mapped = fields.at(1).toInt(&ok, 16);
1633         Q_ASSERT(ok);
1634         if (fields.at(3) == "3.2.0")
1635             c.version = QChar::Unicode_3_2;
1636         else if (fields.at(3) == "4.0.0")
1637             c.version = QChar::Unicode_4_0;
1638         else
1639             qFatal("unknown unicode version in NormalizationCorrection.txt");
1640 
1641         out += "    { 0x" + QByteArray::number(c.codepoint, 16) + ", 0x" + QByteArray::number(c.mapped, 16)
1642              + ", " + QString::number(c.version) + " },\n";
1643         ++numCorrections;
1644         maxVersion = qMax(c.version, maxVersion);
1645     }
1646     if (out.endsWith(",\n"))
1647         out.chop(2);
1648 
1649     out += "\n};\n\n"
1650 
1651            "enum { NumNormalizationCorrections = " + QByteArray::number(numCorrections) + " };\n"
1652            "enum { NormalizationCorrectionsVersionMax = " + QByteArray::number(maxVersion) + " };\n\n";
1653 
1654     return out;
1655 }
1656 
readLineBreak()1657 static void readLineBreak()
1658 {
1659     qDebug("Reading LineBreak.txt");
1660 
1661     QFile f("data/LineBreak.txt");
1662     if (!f.exists())
1663         qFatal("Couldn't find LineBreak.txt");
1664 
1665     f.open(QFile::ReadOnly);
1666 
1667     while (!f.atEnd()) {
1668         QByteArray line;
1669         line.resize(1024);
1670         int len = f.readLine(line.data(), 1024);
1671         line.resize(len-1);
1672 
1673         int comment = line.indexOf('#');
1674         if (comment >= 0)
1675             line = line.left(comment);
1676         line.replace(" ", "");
1677 
1678         if (line.isEmpty())
1679             continue;
1680 
1681         QList<QByteArray> l = line.split(';');
1682         Q_ASSERT(l.size() == 2);
1683 
1684         QByteArray codes = l[0];
1685         codes.replace("..", ".");
1686         QList<QByteArray> cl = codes.split('.');
1687 
1688         bool ok;
1689         int from = cl[0].toInt(&ok, 16);
1690         Q_ASSERT(ok);
1691         int to = from;
1692         if (cl.size() == 2) {
1693             to = cl[1].toInt(&ok, 16);
1694             Q_ASSERT(ok);
1695         }
1696 
1697         LineBreakClass lb = line_break_map.value(l[1], LineBreak_Unassigned);
1698         if (lb == LineBreak_Unassigned)
1699             qFatal("unassigned line break class: %s", l[1].constData());
1700 
1701         for (int codepoint = from; codepoint <= to; ++codepoint) {
1702             UnicodeData &d = UnicodeData::valueRef(codepoint);
1703             d.p.lineBreakClass = lb;
1704         }
1705     }
1706 }
1707 
readSpecialCasing()1708 static void readSpecialCasing()
1709 {
1710     qDebug("Reading SpecialCasing.txt");
1711 
1712     QFile f("data/SpecialCasing.txt");
1713     if (!f.exists())
1714         qFatal("Couldn't find SpecialCasing.txt");
1715 
1716     f.open(QFile::ReadOnly);
1717 
1718     while (!f.atEnd()) {
1719         QByteArray line;
1720         line.resize(1024);
1721         int len = f.readLine(line.data(), 1024);
1722         line.resize(len-1);
1723 
1724         int comment = line.indexOf('#');
1725         if (comment >= 0)
1726             line = line.left(comment);
1727 
1728         if (line.isEmpty())
1729             continue;
1730 
1731         QList<QByteArray> l = line.split(';');
1732 
1733         QByteArray condition = l.size() < 5 ? QByteArray() : l[4].trimmed();
1734         if (!condition.isEmpty())
1735             // #####
1736             continue;
1737 
1738         bool ok;
1739         int codepoint = l[0].trimmed().toInt(&ok, 16);
1740         Q_ASSERT(ok);
1741 
1742         // if the condition below doesn't hold anymore we need to modify our
1743         // lower/upper/title casing code and case folding code
1744         Q_ASSERT(!QChar::requiresSurrogates(codepoint));
1745 
1746 //         qDebug() << "codepoint" << hex << codepoint;
1747 //         qDebug() << line;
1748 
1749         QList<QByteArray> lower = l[1].trimmed().split(' ');
1750         QList<int> lowerMap;
1751         for (int i = 0; i < lower.size(); ++i) {
1752             bool ok;
1753             lowerMap.append(lower.at(i).toInt(&ok, 16));
1754             Q_ASSERT(ok);
1755         }
1756 
1757         QList<QByteArray> title = l[2].trimmed().split(' ');
1758         QList<int> titleMap;
1759         for (int i = 0; i < title.size(); ++i) {
1760             bool ok;
1761             titleMap.append(title.at(i).toInt(&ok, 16));
1762             Q_ASSERT(ok);
1763         }
1764 
1765         QList<QByteArray> upper = l[3].trimmed().split(' ');
1766         QList<int> upperMap;
1767         for (int i = 0; i < upper.size(); ++i) {
1768             bool ok;
1769             upperMap.append(upper.at(i).toInt(&ok, 16));
1770             Q_ASSERT(ok);
1771         }
1772 
1773 
1774         UnicodeData &ud = UnicodeData::valueRef(codepoint);
1775         Q_ASSERT(lowerMap.size() > 1 || lowerMap.at(0) == codepoint + ud.p.lowerCaseDiff);
1776         Q_ASSERT(titleMap.size() > 1 || titleMap.at(0) == codepoint + ud.p.titleCaseDiff);
1777         Q_ASSERT(upperMap.size() > 1 || upperMap.at(0) == codepoint + ud.p.upperCaseDiff);
1778 
1779         if (lowerMap.size() > 1) {
1780             ud.p.lowerCaseSpecial = true;
1781             ud.p.lowerCaseDiff = appendToSpecialCaseMap(lowerMap);
1782         }
1783         if (titleMap.size() > 1) {
1784             ud.p.titleCaseSpecial = true;
1785             ud.p.titleCaseDiff = appendToSpecialCaseMap(titleMap);
1786         }
1787         if (upperMap.size() > 1) {
1788             ud.p.upperCaseSpecial = true;
1789             ud.p.upperCaseDiff = appendToSpecialCaseMap(upperMap);
1790         }
1791     }
1792 }
1793 
1794 static int maxCaseFoldDiff = 0;
1795 
readCaseFolding()1796 static void readCaseFolding()
1797 {
1798     qDebug("Reading CaseFolding.txt");
1799 
1800     QFile f("data/CaseFolding.txt");
1801     if (!f.exists())
1802         qFatal("Couldn't find CaseFolding.txt");
1803 
1804     f.open(QFile::ReadOnly);
1805 
1806     while (!f.atEnd()) {
1807         QByteArray line;
1808         line.resize(1024);
1809         int len = f.readLine(line.data(), 1024);
1810         line.resize(len-1);
1811 
1812         int comment = line.indexOf('#');
1813         if (comment >= 0)
1814             line = line.left(comment);
1815 
1816         if (line.isEmpty())
1817             continue;
1818 
1819         QList<QByteArray> l = line.split(';');
1820 
1821         bool ok;
1822         int codepoint = l[0].trimmed().toInt(&ok, 16);
1823         Q_ASSERT(ok);
1824 
1825 
1826         l[1] = l[1].trimmed();
1827         if (l[1] == "F" || l[1] == "T")
1828             continue;
1829 
1830 //         qDebug() << "codepoint" << hex << codepoint;
1831 //         qDebug() << line;
1832         QList<QByteArray> fold = l[2].trimmed().split(' ');
1833         QList<int> foldMap;
1834         for (int i = 0; i < fold.size(); ++i) {
1835             bool ok;
1836             foldMap.append(fold.at(i).toInt(&ok, 16));
1837             Q_ASSERT(ok);
1838         }
1839 
1840         UnicodeData &ud = UnicodeData::valueRef(codepoint);
1841         if (foldMap.size() == 1) {
1842             int caseFolded = foldMap.at(0);
1843             int diff = caseFolded - codepoint;
1844             // if the conditions below doesn't hold anymore we need to modify our case folding code
1845             Q_ASSERT(QChar::requiresSurrogates(codepoint) == QChar::requiresSurrogates(caseFolded));
1846             if (QChar::requiresSurrogates(codepoint)) {
1847                 Q_ASSERT(QChar::highSurrogate(codepoint) == QChar::highSurrogate(caseFolded));
1848                 Q_ASSERT(QChar::lowSurrogate(codepoint) + diff == QChar::lowSurrogate(caseFolded));
1849             }
1850             if (qAbs(diff) >= (1<<13)) {
1851                 qWarning() << "caseFoldDiff exceeded (" << hex << codepoint << "->" << caseFolded << "); map it for special case";
1852                 ud.p.caseFoldSpecial = true;
1853                 ud.p.caseFoldDiff = appendToSpecialCaseMap(foldMap);
1854             } else {
1855                 ud.p.caseFoldDiff = diff;
1856                 maxCaseFoldDiff = qMax(maxCaseFoldDiff, qAbs(diff));
1857             }
1858         } else {
1859             qFatal("we currently don't support full case foldings");
1860 //             qDebug() << "special" << hex << foldMap;
1861             ud.p.caseFoldSpecial = true;
1862             ud.p.caseFoldDiff = appendToSpecialCaseMap(foldMap);
1863         }
1864     }
1865 }
1866 
readGraphemeBreak()1867 static void readGraphemeBreak()
1868 {
1869     qDebug("Reading GraphemeBreakProperty.txt");
1870 
1871     QFile f("data/GraphemeBreakProperty.txt");
1872     if (!f.exists())
1873         qFatal("Couldn't find GraphemeBreakProperty.txt");
1874 
1875     f.open(QFile::ReadOnly);
1876 
1877     while (!f.atEnd()) {
1878         QByteArray line;
1879         line.resize(1024);
1880         int len = f.readLine(line.data(), 1024);
1881         line.resize(len-1);
1882 
1883         int comment = line.indexOf('#');
1884         if (comment >= 0)
1885             line = line.left(comment);
1886         line.replace(" ", "");
1887 
1888         if (line.isEmpty())
1889             continue;
1890 
1891         QList<QByteArray> l = line.split(';');
1892         Q_ASSERT(l.size() == 2);
1893 
1894         QByteArray codes = l[0];
1895         codes.replace("..", ".");
1896         QList<QByteArray> cl = codes.split('.');
1897 
1898         bool ok;
1899         int from = cl[0].toInt(&ok, 16);
1900         Q_ASSERT(ok);
1901         int to = from;
1902         if (cl.size() == 2) {
1903             to = cl[1].toInt(&ok, 16);
1904             Q_ASSERT(ok);
1905         }
1906 
1907         GraphemeBreakClass brk = grapheme_break_map.value(l[1], GraphemeBreak_Unassigned);
1908         if (brk == GraphemeBreak_Unassigned)
1909             qFatal("unassigned grapheme break class: %s", l[1].constData());
1910 
1911         for (int codepoint = from; codepoint <= to; ++codepoint) {
1912             UnicodeData &ud = UnicodeData::valueRef(codepoint);
1913             ud.p.graphemeBreakClass = brk;
1914         }
1915     }
1916 }
1917 
readWordBreak()1918 static void readWordBreak()
1919 {
1920     qDebug("Reading WordBreakProperty.txt");
1921 
1922     QFile f("data/WordBreakProperty.txt");
1923     if (!f.exists())
1924         qFatal("Couldn't find WordBreakProperty.txt");
1925 
1926     f.open(QFile::ReadOnly);
1927 
1928     while (!f.atEnd()) {
1929         QByteArray line;
1930         line.resize(1024);
1931         int len = f.readLine(line.data(), 1024);
1932         line.resize(len-1);
1933 
1934         int comment = line.indexOf('#');
1935         if (comment >= 0)
1936             line = line.left(comment);
1937         line.replace(" ", "");
1938 
1939         if (line.isEmpty())
1940             continue;
1941 
1942         QList<QByteArray> l = line.split(';');
1943         Q_ASSERT(l.size() == 2);
1944 
1945         QByteArray codes = l[0];
1946         codes.replace("..", ".");
1947         QList<QByteArray> cl = codes.split('.');
1948 
1949         bool ok;
1950         int from = cl[0].toInt(&ok, 16);
1951         Q_ASSERT(ok);
1952         int to = from;
1953         if (cl.size() == 2) {
1954             to = cl[1].toInt(&ok, 16);
1955             Q_ASSERT(ok);
1956         }
1957 
1958         WordBreakClass brk = word_break_map.value(l[1], WordBreak_Unassigned);
1959         if (brk == WordBreak_Unassigned)
1960             qFatal("unassigned word break class: %s", l[1].constData());
1961 
1962         for (int codepoint = from; codepoint <= to; ++codepoint) {
1963             // ### [
1964             // as of Unicode 5.1, some punctuation marks were mapped to MidLetter and MidNumLet
1965             // which caused "hi.there" to be treated like if it were just a single word;
1966             // until we have a tailoring mechanism, retain the old behavior by remapping those characters here.
1967             if (codepoint == 0x002E) // FULL STOP
1968                 brk = WordBreak_MidNum;
1969             else if (codepoint == 0x003A) // COLON
1970                 brk = WordBreak_Any;
1971             // ] ###
1972             UnicodeData &ud = UnicodeData::valueRef(codepoint);
1973             ud.p.wordBreakClass = brk;
1974         }
1975     }
1976 }
1977 
readSentenceBreak()1978 static void readSentenceBreak()
1979 {
1980     qDebug("Reading SentenceBreakProperty.txt");
1981 
1982     QFile f("data/SentenceBreakProperty.txt");
1983     if (!f.exists())
1984         qFatal("Couldn't find SentenceBreakProperty.txt");
1985 
1986     f.open(QFile::ReadOnly);
1987 
1988     while (!f.atEnd()) {
1989         QByteArray line;
1990         line.resize(1024);
1991         int len = f.readLine(line.data(), 1024);
1992         line.resize(len-1);
1993 
1994         int comment = line.indexOf('#');
1995         if (comment >= 0)
1996             line = line.left(comment);
1997         line.replace(" ", "");
1998 
1999         if (line.isEmpty())
2000             continue;
2001 
2002         QList<QByteArray> l = line.split(';');
2003         Q_ASSERT(l.size() == 2);
2004 
2005         QByteArray codes = l[0];
2006         codes.replace("..", ".");
2007         QList<QByteArray> cl = codes.split('.');
2008 
2009         bool ok;
2010         int from = cl[0].toInt(&ok, 16);
2011         Q_ASSERT(ok);
2012         int to = from;
2013         if (cl.size() == 2) {
2014             to = cl[1].toInt(&ok, 16);
2015             Q_ASSERT(ok);
2016         }
2017 
2018         SentenceBreakClass brk = sentence_break_map.value(l[1], SentenceBreak_Unassigned);
2019         if (brk == SentenceBreak_Unassigned)
2020             qFatal("unassigned sentence break class: %s", l[1].constData());
2021 
2022         for (int codepoint = from; codepoint <= to; ++codepoint) {
2023             UnicodeData &ud = UnicodeData::valueRef(codepoint);
2024             ud.p.sentenceBreakClass = brk;
2025         }
2026     }
2027 }
2028 
2029 #if 0
2030 // this piece of code does full case folding and comparison. We currently
2031 // don't use it, since this gives lots of issues with things as case insensitive
2032 // search and replace.
2033 static inline void foldCase(uint ch, ushort *out)
2034 {
2035     const QUnicodeTables::Properties *p = qGetProp(ch);
2036     if (!p->caseFoldSpecial) {
2037         *(out++) = ch + p->caseFoldDiff;
2038     } else {
2039         const ushort *folded = specialCaseMap + p->caseFoldDiff;
2040         ushort length = *folded++;
2041         while (length--)
2042             *out++ = *folded++;
2043     }
2044     *out = 0;
2045 }
2046 
2047 static int ucstricmp(const ushort *a, const ushort *ae, const ushort *b, const ushort *be)
2048 {
2049     if (a == b)
2050         return 0;
2051     if (a == 0)
2052         return 1;
2053     if (b == 0)
2054         return -1;
2055 
2056     while (a != ae && b != be) {
2057         const QUnicodeTables::Properties *pa = qGetProp(*a);
2058         const QUnicodeTables::Properties *pb = qGetProp(*b);
2059         if (pa->caseFoldSpecial | pb->caseFoldSpecial)
2060             goto special;
2061             int diff = (int)(*a + pa->caseFoldDiff) - (int)(*b + pb->caseFoldDiff);
2062         if ((diff))
2063             return diff;
2064         ++a;
2065         ++b;
2066         }
2067     }
2068     if (a == ae) {
2069         if (b == be)
2070             return 0;
2071         return -1;
2072     }
2073     return 1;
2074 special:
2075     ushort abuf[SPECIAL_CASE_MAX_LEN + 1];
2076     ushort bbuf[SPECIAL_CASE_MAX_LEN + 1];
2077     abuf[0] = bbuf[0] = 0;
2078     ushort *ap = abuf;
2079     ushort *bp = bbuf;
2080     while (1) {
2081         if (!*ap) {
2082             if (a == ae) {
2083                 if (!*bp && b == be)
2084                     return 0;
2085                 return -1;
2086             }
2087             foldCase(*(a++), abuf);
2088             ap = abuf;
2089         }
2090         if (!*bp) {
2091             if (b == be)
2092                 return 1;
2093             foldCase(*(b++), bbuf);
2094             bp = bbuf;
2095         }
2096         if (*ap != *bp)
2097             return (int)*ap - (int)*bp;
2098         ++ap;
2099         ++bp;
2100     }
2101 }
2102 
2103 
2104 static int ucstricmp(const ushort *a, const ushort *ae, const uchar *b)
2105 {
2106     if (a == 0)
2107         return 1;
2108     if (b == 0)
2109         return -1;
2110 
2111     while (a != ae && *b) {
2112         const QUnicodeTables::Properties *pa = qGetProp(*a);
2113         const QUnicodeTables::Properties *pb = qGetProp((ushort)*b);
2114         if (pa->caseFoldSpecial | pb->caseFoldSpecial)
2115             goto special;
2116         int diff = (int)(*a + pa->caseFoldDiff) - (int)(*b + pb->caseFoldDiff);
2117         if ((diff))
2118             return diff;
2119         ++a;
2120         ++b;
2121     }
2122     if (a == ae) {
2123         if (!*b)
2124             return 0;
2125         return -1;
2126     }
2127     return 1;
2128 
2129 special:
2130     ushort abuf[SPECIAL_CASE_MAX_LEN + 1];
2131     ushort bbuf[SPECIAL_CASE_MAX_LEN + 1];
2132     abuf[0] = bbuf[0] = 0;
2133     ushort *ap = abuf;
2134     ushort *bp = bbuf;
2135     while (1) {
2136         if (!*ap) {
2137             if (a == ae) {
2138                 if (!*bp && !*b)
2139                     return 0;
2140                 return -1;
2141             }
2142             foldCase(*(a++), abuf);
2143             ap = abuf;
2144         }
2145         if (!*bp) {
2146             if (!*b)
2147                 return 1;
2148             foldCase(*(b++), bbuf);
2149             bp = bbuf;
2150         }
2151         if (*ap != *bp)
2152             return (int)*ap - (int)*bp;
2153         ++ap;
2154         ++bp;
2155     }
2156 }
2157 #endif
2158 
2159 #if 0
2160 static QList<QByteArray> blockNames;
2161 struct BlockInfo
2162 {
2163     int blockIndex;
2164     int firstCodePoint;
2165     int lastCodePoint;
2166 };
2167 static QList<BlockInfo> blockInfoList;
2168 
2169 static void readBlocks()
2170 {
2171     qDebug("Reading Blocks.txt");
2172 
2173     QFile f("data/Blocks.txt");
2174     if (!f.exists())
2175         qFatal("Couldn't find Blocks.txt");
2176 
2177     f.open(QFile::ReadOnly);
2178 
2179     while (!f.atEnd()) {
2180         QByteArray line = f.readLine();
2181         line.resize(line.size() - 1);
2182 
2183         int comment = line.indexOf("#");
2184         if (comment >= 0)
2185             line = line.left(comment);
2186 
2187         line.replace(" ", "");
2188 
2189         if (line.isEmpty())
2190             continue;
2191 
2192         int semicolon = line.indexOf(';');
2193         Q_ASSERT(semicolon >= 0);
2194         QByteArray codePoints = line.left(semicolon);
2195         QByteArray blockName = line.mid(semicolon + 1);
2196 
2197         int blockIndex = blockNames.indexOf(blockName);
2198         if (blockIndex == -1) {
2199             blockIndex = blockNames.size();
2200             blockNames.append(blockName);
2201         }
2202 
2203         codePoints.replace("..", ".");
2204         QList<QByteArray> cl = codePoints.split('.');
2205 
2206         bool ok;
2207         int first = cl[0].toInt(&ok, 16);
2208         Q_ASSERT(ok);
2209         int last = first;
2210         if (cl.size() == 2) {
2211             last = cl[1].toInt(&ok, 16);
2212             Q_ASSERT(ok);
2213         }
2214 
2215         BlockInfo blockInfo = { blockIndex, first, last };
2216         blockInfoList.append(blockInfo);
2217     }
2218 }
2219 #endif
2220 
readScripts()2221 static void readScripts()
2222 {
2223     qDebug("Reading Scripts.txt");
2224 
2225     QFile f("data/Scripts.txt");
2226     if (!f.exists())
2227         qFatal("Couldn't find Scripts.txt");
2228 
2229     f.open(QFile::ReadOnly);
2230 
2231     while (!f.atEnd()) {
2232         QByteArray line = f.readLine();
2233         line.resize(line.size() - 1);
2234 
2235         int comment = line.indexOf("#");
2236         if (comment >= 0)
2237             line = line.left(comment);
2238 
2239         line.replace(" ", "");
2240         line.replace("_", "");
2241 
2242         if (line.isEmpty())
2243             continue;
2244 
2245         int semicolon = line.indexOf(';');
2246         Q_ASSERT(semicolon >= 0);
2247         QByteArray codePoints = line.left(semicolon);
2248         QByteArray scriptName = line.mid(semicolon + 1);
2249 
2250         codePoints.replace("..", ".");
2251         QList<QByteArray> cl = codePoints.split('.');
2252 
2253         bool ok;
2254         int first = cl[0].toInt(&ok, 16);
2255         Q_ASSERT(ok);
2256         int last = first;
2257         if (cl.size() == 2) {
2258             last = cl[1].toInt(&ok, 16);
2259             Q_ASSERT(ok);
2260         }
2261 
2262         if (!scriptMap.contains(scriptName))
2263             qFatal("Unhandled script property value: %s", scriptName.constData());
2264         QChar::Script script = scriptMap.value(scriptName, QChar::Script_Unknown);
2265 
2266         for (int codepoint = first; codepoint <= last; ++codepoint) {
2267             UnicodeData &ud = UnicodeData::valueRef(codepoint);
2268             ud.p.script = script;
2269         }
2270     }
2271 }
2272 
2273 #if 0
2274 static void dump(int from, int to)
2275 {
2276     for (int i = from; i <= to; ++i) {
2277         UnicodeData &d = UnicodeData::valueRef(i);
2278         qDebug("0x%04x: cat=%d combining=%d dir=%d case=%x mirror=%x joining=%d age=%d",
2279                i, d.p.category, d.p.combiningClass, d.p.direction, d.otherCase, d.mirroredChar, d.p.joining, d.p.age);
2280         if (d.decompositionType != QChar::NoDecomposition) {
2281             qDebug("    decomposition: type=%d, length=%d, first=%x", d.decompositionType, d.decomposition.size(),
2282                    d.decomposition[0]);
2283         }
2284     }
2285     qDebug(" ");
2286 }
2287 #endif
2288 
2289 static QList<PropertyFlags> uniqueProperties;
2290 
computeUniqueProperties()2291 static void computeUniqueProperties()
2292 {
2293     qDebug("computeUniqueProperties:");
2294     for (int codepoint = 0; codepoint <= QChar::LastValidCodePoint; ++codepoint) {
2295         UnicodeData &d = UnicodeData::valueRef(codepoint);
2296         int index = uniqueProperties.indexOf(d.p);
2297         if (index == -1) {
2298             index = uniqueProperties.size();
2299             uniqueProperties.append(d.p);
2300         }
2301         d.propertyIndex = index;
2302     }
2303     qDebug("    %d unique unicode properties found", uniqueProperties.size());
2304 }
2305 
2306 struct UniqueBlock {
UniqueBlockUniqueBlock2307     inline UniqueBlock() : index(-1) {}
2308 
operator ==UniqueBlock2309     inline bool operator==(const UniqueBlock &other) const
2310     { return values == other.values; }
2311 
2312     int index;
2313     QVector<int> values;
2314 };
2315 
createPropertyInfo()2316 static QByteArray createPropertyInfo()
2317 {
2318     qDebug("createPropertyInfo:");
2319 
2320     // we reserve one bit more than in the assert below for the sign
2321     Q_ASSERT(maxMirroredDiff < (1<<12));
2322     Q_ASSERT(maxLowerCaseDiff < (1<<13));
2323     Q_ASSERT(maxUpperCaseDiff < (1<<13));
2324     Q_ASSERT(maxTitleCaseDiff < (1<<13));
2325     Q_ASSERT(maxCaseFoldDiff < (1<<13));
2326 
2327     const int BMP_BLOCKSIZE = 32;
2328     const int BMP_SHIFT = 5;
2329     const int BMP_END = 0x11000;
2330     const int SMP_END = 0x110000;
2331     const int SMP_BLOCKSIZE = 256;
2332     const int SMP_SHIFT = 8;
2333 
2334     QList<UniqueBlock> uniqueBlocks;
2335     QVector<int> blockMap;
2336     int used = 0;
2337 
2338     for (int block = 0; block < BMP_END/BMP_BLOCKSIZE; ++block) {
2339         UniqueBlock b;
2340         b.values.reserve(BMP_BLOCKSIZE);
2341         for (int i = 0; i < BMP_BLOCKSIZE; ++i) {
2342             int uc = block*BMP_BLOCKSIZE + i;
2343             UnicodeData &d = UnicodeData::valueRef(uc);
2344             b.values.append(d.propertyIndex);
2345         }
2346         int index = uniqueBlocks.indexOf(b);
2347         if (index == -1) {
2348             index = uniqueBlocks.size();
2349             b.index = used;
2350             used += BMP_BLOCKSIZE;
2351             uniqueBlocks.append(b);
2352         }
2353         blockMap.append(uniqueBlocks.at(index).index);
2354     }
2355     int bmp_blocks = uniqueBlocks.size();
2356 
2357     for (int block = BMP_END/SMP_BLOCKSIZE; block < SMP_END/SMP_BLOCKSIZE; ++block) {
2358         UniqueBlock b;
2359         b.values.reserve(SMP_BLOCKSIZE);
2360         for (int i = 0; i < SMP_BLOCKSIZE; ++i) {
2361             int uc = block*SMP_BLOCKSIZE + i;
2362             UnicodeData &d = UnicodeData::valueRef(uc);
2363             b.values.append(d.propertyIndex);
2364         }
2365         int index = uniqueBlocks.indexOf(b);
2366         if (index == -1) {
2367             index = uniqueBlocks.size();
2368             b.index = used;
2369             used += SMP_BLOCKSIZE;
2370             uniqueBlocks.append(b);
2371         }
2372         blockMap.append(uniqueBlocks.at(index).index);
2373     }
2374     int smp_blocks = uniqueBlocks.size() - bmp_blocks;
2375 
2376     int bmp_block_data = bmp_blocks*BMP_BLOCKSIZE*sizeof(unsigned short);
2377     int bmp_trie = BMP_END/BMP_BLOCKSIZE*sizeof(unsigned short);
2378     int bmp_mem = bmp_block_data + bmp_trie;
2379     qDebug("    %d unique blocks in BMP.", bmp_blocks);
2380     qDebug("        block data uses: %d bytes", bmp_block_data);
2381     qDebug("        trie data uses : %d bytes", bmp_trie);
2382 
2383     int smp_block_data = smp_blocks*SMP_BLOCKSIZE*sizeof(unsigned short);
2384     int smp_trie = (SMP_END-BMP_END)/SMP_BLOCKSIZE*sizeof(unsigned short);
2385     int smp_mem = smp_block_data + smp_trie;
2386     qDebug("    %d unique blocks in SMP.", smp_blocks);
2387     qDebug("        block data uses: %d bytes", smp_block_data);
2388     qDebug("        trie data uses : %d bytes", smp_trie);
2389 
2390     int prop_data = uniqueProperties.size() * SizeOfPropertiesStruct;
2391     qDebug("\n        properties data uses : %d bytes", prop_data);
2392     qDebug("    memory usage: %d bytes", bmp_mem + smp_mem + prop_data);
2393 
2394     Q_ASSERT(blockMap.last() + blockMap.size() < (1<<(sizeof(unsigned short)*8)));
2395 
2396     QByteArray out;
2397 
2398     out += "static const unsigned short uc_property_trie[] = {\n";
2399     // first write the map
2400     out += "    // [0x0..0x" + QByteArray::number(BMP_END, 16) + ")";
2401     for (int i = 0; i < BMP_END/BMP_BLOCKSIZE; ++i) {
2402         if (!(i % 8)) {
2403             if (out.endsWith(' '))
2404                 out.chop(1);
2405             if (!((i*BMP_BLOCKSIZE) % 0x1000))
2406                 out += "\n";
2407             out += "\n    ";
2408         }
2409         out += QByteArray::number(blockMap.at(i) + blockMap.size());
2410         out += ", ";
2411     }
2412     if (out.endsWith(' '))
2413         out.chop(1);
2414     out += "\n\n    // [0x" + QByteArray::number(BMP_END, 16) + "..0x" + QByteArray::number(SMP_END, 16) + ")\n";
2415     for (int i = BMP_END/BMP_BLOCKSIZE; i < blockMap.size(); ++i) {
2416         if (!(i % 8)) {
2417             if (out.endsWith(' '))
2418                 out.chop(1);
2419             if (!(i % (0x10000/SMP_BLOCKSIZE)))
2420                 out += "\n";
2421             out += "\n    ";
2422         }
2423         out += QByteArray::number(blockMap.at(i) + blockMap.size());
2424         out += ", ";
2425     }
2426     if (out.endsWith(' '))
2427         out.chop(1);
2428     out += "\n";
2429     // write the data
2430     for (int i = 0; i < uniqueBlocks.size(); ++i) {
2431         if (out.endsWith(' '))
2432             out.chop(1);
2433         out += "\n";
2434         const UniqueBlock &b = uniqueBlocks.at(i);
2435         for (int j = 0; j < b.values.size(); ++j) {
2436             if (!(j % 8)) {
2437                 if (out.endsWith(' '))
2438                     out.chop(1);
2439                 out += "\n    ";
2440             }
2441             out += QByteArray::number(b.values.at(j));
2442             out += ", ";
2443         }
2444     }
2445     if (out.endsWith(", "))
2446         out.chop(2);
2447     out += "\n};\n\n";
2448 
2449     out += "#define GET_PROP_INDEX(ucs4) \\\n"
2450            "       (ucs4 < 0x" + QByteArray::number(BMP_END, 16) + " \\\n"
2451            "        ? (uc_property_trie[uc_property_trie[ucs4>>" + QByteArray::number(BMP_SHIFT) +
2452            "] + (ucs4 & 0x" + QByteArray::number(BMP_BLOCKSIZE-1, 16)+ ")]) \\\n"
2453            "        : (uc_property_trie[uc_property_trie[((ucs4 - 0x" + QByteArray::number(BMP_END, 16) +
2454            ")>>" + QByteArray::number(SMP_SHIFT) + ") + 0x" + QByteArray::number(BMP_END/BMP_BLOCKSIZE, 16) + "]"
2455            " + (ucs4 & 0x" + QByteArray::number(SMP_BLOCKSIZE-1, 16) + ")]))\n\n"
2456            "#define GET_PROP_INDEX_UCS2(ucs2) \\\n"
2457            "       (uc_property_trie[uc_property_trie[ucs2>>" + QByteArray::number(BMP_SHIFT) +
2458            "] + (ucs2 & 0x" + QByteArray::number(BMP_BLOCKSIZE-1, 16)+ ")])\n\n";
2459 
2460     out += "static const Properties uc_properties[] = {";
2461     // keep in sync with the property declaration
2462     for (int i = 0; i < uniqueProperties.size(); ++i) {
2463         const PropertyFlags &p = uniqueProperties.at(i);
2464         out += "\n    { ";
2465 //     "        ushort category            : 8; /* 5 used */\n"
2466         out += QByteArray::number( p.category );
2467         out += ", ";
2468 //     "        ushort direction           : 8; /* 5 used */\n"
2469         out += QByteArray::number( p.direction );
2470         out += ", ";
2471 //     "        ushort combiningClass      : 8;\n"
2472         out += QByteArray::number( p.combiningClass );
2473         out += ", ";
2474 //     "        ushort joining             : 3;\n"
2475         out += QByteArray::number( p.joining );
2476         out += ", ";
2477 //     "        signed short digitValue    : 5;\n"
2478         out += QByteArray::number( p.digitValue );
2479         out += ", ";
2480 //     "        signed short mirrorDiff    : 16;\n"
2481         out += QByteArray::number( p.mirrorDiff );
2482         out += ", ";
2483 //     "        ushort unicodeVersion      : 8; /* 5 used */\n"
2484         out += QByteArray::number( p.age );
2485         out += ", ";
2486 //     "        ushort nfQuickCheck        : 8;\n"
2487         out += QByteArray::number( p.nfQuickCheck );
2488         out += ", ";
2489 //     "        struct {\n"
2490 //     "            ushort special    : 1;\n"
2491 //     "            signed short diff : 15;\n"
2492 //     "        } cases[NumCases];\n"
2493         out += " { {";
2494         out += QByteArray::number( p.lowerCaseSpecial );
2495         out += ", ";
2496         out += QByteArray::number( p.lowerCaseDiff );
2497         out += "}, {";
2498         out += QByteArray::number( p.upperCaseSpecial );
2499         out += ", ";
2500         out += QByteArray::number( p.upperCaseDiff );
2501         out += "}, {";
2502         out += QByteArray::number( p.titleCaseSpecial );
2503         out += ", ";
2504         out += QByteArray::number( p.titleCaseDiff );
2505         out += "}, {";
2506         out += QByteArray::number( p.caseFoldSpecial );
2507         out += ", ";
2508         out += QByteArray::number( p.caseFoldDiff );
2509         out += "} }, ";
2510 //     "        ushort graphemeBreakClass  : 5; /* 5 used */\n"
2511 //     "        ushort wordBreakClass      : 5; /* 5 used */\n"
2512 //     "        ushort lineBreakClass      : 6; /* 6 used */\n"
2513         out += QByteArray::number( p.graphemeBreakClass );
2514         out += ", ";
2515         out += QByteArray::number( p.wordBreakClass );
2516         out += ", ";
2517         out += QByteArray::number( p.lineBreakClass );
2518         out += ", ";
2519 //     "        ushort sentenceBreakClass  : 8; /* 4 used */\n"
2520         out += QByteArray::number( p.sentenceBreakClass );
2521         out += ", ";
2522 //     "        ushort script              : 8;\n"
2523         out += QByteArray::number( p.script );
2524         out += " },";
2525     }
2526     if (out.endsWith(','))
2527         out.chop(1);
2528     out += "\n};\n\n";
2529 
2530 
2531     out += "Q_DECL_CONST_FUNCTION static inline const Properties *qGetProp(uint ucs4) noexcept\n"
2532            "{\n"
2533            "    return uc_properties + GET_PROP_INDEX(ucs4);\n"
2534            "}\n"
2535            "\n"
2536            "Q_DECL_CONST_FUNCTION static inline const Properties *qGetProp(ushort ucs2) noexcept\n"
2537            "{\n"
2538            "    return uc_properties + GET_PROP_INDEX_UCS2(ucs2);\n"
2539            "}\n"
2540            "\n"
2541            "Q_DECL_CONST_FUNCTION Q_CORE_EXPORT const Properties * QT_FASTCALL properties(uint ucs4) noexcept\n"
2542            "{\n"
2543            "    return qGetProp(ucs4);\n"
2544            "}\n"
2545            "\n"
2546            "Q_DECL_CONST_FUNCTION Q_CORE_EXPORT const Properties * QT_FASTCALL properties(ushort ucs2) noexcept\n"
2547            "{\n"
2548            "    return qGetProp(ucs2);\n"
2549            "}\n\n";
2550 
2551     out += "Q_CORE_EXPORT GraphemeBreakClass QT_FASTCALL graphemeBreakClass(uint ucs4) noexcept\n"
2552            "{\n"
2553            "    return static_cast<GraphemeBreakClass>(qGetProp(ucs4)->graphemeBreakClass);\n"
2554            "}\n"
2555            "\n"
2556            "Q_CORE_EXPORT WordBreakClass QT_FASTCALL wordBreakClass(uint ucs4) noexcept\n"
2557            "{\n"
2558            "    return static_cast<WordBreakClass>(qGetProp(ucs4)->wordBreakClass);\n"
2559            "}\n"
2560            "\n"
2561            "Q_CORE_EXPORT SentenceBreakClass QT_FASTCALL sentenceBreakClass(uint ucs4) noexcept\n"
2562            "{\n"
2563            "    return static_cast<SentenceBreakClass>(qGetProp(ucs4)->sentenceBreakClass);\n"
2564            "}\n"
2565            "\n"
2566            "Q_CORE_EXPORT LineBreakClass QT_FASTCALL lineBreakClass(uint ucs4) noexcept\n"
2567            "{\n"
2568            "    return static_cast<LineBreakClass>(qGetProp(ucs4)->lineBreakClass);\n"
2569            "}\n"
2570            "\n";
2571 
2572     return out;
2573 }
2574 
createSpecialCaseMap()2575 static QByteArray createSpecialCaseMap()
2576 {
2577     qDebug("createSpecialCaseMap:");
2578 
2579     QByteArray out;
2580 
2581     out += "static const unsigned short specialCaseMap[] = {\n"
2582            "    0x0, // placeholder";
2583     int i = 1;
2584     while (i < specialCaseMap.size()) {
2585         out += "\n   ";
2586         int n = specialCaseMap.at(i);
2587         for (int j = 0; j <= n; ++j) {
2588             out += QByteArray(" 0x") + QByteArray::number(specialCaseMap.at(i+j), 16);
2589             out += ",";
2590         }
2591         i += n + 1;
2592     }
2593     out.chop(1);
2594     out += "\n};\n\n";
2595 
2596     qDebug("    memory usage: %ld bytes", specialCaseMap.size()*sizeof(unsigned short));
2597 
2598     return out;
2599 }
2600 
2601 
createCompositionInfo()2602 static QByteArray createCompositionInfo()
2603 {
2604     qDebug("createCompositionInfo: highestComposedCharacter=0x%x", highestComposedCharacter);
2605 
2606     const int BMP_BLOCKSIZE = 16;
2607     const int BMP_SHIFT = 4;
2608     const int BMP_END = 0x3400; // start of Han
2609     const int SMP_END = 0x30000;
2610     const int SMP_BLOCKSIZE = 256;
2611     const int SMP_SHIFT = 8;
2612 
2613     if (SMP_END <= highestComposedCharacter)
2614         qFatal("end of table smaller than highest composed character 0x%x", highestComposedCharacter);
2615 
2616     QVector<unsigned short> decompositions;
2617     int tableIndex = 0;
2618 
2619     QList<UniqueBlock> uniqueBlocks;
2620     QVector<int> blockMap;
2621     int used = 0;
2622 
2623     for (int block = 0; block < BMP_END/BMP_BLOCKSIZE; ++block) {
2624         UniqueBlock b;
2625         b.values.reserve(BMP_BLOCKSIZE);
2626         for (int i = 0; i < BMP_BLOCKSIZE; ++i) {
2627             int uc = block*BMP_BLOCKSIZE + i;
2628             UnicodeData &d = UnicodeData::valueRef(uc);
2629             if (!d.decomposition.isEmpty()) {
2630                 int utf16Length = 0;
2631                 decompositions.append(0);
2632                 for (int j = 0; j < d.decomposition.size(); ++j) {
2633                     int code = d.decomposition.at(j);
2634                     if (QChar::requiresSurrogates(code)) {
2635                         // save as surrogate pair
2636                         decompositions.append(QChar::highSurrogate(code));
2637                         decompositions.append(QChar::lowSurrogate(code));
2638                         utf16Length += 2;
2639                     } else {
2640                         decompositions.append(code);
2641                         utf16Length++;
2642                     }
2643                 }
2644                 decompositions[tableIndex] = d.decompositionType + (utf16Length<<8);
2645                 b.values.append(tableIndex);
2646                 tableIndex += utf16Length + 1;
2647             } else {
2648                 b.values.append(0xffff);
2649             }
2650         }
2651         int index = uniqueBlocks.indexOf(b);
2652         if (index == -1) {
2653             index = uniqueBlocks.size();
2654             b.index = used;
2655             used += BMP_BLOCKSIZE;
2656             uniqueBlocks.append(b);
2657         }
2658         blockMap.append(uniqueBlocks.at(index).index);
2659     }
2660     int bmp_blocks = uniqueBlocks.size();
2661 
2662     for (int block = BMP_END/SMP_BLOCKSIZE; block < SMP_END/SMP_BLOCKSIZE; ++block) {
2663         UniqueBlock b;
2664         b.values.reserve(SMP_BLOCKSIZE);
2665         for (int i = 0; i < SMP_BLOCKSIZE; ++i) {
2666             int uc = block*SMP_BLOCKSIZE + i;
2667             UnicodeData &d = UnicodeData::valueRef(uc);
2668             if (!d.decomposition.isEmpty()) {
2669                 int utf16Length = 0;
2670                 decompositions.append(0);
2671                 for (int j = 0; j < d.decomposition.size(); ++j) {
2672                     int code = d.decomposition.at(j);
2673                     if (QChar::requiresSurrogates(code)) {
2674                         // save as surrogate pair
2675                         decompositions.append(QChar::highSurrogate(code));
2676                         decompositions.append(QChar::lowSurrogate(code));
2677                         utf16Length += 2;
2678                     } else {
2679                         decompositions.append(code);
2680                         utf16Length++;
2681                     }
2682                 }
2683                 decompositions[tableIndex] = d.decompositionType + (utf16Length<<8);
2684                 b.values.append(tableIndex);
2685                 tableIndex += utf16Length + 1;
2686             } else {
2687                 b.values.append(0xffff);
2688             }
2689         }
2690         int index = uniqueBlocks.indexOf(b);
2691         if (index == -1) {
2692             index = uniqueBlocks.size();
2693             b.index = used;
2694             used += SMP_BLOCKSIZE;
2695             uniqueBlocks.append(b);
2696         }
2697         blockMap.append(uniqueBlocks.at(index).index);
2698     }
2699     int smp_blocks = uniqueBlocks.size() - bmp_blocks;
2700 
2701     // if the condition below doesn't hold anymore we need to modify our decomposition code
2702     Q_ASSERT(tableIndex < 0xffff);
2703 
2704     int bmp_block_data = bmp_blocks*BMP_BLOCKSIZE*sizeof(unsigned short);
2705     int bmp_trie = BMP_END/BMP_BLOCKSIZE*sizeof(unsigned short);
2706     int bmp_mem = bmp_block_data + bmp_trie;
2707     qDebug("    %d unique blocks in BMP.", bmp_blocks);
2708     qDebug("        block data uses: %d bytes", bmp_block_data);
2709     qDebug("        trie data uses : %d bytes", bmp_trie);
2710 
2711     int smp_block_data = smp_blocks*SMP_BLOCKSIZE*sizeof(unsigned short);
2712     int smp_trie = (SMP_END-BMP_END)/SMP_BLOCKSIZE*sizeof(unsigned short);
2713     int smp_mem = smp_block_data + smp_trie;
2714     qDebug("    %d unique blocks in SMP.", smp_blocks);
2715     qDebug("        block data uses: %d bytes", smp_block_data);
2716     qDebug("        trie data uses : %d bytes", smp_trie);
2717 
2718     int decomposition_data = decompositions.size() * 2;
2719     qDebug("\n        decomposition data uses : %d bytes", decomposition_data);
2720     qDebug("    memory usage: %d bytes", bmp_mem + smp_mem + decomposition_data);
2721 
2722     Q_ASSERT(blockMap.last() + blockMap.size() < (1<<(sizeof(unsigned short)*8)));
2723 
2724     QByteArray out;
2725 
2726     out += "static const unsigned short uc_decomposition_trie[] = {\n";
2727     // first write the map
2728     out += "    // 0 - 0x" + QByteArray::number(BMP_END, 16);
2729     for (int i = 0; i < BMP_END/BMP_BLOCKSIZE; ++i) {
2730         if (!(i % 8)) {
2731             if (out.endsWith(' '))
2732                 out.chop(1);
2733             if (!((i*BMP_BLOCKSIZE) % 0x1000))
2734                 out += "\n";
2735             out += "\n    ";
2736         }
2737         out += QByteArray::number(blockMap.at(i) + blockMap.size());
2738         out += ", ";
2739     }
2740     if (out.endsWith(' '))
2741         out.chop(1);
2742     out += "\n\n    // 0x" + QByteArray::number(BMP_END, 16) + " - 0x" + QByteArray::number(SMP_END, 16) + "\n";
2743     for (int i = BMP_END/BMP_BLOCKSIZE; i < blockMap.size(); ++i) {
2744         if (!(i % 8)) {
2745             if (out.endsWith(' '))
2746                 out.chop(1);
2747             if (!(i % (0x10000/SMP_BLOCKSIZE)))
2748                 out += "\n";
2749             out += "\n    ";
2750         }
2751         out += QByteArray::number(blockMap.at(i) + blockMap.size());
2752         out += ", ";
2753     }
2754     if (out.endsWith(' '))
2755         out.chop(1);
2756     out += "\n";
2757     // write the data
2758     for (int i = 0; i < uniqueBlocks.size(); ++i) {
2759         if (out.endsWith(' '))
2760             out.chop(1);
2761         out += "\n";
2762         const UniqueBlock &b = uniqueBlocks.at(i);
2763         for (int j = 0; j < b.values.size(); ++j) {
2764             if (!(j % 8)) {
2765                 if (out.endsWith(' '))
2766                     out.chop(1);
2767                 out += "\n    ";
2768             }
2769             out += "0x" + QByteArray::number(b.values.at(j), 16);
2770             out += ", ";
2771         }
2772     }
2773     if (out.endsWith(' '))
2774         out.chop(2);
2775     out += "\n};\n\n";
2776 
2777     out += "#define GET_DECOMPOSITION_INDEX(ucs4) \\\n"
2778            "       (ucs4 < 0x" + QByteArray::number(BMP_END, 16) + " \\\n"
2779            "        ? (uc_decomposition_trie[uc_decomposition_trie[ucs4>>" + QByteArray::number(BMP_SHIFT) +
2780            "] + (ucs4 & 0x" + QByteArray::number(BMP_BLOCKSIZE-1, 16)+ ")]) \\\n"
2781            "        : (ucs4 < 0x" + QByteArray::number(SMP_END, 16) + " \\\n"
2782            "           ? uc_decomposition_trie[uc_decomposition_trie[((ucs4 - 0x" + QByteArray::number(BMP_END, 16) +
2783            ")>>" + QByteArray::number(SMP_SHIFT) + ") + 0x" + QByteArray::number(BMP_END/BMP_BLOCKSIZE, 16) + "]"
2784            " + (ucs4 & 0x" + QByteArray::number(SMP_BLOCKSIZE-1, 16) + ")] \\\n"
2785            "           : 0xffff))\n\n";
2786 
2787     out += "static const unsigned short uc_decomposition_map[] = {";
2788     for (int i = 0; i < decompositions.size(); ++i) {
2789         if (!(i % 8)) {
2790             if (out.endsWith(' '))
2791                 out.chop(1);
2792             out += "\n    ";
2793         }
2794         out += "0x" + QByteArray::number(decompositions.at(i), 16);
2795         out += ", ";
2796     }
2797     if (out.endsWith(' '))
2798         out.chop(2);
2799     out += "\n};\n\n";
2800 
2801     return out;
2802 }
2803 
createLigatureInfo()2804 static QByteArray createLigatureInfo()
2805 {
2806     qDebug("createLigatureInfo: numLigatures=%d, highestLigature=0x%x", numLigatures, highestLigature);
2807 
2808     for (int i = 0; i < ligatureHashes.size(); ++i) {
2809         const QList<Ligature> &l = ligatureHashes.value(i);
2810         for (int j = 0; j < l.size(); ++j) {
2811             // if the condition below doesn't hold anymore we need to modify our ligatureHelper code
2812             Q_ASSERT(QChar::requiresSurrogates(l.at(j).u2) == QChar::requiresSurrogates(l.at(j).ligature) &&
2813                      QChar::requiresSurrogates(l.at(j).u2) == QChar::requiresSurrogates(l.at(j).u1));
2814         }
2815     }
2816 
2817     const int BMP_BLOCKSIZE = 32;
2818     const int BMP_SHIFT = 5;
2819     const int BMP_END = 0x3100;
2820     const int SMP_END = 0x12000;
2821     const int SMP_BLOCKSIZE = 256;
2822     const int SMP_SHIFT = 8;
2823 
2824     if (SMP_END <= highestLigature)
2825         qFatal("end of table smaller than highest ligature character 0x%x", highestLigature);
2826 
2827     QList<unsigned short> ligatures;
2828     int tableIndex = 0;
2829 
2830     QList<UniqueBlock> uniqueBlocks;
2831     QVector<int> blockMap;
2832     int used = 0;
2833 
2834     for (int block = 0; block < BMP_END/BMP_BLOCKSIZE; ++block) {
2835         UniqueBlock b;
2836         b.values.reserve(BMP_BLOCKSIZE);
2837         for (int i = 0; i < BMP_BLOCKSIZE; ++i) {
2838             int uc = block*BMP_BLOCKSIZE + i;
2839             QList<Ligature> l = ligatureHashes.value(uc);
2840             if (!l.isEmpty()) {
2841                 Q_ASSERT(!QChar::requiresSurrogates(uc));
2842                 std::sort(l.begin(), l.end()); // needed for bsearch in ligatureHelper code
2843 
2844                 ligatures.append(l.size());
2845                 for (int j = 0; j < l.size(); ++j) {
2846                     ligatures.append(l.at(j).u1);
2847                     ligatures.append(l.at(j).ligature);
2848                 }
2849                 b.values.append(tableIndex);
2850                 tableIndex += 2*l.size() + 1;
2851             } else {
2852                 b.values.append(0xffff);
2853             }
2854         }
2855         int index = uniqueBlocks.indexOf(b);
2856         if (index == -1) {
2857             index = uniqueBlocks.size();
2858             b.index = used;
2859             used += BMP_BLOCKSIZE;
2860             uniqueBlocks.append(b);
2861         }
2862         blockMap.append(uniqueBlocks.at(index).index);
2863     }
2864     int bmp_blocks = uniqueBlocks.size();
2865 
2866     for (int block = BMP_END/SMP_BLOCKSIZE; block < SMP_END/SMP_BLOCKSIZE; ++block) {
2867         UniqueBlock b;
2868         b.values.reserve(SMP_BLOCKSIZE);
2869         for (int i = 0; i < SMP_BLOCKSIZE; ++i) {
2870             int uc = block*SMP_BLOCKSIZE + i;
2871             QList<Ligature> l = ligatureHashes.value(uc);
2872             if (!l.isEmpty()) {
2873                 Q_ASSERT(QChar::requiresSurrogates(uc));
2874                 std::sort(l.begin(), l.end()); // needed for bsearch in ligatureHelper code
2875 
2876                 ligatures.append(l.size());
2877                 for (int j = 0; j < l.size(); ++j) {
2878                     ligatures.append(QChar::highSurrogate(l.at(j).u1));
2879                     ligatures.append(QChar::lowSurrogate(l.at(j).u1));
2880                     ligatures.append(QChar::highSurrogate(l.at(j).ligature));
2881                     ligatures.append(QChar::lowSurrogate(l.at(j).ligature));
2882                 }
2883                 b.values.append(tableIndex);
2884                 tableIndex += 4*l.size() + 1;
2885             } else {
2886                 b.values.append(0xffff);
2887             }
2888         }
2889         int index = uniqueBlocks.indexOf(b);
2890         if (index == -1) {
2891             index = uniqueBlocks.size();
2892             b.index = used;
2893             used += SMP_BLOCKSIZE;
2894             uniqueBlocks.append(b);
2895         }
2896         blockMap.append(uniqueBlocks.at(index).index);
2897     }
2898     int smp_blocks = uniqueBlocks.size() - bmp_blocks;
2899 
2900     // if the condition below doesn't hold anymore we need to modify our composition code
2901     Q_ASSERT(tableIndex < 0xffff);
2902 
2903     int bmp_block_data = bmp_blocks*BMP_BLOCKSIZE*sizeof(unsigned short);
2904     int bmp_trie = BMP_END/BMP_BLOCKSIZE*sizeof(unsigned short);
2905     int bmp_mem = bmp_block_data + bmp_trie;
2906     qDebug("    %d unique blocks in BMP.", bmp_blocks);
2907     qDebug("        block data uses: %d bytes", bmp_block_data);
2908     qDebug("        trie data uses : %d bytes", bmp_trie);
2909 
2910     int smp_block_data = smp_blocks*SMP_BLOCKSIZE*sizeof(unsigned short);
2911     int smp_trie = (SMP_END-BMP_END)/SMP_BLOCKSIZE*sizeof(unsigned short);
2912     int smp_mem = smp_block_data + smp_trie;
2913     qDebug("    %d unique blocks in SMP.", smp_blocks);
2914     qDebug("        block data uses: %d bytes", smp_block_data);
2915     qDebug("        trie data uses : %d bytes", smp_trie);
2916 
2917     int ligature_data = ligatures.size() * 2;
2918     qDebug("\n        ligature data uses : %d bytes", ligature_data);
2919     qDebug("    memory usage: %d bytes", bmp_mem + smp_mem + ligature_data);
2920 
2921     Q_ASSERT(blockMap.last() + blockMap.size() < (1<<(sizeof(unsigned short)*8)));
2922 
2923     QByteArray out;
2924 
2925     out += "static const unsigned short uc_ligature_trie[] = {\n";
2926     // first write the map
2927     out += "    // 0 - 0x" + QByteArray::number(BMP_END, 16);
2928     for (int i = 0; i < BMP_END/BMP_BLOCKSIZE; ++i) {
2929         if (!(i % 8)) {
2930             if (out.endsWith(' '))
2931                 out.chop(1);
2932             if (!((i*BMP_BLOCKSIZE) % 0x1000))
2933                 out += "\n";
2934             out += "\n    ";
2935         }
2936         out += QByteArray::number(blockMap.at(i) + blockMap.size());
2937         out += ", ";
2938     }
2939     if (out.endsWith(' '))
2940         out.chop(1);
2941     out += "\n\n    // 0x" + QByteArray::number(BMP_END, 16) + " - 0x" + QByteArray::number(SMP_END, 16) + "\n";
2942     for (int i = BMP_END/BMP_BLOCKSIZE; i < blockMap.size(); ++i) {
2943         if (!(i % 8)) {
2944             if (out.endsWith(' '))
2945                 out.chop(1);
2946             if (!(i % (0x10000/SMP_BLOCKSIZE)))
2947                 out += "\n";
2948             out += "\n    ";
2949         }
2950         out += QByteArray::number(blockMap.at(i) + blockMap.size());
2951         out += ", ";
2952     }
2953     if (out.endsWith(' '))
2954         out.chop(1);
2955     out += "\n";
2956     // write the data
2957     for (int i = 0; i < uniqueBlocks.size(); ++i) {
2958         if (out.endsWith(' '))
2959             out.chop(1);
2960         out += "\n";
2961         const UniqueBlock &b = uniqueBlocks.at(i);
2962         for (int j = 0; j < b.values.size(); ++j) {
2963             if (!(j % 8)) {
2964                 if (out.endsWith(' '))
2965                     out.chop(1);
2966                 out += "\n    ";
2967             }
2968             out += "0x" + QByteArray::number(b.values.at(j), 16);
2969             out += ", ";
2970         }
2971     }
2972     if (out.endsWith(' '))
2973         out.chop(2);
2974     out += "\n};\n\n";
2975 
2976     out += "#define GET_LIGATURE_INDEX(ucs4) \\\n"
2977            "       (ucs4 < 0x" + QByteArray::number(BMP_END, 16) + " \\\n"
2978            "        ? (uc_ligature_trie[uc_ligature_trie[ucs4>>" + QByteArray::number(BMP_SHIFT) +
2979            "] + (ucs4 & 0x" + QByteArray::number(BMP_BLOCKSIZE-1, 16)+ ")]) \\\n"
2980            "        : (ucs4 < 0x" + QByteArray::number(SMP_END, 16) + " \\\n"
2981            "           ? uc_ligature_trie[uc_ligature_trie[((ucs4 - 0x" + QByteArray::number(BMP_END, 16) +
2982            ")>>" + QByteArray::number(SMP_SHIFT) + ") + 0x" + QByteArray::number(BMP_END/BMP_BLOCKSIZE, 16) + "]"
2983            " + (ucs4 & 0x" + QByteArray::number(SMP_BLOCKSIZE-1, 16) + ")] \\\n"
2984            "           : 0xffff))\n\n";
2985 
2986     out += "static const unsigned short uc_ligature_map[] = {";
2987     for (int i = 0; i < ligatures.size(); ++i) {
2988         if (!(i % 8)) {
2989             if (out.endsWith(' '))
2990                 out.chop(1);
2991             out += "\n    ";
2992         }
2993         out += "0x" + QByteArray::number(ligatures.at(i), 16);
2994         out += ", ";
2995     }
2996     if (out.endsWith(' '))
2997         out.chop(2);
2998     out += "\n};\n\n";
2999 
3000     return out;
3001 }
3002 
createCasingInfo()3003 QByteArray createCasingInfo()
3004 {
3005     QByteArray out;
3006 
3007     out += "struct CasingInfo {\n"
3008            "    uint codePoint : 16;\n"
3009            "    uint flags : 8;\n"
3010            "    uint offset : 8;\n"
3011            "};\n\n";
3012 
3013     return out;
3014 }
3015 
3016 
main(int,char **)3017 int main(int, char **)
3018 {
3019     initAgeMap();
3020     initCategoryMap();
3021     initDecompositionMap();
3022     initDirectionMap();
3023     initJoiningMap();
3024     initGraphemeBreak();
3025     initWordBreak();
3026     initSentenceBreak();
3027     initLineBreak();
3028     initScriptMap();
3029 
3030     readUnicodeData();
3031     readBidiMirroring();
3032     readArabicShaping();
3033     readDerivedAge();
3034     readDerivedNormalizationProps();
3035     readSpecialCasing();
3036     readCaseFolding();
3037     // readBlocks();
3038     readScripts();
3039     readGraphemeBreak();
3040     readWordBreak();
3041     readSentenceBreak();
3042     readLineBreak();
3043 
3044     computeUniqueProperties();
3045     QByteArray properties = createPropertyInfo();
3046     QByteArray specialCases = createSpecialCaseMap();
3047     QByteArray compositions = createCompositionInfo();
3048     QByteArray ligatures = createLigatureInfo();
3049     QByteArray normalizationCorrections = createNormalizationCorrections();
3050 
3051     QByteArray header =
3052         "/****************************************************************************\n"
3053         "**\n"
3054         "** Copyright (C) 2020 The Qt Company Ltd.\n"
3055         "** Contact: https://www.qt.io/licensing/\n"
3056         "**\n"
3057         "** This file is part of the QtCore module of the Qt Toolkit.\n"
3058         "**\n"
3059         "** $QT_BEGIN_LICENSE:LGPL$\n"
3060         "** Commercial License Usage\n"
3061         "** Licensees holding valid commercial Qt licenses may use this file in\n"
3062         "** accordance with the commercial license agreement provided with the\n"
3063         "** Software or, alternatively, in accordance with the terms contained in\n"
3064         "** a written agreement between you and The Qt Company. For licensing terms\n"
3065         "** and conditions see https://www.qt.io/terms-conditions. For further\n"
3066         "** information use the contact form at https://www.qt.io/contact-us.\n"
3067         "**\n"
3068         "** GNU Lesser General Public License Usage\n"
3069         "** Alternatively, this file may be used under the terms of the GNU Lesser\n"
3070         "** General Public License version 3 as published by the Free Software\n"
3071         "** Foundation and appearing in the file LICENSE.LGPL3 included in the\n"
3072         "** packaging of this file. Please review the following information to\n"
3073         "** ensure the GNU Lesser General Public License version 3 requirements\n"
3074         "** will be met: https://www.gnu.org/licenses/lgpl-3.0.html.\n"
3075         "**\n"
3076         "** GNU General Public License Usage\n"
3077         "** Alternatively, this file may be used under the terms of the GNU\n"
3078         "** General Public License version 2.0 or (at your option) the GNU General\n"
3079         "** Public license version 3 or any later version approved by the KDE Free\n"
3080         "** Qt Foundation. The licenses are as published by the Free Software\n"
3081         "** Foundation and appearing in the file LICENSE.GPL2 and LICENSE.GPL3\n"
3082         "** included in the packaging of this file. Please review the following\n"
3083         "** information to ensure the GNU General Public License requirements will\n"
3084         "** be met: https://www.gnu.org/licenses/gpl-2.0.html and\n"
3085         "** https://www.gnu.org/licenses/gpl-3.0.html.\n"
3086         "**\n"
3087         "** $QT_END_LICENSE$\n"
3088         "**\n"
3089         "****************************************************************************/\n\n";
3090 
3091     QByteArray note =
3092         "/* This file is autogenerated from the Unicode " DATA_VERSION_S " database. Do not edit */\n\n";
3093 
3094     QByteArray warning =
3095         "//\n"
3096         "//  W A R N I N G\n"
3097         "//  -------------\n"
3098         "//\n"
3099         "// This file is not part of the Qt API.  It exists for the convenience\n"
3100         "// of internal files.  This header file may change from version to version\n"
3101         "// without notice, or even be removed.\n"
3102         "//\n"
3103         "// We mean it.\n"
3104         "//\n\n";
3105 
3106     QFile f("../../src/corelib/text/qunicodetables.cpp");
3107     f.open(QFile::WriteOnly|QFile::Truncate);
3108     f.write(header);
3109     f.write(note);
3110     f.write("#include \"qunicodetables_p.h\"\n\n");
3111     f.write("QT_BEGIN_NAMESPACE\n\n");
3112     f.write("namespace QUnicodeTables {\n\n");
3113     f.write(properties);
3114     f.write("\n");
3115     f.write(specialCases);
3116     f.write("\n");
3117     f.write(compositions);
3118     f.write(ligatures);
3119     f.write("\n");
3120     f.write(normalizationCorrections);
3121     f.write("} // namespace QUnicodeTables\n\n");
3122     f.write("using namespace QUnicodeTables;\n\n");
3123     f.write("QT_END_NAMESPACE\n");
3124     f.close();
3125 
3126     f.setFileName("../../src/corelib/text/qunicodetables_p.h");
3127     f.open(QFile::WriteOnly | QFile::Truncate);
3128     f.write(header);
3129     f.write(note);
3130     f.write(warning);
3131     f.write("#ifndef QUNICODETABLES_P_H\n"
3132             "#define QUNICODETABLES_P_H\n\n"
3133             "#include <QtCore/private/qglobal_p.h>\n\n"
3134             "#include <QtCore/qchar.h>\n\n"
3135             "QT_BEGIN_NAMESPACE\n\n");
3136     f.write("#define UNICODE_DATA_VERSION " DATA_VERSION_STR "\n\n");
3137     f.write("namespace QUnicodeTables {\n\n");
3138     f.write(property_string);
3139     f.write(sizeOfPropertiesStructCheck);
3140     f.write(grapheme_break_class_string);
3141     f.write(word_break_class_string);
3142     f.write(sentence_break_class_string);
3143     f.write(line_break_class_string);
3144     f.write(methods);
3145     f.write("} // namespace QUnicodeTables\n\n"
3146             "QT_END_NAMESPACE\n\n"
3147             "#endif // QUNICODETABLES_P_H\n");
3148     f.close();
3149 
3150     qDebug() << "maxMirroredDiff  = " << hex << maxMirroredDiff;
3151     qDebug() << "maxLowerCaseDiff = " << hex << maxLowerCaseDiff;
3152     qDebug() << "maxUpperCaseDiff = " << hex << maxUpperCaseDiff;
3153     qDebug() << "maxTitleCaseDiff = " << hex << maxTitleCaseDiff;
3154     qDebug() << "maxCaseFoldDiff  = " << hex << maxCaseFoldDiff;
3155 #if 0
3156 //     dump(0, 0x7f);
3157 //     dump(0x620, 0x640);
3158 //     dump(0x10000, 0x10020);
3159 //     dump(0x10800, 0x10820);
3160 
3161     qDebug("decompositionLength used:");
3162     int totalcompositions = 0;
3163     int sum = 0;
3164     for (int i = 1; i < 20; ++i) {
3165         qDebug("    length %d used %d times", i, decompositionLength.value(i, 0));
3166         totalcompositions += i*decompositionLength.value(i, 0);
3167         sum += decompositionLength.value(i, 0);
3168     }
3169     qDebug("    len decomposition map %d, average length %f, num composed chars %d",
3170            totalcompositions, (float)totalcompositions/(float)sum, sum);
3171     qDebug("highest composed character %x", highestComposedCharacter);
3172     qDebug("num ligatures = %d highest=%x, maxLength=%d", numLigatures, highestLigature, longestLigature);
3173 
3174     qBubbleSort(ligatures);
3175     for (int i = 0; i < ligatures.size(); ++i)
3176         qDebug("%s", ligatures.at(i).data());
3177 
3178 //     qDebug("combiningClass usage:");
3179 //     int numClasses = 0;
3180 //     for (int i = 0; i < 255; ++i) {
3181 //         int num = combiningClassUsage.value(i, 0);
3182 //         if (num) {
3183 //             ++numClasses;
3184 //             qDebug("    combiningClass %d used %d times", i, num);
3185 //         }
3186 //     }
3187 //     qDebug("total of %d combining classes used", numClasses);
3188 
3189 #endif
3190 }
3191