1 /****************************************************************************
2 **
3 ** Copyright (C) 2019 The Qt Company Ltd.
4 ** Contact: https://www.qt.io/licensing/
5 **
6 ** This file is part of the utils of the Qt Toolkit.
7 **
8 ** $QT_BEGIN_LICENSE:GPL-EXCEPT$
9 ** Commercial License Usage
10 ** Licensees holding valid commercial Qt licenses may use this file in
11 ** accordance with the commercial license agreement provided with the
12 ** Software or, alternatively, in accordance with the terms contained in
13 ** a written agreement between you and The Qt Company. For licensing terms
14 ** and conditions see https://www.qt.io/terms-conditions. For further
15 ** information use the contact form at https://www.qt.io/contact-us.
16 **
17 ** GNU General Public License Usage
18 ** Alternatively, this file may be used under the terms of the GNU
19 ** General Public License version 3 as published by the Free Software
20 ** Foundation with exceptions as appearing in the file LICENSE.GPL3-EXCEPT
21 ** included in the packaging of this file. Please review the following
22 ** information to ensure the GNU General Public License requirements will
23 ** be met: https://www.gnu.org/licenses/gpl-3.0.html.
24 **
25 ** $QT_END_LICENSE$
26 **
27 ****************************************************************************/
28
29 #include <qlist.h>
30 #include <qhash.h>
31 #include <qfile.h>
32 #include <qbytearray.h>
33 #include <qstring.h>
34 #include <qchar.h>
35 #include <qvector.h>
36 #include <qdebug.h>
37 #if 0
38 #include <private/qunicodetables_p.h>
39 #endif
40
41 #define DATA_VERSION_S "13.0"
42 #define DATA_VERSION_STR "QChar::Unicode_13_0"
43
44
45 static QHash<QByteArray, QChar::UnicodeVersion> age_map;
46
initAgeMap()47 static void initAgeMap()
48 {
49 struct AgeMap {
50 const QChar::UnicodeVersion version;
51 const char *age;
52 } ageMap[] = {
53 { QChar::Unicode_1_1, "1.1" },
54 { QChar::Unicode_2_0, "2.0" },
55 { QChar::Unicode_2_1_2, "2.1" },
56 { QChar::Unicode_3_0, "3.0" },
57 { QChar::Unicode_3_1, "3.1" },
58 { QChar::Unicode_3_2, "3.2" },
59 { QChar::Unicode_4_0, "4.0" },
60 { QChar::Unicode_4_1, "4.1" },
61 { QChar::Unicode_5_0, "5.0" },
62 { QChar::Unicode_5_1, "5.1" },
63 { QChar::Unicode_5_2, "5.2" },
64 { QChar::Unicode_6_0, "6.0" },
65 { QChar::Unicode_6_1, "6.1" },
66 { QChar::Unicode_6_2, "6.2" },
67 { QChar::Unicode_6_3, "6.3" },
68 { QChar::Unicode_7_0, "7.0" },
69 { QChar::Unicode_8_0, "8.0" },
70 { QChar::Unicode_9_0, "9.0" },
71 { QChar::Unicode_10_0, "10.0" },
72 { QChar::Unicode_11_0, "11.0" },
73 { QChar::Unicode_12_0, "12.0" },
74 { QChar::Unicode_12_1, "12.1" }, // UCD Revision 24
75 { QChar::Unicode_13_0, "13.0" }, // UCD Revision 26
76 { QChar::Unicode_Unassigned, 0 }
77 };
78 AgeMap *d = ageMap;
79 while (d->age) {
80 age_map.insert(d->age, d->version);
81 ++d;
82 }
83 }
84
85 static QHash<QByteArray, QChar::Category> categoryMap;
86
initCategoryMap()87 static void initCategoryMap()
88 {
89 struct Cat {
90 QChar::Category cat;
91 const char *name;
92 } categories[] = {
93 { QChar::Mark_NonSpacing, "Mn" },
94 { QChar::Mark_SpacingCombining, "Mc" },
95 { QChar::Mark_Enclosing, "Me" },
96
97 { QChar::Number_DecimalDigit, "Nd" },
98 { QChar::Number_Letter, "Nl" },
99 { QChar::Number_Other, "No" },
100
101 { QChar::Separator_Space, "Zs" },
102 { QChar::Separator_Line, "Zl" },
103 { QChar::Separator_Paragraph, "Zp" },
104
105 { QChar::Other_Control, "Cc" },
106 { QChar::Other_Format, "Cf" },
107 { QChar::Other_Surrogate, "Cs" },
108 { QChar::Other_PrivateUse, "Co" },
109 { QChar::Other_NotAssigned, "Cn" },
110
111 { QChar::Letter_Uppercase, "Lu" },
112 { QChar::Letter_Lowercase, "Ll" },
113 { QChar::Letter_Titlecase, "Lt" },
114 { QChar::Letter_Modifier, "Lm" },
115 { QChar::Letter_Other, "Lo" },
116
117 { QChar::Punctuation_Connector, "Pc" },
118 { QChar::Punctuation_Dash, "Pd" },
119 { QChar::Punctuation_Open, "Ps" },
120 { QChar::Punctuation_Close, "Pe" },
121 { QChar::Punctuation_InitialQuote, "Pi" },
122 { QChar::Punctuation_FinalQuote, "Pf" },
123 { QChar::Punctuation_Other, "Po" },
124
125 { QChar::Symbol_Math, "Sm" },
126 { QChar::Symbol_Currency, "Sc" },
127 { QChar::Symbol_Modifier, "Sk" },
128 { QChar::Symbol_Other, "So" },
129 { QChar::Other_NotAssigned, 0 }
130 };
131 Cat *c = categories;
132 while (c->name) {
133 categoryMap.insert(c->name, c->cat);
134 ++c;
135 }
136 }
137
138
139 static QHash<QByteArray, QChar::Decomposition> decompositionMap;
140
initDecompositionMap()141 static void initDecompositionMap()
142 {
143 struct Dec {
144 QChar::Decomposition dec;
145 const char *name;
146 } decompositions[] = {
147 { QChar::Canonical, "<canonical>" },
148 { QChar::Font, "<font>" },
149 { QChar::NoBreak, "<noBreak>" },
150 { QChar::Initial, "<initial>" },
151 { QChar::Medial, "<medial>" },
152 { QChar::Final, "<final>" },
153 { QChar::Isolated, "<isolated>" },
154 { QChar::Circle, "<circle>" },
155 { QChar::Super, "<super>" },
156 { QChar::Sub, "<sub>" },
157 { QChar::Vertical, "<vertical>" },
158 { QChar::Wide, "<wide>" },
159 { QChar::Narrow, "<narrow>" },
160 { QChar::Small, "<small>" },
161 { QChar::Square, "<square>" },
162 { QChar::Compat, "<compat>" },
163 { QChar::Fraction, "<fraction>" },
164 { QChar::NoDecomposition, 0 }
165 };
166 Dec *d = decompositions;
167 while (d->name) {
168 decompositionMap.insert(d->name, d->dec);
169 ++d;
170 }
171 }
172
173
174 enum Direction {
175 DirL = QChar::DirL,
176 DirR = QChar::DirR,
177 DirEN = QChar::DirEN,
178 DirES = QChar::DirES,
179 DirET = QChar::DirET,
180 DirAN = QChar::DirAN,
181 DirCS = QChar::DirCS,
182 DirB = QChar::DirB,
183 DirS = QChar::DirS,
184 DirWS = QChar::DirWS,
185 DirON = QChar::DirON,
186 DirLRE = QChar::DirLRE,
187 DirLRO = QChar::DirLRO,
188 DirAL = QChar::DirAL,
189 DirRLE = QChar::DirRLE,
190 DirRLO = QChar::DirRLO,
191 DirPDF = QChar::DirPDF,
192 DirNSM = QChar::DirNSM,
193 DirBN = QChar::DirBN,
194 DirLRI = QChar::DirLRI,
195 DirRLI = QChar::DirRLI,
196 DirFSI = QChar::DirFSI,
197 DirPDI = QChar::DirPDI,
198
199 Dir_Unassigned
200 };
201
202 static QHash<QByteArray, Direction> directionMap;
203
initDirectionMap()204 static void initDirectionMap()
205 {
206 struct Dir {
207 Direction dir;
208 const char *name;
209 } directions[] = {
210 { DirL, "L" },
211 { DirR, "R" },
212 { DirEN, "EN" },
213 { DirES, "ES" },
214 { DirET, "ET" },
215 { DirAN, "AN" },
216 { DirCS, "CS" },
217 { DirB, "B" },
218 { DirS, "S" },
219 { DirWS, "WS" },
220 { DirON, "ON" },
221 { DirLRE, "LRE" },
222 { DirLRO, "LRO" },
223 { DirAL, "AL" },
224 { DirRLE, "RLE" },
225 { DirRLO, "RLO" },
226 { DirPDF, "PDF" },
227 { DirNSM, "NSM" },
228 { DirBN, "BN" },
229 { DirLRI, "LRI" },
230 { DirRLI, "RLI" },
231 { DirFSI, "FSI" },
232 { DirPDI, "PDI" },
233 { Dir_Unassigned, 0 }
234 };
235 Dir *d = directions;
236 while (d->name) {
237 directionMap.insert(d->name, d->dir);
238 ++d;
239 }
240 }
241
242
243 enum JoiningType {
244 Joining_None,
245 Joining_Causing,
246 Joining_Dual,
247 Joining_Right,
248 Joining_Left,
249 Joining_Transparent,
250
251 Joining_Unassigned
252 };
253
254 static QHash<QByteArray, JoiningType> joining_map;
255
initJoiningMap()256 static void initJoiningMap()
257 {
258 struct JoiningList {
259 JoiningType joining;
260 const char *name;
261 } joinings[] = {
262 { Joining_None, "U" },
263 { Joining_Causing, "C" },
264 { Joining_Dual, "D" },
265 { Joining_Right, "R" },
266 { Joining_Left, "L" },
267 { Joining_Transparent, "T" },
268 { Joining_Unassigned, 0 }
269 };
270 JoiningList *d = joinings;
271 while (d->name) {
272 joining_map.insert(d->name, d->joining);
273 ++d;
274 }
275 }
276
277
278 static const char *grapheme_break_class_string =
279 "enum GraphemeBreakClass {\n"
280 " GraphemeBreak_Any,\n"
281 " GraphemeBreak_CR,\n"
282 " GraphemeBreak_LF,\n"
283 " GraphemeBreak_Control,\n"
284 " GraphemeBreak_Extend,\n"
285 " GraphemeBreak_ZWJ,\n"
286 " GraphemeBreak_RegionalIndicator,\n"
287 " GraphemeBreak_Prepend,\n"
288 " GraphemeBreak_SpacingMark,\n"
289 " GraphemeBreak_L,\n"
290 " GraphemeBreak_V,\n"
291 " GraphemeBreak_T,\n"
292 " GraphemeBreak_LV,\n"
293 " GraphemeBreak_LVT,\n"
294 " Graphemebreak_E_Base,\n"
295 " Graphemebreak_E_Modifier,\n"
296 " Graphemebreak_Glue_After_Zwj,\n"
297 " Graphemebreak_E_Base_GAZ,\n"
298 "\n"
299 " NumGraphemeBreakClasses\n"
300 "};\n\n";
301
302 enum GraphemeBreakClass {
303 GraphemeBreak_Any,
304 GraphemeBreak_CR,
305 GraphemeBreak_LF,
306 GraphemeBreak_Control,
307 GraphemeBreak_Extend,
308 GraphemeBreak_ZWJ,
309 GraphemeBreak_RegionalIndicator,
310 GraphemeBreak_Prepend,
311 GraphemeBreak_SpacingMark,
312 GraphemeBreak_L,
313 GraphemeBreak_V,
314 GraphemeBreak_T,
315 GraphemeBreak_LV,
316 GraphemeBreak_LVT,
317 Graphemebreak_E_Base,
318 Graphemebreak_E_Modifier,
319 Graphemebreak_Glue_After_Zwj,
320 Graphemebreak_E_Base_GAZ,
321
322 GraphemeBreak_Unassigned
323 };
324
325 static QHash<QByteArray, GraphemeBreakClass> grapheme_break_map;
326
initGraphemeBreak()327 static void initGraphemeBreak()
328 {
329 struct GraphemeBreakList {
330 GraphemeBreakClass brk;
331 const char *name;
332 } breaks[] = {
333 { GraphemeBreak_Any, "Any" },
334 { GraphemeBreak_CR, "CR" },
335 { GraphemeBreak_LF, "LF" },
336 { GraphemeBreak_Control, "Control" },
337 { GraphemeBreak_Extend, "Extend" },
338 { GraphemeBreak_ZWJ, "ZWJ" },
339 { GraphemeBreak_RegionalIndicator, "Regional_Indicator" },
340 { GraphemeBreak_Prepend, "Prepend" },
341 { GraphemeBreak_SpacingMark, "SpacingMark" },
342 { GraphemeBreak_L, "L" },
343 { GraphemeBreak_V, "V" },
344 { GraphemeBreak_T, "T" },
345 { GraphemeBreak_LV, "LV" },
346 { GraphemeBreak_LVT, "LVT" },
347 { Graphemebreak_E_Base, "E_Base" },
348 { Graphemebreak_E_Modifier, "E_Modifier" },
349 { Graphemebreak_Glue_After_Zwj, "Glue_After_Zwj" },
350 { Graphemebreak_E_Base_GAZ, "E_Base_GAZ" },
351 { GraphemeBreak_Unassigned, 0 }
352 };
353 GraphemeBreakList *d = breaks;
354 while (d->name) {
355 grapheme_break_map.insert(d->name, d->brk);
356 ++d;
357 }
358 }
359
360
361 static const char *word_break_class_string =
362 "enum WordBreakClass {\n"
363 " WordBreak_Any,\n"
364 " WordBreak_CR,\n"
365 " WordBreak_LF,\n"
366 " WordBreak_Newline,\n"
367 " WordBreak_Extend,\n"
368 " WordBreak_ZWJ,\n"
369 " WordBreak_Format,\n"
370 " WordBreak_RegionalIndicator,\n"
371 " WordBreak_Katakana,\n"
372 " WordBreak_HebrewLetter,\n"
373 " WordBreak_ALetter,\n"
374 " WordBreak_SingleQuote,\n"
375 " WordBreak_DoubleQuote,\n"
376 " WordBreak_MidNumLet,\n"
377 " WordBreak_MidLetter,\n"
378 " WordBreak_MidNum,\n"
379 " WordBreak_Numeric,\n"
380 " WordBreak_ExtendNumLet,\n"
381 " WordBreak_E_Base,\n"
382 " WordBreak_E_Modifier,\n"
383 " WordBreak_Glue_After_Zwj,\n"
384 " WordBreak_E_Base_GAZ,\n"
385 " WordBreak_WSegSpace,\n"
386 "\n"
387 " NumWordBreakClasses\n"
388 "};\n\n";
389
390 enum WordBreakClass {
391 WordBreak_Any,
392 WordBreak_CR,
393 WordBreak_LF,
394 WordBreak_Newline,
395 WordBreak_Extend,
396 WordBreak_ZWJ,
397 WordBreak_Format,
398 WordBreak_RegionalIndicator,
399 WordBreak_Katakana,
400 WordBreak_HebrewLetter,
401 WordBreak_ALetter,
402 WordBreak_SingleQuote,
403 WordBreak_DoubleQuote,
404 WordBreak_MidNumLet,
405 WordBreak_MidLetter,
406 WordBreak_MidNum,
407 WordBreak_Numeric,
408 WordBreak_ExtendNumLet,
409 WordBreak_E_Base,
410 WordBreak_E_Modifier,
411 WordBreak_Glue_After_Zwj,
412 WordBreak_E_Base_GAZ,
413 WordBreak_WSegSpace,
414
415 WordBreak_Unassigned
416 };
417
418 static QHash<QByteArray, WordBreakClass> word_break_map;
419
initWordBreak()420 static void initWordBreak()
421 {
422 struct WordBreakList {
423 WordBreakClass brk;
424 const char *name;
425 } breaks[] = {
426 { WordBreak_Any, "Any" },
427 { WordBreak_CR, "CR" },
428 { WordBreak_LF, "LF" },
429 { WordBreak_Newline, "Newline" },
430 { WordBreak_Extend, "Extend" },
431 { WordBreak_ZWJ, "ZWJ" },
432 { WordBreak_Format, "Format" },
433 { WordBreak_RegionalIndicator, "Regional_Indicator" },
434 { WordBreak_Katakana, "Katakana" },
435 { WordBreak_HebrewLetter, "Hebrew_Letter" },
436 { WordBreak_ALetter, "ALetter" },
437 { WordBreak_SingleQuote, "Single_Quote" },
438 { WordBreak_DoubleQuote, "Double_Quote" },
439 { WordBreak_MidNumLet, "MidNumLet" },
440 { WordBreak_MidLetter, "MidLetter" },
441 { WordBreak_MidNum, "MidNum" },
442 { WordBreak_Numeric, "Numeric" },
443 { WordBreak_ExtendNumLet, "ExtendNumLet" },
444 { WordBreak_E_Base, "E_Base" },
445 { WordBreak_E_Modifier, "E_Modifier" },
446 { WordBreak_Glue_After_Zwj, "Glue_After_Zwj" },
447 { WordBreak_E_Base_GAZ, "E_Base_GAZ" },
448 { WordBreak_WSegSpace, "WSegSpace" },
449 { WordBreak_Unassigned, 0 }
450 };
451 WordBreakList *d = breaks;
452 while (d->name) {
453 word_break_map.insert(d->name, d->brk);
454 ++d;
455 }
456 }
457
458
459 static const char *sentence_break_class_string =
460 "enum SentenceBreakClass {\n"
461 " SentenceBreak_Any,\n"
462 " SentenceBreak_CR,\n"
463 " SentenceBreak_LF,\n"
464 " SentenceBreak_Sep,\n"
465 " SentenceBreak_Extend,\n"
466 " SentenceBreak_Sp,\n"
467 " SentenceBreak_Lower,\n"
468 " SentenceBreak_Upper,\n"
469 " SentenceBreak_OLetter,\n"
470 " SentenceBreak_Numeric,\n"
471 " SentenceBreak_ATerm,\n"
472 " SentenceBreak_SContinue,\n"
473 " SentenceBreak_STerm,\n"
474 " SentenceBreak_Close,\n"
475 "\n"
476 " NumSentenceBreakClasses\n"
477 "};\n\n";
478
479 enum SentenceBreakClass {
480 SentenceBreak_Any,
481 SentenceBreak_CR,
482 SentenceBreak_LF,
483 SentenceBreak_Sep,
484 SentenceBreak_Extend,
485 SentenceBreak_Sp,
486 SentenceBreak_Lower,
487 SentenceBreak_Upper,
488 SentenceBreak_OLetter,
489 SentenceBreak_Numeric,
490 SentenceBreak_ATerm,
491 SentenceBreak_SContinue,
492 SentenceBreak_STerm,
493 SentenceBreak_Close,
494
495 SentenceBreak_Unassigned
496 };
497
498 static QHash<QByteArray, SentenceBreakClass> sentence_break_map;
499
initSentenceBreak()500 static void initSentenceBreak()
501 {
502 struct SentenceBreakList {
503 SentenceBreakClass brk;
504 const char *name;
505 } breaks[] = {
506 { SentenceBreak_Any, "Any" },
507 { SentenceBreak_CR, "CR" },
508 { SentenceBreak_LF, "LF" },
509 { SentenceBreak_Sep, "Sep" },
510 { SentenceBreak_Extend, "Extend" },
511 { SentenceBreak_Extend, "Format" },
512 { SentenceBreak_Sp, "Sp" },
513 { SentenceBreak_Lower, "Lower" },
514 { SentenceBreak_Upper, "Upper" },
515 { SentenceBreak_OLetter, "OLetter" },
516 { SentenceBreak_Numeric, "Numeric" },
517 { SentenceBreak_ATerm, "ATerm" },
518 { SentenceBreak_SContinue, "SContinue" },
519 { SentenceBreak_STerm, "STerm" },
520 { SentenceBreak_Close, "Close" },
521 { SentenceBreak_Unassigned, 0 }
522 };
523 SentenceBreakList *d = breaks;
524 while (d->name) {
525 sentence_break_map.insert(d->name, d->brk);
526 ++d;
527 }
528 }
529
530
531 static const char *line_break_class_string =
532 "// see http://www.unicode.org/reports/tr14/tr14-30.html\n"
533 "// we don't use the XX and AI classes and map them to AL instead.\n"
534 "enum LineBreakClass {\n"
535 " LineBreak_OP, LineBreak_CL, LineBreak_CP, LineBreak_QU, LineBreak_GL,\n"
536 " LineBreak_NS, LineBreak_EX, LineBreak_SY, LineBreak_IS, LineBreak_PR,\n"
537 " LineBreak_PO, LineBreak_NU, LineBreak_AL, LineBreak_HL, LineBreak_ID,\n"
538 " LineBreak_IN, LineBreak_HY, LineBreak_BA, LineBreak_BB, LineBreak_B2,\n"
539 " LineBreak_ZW, LineBreak_CM, LineBreak_WJ, LineBreak_H2, LineBreak_H3,\n"
540 " LineBreak_JL, LineBreak_JV, LineBreak_JT, LineBreak_RI, LineBreak_CB,\n"
541 " LineBreak_EB, LineBreak_EM, LineBreak_ZWJ,\n"
542 " LineBreak_SA, LineBreak_SG, LineBreak_SP,\n"
543 " LineBreak_CR, LineBreak_LF, LineBreak_BK,\n"
544 "\n"
545 " NumLineBreakClasses\n"
546 "};\n\n";
547
548 enum LineBreakClass {
549 LineBreak_OP, LineBreak_CL, LineBreak_CP, LineBreak_QU, LineBreak_GL,
550 LineBreak_NS, LineBreak_EX, LineBreak_SY, LineBreak_IS, LineBreak_PR,
551 LineBreak_PO, LineBreak_NU, LineBreak_AL, LineBreak_HL, LineBreak_ID,
552 LineBreak_IN, LineBreak_HY, LineBreak_BA, LineBreak_BB, LineBreak_B2,
553 LineBreak_ZW, LineBreak_CM, LineBreak_WJ, LineBreak_H2, LineBreak_H3,
554 LineBreak_JL, LineBreak_JV, LineBreak_JT, LineBreak_RI, LineBreak_CB,
555 LineBreak_EB, LineBreak_EM, LineBreak_ZWJ,
556 LineBreak_SA, LineBreak_SG, LineBreak_SP,
557 LineBreak_CR, LineBreak_LF, LineBreak_BK,
558
559 LineBreak_Unassigned
560 };
561
562 static QHash<QByteArray, LineBreakClass> line_break_map;
563
initLineBreak()564 static void initLineBreak()
565 {
566 // ### Classes XX and AI are left out and mapped to AL for now.
567 // ### Class NL is mapped to BK.
568 // ### Treating characters of class CJ as class NS will give CSS strict line breaking;
569 // treating them as class ID will give CSS normal breaking.
570 struct LineBreakList {
571 LineBreakClass brk;
572 const char *name;
573 } breaks[] = {
574 { LineBreak_BK, "BK" },
575 { LineBreak_CR, "CR" },
576 { LineBreak_LF, "LF" },
577 { LineBreak_CM, "CM" },
578 { LineBreak_BK, "NL" },
579 { LineBreak_SG, "SG" },
580 { LineBreak_WJ, "WJ" },
581 { LineBreak_ZW, "ZW" },
582 { LineBreak_GL, "GL" },
583 { LineBreak_SP, "SP" },
584 { LineBreak_B2, "B2" },
585 { LineBreak_BA, "BA" },
586 { LineBreak_BB, "BB" },
587 { LineBreak_HY, "HY" },
588 { LineBreak_CB, "CB" },
589 { LineBreak_NS, "CJ" },
590 { LineBreak_CL, "CL" },
591 { LineBreak_CP, "CP" },
592 { LineBreak_EX, "EX" },
593 { LineBreak_IN, "IN" },
594 { LineBreak_NS, "NS" },
595 { LineBreak_OP, "OP" },
596 { LineBreak_QU, "QU" },
597 { LineBreak_IS, "IS" },
598 { LineBreak_NU, "NU" },
599 { LineBreak_PO, "PO" },
600 { LineBreak_PR, "PR" },
601 { LineBreak_SY, "SY" },
602 { LineBreak_AL, "AI" },
603 { LineBreak_AL, "AL" },
604 { LineBreak_HL, "HL" },
605 { LineBreak_H2, "H2" },
606 { LineBreak_H3, "H3" },
607 { LineBreak_ID, "ID" },
608 { LineBreak_JL, "JL" },
609 { LineBreak_JV, "JV" },
610 { LineBreak_JT, "JT" },
611 { LineBreak_RI, "RI" },
612 { LineBreak_SA, "SA" },
613 { LineBreak_AL, "XX" },
614 { LineBreak_EB, "EB" },
615 { LineBreak_EM, "EM" },
616 { LineBreak_ZWJ, "ZWJ" },
617 { LineBreak_Unassigned, 0 }
618 };
619 LineBreakList *d = breaks;
620 while (d->name) {
621 line_break_map.insert(d->name, d->brk);
622 ++d;
623 }
624 }
625
626
627 static QHash<QByteArray, QChar::Script> scriptMap;
628
initScriptMap()629 static void initScriptMap()
630 {
631 struct Scrpt {
632 QChar::Script script;
633 const char *name;
634 } scripts[] = {
635 // general
636 { QChar::Script_Unknown, "Unknown" },
637 { QChar::Script_Inherited, "Inherited" },
638 { QChar::Script_Common, "Common" },
639 // pre-4.0
640 { QChar::Script_Latin, "Latin" },
641 { QChar::Script_Greek, "Greek" },
642 { QChar::Script_Cyrillic, "Cyrillic" },
643 { QChar::Script_Armenian, "Armenian" },
644 { QChar::Script_Hebrew, "Hebrew" },
645 { QChar::Script_Arabic, "Arabic" },
646 { QChar::Script_Syriac, "Syriac" },
647 { QChar::Script_Thaana, "Thaana" },
648 { QChar::Script_Devanagari, "Devanagari" },
649 { QChar::Script_Bengali, "Bengali" },
650 { QChar::Script_Gurmukhi, "Gurmukhi" },
651 { QChar::Script_Gujarati, "Gujarati" },
652 { QChar::Script_Oriya, "Oriya" },
653 { QChar::Script_Tamil, "Tamil" },
654 { QChar::Script_Telugu, "Telugu" },
655 { QChar::Script_Kannada, "Kannada" },
656 { QChar::Script_Malayalam, "Malayalam" },
657 { QChar::Script_Sinhala, "Sinhala" },
658 { QChar::Script_Thai, "Thai" },
659 { QChar::Script_Lao, "Lao" },
660 { QChar::Script_Tibetan, "Tibetan" },
661 { QChar::Script_Myanmar, "Myanmar" },
662 { QChar::Script_Georgian, "Georgian" },
663 { QChar::Script_Hangul, "Hangul" },
664 { QChar::Script_Ethiopic, "Ethiopic" },
665 { QChar::Script_Cherokee, "Cherokee" },
666 { QChar::Script_CanadianAboriginal, "CanadianAboriginal" },
667 { QChar::Script_Ogham, "Ogham" },
668 { QChar::Script_Runic, "Runic" },
669 { QChar::Script_Khmer, "Khmer" },
670 { QChar::Script_Mongolian, "Mongolian" },
671 { QChar::Script_Hiragana, "Hiragana" },
672 { QChar::Script_Katakana, "Katakana" },
673 { QChar::Script_Bopomofo, "Bopomofo" },
674 { QChar::Script_Han, "Han" },
675 { QChar::Script_Yi, "Yi" },
676 { QChar::Script_OldItalic, "OldItalic" },
677 { QChar::Script_Gothic, "Gothic" },
678 { QChar::Script_Deseret, "Deseret" },
679 { QChar::Script_Tagalog, "Tagalog" },
680 { QChar::Script_Hanunoo, "Hanunoo" },
681 { QChar::Script_Buhid, "Buhid" },
682 { QChar::Script_Tagbanwa, "Tagbanwa" },
683 { QChar::Script_Coptic, "Coptic" },
684 // 4.0
685 { QChar::Script_Limbu, "Limbu" },
686 { QChar::Script_TaiLe, "TaiLe" },
687 { QChar::Script_LinearB, "LinearB" },
688 { QChar::Script_Ugaritic, "Ugaritic" },
689 { QChar::Script_Shavian, "Shavian" },
690 { QChar::Script_Osmanya, "Osmanya" },
691 { QChar::Script_Cypriot, "Cypriot" },
692 { QChar::Script_Braille, "Braille" },
693 // 4.1
694 { QChar::Script_Buginese, "Buginese" },
695 { QChar::Script_NewTaiLue, "NewTaiLue" },
696 { QChar::Script_Glagolitic, "Glagolitic" },
697 { QChar::Script_Tifinagh, "Tifinagh" },
698 { QChar::Script_SylotiNagri, "SylotiNagri" },
699 { QChar::Script_OldPersian, "OldPersian" },
700 { QChar::Script_Kharoshthi, "Kharoshthi" },
701 // 5.0
702 { QChar::Script_Balinese, "Balinese" },
703 { QChar::Script_Cuneiform, "Cuneiform" },
704 { QChar::Script_Phoenician, "Phoenician" },
705 { QChar::Script_PhagsPa, "PhagsPa" },
706 { QChar::Script_Nko, "Nko" },
707 // 5.1
708 { QChar::Script_Sundanese, "Sundanese" },
709 { QChar::Script_Lepcha, "Lepcha" },
710 { QChar::Script_OlChiki, "OlChiki" },
711 { QChar::Script_Vai, "Vai" },
712 { QChar::Script_Saurashtra, "Saurashtra" },
713 { QChar::Script_KayahLi, "KayahLi" },
714 { QChar::Script_Rejang, "Rejang" },
715 { QChar::Script_Lycian, "Lycian" },
716 { QChar::Script_Carian, "Carian" },
717 { QChar::Script_Lydian, "Lydian" },
718 { QChar::Script_Cham, "Cham" },
719 // 5.2
720 { QChar::Script_TaiTham, "TaiTham" },
721 { QChar::Script_TaiViet, "TaiViet" },
722 { QChar::Script_Avestan, "Avestan" },
723 { QChar::Script_EgyptianHieroglyphs, "EgyptianHieroglyphs" },
724 { QChar::Script_Samaritan, "Samaritan" },
725 { QChar::Script_Lisu, "Lisu" },
726 { QChar::Script_Bamum, "Bamum" },
727 { QChar::Script_Javanese, "Javanese" },
728 { QChar::Script_MeeteiMayek, "MeeteiMayek" },
729 { QChar::Script_ImperialAramaic, "ImperialAramaic" },
730 { QChar::Script_OldSouthArabian, "OldSouthArabian" },
731 { QChar::Script_InscriptionalParthian, "InscriptionalParthian" },
732 { QChar::Script_InscriptionalPahlavi, "InscriptionalPahlavi" },
733 { QChar::Script_OldTurkic, "OldTurkic" },
734 { QChar::Script_Kaithi, "Kaithi" },
735 // 6.0
736 { QChar::Script_Batak, "Batak" },
737 { QChar::Script_Brahmi, "Brahmi" },
738 { QChar::Script_Mandaic, "Mandaic" },
739 // 6.1
740 { QChar::Script_Chakma, "Chakma" },
741 { QChar::Script_MeroiticCursive, "MeroiticCursive" },
742 { QChar::Script_MeroiticHieroglyphs, "MeroiticHieroglyphs" },
743 { QChar::Script_Miao, "Miao" },
744 { QChar::Script_Sharada, "Sharada" },
745 { QChar::Script_SoraSompeng, "SoraSompeng" },
746 { QChar::Script_Takri, "Takri" },
747 // 7.0
748 { QChar::Script_CaucasianAlbanian, "CaucasianAlbanian" },
749 { QChar::Script_BassaVah, "BassaVah" },
750 { QChar::Script_Duployan, "Duployan" },
751 { QChar::Script_Elbasan, "Elbasan" },
752 { QChar::Script_Grantha, "Grantha" },
753 { QChar::Script_PahawhHmong, "PahawhHmong" },
754 { QChar::Script_Khojki, "Khojki" },
755 { QChar::Script_LinearA, "LinearA" },
756 { QChar::Script_Mahajani, "Mahajani" },
757 { QChar::Script_Manichaean, "Manichaean" },
758 { QChar::Script_MendeKikakui, "MendeKikakui" },
759 { QChar::Script_Modi, "Modi" },
760 { QChar::Script_Mro, "Mro" },
761 { QChar::Script_OldNorthArabian, "OldNorthArabian" },
762 { QChar::Script_Nabataean, "Nabataean" },
763 { QChar::Script_Palmyrene, "Palmyrene" },
764 { QChar::Script_PauCinHau, "PauCinHau" },
765 { QChar::Script_OldPermic, "OldPermic" },
766 { QChar::Script_PsalterPahlavi, "PsalterPahlavi" },
767 { QChar::Script_Siddham, "Siddham" },
768 { QChar::Script_Khudawadi, "Khudawadi" },
769 { QChar::Script_Tirhuta, "Tirhuta" },
770 { QChar::Script_WarangCiti, "WarangCiti" },
771 // 8.0
772 { QChar::Script_Ahom, "Ahom" },
773 { QChar::Script_AnatolianHieroglyphs, "AnatolianHieroglyphs" },
774 { QChar::Script_Hatran, "Hatran" },
775 { QChar::Script_Multani, "Multani" },
776 { QChar::Script_OldHungarian, "OldHungarian" },
777 { QChar::Script_SignWriting, "SignWriting" },
778 // 9.0
779 { QChar::Script_Adlam, "Adlam" },
780 { QChar::Script_Bhaiksuki, "Bhaiksuki" },
781 { QChar::Script_Marchen, "Marchen" },
782 { QChar::Script_Newa, "Newa" },
783 { QChar::Script_Osage, "Osage" },
784 { QChar::Script_Tangut, "Tangut" },
785 // 10.0
786 { QChar::Script_MasaramGondi, "MasaramGondi" },
787 { QChar::Script_Nushu, "Nushu" },
788 { QChar::Script_Soyombo, "Soyombo" },
789 { QChar::Script_ZanabazarSquare, "ZanabazarSquare" },
790 // 12.1
791 { QChar::Script_Dogra, "Dogra" },
792 { QChar::Script_GunjalaGondi, "GunjalaGondi" },
793 { QChar::Script_HanifiRohingya, "HanifiRohingya" },
794 { QChar::Script_Makasar, "Makasar" },
795 { QChar::Script_Medefaidrin, "Medefaidrin" },
796 { QChar::Script_OldSogdian, "OldSogdian" },
797 { QChar::Script_Sogdian, "Sogdian" },
798 { QChar::Script_Elymaic, "Elymaic" },
799 { QChar::Script_Nandinagari, "Nandinagari" },
800 { QChar::Script_NyiakengPuachueHmong, "NyiakengPuachueHmong" },
801 { QChar::Script_Wancho, "Wancho" },
802 // 13.0
803 { QChar::Script_Chorasmian, "Chorasmian" },
804 { QChar::Script_DivesAkuru, "DivesAkuru" },
805 { QChar::Script_KhitanSmallScript, "KhitanSmallScript" },
806 { QChar::Script_Yezidi, "Yezidi" },
807
808 // unhandled
809 { QChar::Script_Unknown, 0 }
810 };
811 Scrpt *p = scripts;
812 while (p->name) {
813 scriptMap.insert(p->name, p->script);
814 ++p;
815 }
816 }
817
818 // Keep this one in sync with the code in createPropertyInfo
819 static const char *property_string =
820 "enum Case {\n"
821 " LowerCase,\n"
822 " UpperCase,\n"
823 " TitleCase,\n"
824 " CaseFold,\n"
825 "\n"
826 " NumCases\n"
827 "};\n"
828 "\n"
829 "struct Properties {\n"
830 " ushort category : 8; /* 5 used */\n"
831 " ushort direction : 8; /* 5 used */\n"
832 " ushort combiningClass : 8;\n"
833 " ushort joining : 3;\n"
834 " signed short digitValue : 5;\n"
835 " signed short mirrorDiff : 16;\n"
836 " ushort unicodeVersion : 8; /* 5 used */\n"
837 " ushort nfQuickCheck : 8;\n" // could be narrowed
838 "#ifdef Q_OS_WASM\n"
839 " unsigned char : 0; //wasm 64 packing trick\n"
840 "#endif\n"
841 " struct {\n"
842 " ushort special : 1;\n"
843 " signed short diff : 15;\n"
844 " } cases[NumCases];\n"
845 "#ifdef Q_OS_WASM\n"
846 " unsigned char : 0; //wasm 64 packing trick\n"
847 "#endif\n"
848 " ushort graphemeBreakClass : 5; /* 5 used */\n"
849 " ushort wordBreakClass : 5; /* 5 used */\n"
850 " ushort lineBreakClass : 6; /* 6 used */\n"
851 " ushort sentenceBreakClass : 8; /* 4 used */\n"
852 " ushort script : 8;\n"
853 "};\n\n"
854 "Q_CORE_EXPORT const Properties * QT_FASTCALL properties(uint ucs4) noexcept;\n"
855 "Q_CORE_EXPORT const Properties * QT_FASTCALL properties(ushort ucs2) noexcept;\n"
856 "\n";
857
858 static const char *methods =
859 "Q_CORE_EXPORT GraphemeBreakClass QT_FASTCALL graphemeBreakClass(uint ucs4) noexcept;\n"
860 "inline GraphemeBreakClass graphemeBreakClass(QChar ch) noexcept\n"
861 "{ return graphemeBreakClass(ch.unicode()); }\n"
862 "\n"
863 "Q_CORE_EXPORT WordBreakClass QT_FASTCALL wordBreakClass(uint ucs4) noexcept;\n"
864 "inline WordBreakClass wordBreakClass(QChar ch) noexcept\n"
865 "{ return wordBreakClass(ch.unicode()); }\n"
866 "\n"
867 "Q_CORE_EXPORT SentenceBreakClass QT_FASTCALL sentenceBreakClass(uint ucs4) noexcept;\n"
868 "inline SentenceBreakClass sentenceBreakClass(QChar ch) noexcept\n"
869 "{ return sentenceBreakClass(ch.unicode()); }\n"
870 "\n"
871 "Q_CORE_EXPORT LineBreakClass QT_FASTCALL lineBreakClass(uint ucs4) noexcept;\n"
872 "inline LineBreakClass lineBreakClass(QChar ch) noexcept\n"
873 "{ return lineBreakClass(ch.unicode()); }\n"
874 "\n";
875
876 static const int SizeOfPropertiesStruct = 20;
877
878 static const QByteArray sizeOfPropertiesStructCheck =
879 "Q_STATIC_ASSERT(sizeof(Properties) == " + QByteArray::number(SizeOfPropertiesStruct) + ");\n\n";
880
881 struct PropertyFlags {
operator ==PropertyFlags882 bool operator==(const PropertyFlags &o) const {
883 return (combiningClass == o.combiningClass
884 && category == o.category
885 && direction == o.direction
886 && joining == o.joining
887 && age == o.age
888 && digitValue == o.digitValue
889 && mirrorDiff == o.mirrorDiff
890 && lowerCaseDiff == o.lowerCaseDiff
891 && upperCaseDiff == o.upperCaseDiff
892 && titleCaseDiff == o.titleCaseDiff
893 && caseFoldDiff == o.caseFoldDiff
894 && lowerCaseSpecial == o.lowerCaseSpecial
895 && upperCaseSpecial == o.upperCaseSpecial
896 && titleCaseSpecial == o.titleCaseSpecial
897 && caseFoldSpecial == o.caseFoldSpecial
898 && graphemeBreakClass == o.graphemeBreakClass
899 && wordBreakClass == o.wordBreakClass
900 && sentenceBreakClass == o.sentenceBreakClass
901 && lineBreakClass == o.lineBreakClass
902 && script == o.script
903 && nfQuickCheck == o.nfQuickCheck
904 );
905 }
906 // from UnicodeData.txt
907 uchar combiningClass : 8;
908 QChar::Category category : 5;
909 QChar::Direction direction : 5;
910 // from ArabicShaping.txt
911 QChar::JoiningType joining : 3;
912 // from DerivedAge.txt
913 QChar::UnicodeVersion age : 5;
914 int digitValue;
915
916 int mirrorDiff : 16;
917
918 int lowerCaseDiff;
919 int upperCaseDiff;
920 int titleCaseDiff;
921 int caseFoldDiff;
922 bool lowerCaseSpecial;
923 bool upperCaseSpecial;
924 bool titleCaseSpecial;
925 bool caseFoldSpecial;
926 GraphemeBreakClass graphemeBreakClass;
927 WordBreakClass wordBreakClass;
928 SentenceBreakClass sentenceBreakClass;
929 LineBreakClass lineBreakClass;
930 int script;
931 // from DerivedNormalizationProps.txt
932 uchar nfQuickCheck;
933 };
934
935
936 static QList<int> specialCaseMap;
937
appendToSpecialCaseMap(const QList<int> & map)938 static int appendToSpecialCaseMap(const QList<int> &map)
939 {
940 QList<int> utf16map;
941 for (int i = 0; i < map.size(); ++i) {
942 uint codepoint = map.at(i);
943 // if the condition below doesn't hold anymore we need to modify our special case mapping code
944 Q_ASSERT(!QChar::requiresSurrogates(codepoint));
945 if (QChar::requiresSurrogates(codepoint)) {
946 utf16map << QChar::highSurrogate(codepoint);
947 utf16map << QChar::lowSurrogate(codepoint);
948 } else {
949 utf16map << codepoint;
950 }
951 }
952 int length = utf16map.size();
953 utf16map.prepend(length);
954
955 if (specialCaseMap.isEmpty())
956 specialCaseMap << 0; // placeholder
957
958 int i = 1;
959 while (i < specialCaseMap.size()) {
960 int n = specialCaseMap.at(i);
961 if (n == length) {
962 int j;
963 for (j = 1; j <= n; ++j) {
964 if (specialCaseMap.at(i+j) != utf16map.at(j))
965 break;
966 }
967 if (j > n)
968 return i;
969 }
970 i += n + 1;
971 }
972
973 int pos = specialCaseMap.size();
974 specialCaseMap << utf16map;
975 return pos;
976 }
977
978 // DerivedCoreProperties.txt
isDefaultIgnorable(uint ucs4)979 static inline bool isDefaultIgnorable(uint ucs4)
980 {
981 // Default_Ignorable_Code_Point:
982 // Generated from
983 // Other_Default_Ignorable_Code_Point + Cf + Variation_Selector
984 // - White_Space - FFF9..FFFB (Annotation Characters)
985 // - 0600..0604, 06DD, 070F, 110BD (exceptional Cf characters that should be visible)
986 if (ucs4 <= 0xff)
987 return ucs4 == 0xad;
988
989 return ucs4 == 0x034f
990 || ucs4 == 0x061c
991 || (ucs4 >= 0x115f && ucs4 <= 0x1160)
992 || (ucs4 >= 0x17b4 && ucs4 <= 0x17b5)
993 || (ucs4 >= 0x180b && ucs4 <= 0x180d)
994 || ucs4 == 0x180e
995 || (ucs4 >= 0x200b && ucs4 <= 0x200f)
996 || (ucs4 >= 0x202a && ucs4 <= 0x202e)
997 || (ucs4 >= 0x2060 && ucs4 <= 0x206f)
998 || ucs4 == 0x3164
999 || (ucs4 >= 0xfe00 && ucs4 <= 0xfe0f)
1000 || ucs4 == 0xfeff
1001 || ucs4 == 0xffa0
1002 || (ucs4 >= 0xfff0 && ucs4 <= 0xfff8)
1003 || (ucs4 >= 0x1bca0 && ucs4 <= 0x1bca3)
1004 || (ucs4 >= 0x1d173 && ucs4 <= 0x1d17a)
1005 || (ucs4 >= 0xe0000 && ucs4 <= 0xe0fff);
1006 }
1007
1008 struct UnicodeData {
UnicodeDataUnicodeData1009 UnicodeData(int codepoint = 0) {
1010 p.category = QChar::Other_NotAssigned; // Cn
1011 p.combiningClass = 0;
1012
1013 p.direction = QChar::DirL;
1014 // DerivedBidiClass.txt
1015 // The unassigned code points that default to AL are in the ranges:
1016 // [U+0600..U+07BF, U+08A0..U+08FF, U+FB50..U+FDCF, U+FDF0..U+FDFF, U+FE70..U+FEFF, U+1EE00..U+1EEFF]
1017 if ((codepoint >= 0x0600 && codepoint <= 0x07BF)
1018 || (codepoint >= 0x08A0 && codepoint <= 0x08FF)
1019 || (codepoint >= 0xFB50 && codepoint <= 0xFDCF)
1020 || (codepoint >= 0xFDF0 && codepoint <= 0xFDFF)
1021 || (codepoint >= 0xFE70 && codepoint <= 0xFEFF)
1022 || (codepoint >= 0x1EE00 && codepoint <= 0x1EEFF)) {
1023 p.direction = QChar::DirAL;
1024 }
1025 // The unassigned code points that default to R are in the ranges:
1026 // [U+0590..U+05FF, U+07C0..U+089F, U+FB1D..U+FB4F, U+10800..U+10FFF, U+1E800..U+1EDFF, U+1EF00..U+1EFFF]
1027 else if ((codepoint >= 0x0590 && codepoint <= 0x05FF)
1028 || (codepoint >= 0x07C0 && codepoint <= 0x089F)
1029 || (codepoint >= 0xFB1D && codepoint <= 0xFB4F)
1030 || (codepoint >= 0x10800 && codepoint <= 0x10FFF)
1031 || (codepoint >= 0x1E800 && codepoint <= 0x1EDFF)
1032 || (codepoint >= 0x1EF00 && codepoint <= 0x1EFFF)) {
1033 p.direction = QChar::DirR;
1034 }
1035 // The unassigned code points that default to ET are in the range:
1036 // [U+20A0..U+20CF]
1037 else if (codepoint >= 0x20A0 && codepoint <= 0x20CF) {
1038 p.direction = QChar::DirET;
1039 }
1040 // The unassigned code points that default to BN have one of the following properties:
1041 // Default_Ignorable_Code_Point
1042 // Noncharacter_Code_Point
1043 else if (QChar::isNonCharacter(codepoint) || isDefaultIgnorable(codepoint)) {
1044 p.direction = QChar::DirBN;
1045 }
1046
1047 p.lineBreakClass = LineBreak_AL; // XX -> AL
1048 // LineBreak.txt
1049 // The unassigned code points that default to "ID" include ranges in the following blocks:
1050 // [U+3400..U+4DBF, U+4E00..U+9FFF, U+F900..U+FAFF, U+20000..U+2A6DF, U+2A700..U+2B73F, U+2B740..U+2B81F, U+2B820..U+2CEAF, U+2F800..U+2FA1F]
1051 // and any other reserved code points on
1052 // [U+20000..U+2FFFD, U+30000..U+3FFFD]
1053 if ((codepoint >= 0x3400 && codepoint <= 0x4DBF)
1054 || (codepoint >= 0x4E00 && codepoint <= 0x9FFF)
1055 || (codepoint >= 0xF900 && codepoint <= 0xFAFF)
1056 || (codepoint >= 0x20000 && codepoint <= 0x2A6DF)
1057 || (codepoint >= 0x2A700 && codepoint <= 0x2B73F)
1058 || (codepoint >= 0x2B740 && codepoint <= 0x2B81F)
1059 || (codepoint >= 0x2B820 && codepoint <= 0x2CEAF)
1060 || (codepoint >= 0x2F800 && codepoint <= 0x2FA1F)
1061 || (codepoint >= 0x20000 && codepoint <= 0x2FFFD)
1062 || (codepoint >= 0x30000 && codepoint <= 0x3FFFD)) {
1063 p.lineBreakClass = LineBreak_ID;
1064 }
1065 // The unassigned code points that default to "PR" comprise a range in the following block:
1066 // [U+20A0..U+20CF]
1067 else if (codepoint >= 0x20A0 && codepoint <= 0x20CF) {
1068 p.lineBreakClass = LineBreak_PR;
1069 }
1070
1071 mirroredChar = 0;
1072 decompositionType = QChar::NoDecomposition;
1073 p.joining = QChar::Joining_None;
1074 p.age = QChar::Unicode_Unassigned;
1075 p.mirrorDiff = 0;
1076 p.digitValue = -1;
1077 p.lowerCaseDiff = 0;
1078 p.upperCaseDiff = 0;
1079 p.titleCaseDiff = 0;
1080 p.caseFoldDiff = 0;
1081 p.lowerCaseSpecial = 0;
1082 p.upperCaseSpecial = 0;
1083 p.titleCaseSpecial = 0;
1084 p.caseFoldSpecial = 0;
1085 p.graphemeBreakClass = GraphemeBreak_Any;
1086 p.wordBreakClass = WordBreak_Any;
1087 p.sentenceBreakClass = SentenceBreak_Any;
1088 p.script = QChar::Script_Unknown;
1089 p.nfQuickCheck = 0;
1090 propertyIndex = -1;
1091 excludedComposition = false;
1092 }
1093
1094 static UnicodeData &valueRef(int codepoint);
1095
1096 PropertyFlags p;
1097
1098 // from UnicodeData.txt
1099 QChar::Decomposition decompositionType;
1100 QList<int> decomposition;
1101
1102 QList<int> specialFolding;
1103
1104 // from BidiMirroring.txt
1105 int mirroredChar;
1106
1107 // DerivedNormalizationProps.txt
1108 bool excludedComposition;
1109
1110 // computed position of unicode property set
1111 int propertyIndex;
1112 };
1113
1114 static QList<UnicodeData> unicodeData;
1115
valueRef(int codepoint)1116 UnicodeData &UnicodeData::valueRef(int codepoint)
1117 {
1118 static bool initialized = false;
1119 if (!initialized) {
1120 unicodeData.reserve(QChar::LastValidCodePoint + 1);
1121 for (int uc = 0; uc <= QChar::LastValidCodePoint; ++uc)
1122 unicodeData.append(UnicodeData(uc));
1123 initialized = true;
1124 }
1125
1126 Q_ASSERT(codepoint <= 0x10ffff);
1127 return unicodeData[codepoint];
1128 }
1129
1130
1131 static QHash<int, int> decompositionLength;
1132 static int highestComposedCharacter = 0;
1133 static int numLigatures = 0;
1134 static int highestLigature = 0;
1135
1136 struct Ligature {
1137 int u1;
1138 int u2;
1139 int ligature;
1140 };
1141 // we need them sorted after the first component for fast lookup
operator <(const Ligature & l1,const Ligature & l2)1142 bool operator < (const Ligature &l1, const Ligature &l2)
1143 { return l1.u1 < l2.u1; }
1144
1145 static QHash<int, QList<Ligature> > ligatureHashes;
1146
1147 static QHash<int, int> combiningClassUsage;
1148
1149 static int maxLowerCaseDiff = 0;
1150 static int maxUpperCaseDiff = 0;
1151 static int maxTitleCaseDiff = 0;
1152
readUnicodeData()1153 static void readUnicodeData()
1154 {
1155 qDebug("Reading UnicodeData.txt");
1156
1157 enum UniDataFields {
1158 UD_Value,
1159 UD_Name,
1160 UD_Category,
1161 UD_CombiningClass,
1162 UD_BidiCategory,
1163 UD_Decomposition,
1164 UD_DecimalDigitValue,
1165 UD_DigitValue,
1166 UD_NumericValue,
1167 UD_Mirrored,
1168 UD_OldName,
1169 UD_Comment,
1170 UD_UpperCase,
1171 UD_LowerCase,
1172 UD_TitleCase
1173 };
1174
1175 QFile f("data/UnicodeData.txt");
1176 if (!f.exists())
1177 qFatal("Couldn't find UnicodeData.txt");
1178
1179 f.open(QFile::ReadOnly);
1180
1181 while (!f.atEnd()) {
1182 QByteArray line;
1183 line.resize(1024);
1184 int len = f.readLine(line.data(), 1024);
1185 line.truncate(len-1);
1186
1187 int comment = line.indexOf('#');
1188 if (comment >= 0)
1189 line = line.left(comment);
1190 if (line.isEmpty())
1191 continue;
1192
1193 QList<QByteArray> properties = line.split(';');
1194 bool ok;
1195 int codepoint = properties[UD_Value].toInt(&ok, 16);
1196 Q_ASSERT(ok);
1197 Q_ASSERT(codepoint <= QChar::LastValidCodePoint);
1198 int lastCodepoint = codepoint;
1199
1200 QByteArray name = properties[UD_Name];
1201 if (name.startsWith('<') && name.contains("First")) {
1202 QByteArray nextLine;
1203 nextLine.resize(1024);
1204 f.readLine(nextLine.data(), 1024);
1205 QList<QByteArray> properties = nextLine.split(';');
1206 Q_ASSERT(properties[UD_Name].startsWith('<') && properties[UD_Name].contains("Last"));
1207 lastCodepoint = properties[UD_Value].toInt(&ok, 16);
1208 Q_ASSERT(ok);
1209 Q_ASSERT(lastCodepoint <= QChar::LastValidCodePoint);
1210 }
1211
1212 UnicodeData &data = UnicodeData::valueRef(codepoint);
1213 data.p.category = categoryMap.value(properties[UD_Category], QChar::Other_NotAssigned);
1214 data.p.combiningClass = properties[UD_CombiningClass].toInt();
1215 if (!combiningClassUsage.contains(data.p.combiningClass))
1216 combiningClassUsage[data.p.combiningClass] = 1;
1217 else
1218 ++combiningClassUsage[data.p.combiningClass];
1219
1220 Direction dir = directionMap.value(properties[UD_BidiCategory], Dir_Unassigned);
1221 if (dir == Dir_Unassigned)
1222 qFatal("unhandled direction value: %s", properties[UD_BidiCategory].constData());
1223 data.p.direction = QChar::Direction(dir);
1224
1225 if (!properties[UD_UpperCase].isEmpty()) {
1226 int upperCase = properties[UD_UpperCase].toInt(&ok, 16);
1227 Q_ASSERT(ok);
1228 int diff = upperCase - codepoint;
1229 // if the conditions below doesn't hold anymore we need to modify our upper casing code
1230 Q_ASSERT(QChar::requiresSurrogates(codepoint) == QChar::requiresSurrogates(upperCase));
1231 if (QChar::requiresSurrogates(codepoint)) {
1232 Q_ASSERT(QChar::highSurrogate(codepoint) == QChar::highSurrogate(upperCase));
1233 Q_ASSERT(QChar::lowSurrogate(codepoint) + diff == QChar::lowSurrogate(upperCase));
1234 }
1235 if (qAbs(diff) >= (1<<13)) {
1236 qWarning() << "upperCaseDiff exceeded (" << hex << codepoint << "->" << upperCase << "); map it for special case";
1237 data.p.upperCaseSpecial = true;
1238 data.p.upperCaseDiff = appendToSpecialCaseMap(QList<int>() << upperCase);
1239 } else {
1240 data.p.upperCaseDiff = diff;
1241 maxUpperCaseDiff = qMax(maxUpperCaseDiff, qAbs(diff));
1242 }
1243 }
1244 if (!properties[UD_LowerCase].isEmpty()) {
1245 int lowerCase = properties[UD_LowerCase].toInt(&ok, 16);
1246 Q_ASSERT(ok);
1247 int diff = lowerCase - codepoint;
1248 // if the conditions below doesn't hold anymore we need to modify our lower casing code
1249 Q_ASSERT(QChar::requiresSurrogates(codepoint) == QChar::requiresSurrogates(lowerCase));
1250 if (QChar::requiresSurrogates(codepoint)) {
1251 Q_ASSERT(QChar::highSurrogate(codepoint) == QChar::highSurrogate(lowerCase));
1252 Q_ASSERT(QChar::lowSurrogate(codepoint) + diff == QChar::lowSurrogate(lowerCase));
1253 }
1254 if (qAbs(diff) >= (1<<13)) {
1255 qWarning() << "lowerCaseDiff exceeded (" << hex << codepoint << "->" << lowerCase << "); map it for special case";
1256 data.p.lowerCaseSpecial = true;
1257 data.p.lowerCaseDiff = appendToSpecialCaseMap(QList<int>() << lowerCase);
1258 } else {
1259 data.p.lowerCaseDiff = diff;
1260 maxLowerCaseDiff = qMax(maxLowerCaseDiff, qAbs(diff));
1261 }
1262 }
1263 // we want toTitleCase to map to ToUpper in case we don't have any titlecase.
1264 if (properties[UD_TitleCase].isEmpty())
1265 properties[UD_TitleCase] = properties[UD_UpperCase];
1266 if (!properties[UD_TitleCase].isEmpty()) {
1267 int titleCase = properties[UD_TitleCase].toInt(&ok, 16);
1268 Q_ASSERT(ok);
1269 int diff = titleCase - codepoint;
1270 // if the conditions below doesn't hold anymore we need to modify our title casing code
1271 Q_ASSERT(QChar::requiresSurrogates(codepoint) == QChar::requiresSurrogates(titleCase));
1272 if (QChar::requiresSurrogates(codepoint)) {
1273 Q_ASSERT(QChar::highSurrogate(codepoint) == QChar::highSurrogate(titleCase));
1274 Q_ASSERT(QChar::lowSurrogate(codepoint) + diff == QChar::lowSurrogate(titleCase));
1275 }
1276 if (qAbs(diff) >= (1<<13)) {
1277 qWarning() << "titleCaseDiff exceeded (" << hex << codepoint << "->" << titleCase << "); map it for special case";
1278 data.p.titleCaseSpecial = true;
1279 data.p.titleCaseDiff = appendToSpecialCaseMap(QList<int>() << titleCase);
1280 } else {
1281 data.p.titleCaseDiff = diff;
1282 maxTitleCaseDiff = qMax(maxTitleCaseDiff, qAbs(diff));
1283 }
1284 }
1285
1286 if (!properties[UD_DigitValue].isEmpty())
1287 data.p.digitValue = properties[UD_DigitValue].toInt();
1288
1289 // decompositition
1290 QByteArray decomposition = properties[UD_Decomposition];
1291 if (!decomposition.isEmpty()) {
1292 highestComposedCharacter = qMax(highestComposedCharacter, codepoint);
1293 QList<QByteArray> d = decomposition.split(' ');
1294 if (d[0].contains('<')) {
1295 data.decompositionType = decompositionMap.value(d[0], QChar::NoDecomposition);
1296 if (data.decompositionType == QChar::NoDecomposition)
1297 qFatal("unhandled decomposition type: %s", d[0].constData());
1298 d.takeFirst();
1299 } else {
1300 data.decompositionType = QChar::Canonical;
1301 }
1302 for (int i = 0; i < d.size(); ++i) {
1303 data.decomposition.append(d[i].toInt(&ok, 16));
1304 Q_ASSERT(ok);
1305 }
1306 ++decompositionLength[data.decomposition.size()];
1307 }
1308
1309 for (int i = codepoint; i <= lastCodepoint; ++i)
1310 unicodeData[i] = data;
1311 }
1312 }
1313
1314 static int maxMirroredDiff = 0;
1315
readBidiMirroring()1316 static void readBidiMirroring()
1317 {
1318 qDebug("Reading BidiMirroring.txt");
1319
1320 QFile f("data/BidiMirroring.txt");
1321 if (!f.exists())
1322 qFatal("Couldn't find BidiMirroring.txt");
1323
1324 f.open(QFile::ReadOnly);
1325
1326 while (!f.atEnd()) {
1327 QByteArray line;
1328 line.resize(1024);
1329 int len = f.readLine(line.data(), 1024);
1330 line.resize(len-1);
1331
1332 int comment = line.indexOf('#');
1333 if (comment >= 0)
1334 line = line.left(comment);
1335
1336 if (line.isEmpty())
1337 continue;
1338 line = line.replace(" ", "");
1339
1340 QList<QByteArray> pair = line.split(';');
1341 Q_ASSERT(pair.size() == 2);
1342
1343 bool ok;
1344 int codepoint = pair[0].toInt(&ok, 16);
1345 Q_ASSERT(ok);
1346 int mirror = pair[1].toInt(&ok, 16);
1347 Q_ASSERT(ok);
1348
1349 UnicodeData &d = UnicodeData::valueRef(codepoint);
1350 d.mirroredChar = mirror;
1351 d.p.mirrorDiff = d.mirroredChar - codepoint;
1352 maxMirroredDiff = qMax(maxMirroredDiff, qAbs(d.p.mirrorDiff));
1353 }
1354 }
1355
readArabicShaping()1356 static void readArabicShaping()
1357 {
1358 qDebug("Reading ArabicShaping.txt");
1359
1360 // Initialize defaults:
1361 // Code points that are not explicitly listed in ArabicShaping.txt are either of joining type T or U:
1362 // - Those that not explicitly listed that are of General Category Mn, Me, or Cf have joining type T.
1363 // - All others not explicitly listed have joining type U.
1364 for (int codepoint = 0; codepoint <= QChar::LastValidCodePoint; ++codepoint) {
1365 UnicodeData &d = UnicodeData::valueRef(codepoint);
1366 if (d.p.joining == QChar::Joining_None) {
1367 if (d.p.category == QChar::Mark_NonSpacing || d.p.category == QChar::Mark_Enclosing || d.p.category == QChar::Other_Format)
1368 d.p.joining = QChar::Joining_Transparent;
1369 }
1370 }
1371
1372 QFile f("data/ArabicShaping.txt");
1373 if (!f.exists())
1374 qFatal("Couldn't find ArabicShaping.txt");
1375
1376 f.open(QFile::ReadOnly);
1377
1378 while (!f.atEnd()) {
1379 QByteArray line;
1380 line.resize(1024);
1381 int len = f.readLine(line.data(), 1024);
1382 line.resize(len-1);
1383
1384 int comment = line.indexOf('#');
1385 if (comment >= 0)
1386 line = line.left(comment);
1387 line = line.trimmed();
1388
1389 if (line.isEmpty())
1390 continue;
1391
1392 QList<QByteArray> l = line.split(';');
1393 Q_ASSERT(l.size() == 4);
1394
1395 bool ok;
1396 int codepoint = l[0].toInt(&ok, 16);
1397 Q_ASSERT(ok);
1398
1399 UnicodeData &d = UnicodeData::valueRef(codepoint);
1400 JoiningType joining = joining_map.value(l[2].trimmed(), Joining_Unassigned);
1401 switch (joining) {
1402 case Joining_Unassigned:
1403 qFatal("%x: unassigned or unhandled joining type: %s", codepoint, l[2].constData());
1404 break;
1405 case Joining_Transparent:
1406 switch (d.p.category) {
1407 case QChar::Mark_Enclosing:
1408 case QChar::Mark_NonSpacing:
1409 case QChar::Letter_Modifier:
1410 case QChar::Other_Format:
1411 break;
1412 default:
1413 qFatal("%x: joining type '%s' was met (category: %d); "
1414 "the current implementation needs to be revised!",
1415 codepoint, l[2].constData(), d.p.category);
1416 }
1417 Q_FALLTHROUGH();
1418 default:
1419 d.p.joining = QChar::JoiningType(joining);
1420 break;
1421 }
1422 }
1423 }
1424
readDerivedAge()1425 static void readDerivedAge()
1426 {
1427 qDebug("Reading DerivedAge.txt");
1428
1429 QFile f("data/DerivedAge.txt");
1430 if (!f.exists())
1431 qFatal("Couldn't find DerivedAge.txt");
1432
1433 f.open(QFile::ReadOnly);
1434
1435 while (!f.atEnd()) {
1436 QByteArray line;
1437 line.resize(1024);
1438 int len = f.readLine(line.data(), 1024);
1439 line.resize(len-1);
1440
1441 int comment = line.indexOf('#');
1442 if (comment >= 0)
1443 line = line.left(comment);
1444 line.replace(" ", "");
1445
1446 if (line.isEmpty())
1447 continue;
1448
1449 QList<QByteArray> l = line.split(';');
1450 Q_ASSERT(l.size() == 2);
1451
1452 QByteArray codes = l[0];
1453 codes.replace("..", ".");
1454 QList<QByteArray> cl = codes.split('.');
1455
1456 bool ok;
1457 int from = cl[0].toInt(&ok, 16);
1458 Q_ASSERT(ok);
1459 int to = from;
1460 if (cl.size() == 2) {
1461 to = cl[1].toInt(&ok, 16);
1462 Q_ASSERT(ok);
1463 }
1464
1465 QChar::UnicodeVersion age = age_map.value(l[1].trimmed(), QChar::Unicode_Unassigned);
1466 //qDebug() << hex << from << ".." << to << ba << age;
1467 if (age == QChar::Unicode_Unassigned)
1468 qFatal("unassigned or unhandled age value: %s", l[1].constData());
1469
1470 for (int codepoint = from; codepoint <= to; ++codepoint) {
1471 UnicodeData &d = UnicodeData::valueRef(codepoint);
1472 d.p.age = age;
1473 }
1474 }
1475 }
1476
readDerivedNormalizationProps()1477 static void readDerivedNormalizationProps()
1478 {
1479 qDebug("Reading DerivedNormalizationProps.txt");
1480
1481 QFile f("data/DerivedNormalizationProps.txt");
1482 if (!f.exists())
1483 qFatal("Couldn't find DerivedNormalizationProps.txt");
1484
1485 f.open(QFile::ReadOnly);
1486
1487 while (!f.atEnd()) {
1488 QByteArray line;
1489 line.resize(1024);
1490 int len = f.readLine(line.data(), 1024);
1491 line.resize(len-1);
1492
1493 int comment = line.indexOf('#');
1494 if (comment >= 0)
1495 line = line.left(comment);
1496
1497 if (line.trimmed().isEmpty())
1498 continue;
1499
1500 QList<QByteArray> l = line.split(';');
1501 Q_ASSERT(l.size() >= 2);
1502
1503 QByteArray propName = l[1].trimmed();
1504 if (propName != "Full_Composition_Exclusion" &&
1505 propName != "NFD_QC" && propName != "NFC_QC" &&
1506 propName != "NFKD_QC" && propName != "NFKC_QC") {
1507 // ###
1508 continue;
1509 }
1510
1511 QByteArray codes = l[0].trimmed();
1512 codes.replace("..", ".");
1513 QList<QByteArray> cl = codes.split('.');
1514
1515 bool ok;
1516 int from = cl[0].toInt(&ok, 16);
1517 Q_ASSERT(ok);
1518 int to = from;
1519 if (cl.size() == 2) {
1520 to = cl[1].toInt(&ok, 16);
1521 Q_ASSERT(ok);
1522 }
1523
1524 for (int codepoint = from; codepoint <= to; ++codepoint) {
1525 UnicodeData &d = UnicodeData::valueRef(codepoint);
1526 if (propName == "Full_Composition_Exclusion") {
1527 d.excludedComposition = true;
1528 } else {
1529 Q_STATIC_ASSERT(QString::NormalizationForm_D == 0);
1530 Q_STATIC_ASSERT(QString::NormalizationForm_C == 1);
1531 Q_STATIC_ASSERT(QString::NormalizationForm_KD == 2);
1532 Q_STATIC_ASSERT(QString::NormalizationForm_KC == 3);
1533
1534 QString::NormalizationForm form;
1535 if (propName == "NFD_QC")
1536 form = QString::NormalizationForm_D;
1537 else if (propName == "NFC_QC")
1538 form = QString::NormalizationForm_C;
1539 else if (propName == "NFKD_QC")
1540 form = QString::NormalizationForm_KD;
1541 else// if (propName == "NFKC_QC")
1542 form = QString::NormalizationForm_KC;
1543
1544 Q_ASSERT(l.size() == 3);
1545 l[2] = l[2].trimmed();
1546
1547 enum { NFQC_YES = 0, NFQC_NO = 1, NFQC_MAYBE = 3 };
1548 uchar ynm = (l[2] == "N" ? NFQC_NO : l[2] == "M" ? NFQC_MAYBE : NFQC_YES);
1549 if (ynm == NFQC_MAYBE) {
1550 // if this changes, we need to revise the normalizationQuickCheckHelper() implementation
1551 Q_ASSERT(form == QString::NormalizationForm_C || form == QString::NormalizationForm_KC);
1552 }
1553 d.p.nfQuickCheck |= (ynm << (form << 1)); // 2 bits per NF
1554 }
1555 }
1556 }
1557
1558 for (int codepoint = 0; codepoint <= QChar::LastValidCodePoint; ++codepoint) {
1559 UnicodeData &d = UnicodeData::valueRef(codepoint);
1560 if (!d.excludedComposition
1561 && d.decompositionType == QChar::Canonical
1562 && d.decomposition.size() > 1) {
1563 Q_ASSERT(d.decomposition.size() == 2);
1564
1565 int part1 = d.decomposition.at(0);
1566 int part2 = d.decomposition.at(1);
1567
1568 // all non-starters are listed in DerivedNormalizationProps.txt
1569 // and already excluded from composition
1570 Q_ASSERT(UnicodeData::valueRef(part1).p.combiningClass == 0);
1571
1572 ++numLigatures;
1573 highestLigature = qMax(highestLigature, part1);
1574 Ligature l = { part1, part2, codepoint };
1575 ligatureHashes[part2].append(l);
1576 }
1577 }
1578 }
1579
1580
1581 struct NormalizationCorrection {
1582 uint codepoint;
1583 uint mapped;
1584 int version;
1585 };
1586
createNormalizationCorrections()1587 static QByteArray createNormalizationCorrections()
1588 {
1589 qDebug("Reading NormalizationCorrections.txt");
1590
1591 QFile f("data/NormalizationCorrections.txt");
1592 if (!f.exists())
1593 qFatal("Couldn't find NormalizationCorrections.txt");
1594
1595 f.open(QFile::ReadOnly);
1596
1597 QByteArray out;
1598
1599 out += "struct NormalizationCorrection {\n"
1600 " uint ucs4;\n"
1601 " uint old_mapping;\n"
1602 " int version;\n"
1603 "};\n\n"
1604
1605 "static const NormalizationCorrection uc_normalization_corrections[] = {\n";
1606
1607 int maxVersion = 0;
1608 int numCorrections = 0;
1609 while (!f.atEnd()) {
1610 QByteArray line;
1611 line.resize(1024);
1612 int len = f.readLine(line.data(), 1024);
1613 line.resize(len-1);
1614
1615 int comment = line.indexOf('#');
1616 if (comment >= 0)
1617 line = line.left(comment);
1618 line.replace(" ", "");
1619
1620 if (line.isEmpty())
1621 continue;
1622
1623 Q_ASSERT(!line.contains(".."));
1624
1625 QList<QByteArray> fields = line.split(';');
1626 Q_ASSERT(fields.size() == 4);
1627
1628 NormalizationCorrection c = { 0, 0, 0 };
1629 bool ok;
1630 c.codepoint = fields.at(0).toInt(&ok, 16);
1631 Q_ASSERT(ok);
1632 c.mapped = fields.at(1).toInt(&ok, 16);
1633 Q_ASSERT(ok);
1634 if (fields.at(3) == "3.2.0")
1635 c.version = QChar::Unicode_3_2;
1636 else if (fields.at(3) == "4.0.0")
1637 c.version = QChar::Unicode_4_0;
1638 else
1639 qFatal("unknown unicode version in NormalizationCorrection.txt");
1640
1641 out += " { 0x" + QByteArray::number(c.codepoint, 16) + ", 0x" + QByteArray::number(c.mapped, 16)
1642 + ", " + QString::number(c.version) + " },\n";
1643 ++numCorrections;
1644 maxVersion = qMax(c.version, maxVersion);
1645 }
1646 if (out.endsWith(",\n"))
1647 out.chop(2);
1648
1649 out += "\n};\n\n"
1650
1651 "enum { NumNormalizationCorrections = " + QByteArray::number(numCorrections) + " };\n"
1652 "enum { NormalizationCorrectionsVersionMax = " + QByteArray::number(maxVersion) + " };\n\n";
1653
1654 return out;
1655 }
1656
readLineBreak()1657 static void readLineBreak()
1658 {
1659 qDebug("Reading LineBreak.txt");
1660
1661 QFile f("data/LineBreak.txt");
1662 if (!f.exists())
1663 qFatal("Couldn't find LineBreak.txt");
1664
1665 f.open(QFile::ReadOnly);
1666
1667 while (!f.atEnd()) {
1668 QByteArray line;
1669 line.resize(1024);
1670 int len = f.readLine(line.data(), 1024);
1671 line.resize(len-1);
1672
1673 int comment = line.indexOf('#');
1674 if (comment >= 0)
1675 line = line.left(comment);
1676 line.replace(" ", "");
1677
1678 if (line.isEmpty())
1679 continue;
1680
1681 QList<QByteArray> l = line.split(';');
1682 Q_ASSERT(l.size() == 2);
1683
1684 QByteArray codes = l[0];
1685 codes.replace("..", ".");
1686 QList<QByteArray> cl = codes.split('.');
1687
1688 bool ok;
1689 int from = cl[0].toInt(&ok, 16);
1690 Q_ASSERT(ok);
1691 int to = from;
1692 if (cl.size() == 2) {
1693 to = cl[1].toInt(&ok, 16);
1694 Q_ASSERT(ok);
1695 }
1696
1697 LineBreakClass lb = line_break_map.value(l[1], LineBreak_Unassigned);
1698 if (lb == LineBreak_Unassigned)
1699 qFatal("unassigned line break class: %s", l[1].constData());
1700
1701 for (int codepoint = from; codepoint <= to; ++codepoint) {
1702 UnicodeData &d = UnicodeData::valueRef(codepoint);
1703 d.p.lineBreakClass = lb;
1704 }
1705 }
1706 }
1707
readSpecialCasing()1708 static void readSpecialCasing()
1709 {
1710 qDebug("Reading SpecialCasing.txt");
1711
1712 QFile f("data/SpecialCasing.txt");
1713 if (!f.exists())
1714 qFatal("Couldn't find SpecialCasing.txt");
1715
1716 f.open(QFile::ReadOnly);
1717
1718 while (!f.atEnd()) {
1719 QByteArray line;
1720 line.resize(1024);
1721 int len = f.readLine(line.data(), 1024);
1722 line.resize(len-1);
1723
1724 int comment = line.indexOf('#');
1725 if (comment >= 0)
1726 line = line.left(comment);
1727
1728 if (line.isEmpty())
1729 continue;
1730
1731 QList<QByteArray> l = line.split(';');
1732
1733 QByteArray condition = l.size() < 5 ? QByteArray() : l[4].trimmed();
1734 if (!condition.isEmpty())
1735 // #####
1736 continue;
1737
1738 bool ok;
1739 int codepoint = l[0].trimmed().toInt(&ok, 16);
1740 Q_ASSERT(ok);
1741
1742 // if the condition below doesn't hold anymore we need to modify our
1743 // lower/upper/title casing code and case folding code
1744 Q_ASSERT(!QChar::requiresSurrogates(codepoint));
1745
1746 // qDebug() << "codepoint" << hex << codepoint;
1747 // qDebug() << line;
1748
1749 QList<QByteArray> lower = l[1].trimmed().split(' ');
1750 QList<int> lowerMap;
1751 for (int i = 0; i < lower.size(); ++i) {
1752 bool ok;
1753 lowerMap.append(lower.at(i).toInt(&ok, 16));
1754 Q_ASSERT(ok);
1755 }
1756
1757 QList<QByteArray> title = l[2].trimmed().split(' ');
1758 QList<int> titleMap;
1759 for (int i = 0; i < title.size(); ++i) {
1760 bool ok;
1761 titleMap.append(title.at(i).toInt(&ok, 16));
1762 Q_ASSERT(ok);
1763 }
1764
1765 QList<QByteArray> upper = l[3].trimmed().split(' ');
1766 QList<int> upperMap;
1767 for (int i = 0; i < upper.size(); ++i) {
1768 bool ok;
1769 upperMap.append(upper.at(i).toInt(&ok, 16));
1770 Q_ASSERT(ok);
1771 }
1772
1773
1774 UnicodeData &ud = UnicodeData::valueRef(codepoint);
1775 Q_ASSERT(lowerMap.size() > 1 || lowerMap.at(0) == codepoint + ud.p.lowerCaseDiff);
1776 Q_ASSERT(titleMap.size() > 1 || titleMap.at(0) == codepoint + ud.p.titleCaseDiff);
1777 Q_ASSERT(upperMap.size() > 1 || upperMap.at(0) == codepoint + ud.p.upperCaseDiff);
1778
1779 if (lowerMap.size() > 1) {
1780 ud.p.lowerCaseSpecial = true;
1781 ud.p.lowerCaseDiff = appendToSpecialCaseMap(lowerMap);
1782 }
1783 if (titleMap.size() > 1) {
1784 ud.p.titleCaseSpecial = true;
1785 ud.p.titleCaseDiff = appendToSpecialCaseMap(titleMap);
1786 }
1787 if (upperMap.size() > 1) {
1788 ud.p.upperCaseSpecial = true;
1789 ud.p.upperCaseDiff = appendToSpecialCaseMap(upperMap);
1790 }
1791 }
1792 }
1793
1794 static int maxCaseFoldDiff = 0;
1795
readCaseFolding()1796 static void readCaseFolding()
1797 {
1798 qDebug("Reading CaseFolding.txt");
1799
1800 QFile f("data/CaseFolding.txt");
1801 if (!f.exists())
1802 qFatal("Couldn't find CaseFolding.txt");
1803
1804 f.open(QFile::ReadOnly);
1805
1806 while (!f.atEnd()) {
1807 QByteArray line;
1808 line.resize(1024);
1809 int len = f.readLine(line.data(), 1024);
1810 line.resize(len-1);
1811
1812 int comment = line.indexOf('#');
1813 if (comment >= 0)
1814 line = line.left(comment);
1815
1816 if (line.isEmpty())
1817 continue;
1818
1819 QList<QByteArray> l = line.split(';');
1820
1821 bool ok;
1822 int codepoint = l[0].trimmed().toInt(&ok, 16);
1823 Q_ASSERT(ok);
1824
1825
1826 l[1] = l[1].trimmed();
1827 if (l[1] == "F" || l[1] == "T")
1828 continue;
1829
1830 // qDebug() << "codepoint" << hex << codepoint;
1831 // qDebug() << line;
1832 QList<QByteArray> fold = l[2].trimmed().split(' ');
1833 QList<int> foldMap;
1834 for (int i = 0; i < fold.size(); ++i) {
1835 bool ok;
1836 foldMap.append(fold.at(i).toInt(&ok, 16));
1837 Q_ASSERT(ok);
1838 }
1839
1840 UnicodeData &ud = UnicodeData::valueRef(codepoint);
1841 if (foldMap.size() == 1) {
1842 int caseFolded = foldMap.at(0);
1843 int diff = caseFolded - codepoint;
1844 // if the conditions below doesn't hold anymore we need to modify our case folding code
1845 Q_ASSERT(QChar::requiresSurrogates(codepoint) == QChar::requiresSurrogates(caseFolded));
1846 if (QChar::requiresSurrogates(codepoint)) {
1847 Q_ASSERT(QChar::highSurrogate(codepoint) == QChar::highSurrogate(caseFolded));
1848 Q_ASSERT(QChar::lowSurrogate(codepoint) + diff == QChar::lowSurrogate(caseFolded));
1849 }
1850 if (qAbs(diff) >= (1<<13)) {
1851 qWarning() << "caseFoldDiff exceeded (" << hex << codepoint << "->" << caseFolded << "); map it for special case";
1852 ud.p.caseFoldSpecial = true;
1853 ud.p.caseFoldDiff = appendToSpecialCaseMap(foldMap);
1854 } else {
1855 ud.p.caseFoldDiff = diff;
1856 maxCaseFoldDiff = qMax(maxCaseFoldDiff, qAbs(diff));
1857 }
1858 } else {
1859 qFatal("we currently don't support full case foldings");
1860 // qDebug() << "special" << hex << foldMap;
1861 ud.p.caseFoldSpecial = true;
1862 ud.p.caseFoldDiff = appendToSpecialCaseMap(foldMap);
1863 }
1864 }
1865 }
1866
readGraphemeBreak()1867 static void readGraphemeBreak()
1868 {
1869 qDebug("Reading GraphemeBreakProperty.txt");
1870
1871 QFile f("data/GraphemeBreakProperty.txt");
1872 if (!f.exists())
1873 qFatal("Couldn't find GraphemeBreakProperty.txt");
1874
1875 f.open(QFile::ReadOnly);
1876
1877 while (!f.atEnd()) {
1878 QByteArray line;
1879 line.resize(1024);
1880 int len = f.readLine(line.data(), 1024);
1881 line.resize(len-1);
1882
1883 int comment = line.indexOf('#');
1884 if (comment >= 0)
1885 line = line.left(comment);
1886 line.replace(" ", "");
1887
1888 if (line.isEmpty())
1889 continue;
1890
1891 QList<QByteArray> l = line.split(';');
1892 Q_ASSERT(l.size() == 2);
1893
1894 QByteArray codes = l[0];
1895 codes.replace("..", ".");
1896 QList<QByteArray> cl = codes.split('.');
1897
1898 bool ok;
1899 int from = cl[0].toInt(&ok, 16);
1900 Q_ASSERT(ok);
1901 int to = from;
1902 if (cl.size() == 2) {
1903 to = cl[1].toInt(&ok, 16);
1904 Q_ASSERT(ok);
1905 }
1906
1907 GraphemeBreakClass brk = grapheme_break_map.value(l[1], GraphemeBreak_Unassigned);
1908 if (brk == GraphemeBreak_Unassigned)
1909 qFatal("unassigned grapheme break class: %s", l[1].constData());
1910
1911 for (int codepoint = from; codepoint <= to; ++codepoint) {
1912 UnicodeData &ud = UnicodeData::valueRef(codepoint);
1913 ud.p.graphemeBreakClass = brk;
1914 }
1915 }
1916 }
1917
readWordBreak()1918 static void readWordBreak()
1919 {
1920 qDebug("Reading WordBreakProperty.txt");
1921
1922 QFile f("data/WordBreakProperty.txt");
1923 if (!f.exists())
1924 qFatal("Couldn't find WordBreakProperty.txt");
1925
1926 f.open(QFile::ReadOnly);
1927
1928 while (!f.atEnd()) {
1929 QByteArray line;
1930 line.resize(1024);
1931 int len = f.readLine(line.data(), 1024);
1932 line.resize(len-1);
1933
1934 int comment = line.indexOf('#');
1935 if (comment >= 0)
1936 line = line.left(comment);
1937 line.replace(" ", "");
1938
1939 if (line.isEmpty())
1940 continue;
1941
1942 QList<QByteArray> l = line.split(';');
1943 Q_ASSERT(l.size() == 2);
1944
1945 QByteArray codes = l[0];
1946 codes.replace("..", ".");
1947 QList<QByteArray> cl = codes.split('.');
1948
1949 bool ok;
1950 int from = cl[0].toInt(&ok, 16);
1951 Q_ASSERT(ok);
1952 int to = from;
1953 if (cl.size() == 2) {
1954 to = cl[1].toInt(&ok, 16);
1955 Q_ASSERT(ok);
1956 }
1957
1958 WordBreakClass brk = word_break_map.value(l[1], WordBreak_Unassigned);
1959 if (brk == WordBreak_Unassigned)
1960 qFatal("unassigned word break class: %s", l[1].constData());
1961
1962 for (int codepoint = from; codepoint <= to; ++codepoint) {
1963 // ### [
1964 // as of Unicode 5.1, some punctuation marks were mapped to MidLetter and MidNumLet
1965 // which caused "hi.there" to be treated like if it were just a single word;
1966 // until we have a tailoring mechanism, retain the old behavior by remapping those characters here.
1967 if (codepoint == 0x002E) // FULL STOP
1968 brk = WordBreak_MidNum;
1969 else if (codepoint == 0x003A) // COLON
1970 brk = WordBreak_Any;
1971 // ] ###
1972 UnicodeData &ud = UnicodeData::valueRef(codepoint);
1973 ud.p.wordBreakClass = brk;
1974 }
1975 }
1976 }
1977
readSentenceBreak()1978 static void readSentenceBreak()
1979 {
1980 qDebug("Reading SentenceBreakProperty.txt");
1981
1982 QFile f("data/SentenceBreakProperty.txt");
1983 if (!f.exists())
1984 qFatal("Couldn't find SentenceBreakProperty.txt");
1985
1986 f.open(QFile::ReadOnly);
1987
1988 while (!f.atEnd()) {
1989 QByteArray line;
1990 line.resize(1024);
1991 int len = f.readLine(line.data(), 1024);
1992 line.resize(len-1);
1993
1994 int comment = line.indexOf('#');
1995 if (comment >= 0)
1996 line = line.left(comment);
1997 line.replace(" ", "");
1998
1999 if (line.isEmpty())
2000 continue;
2001
2002 QList<QByteArray> l = line.split(';');
2003 Q_ASSERT(l.size() == 2);
2004
2005 QByteArray codes = l[0];
2006 codes.replace("..", ".");
2007 QList<QByteArray> cl = codes.split('.');
2008
2009 bool ok;
2010 int from = cl[0].toInt(&ok, 16);
2011 Q_ASSERT(ok);
2012 int to = from;
2013 if (cl.size() == 2) {
2014 to = cl[1].toInt(&ok, 16);
2015 Q_ASSERT(ok);
2016 }
2017
2018 SentenceBreakClass brk = sentence_break_map.value(l[1], SentenceBreak_Unassigned);
2019 if (brk == SentenceBreak_Unassigned)
2020 qFatal("unassigned sentence break class: %s", l[1].constData());
2021
2022 for (int codepoint = from; codepoint <= to; ++codepoint) {
2023 UnicodeData &ud = UnicodeData::valueRef(codepoint);
2024 ud.p.sentenceBreakClass = brk;
2025 }
2026 }
2027 }
2028
2029 #if 0
2030 // this piece of code does full case folding and comparison. We currently
2031 // don't use it, since this gives lots of issues with things as case insensitive
2032 // search and replace.
2033 static inline void foldCase(uint ch, ushort *out)
2034 {
2035 const QUnicodeTables::Properties *p = qGetProp(ch);
2036 if (!p->caseFoldSpecial) {
2037 *(out++) = ch + p->caseFoldDiff;
2038 } else {
2039 const ushort *folded = specialCaseMap + p->caseFoldDiff;
2040 ushort length = *folded++;
2041 while (length--)
2042 *out++ = *folded++;
2043 }
2044 *out = 0;
2045 }
2046
2047 static int ucstricmp(const ushort *a, const ushort *ae, const ushort *b, const ushort *be)
2048 {
2049 if (a == b)
2050 return 0;
2051 if (a == 0)
2052 return 1;
2053 if (b == 0)
2054 return -1;
2055
2056 while (a != ae && b != be) {
2057 const QUnicodeTables::Properties *pa = qGetProp(*a);
2058 const QUnicodeTables::Properties *pb = qGetProp(*b);
2059 if (pa->caseFoldSpecial | pb->caseFoldSpecial)
2060 goto special;
2061 int diff = (int)(*a + pa->caseFoldDiff) - (int)(*b + pb->caseFoldDiff);
2062 if ((diff))
2063 return diff;
2064 ++a;
2065 ++b;
2066 }
2067 }
2068 if (a == ae) {
2069 if (b == be)
2070 return 0;
2071 return -1;
2072 }
2073 return 1;
2074 special:
2075 ushort abuf[SPECIAL_CASE_MAX_LEN + 1];
2076 ushort bbuf[SPECIAL_CASE_MAX_LEN + 1];
2077 abuf[0] = bbuf[0] = 0;
2078 ushort *ap = abuf;
2079 ushort *bp = bbuf;
2080 while (1) {
2081 if (!*ap) {
2082 if (a == ae) {
2083 if (!*bp && b == be)
2084 return 0;
2085 return -1;
2086 }
2087 foldCase(*(a++), abuf);
2088 ap = abuf;
2089 }
2090 if (!*bp) {
2091 if (b == be)
2092 return 1;
2093 foldCase(*(b++), bbuf);
2094 bp = bbuf;
2095 }
2096 if (*ap != *bp)
2097 return (int)*ap - (int)*bp;
2098 ++ap;
2099 ++bp;
2100 }
2101 }
2102
2103
2104 static int ucstricmp(const ushort *a, const ushort *ae, const uchar *b)
2105 {
2106 if (a == 0)
2107 return 1;
2108 if (b == 0)
2109 return -1;
2110
2111 while (a != ae && *b) {
2112 const QUnicodeTables::Properties *pa = qGetProp(*a);
2113 const QUnicodeTables::Properties *pb = qGetProp((ushort)*b);
2114 if (pa->caseFoldSpecial | pb->caseFoldSpecial)
2115 goto special;
2116 int diff = (int)(*a + pa->caseFoldDiff) - (int)(*b + pb->caseFoldDiff);
2117 if ((diff))
2118 return diff;
2119 ++a;
2120 ++b;
2121 }
2122 if (a == ae) {
2123 if (!*b)
2124 return 0;
2125 return -1;
2126 }
2127 return 1;
2128
2129 special:
2130 ushort abuf[SPECIAL_CASE_MAX_LEN + 1];
2131 ushort bbuf[SPECIAL_CASE_MAX_LEN + 1];
2132 abuf[0] = bbuf[0] = 0;
2133 ushort *ap = abuf;
2134 ushort *bp = bbuf;
2135 while (1) {
2136 if (!*ap) {
2137 if (a == ae) {
2138 if (!*bp && !*b)
2139 return 0;
2140 return -1;
2141 }
2142 foldCase(*(a++), abuf);
2143 ap = abuf;
2144 }
2145 if (!*bp) {
2146 if (!*b)
2147 return 1;
2148 foldCase(*(b++), bbuf);
2149 bp = bbuf;
2150 }
2151 if (*ap != *bp)
2152 return (int)*ap - (int)*bp;
2153 ++ap;
2154 ++bp;
2155 }
2156 }
2157 #endif
2158
2159 #if 0
2160 static QList<QByteArray> blockNames;
2161 struct BlockInfo
2162 {
2163 int blockIndex;
2164 int firstCodePoint;
2165 int lastCodePoint;
2166 };
2167 static QList<BlockInfo> blockInfoList;
2168
2169 static void readBlocks()
2170 {
2171 qDebug("Reading Blocks.txt");
2172
2173 QFile f("data/Blocks.txt");
2174 if (!f.exists())
2175 qFatal("Couldn't find Blocks.txt");
2176
2177 f.open(QFile::ReadOnly);
2178
2179 while (!f.atEnd()) {
2180 QByteArray line = f.readLine();
2181 line.resize(line.size() - 1);
2182
2183 int comment = line.indexOf("#");
2184 if (comment >= 0)
2185 line = line.left(comment);
2186
2187 line.replace(" ", "");
2188
2189 if (line.isEmpty())
2190 continue;
2191
2192 int semicolon = line.indexOf(';');
2193 Q_ASSERT(semicolon >= 0);
2194 QByteArray codePoints = line.left(semicolon);
2195 QByteArray blockName = line.mid(semicolon + 1);
2196
2197 int blockIndex = blockNames.indexOf(blockName);
2198 if (blockIndex == -1) {
2199 blockIndex = blockNames.size();
2200 blockNames.append(blockName);
2201 }
2202
2203 codePoints.replace("..", ".");
2204 QList<QByteArray> cl = codePoints.split('.');
2205
2206 bool ok;
2207 int first = cl[0].toInt(&ok, 16);
2208 Q_ASSERT(ok);
2209 int last = first;
2210 if (cl.size() == 2) {
2211 last = cl[1].toInt(&ok, 16);
2212 Q_ASSERT(ok);
2213 }
2214
2215 BlockInfo blockInfo = { blockIndex, first, last };
2216 blockInfoList.append(blockInfo);
2217 }
2218 }
2219 #endif
2220
readScripts()2221 static void readScripts()
2222 {
2223 qDebug("Reading Scripts.txt");
2224
2225 QFile f("data/Scripts.txt");
2226 if (!f.exists())
2227 qFatal("Couldn't find Scripts.txt");
2228
2229 f.open(QFile::ReadOnly);
2230
2231 while (!f.atEnd()) {
2232 QByteArray line = f.readLine();
2233 line.resize(line.size() - 1);
2234
2235 int comment = line.indexOf("#");
2236 if (comment >= 0)
2237 line = line.left(comment);
2238
2239 line.replace(" ", "");
2240 line.replace("_", "");
2241
2242 if (line.isEmpty())
2243 continue;
2244
2245 int semicolon = line.indexOf(';');
2246 Q_ASSERT(semicolon >= 0);
2247 QByteArray codePoints = line.left(semicolon);
2248 QByteArray scriptName = line.mid(semicolon + 1);
2249
2250 codePoints.replace("..", ".");
2251 QList<QByteArray> cl = codePoints.split('.');
2252
2253 bool ok;
2254 int first = cl[0].toInt(&ok, 16);
2255 Q_ASSERT(ok);
2256 int last = first;
2257 if (cl.size() == 2) {
2258 last = cl[1].toInt(&ok, 16);
2259 Q_ASSERT(ok);
2260 }
2261
2262 if (!scriptMap.contains(scriptName))
2263 qFatal("Unhandled script property value: %s", scriptName.constData());
2264 QChar::Script script = scriptMap.value(scriptName, QChar::Script_Unknown);
2265
2266 for (int codepoint = first; codepoint <= last; ++codepoint) {
2267 UnicodeData &ud = UnicodeData::valueRef(codepoint);
2268 ud.p.script = script;
2269 }
2270 }
2271 }
2272
2273 #if 0
2274 static void dump(int from, int to)
2275 {
2276 for (int i = from; i <= to; ++i) {
2277 UnicodeData &d = UnicodeData::valueRef(i);
2278 qDebug("0x%04x: cat=%d combining=%d dir=%d case=%x mirror=%x joining=%d age=%d",
2279 i, d.p.category, d.p.combiningClass, d.p.direction, d.otherCase, d.mirroredChar, d.p.joining, d.p.age);
2280 if (d.decompositionType != QChar::NoDecomposition) {
2281 qDebug(" decomposition: type=%d, length=%d, first=%x", d.decompositionType, d.decomposition.size(),
2282 d.decomposition[0]);
2283 }
2284 }
2285 qDebug(" ");
2286 }
2287 #endif
2288
2289 static QList<PropertyFlags> uniqueProperties;
2290
computeUniqueProperties()2291 static void computeUniqueProperties()
2292 {
2293 qDebug("computeUniqueProperties:");
2294 for (int codepoint = 0; codepoint <= QChar::LastValidCodePoint; ++codepoint) {
2295 UnicodeData &d = UnicodeData::valueRef(codepoint);
2296 int index = uniqueProperties.indexOf(d.p);
2297 if (index == -1) {
2298 index = uniqueProperties.size();
2299 uniqueProperties.append(d.p);
2300 }
2301 d.propertyIndex = index;
2302 }
2303 qDebug(" %d unique unicode properties found", uniqueProperties.size());
2304 }
2305
2306 struct UniqueBlock {
UniqueBlockUniqueBlock2307 inline UniqueBlock() : index(-1) {}
2308
operator ==UniqueBlock2309 inline bool operator==(const UniqueBlock &other) const
2310 { return values == other.values; }
2311
2312 int index;
2313 QVector<int> values;
2314 };
2315
createPropertyInfo()2316 static QByteArray createPropertyInfo()
2317 {
2318 qDebug("createPropertyInfo:");
2319
2320 // we reserve one bit more than in the assert below for the sign
2321 Q_ASSERT(maxMirroredDiff < (1<<12));
2322 Q_ASSERT(maxLowerCaseDiff < (1<<13));
2323 Q_ASSERT(maxUpperCaseDiff < (1<<13));
2324 Q_ASSERT(maxTitleCaseDiff < (1<<13));
2325 Q_ASSERT(maxCaseFoldDiff < (1<<13));
2326
2327 const int BMP_BLOCKSIZE = 32;
2328 const int BMP_SHIFT = 5;
2329 const int BMP_END = 0x11000;
2330 const int SMP_END = 0x110000;
2331 const int SMP_BLOCKSIZE = 256;
2332 const int SMP_SHIFT = 8;
2333
2334 QList<UniqueBlock> uniqueBlocks;
2335 QVector<int> blockMap;
2336 int used = 0;
2337
2338 for (int block = 0; block < BMP_END/BMP_BLOCKSIZE; ++block) {
2339 UniqueBlock b;
2340 b.values.reserve(BMP_BLOCKSIZE);
2341 for (int i = 0; i < BMP_BLOCKSIZE; ++i) {
2342 int uc = block*BMP_BLOCKSIZE + i;
2343 UnicodeData &d = UnicodeData::valueRef(uc);
2344 b.values.append(d.propertyIndex);
2345 }
2346 int index = uniqueBlocks.indexOf(b);
2347 if (index == -1) {
2348 index = uniqueBlocks.size();
2349 b.index = used;
2350 used += BMP_BLOCKSIZE;
2351 uniqueBlocks.append(b);
2352 }
2353 blockMap.append(uniqueBlocks.at(index).index);
2354 }
2355 int bmp_blocks = uniqueBlocks.size();
2356
2357 for (int block = BMP_END/SMP_BLOCKSIZE; block < SMP_END/SMP_BLOCKSIZE; ++block) {
2358 UniqueBlock b;
2359 b.values.reserve(SMP_BLOCKSIZE);
2360 for (int i = 0; i < SMP_BLOCKSIZE; ++i) {
2361 int uc = block*SMP_BLOCKSIZE + i;
2362 UnicodeData &d = UnicodeData::valueRef(uc);
2363 b.values.append(d.propertyIndex);
2364 }
2365 int index = uniqueBlocks.indexOf(b);
2366 if (index == -1) {
2367 index = uniqueBlocks.size();
2368 b.index = used;
2369 used += SMP_BLOCKSIZE;
2370 uniqueBlocks.append(b);
2371 }
2372 blockMap.append(uniqueBlocks.at(index).index);
2373 }
2374 int smp_blocks = uniqueBlocks.size() - bmp_blocks;
2375
2376 int bmp_block_data = bmp_blocks*BMP_BLOCKSIZE*sizeof(unsigned short);
2377 int bmp_trie = BMP_END/BMP_BLOCKSIZE*sizeof(unsigned short);
2378 int bmp_mem = bmp_block_data + bmp_trie;
2379 qDebug(" %d unique blocks in BMP.", bmp_blocks);
2380 qDebug(" block data uses: %d bytes", bmp_block_data);
2381 qDebug(" trie data uses : %d bytes", bmp_trie);
2382
2383 int smp_block_data = smp_blocks*SMP_BLOCKSIZE*sizeof(unsigned short);
2384 int smp_trie = (SMP_END-BMP_END)/SMP_BLOCKSIZE*sizeof(unsigned short);
2385 int smp_mem = smp_block_data + smp_trie;
2386 qDebug(" %d unique blocks in SMP.", smp_blocks);
2387 qDebug(" block data uses: %d bytes", smp_block_data);
2388 qDebug(" trie data uses : %d bytes", smp_trie);
2389
2390 int prop_data = uniqueProperties.size() * SizeOfPropertiesStruct;
2391 qDebug("\n properties data uses : %d bytes", prop_data);
2392 qDebug(" memory usage: %d bytes", bmp_mem + smp_mem + prop_data);
2393
2394 Q_ASSERT(blockMap.last() + blockMap.size() < (1<<(sizeof(unsigned short)*8)));
2395
2396 QByteArray out;
2397
2398 out += "static const unsigned short uc_property_trie[] = {\n";
2399 // first write the map
2400 out += " // [0x0..0x" + QByteArray::number(BMP_END, 16) + ")";
2401 for (int i = 0; i < BMP_END/BMP_BLOCKSIZE; ++i) {
2402 if (!(i % 8)) {
2403 if (out.endsWith(' '))
2404 out.chop(1);
2405 if (!((i*BMP_BLOCKSIZE) % 0x1000))
2406 out += "\n";
2407 out += "\n ";
2408 }
2409 out += QByteArray::number(blockMap.at(i) + blockMap.size());
2410 out += ", ";
2411 }
2412 if (out.endsWith(' '))
2413 out.chop(1);
2414 out += "\n\n // [0x" + QByteArray::number(BMP_END, 16) + "..0x" + QByteArray::number(SMP_END, 16) + ")\n";
2415 for (int i = BMP_END/BMP_BLOCKSIZE; i < blockMap.size(); ++i) {
2416 if (!(i % 8)) {
2417 if (out.endsWith(' '))
2418 out.chop(1);
2419 if (!(i % (0x10000/SMP_BLOCKSIZE)))
2420 out += "\n";
2421 out += "\n ";
2422 }
2423 out += QByteArray::number(blockMap.at(i) + blockMap.size());
2424 out += ", ";
2425 }
2426 if (out.endsWith(' '))
2427 out.chop(1);
2428 out += "\n";
2429 // write the data
2430 for (int i = 0; i < uniqueBlocks.size(); ++i) {
2431 if (out.endsWith(' '))
2432 out.chop(1);
2433 out += "\n";
2434 const UniqueBlock &b = uniqueBlocks.at(i);
2435 for (int j = 0; j < b.values.size(); ++j) {
2436 if (!(j % 8)) {
2437 if (out.endsWith(' '))
2438 out.chop(1);
2439 out += "\n ";
2440 }
2441 out += QByteArray::number(b.values.at(j));
2442 out += ", ";
2443 }
2444 }
2445 if (out.endsWith(", "))
2446 out.chop(2);
2447 out += "\n};\n\n";
2448
2449 out += "#define GET_PROP_INDEX(ucs4) \\\n"
2450 " (ucs4 < 0x" + QByteArray::number(BMP_END, 16) + " \\\n"
2451 " ? (uc_property_trie[uc_property_trie[ucs4>>" + QByteArray::number(BMP_SHIFT) +
2452 "] + (ucs4 & 0x" + QByteArray::number(BMP_BLOCKSIZE-1, 16)+ ")]) \\\n"
2453 " : (uc_property_trie[uc_property_trie[((ucs4 - 0x" + QByteArray::number(BMP_END, 16) +
2454 ")>>" + QByteArray::number(SMP_SHIFT) + ") + 0x" + QByteArray::number(BMP_END/BMP_BLOCKSIZE, 16) + "]"
2455 " + (ucs4 & 0x" + QByteArray::number(SMP_BLOCKSIZE-1, 16) + ")]))\n\n"
2456 "#define GET_PROP_INDEX_UCS2(ucs2) \\\n"
2457 " (uc_property_trie[uc_property_trie[ucs2>>" + QByteArray::number(BMP_SHIFT) +
2458 "] + (ucs2 & 0x" + QByteArray::number(BMP_BLOCKSIZE-1, 16)+ ")])\n\n";
2459
2460 out += "static const Properties uc_properties[] = {";
2461 // keep in sync with the property declaration
2462 for (int i = 0; i < uniqueProperties.size(); ++i) {
2463 const PropertyFlags &p = uniqueProperties.at(i);
2464 out += "\n { ";
2465 // " ushort category : 8; /* 5 used */\n"
2466 out += QByteArray::number( p.category );
2467 out += ", ";
2468 // " ushort direction : 8; /* 5 used */\n"
2469 out += QByteArray::number( p.direction );
2470 out += ", ";
2471 // " ushort combiningClass : 8;\n"
2472 out += QByteArray::number( p.combiningClass );
2473 out += ", ";
2474 // " ushort joining : 3;\n"
2475 out += QByteArray::number( p.joining );
2476 out += ", ";
2477 // " signed short digitValue : 5;\n"
2478 out += QByteArray::number( p.digitValue );
2479 out += ", ";
2480 // " signed short mirrorDiff : 16;\n"
2481 out += QByteArray::number( p.mirrorDiff );
2482 out += ", ";
2483 // " ushort unicodeVersion : 8; /* 5 used */\n"
2484 out += QByteArray::number( p.age );
2485 out += ", ";
2486 // " ushort nfQuickCheck : 8;\n"
2487 out += QByteArray::number( p.nfQuickCheck );
2488 out += ", ";
2489 // " struct {\n"
2490 // " ushort special : 1;\n"
2491 // " signed short diff : 15;\n"
2492 // " } cases[NumCases];\n"
2493 out += " { {";
2494 out += QByteArray::number( p.lowerCaseSpecial );
2495 out += ", ";
2496 out += QByteArray::number( p.lowerCaseDiff );
2497 out += "}, {";
2498 out += QByteArray::number( p.upperCaseSpecial );
2499 out += ", ";
2500 out += QByteArray::number( p.upperCaseDiff );
2501 out += "}, {";
2502 out += QByteArray::number( p.titleCaseSpecial );
2503 out += ", ";
2504 out += QByteArray::number( p.titleCaseDiff );
2505 out += "}, {";
2506 out += QByteArray::number( p.caseFoldSpecial );
2507 out += ", ";
2508 out += QByteArray::number( p.caseFoldDiff );
2509 out += "} }, ";
2510 // " ushort graphemeBreakClass : 5; /* 5 used */\n"
2511 // " ushort wordBreakClass : 5; /* 5 used */\n"
2512 // " ushort lineBreakClass : 6; /* 6 used */\n"
2513 out += QByteArray::number( p.graphemeBreakClass );
2514 out += ", ";
2515 out += QByteArray::number( p.wordBreakClass );
2516 out += ", ";
2517 out += QByteArray::number( p.lineBreakClass );
2518 out += ", ";
2519 // " ushort sentenceBreakClass : 8; /* 4 used */\n"
2520 out += QByteArray::number( p.sentenceBreakClass );
2521 out += ", ";
2522 // " ushort script : 8;\n"
2523 out += QByteArray::number( p.script );
2524 out += " },";
2525 }
2526 if (out.endsWith(','))
2527 out.chop(1);
2528 out += "\n};\n\n";
2529
2530
2531 out += "Q_DECL_CONST_FUNCTION static inline const Properties *qGetProp(uint ucs4) noexcept\n"
2532 "{\n"
2533 " return uc_properties + GET_PROP_INDEX(ucs4);\n"
2534 "}\n"
2535 "\n"
2536 "Q_DECL_CONST_FUNCTION static inline const Properties *qGetProp(ushort ucs2) noexcept\n"
2537 "{\n"
2538 " return uc_properties + GET_PROP_INDEX_UCS2(ucs2);\n"
2539 "}\n"
2540 "\n"
2541 "Q_DECL_CONST_FUNCTION Q_CORE_EXPORT const Properties * QT_FASTCALL properties(uint ucs4) noexcept\n"
2542 "{\n"
2543 " return qGetProp(ucs4);\n"
2544 "}\n"
2545 "\n"
2546 "Q_DECL_CONST_FUNCTION Q_CORE_EXPORT const Properties * QT_FASTCALL properties(ushort ucs2) noexcept\n"
2547 "{\n"
2548 " return qGetProp(ucs2);\n"
2549 "}\n\n";
2550
2551 out += "Q_CORE_EXPORT GraphemeBreakClass QT_FASTCALL graphemeBreakClass(uint ucs4) noexcept\n"
2552 "{\n"
2553 " return static_cast<GraphemeBreakClass>(qGetProp(ucs4)->graphemeBreakClass);\n"
2554 "}\n"
2555 "\n"
2556 "Q_CORE_EXPORT WordBreakClass QT_FASTCALL wordBreakClass(uint ucs4) noexcept\n"
2557 "{\n"
2558 " return static_cast<WordBreakClass>(qGetProp(ucs4)->wordBreakClass);\n"
2559 "}\n"
2560 "\n"
2561 "Q_CORE_EXPORT SentenceBreakClass QT_FASTCALL sentenceBreakClass(uint ucs4) noexcept\n"
2562 "{\n"
2563 " return static_cast<SentenceBreakClass>(qGetProp(ucs4)->sentenceBreakClass);\n"
2564 "}\n"
2565 "\n"
2566 "Q_CORE_EXPORT LineBreakClass QT_FASTCALL lineBreakClass(uint ucs4) noexcept\n"
2567 "{\n"
2568 " return static_cast<LineBreakClass>(qGetProp(ucs4)->lineBreakClass);\n"
2569 "}\n"
2570 "\n";
2571
2572 return out;
2573 }
2574
createSpecialCaseMap()2575 static QByteArray createSpecialCaseMap()
2576 {
2577 qDebug("createSpecialCaseMap:");
2578
2579 QByteArray out;
2580
2581 out += "static const unsigned short specialCaseMap[] = {\n"
2582 " 0x0, // placeholder";
2583 int i = 1;
2584 while (i < specialCaseMap.size()) {
2585 out += "\n ";
2586 int n = specialCaseMap.at(i);
2587 for (int j = 0; j <= n; ++j) {
2588 out += QByteArray(" 0x") + QByteArray::number(specialCaseMap.at(i+j), 16);
2589 out += ",";
2590 }
2591 i += n + 1;
2592 }
2593 out.chop(1);
2594 out += "\n};\n\n";
2595
2596 qDebug(" memory usage: %ld bytes", specialCaseMap.size()*sizeof(unsigned short));
2597
2598 return out;
2599 }
2600
2601
createCompositionInfo()2602 static QByteArray createCompositionInfo()
2603 {
2604 qDebug("createCompositionInfo: highestComposedCharacter=0x%x", highestComposedCharacter);
2605
2606 const int BMP_BLOCKSIZE = 16;
2607 const int BMP_SHIFT = 4;
2608 const int BMP_END = 0x3400; // start of Han
2609 const int SMP_END = 0x30000;
2610 const int SMP_BLOCKSIZE = 256;
2611 const int SMP_SHIFT = 8;
2612
2613 if (SMP_END <= highestComposedCharacter)
2614 qFatal("end of table smaller than highest composed character 0x%x", highestComposedCharacter);
2615
2616 QVector<unsigned short> decompositions;
2617 int tableIndex = 0;
2618
2619 QList<UniqueBlock> uniqueBlocks;
2620 QVector<int> blockMap;
2621 int used = 0;
2622
2623 for (int block = 0; block < BMP_END/BMP_BLOCKSIZE; ++block) {
2624 UniqueBlock b;
2625 b.values.reserve(BMP_BLOCKSIZE);
2626 for (int i = 0; i < BMP_BLOCKSIZE; ++i) {
2627 int uc = block*BMP_BLOCKSIZE + i;
2628 UnicodeData &d = UnicodeData::valueRef(uc);
2629 if (!d.decomposition.isEmpty()) {
2630 int utf16Length = 0;
2631 decompositions.append(0);
2632 for (int j = 0; j < d.decomposition.size(); ++j) {
2633 int code = d.decomposition.at(j);
2634 if (QChar::requiresSurrogates(code)) {
2635 // save as surrogate pair
2636 decompositions.append(QChar::highSurrogate(code));
2637 decompositions.append(QChar::lowSurrogate(code));
2638 utf16Length += 2;
2639 } else {
2640 decompositions.append(code);
2641 utf16Length++;
2642 }
2643 }
2644 decompositions[tableIndex] = d.decompositionType + (utf16Length<<8);
2645 b.values.append(tableIndex);
2646 tableIndex += utf16Length + 1;
2647 } else {
2648 b.values.append(0xffff);
2649 }
2650 }
2651 int index = uniqueBlocks.indexOf(b);
2652 if (index == -1) {
2653 index = uniqueBlocks.size();
2654 b.index = used;
2655 used += BMP_BLOCKSIZE;
2656 uniqueBlocks.append(b);
2657 }
2658 blockMap.append(uniqueBlocks.at(index).index);
2659 }
2660 int bmp_blocks = uniqueBlocks.size();
2661
2662 for (int block = BMP_END/SMP_BLOCKSIZE; block < SMP_END/SMP_BLOCKSIZE; ++block) {
2663 UniqueBlock b;
2664 b.values.reserve(SMP_BLOCKSIZE);
2665 for (int i = 0; i < SMP_BLOCKSIZE; ++i) {
2666 int uc = block*SMP_BLOCKSIZE + i;
2667 UnicodeData &d = UnicodeData::valueRef(uc);
2668 if (!d.decomposition.isEmpty()) {
2669 int utf16Length = 0;
2670 decompositions.append(0);
2671 for (int j = 0; j < d.decomposition.size(); ++j) {
2672 int code = d.decomposition.at(j);
2673 if (QChar::requiresSurrogates(code)) {
2674 // save as surrogate pair
2675 decompositions.append(QChar::highSurrogate(code));
2676 decompositions.append(QChar::lowSurrogate(code));
2677 utf16Length += 2;
2678 } else {
2679 decompositions.append(code);
2680 utf16Length++;
2681 }
2682 }
2683 decompositions[tableIndex] = d.decompositionType + (utf16Length<<8);
2684 b.values.append(tableIndex);
2685 tableIndex += utf16Length + 1;
2686 } else {
2687 b.values.append(0xffff);
2688 }
2689 }
2690 int index = uniqueBlocks.indexOf(b);
2691 if (index == -1) {
2692 index = uniqueBlocks.size();
2693 b.index = used;
2694 used += SMP_BLOCKSIZE;
2695 uniqueBlocks.append(b);
2696 }
2697 blockMap.append(uniqueBlocks.at(index).index);
2698 }
2699 int smp_blocks = uniqueBlocks.size() - bmp_blocks;
2700
2701 // if the condition below doesn't hold anymore we need to modify our decomposition code
2702 Q_ASSERT(tableIndex < 0xffff);
2703
2704 int bmp_block_data = bmp_blocks*BMP_BLOCKSIZE*sizeof(unsigned short);
2705 int bmp_trie = BMP_END/BMP_BLOCKSIZE*sizeof(unsigned short);
2706 int bmp_mem = bmp_block_data + bmp_trie;
2707 qDebug(" %d unique blocks in BMP.", bmp_blocks);
2708 qDebug(" block data uses: %d bytes", bmp_block_data);
2709 qDebug(" trie data uses : %d bytes", bmp_trie);
2710
2711 int smp_block_data = smp_blocks*SMP_BLOCKSIZE*sizeof(unsigned short);
2712 int smp_trie = (SMP_END-BMP_END)/SMP_BLOCKSIZE*sizeof(unsigned short);
2713 int smp_mem = smp_block_data + smp_trie;
2714 qDebug(" %d unique blocks in SMP.", smp_blocks);
2715 qDebug(" block data uses: %d bytes", smp_block_data);
2716 qDebug(" trie data uses : %d bytes", smp_trie);
2717
2718 int decomposition_data = decompositions.size() * 2;
2719 qDebug("\n decomposition data uses : %d bytes", decomposition_data);
2720 qDebug(" memory usage: %d bytes", bmp_mem + smp_mem + decomposition_data);
2721
2722 Q_ASSERT(blockMap.last() + blockMap.size() < (1<<(sizeof(unsigned short)*8)));
2723
2724 QByteArray out;
2725
2726 out += "static const unsigned short uc_decomposition_trie[] = {\n";
2727 // first write the map
2728 out += " // 0 - 0x" + QByteArray::number(BMP_END, 16);
2729 for (int i = 0; i < BMP_END/BMP_BLOCKSIZE; ++i) {
2730 if (!(i % 8)) {
2731 if (out.endsWith(' '))
2732 out.chop(1);
2733 if (!((i*BMP_BLOCKSIZE) % 0x1000))
2734 out += "\n";
2735 out += "\n ";
2736 }
2737 out += QByteArray::number(blockMap.at(i) + blockMap.size());
2738 out += ", ";
2739 }
2740 if (out.endsWith(' '))
2741 out.chop(1);
2742 out += "\n\n // 0x" + QByteArray::number(BMP_END, 16) + " - 0x" + QByteArray::number(SMP_END, 16) + "\n";
2743 for (int i = BMP_END/BMP_BLOCKSIZE; i < blockMap.size(); ++i) {
2744 if (!(i % 8)) {
2745 if (out.endsWith(' '))
2746 out.chop(1);
2747 if (!(i % (0x10000/SMP_BLOCKSIZE)))
2748 out += "\n";
2749 out += "\n ";
2750 }
2751 out += QByteArray::number(blockMap.at(i) + blockMap.size());
2752 out += ", ";
2753 }
2754 if (out.endsWith(' '))
2755 out.chop(1);
2756 out += "\n";
2757 // write the data
2758 for (int i = 0; i < uniqueBlocks.size(); ++i) {
2759 if (out.endsWith(' '))
2760 out.chop(1);
2761 out += "\n";
2762 const UniqueBlock &b = uniqueBlocks.at(i);
2763 for (int j = 0; j < b.values.size(); ++j) {
2764 if (!(j % 8)) {
2765 if (out.endsWith(' '))
2766 out.chop(1);
2767 out += "\n ";
2768 }
2769 out += "0x" + QByteArray::number(b.values.at(j), 16);
2770 out += ", ";
2771 }
2772 }
2773 if (out.endsWith(' '))
2774 out.chop(2);
2775 out += "\n};\n\n";
2776
2777 out += "#define GET_DECOMPOSITION_INDEX(ucs4) \\\n"
2778 " (ucs4 < 0x" + QByteArray::number(BMP_END, 16) + " \\\n"
2779 " ? (uc_decomposition_trie[uc_decomposition_trie[ucs4>>" + QByteArray::number(BMP_SHIFT) +
2780 "] + (ucs4 & 0x" + QByteArray::number(BMP_BLOCKSIZE-1, 16)+ ")]) \\\n"
2781 " : (ucs4 < 0x" + QByteArray::number(SMP_END, 16) + " \\\n"
2782 " ? uc_decomposition_trie[uc_decomposition_trie[((ucs4 - 0x" + QByteArray::number(BMP_END, 16) +
2783 ")>>" + QByteArray::number(SMP_SHIFT) + ") + 0x" + QByteArray::number(BMP_END/BMP_BLOCKSIZE, 16) + "]"
2784 " + (ucs4 & 0x" + QByteArray::number(SMP_BLOCKSIZE-1, 16) + ")] \\\n"
2785 " : 0xffff))\n\n";
2786
2787 out += "static const unsigned short uc_decomposition_map[] = {";
2788 for (int i = 0; i < decompositions.size(); ++i) {
2789 if (!(i % 8)) {
2790 if (out.endsWith(' '))
2791 out.chop(1);
2792 out += "\n ";
2793 }
2794 out += "0x" + QByteArray::number(decompositions.at(i), 16);
2795 out += ", ";
2796 }
2797 if (out.endsWith(' '))
2798 out.chop(2);
2799 out += "\n};\n\n";
2800
2801 return out;
2802 }
2803
createLigatureInfo()2804 static QByteArray createLigatureInfo()
2805 {
2806 qDebug("createLigatureInfo: numLigatures=%d, highestLigature=0x%x", numLigatures, highestLigature);
2807
2808 for (int i = 0; i < ligatureHashes.size(); ++i) {
2809 const QList<Ligature> &l = ligatureHashes.value(i);
2810 for (int j = 0; j < l.size(); ++j) {
2811 // if the condition below doesn't hold anymore we need to modify our ligatureHelper code
2812 Q_ASSERT(QChar::requiresSurrogates(l.at(j).u2) == QChar::requiresSurrogates(l.at(j).ligature) &&
2813 QChar::requiresSurrogates(l.at(j).u2) == QChar::requiresSurrogates(l.at(j).u1));
2814 }
2815 }
2816
2817 const int BMP_BLOCKSIZE = 32;
2818 const int BMP_SHIFT = 5;
2819 const int BMP_END = 0x3100;
2820 const int SMP_END = 0x12000;
2821 const int SMP_BLOCKSIZE = 256;
2822 const int SMP_SHIFT = 8;
2823
2824 if (SMP_END <= highestLigature)
2825 qFatal("end of table smaller than highest ligature character 0x%x", highestLigature);
2826
2827 QList<unsigned short> ligatures;
2828 int tableIndex = 0;
2829
2830 QList<UniqueBlock> uniqueBlocks;
2831 QVector<int> blockMap;
2832 int used = 0;
2833
2834 for (int block = 0; block < BMP_END/BMP_BLOCKSIZE; ++block) {
2835 UniqueBlock b;
2836 b.values.reserve(BMP_BLOCKSIZE);
2837 for (int i = 0; i < BMP_BLOCKSIZE; ++i) {
2838 int uc = block*BMP_BLOCKSIZE + i;
2839 QList<Ligature> l = ligatureHashes.value(uc);
2840 if (!l.isEmpty()) {
2841 Q_ASSERT(!QChar::requiresSurrogates(uc));
2842 std::sort(l.begin(), l.end()); // needed for bsearch in ligatureHelper code
2843
2844 ligatures.append(l.size());
2845 for (int j = 0; j < l.size(); ++j) {
2846 ligatures.append(l.at(j).u1);
2847 ligatures.append(l.at(j).ligature);
2848 }
2849 b.values.append(tableIndex);
2850 tableIndex += 2*l.size() + 1;
2851 } else {
2852 b.values.append(0xffff);
2853 }
2854 }
2855 int index = uniqueBlocks.indexOf(b);
2856 if (index == -1) {
2857 index = uniqueBlocks.size();
2858 b.index = used;
2859 used += BMP_BLOCKSIZE;
2860 uniqueBlocks.append(b);
2861 }
2862 blockMap.append(uniqueBlocks.at(index).index);
2863 }
2864 int bmp_blocks = uniqueBlocks.size();
2865
2866 for (int block = BMP_END/SMP_BLOCKSIZE; block < SMP_END/SMP_BLOCKSIZE; ++block) {
2867 UniqueBlock b;
2868 b.values.reserve(SMP_BLOCKSIZE);
2869 for (int i = 0; i < SMP_BLOCKSIZE; ++i) {
2870 int uc = block*SMP_BLOCKSIZE + i;
2871 QList<Ligature> l = ligatureHashes.value(uc);
2872 if (!l.isEmpty()) {
2873 Q_ASSERT(QChar::requiresSurrogates(uc));
2874 std::sort(l.begin(), l.end()); // needed for bsearch in ligatureHelper code
2875
2876 ligatures.append(l.size());
2877 for (int j = 0; j < l.size(); ++j) {
2878 ligatures.append(QChar::highSurrogate(l.at(j).u1));
2879 ligatures.append(QChar::lowSurrogate(l.at(j).u1));
2880 ligatures.append(QChar::highSurrogate(l.at(j).ligature));
2881 ligatures.append(QChar::lowSurrogate(l.at(j).ligature));
2882 }
2883 b.values.append(tableIndex);
2884 tableIndex += 4*l.size() + 1;
2885 } else {
2886 b.values.append(0xffff);
2887 }
2888 }
2889 int index = uniqueBlocks.indexOf(b);
2890 if (index == -1) {
2891 index = uniqueBlocks.size();
2892 b.index = used;
2893 used += SMP_BLOCKSIZE;
2894 uniqueBlocks.append(b);
2895 }
2896 blockMap.append(uniqueBlocks.at(index).index);
2897 }
2898 int smp_blocks = uniqueBlocks.size() - bmp_blocks;
2899
2900 // if the condition below doesn't hold anymore we need to modify our composition code
2901 Q_ASSERT(tableIndex < 0xffff);
2902
2903 int bmp_block_data = bmp_blocks*BMP_BLOCKSIZE*sizeof(unsigned short);
2904 int bmp_trie = BMP_END/BMP_BLOCKSIZE*sizeof(unsigned short);
2905 int bmp_mem = bmp_block_data + bmp_trie;
2906 qDebug(" %d unique blocks in BMP.", bmp_blocks);
2907 qDebug(" block data uses: %d bytes", bmp_block_data);
2908 qDebug(" trie data uses : %d bytes", bmp_trie);
2909
2910 int smp_block_data = smp_blocks*SMP_BLOCKSIZE*sizeof(unsigned short);
2911 int smp_trie = (SMP_END-BMP_END)/SMP_BLOCKSIZE*sizeof(unsigned short);
2912 int smp_mem = smp_block_data + smp_trie;
2913 qDebug(" %d unique blocks in SMP.", smp_blocks);
2914 qDebug(" block data uses: %d bytes", smp_block_data);
2915 qDebug(" trie data uses : %d bytes", smp_trie);
2916
2917 int ligature_data = ligatures.size() * 2;
2918 qDebug("\n ligature data uses : %d bytes", ligature_data);
2919 qDebug(" memory usage: %d bytes", bmp_mem + smp_mem + ligature_data);
2920
2921 Q_ASSERT(blockMap.last() + blockMap.size() < (1<<(sizeof(unsigned short)*8)));
2922
2923 QByteArray out;
2924
2925 out += "static const unsigned short uc_ligature_trie[] = {\n";
2926 // first write the map
2927 out += " // 0 - 0x" + QByteArray::number(BMP_END, 16);
2928 for (int i = 0; i < BMP_END/BMP_BLOCKSIZE; ++i) {
2929 if (!(i % 8)) {
2930 if (out.endsWith(' '))
2931 out.chop(1);
2932 if (!((i*BMP_BLOCKSIZE) % 0x1000))
2933 out += "\n";
2934 out += "\n ";
2935 }
2936 out += QByteArray::number(blockMap.at(i) + blockMap.size());
2937 out += ", ";
2938 }
2939 if (out.endsWith(' '))
2940 out.chop(1);
2941 out += "\n\n // 0x" + QByteArray::number(BMP_END, 16) + " - 0x" + QByteArray::number(SMP_END, 16) + "\n";
2942 for (int i = BMP_END/BMP_BLOCKSIZE; i < blockMap.size(); ++i) {
2943 if (!(i % 8)) {
2944 if (out.endsWith(' '))
2945 out.chop(1);
2946 if (!(i % (0x10000/SMP_BLOCKSIZE)))
2947 out += "\n";
2948 out += "\n ";
2949 }
2950 out += QByteArray::number(blockMap.at(i) + blockMap.size());
2951 out += ", ";
2952 }
2953 if (out.endsWith(' '))
2954 out.chop(1);
2955 out += "\n";
2956 // write the data
2957 for (int i = 0; i < uniqueBlocks.size(); ++i) {
2958 if (out.endsWith(' '))
2959 out.chop(1);
2960 out += "\n";
2961 const UniqueBlock &b = uniqueBlocks.at(i);
2962 for (int j = 0; j < b.values.size(); ++j) {
2963 if (!(j % 8)) {
2964 if (out.endsWith(' '))
2965 out.chop(1);
2966 out += "\n ";
2967 }
2968 out += "0x" + QByteArray::number(b.values.at(j), 16);
2969 out += ", ";
2970 }
2971 }
2972 if (out.endsWith(' '))
2973 out.chop(2);
2974 out += "\n};\n\n";
2975
2976 out += "#define GET_LIGATURE_INDEX(ucs4) \\\n"
2977 " (ucs4 < 0x" + QByteArray::number(BMP_END, 16) + " \\\n"
2978 " ? (uc_ligature_trie[uc_ligature_trie[ucs4>>" + QByteArray::number(BMP_SHIFT) +
2979 "] + (ucs4 & 0x" + QByteArray::number(BMP_BLOCKSIZE-1, 16)+ ")]) \\\n"
2980 " : (ucs4 < 0x" + QByteArray::number(SMP_END, 16) + " \\\n"
2981 " ? uc_ligature_trie[uc_ligature_trie[((ucs4 - 0x" + QByteArray::number(BMP_END, 16) +
2982 ")>>" + QByteArray::number(SMP_SHIFT) + ") + 0x" + QByteArray::number(BMP_END/BMP_BLOCKSIZE, 16) + "]"
2983 " + (ucs4 & 0x" + QByteArray::number(SMP_BLOCKSIZE-1, 16) + ")] \\\n"
2984 " : 0xffff))\n\n";
2985
2986 out += "static const unsigned short uc_ligature_map[] = {";
2987 for (int i = 0; i < ligatures.size(); ++i) {
2988 if (!(i % 8)) {
2989 if (out.endsWith(' '))
2990 out.chop(1);
2991 out += "\n ";
2992 }
2993 out += "0x" + QByteArray::number(ligatures.at(i), 16);
2994 out += ", ";
2995 }
2996 if (out.endsWith(' '))
2997 out.chop(2);
2998 out += "\n};\n\n";
2999
3000 return out;
3001 }
3002
createCasingInfo()3003 QByteArray createCasingInfo()
3004 {
3005 QByteArray out;
3006
3007 out += "struct CasingInfo {\n"
3008 " uint codePoint : 16;\n"
3009 " uint flags : 8;\n"
3010 " uint offset : 8;\n"
3011 "};\n\n";
3012
3013 return out;
3014 }
3015
3016
main(int,char **)3017 int main(int, char **)
3018 {
3019 initAgeMap();
3020 initCategoryMap();
3021 initDecompositionMap();
3022 initDirectionMap();
3023 initJoiningMap();
3024 initGraphemeBreak();
3025 initWordBreak();
3026 initSentenceBreak();
3027 initLineBreak();
3028 initScriptMap();
3029
3030 readUnicodeData();
3031 readBidiMirroring();
3032 readArabicShaping();
3033 readDerivedAge();
3034 readDerivedNormalizationProps();
3035 readSpecialCasing();
3036 readCaseFolding();
3037 // readBlocks();
3038 readScripts();
3039 readGraphemeBreak();
3040 readWordBreak();
3041 readSentenceBreak();
3042 readLineBreak();
3043
3044 computeUniqueProperties();
3045 QByteArray properties = createPropertyInfo();
3046 QByteArray specialCases = createSpecialCaseMap();
3047 QByteArray compositions = createCompositionInfo();
3048 QByteArray ligatures = createLigatureInfo();
3049 QByteArray normalizationCorrections = createNormalizationCorrections();
3050
3051 QByteArray header =
3052 "/****************************************************************************\n"
3053 "**\n"
3054 "** Copyright (C) 2020 The Qt Company Ltd.\n"
3055 "** Contact: https://www.qt.io/licensing/\n"
3056 "**\n"
3057 "** This file is part of the QtCore module of the Qt Toolkit.\n"
3058 "**\n"
3059 "** $QT_BEGIN_LICENSE:LGPL$\n"
3060 "** Commercial License Usage\n"
3061 "** Licensees holding valid commercial Qt licenses may use this file in\n"
3062 "** accordance with the commercial license agreement provided with the\n"
3063 "** Software or, alternatively, in accordance with the terms contained in\n"
3064 "** a written agreement between you and The Qt Company. For licensing terms\n"
3065 "** and conditions see https://www.qt.io/terms-conditions. For further\n"
3066 "** information use the contact form at https://www.qt.io/contact-us.\n"
3067 "**\n"
3068 "** GNU Lesser General Public License Usage\n"
3069 "** Alternatively, this file may be used under the terms of the GNU Lesser\n"
3070 "** General Public License version 3 as published by the Free Software\n"
3071 "** Foundation and appearing in the file LICENSE.LGPL3 included in the\n"
3072 "** packaging of this file. Please review the following information to\n"
3073 "** ensure the GNU Lesser General Public License version 3 requirements\n"
3074 "** will be met: https://www.gnu.org/licenses/lgpl-3.0.html.\n"
3075 "**\n"
3076 "** GNU General Public License Usage\n"
3077 "** Alternatively, this file may be used under the terms of the GNU\n"
3078 "** General Public License version 2.0 or (at your option) the GNU General\n"
3079 "** Public license version 3 or any later version approved by the KDE Free\n"
3080 "** Qt Foundation. The licenses are as published by the Free Software\n"
3081 "** Foundation and appearing in the file LICENSE.GPL2 and LICENSE.GPL3\n"
3082 "** included in the packaging of this file. Please review the following\n"
3083 "** information to ensure the GNU General Public License requirements will\n"
3084 "** be met: https://www.gnu.org/licenses/gpl-2.0.html and\n"
3085 "** https://www.gnu.org/licenses/gpl-3.0.html.\n"
3086 "**\n"
3087 "** $QT_END_LICENSE$\n"
3088 "**\n"
3089 "****************************************************************************/\n\n";
3090
3091 QByteArray note =
3092 "/* This file is autogenerated from the Unicode " DATA_VERSION_S " database. Do not edit */\n\n";
3093
3094 QByteArray warning =
3095 "//\n"
3096 "// W A R N I N G\n"
3097 "// -------------\n"
3098 "//\n"
3099 "// This file is not part of the Qt API. It exists for the convenience\n"
3100 "// of internal files. This header file may change from version to version\n"
3101 "// without notice, or even be removed.\n"
3102 "//\n"
3103 "// We mean it.\n"
3104 "//\n\n";
3105
3106 QFile f("../../src/corelib/text/qunicodetables.cpp");
3107 f.open(QFile::WriteOnly|QFile::Truncate);
3108 f.write(header);
3109 f.write(note);
3110 f.write("#include \"qunicodetables_p.h\"\n\n");
3111 f.write("QT_BEGIN_NAMESPACE\n\n");
3112 f.write("namespace QUnicodeTables {\n\n");
3113 f.write(properties);
3114 f.write("\n");
3115 f.write(specialCases);
3116 f.write("\n");
3117 f.write(compositions);
3118 f.write(ligatures);
3119 f.write("\n");
3120 f.write(normalizationCorrections);
3121 f.write("} // namespace QUnicodeTables\n\n");
3122 f.write("using namespace QUnicodeTables;\n\n");
3123 f.write("QT_END_NAMESPACE\n");
3124 f.close();
3125
3126 f.setFileName("../../src/corelib/text/qunicodetables_p.h");
3127 f.open(QFile::WriteOnly | QFile::Truncate);
3128 f.write(header);
3129 f.write(note);
3130 f.write(warning);
3131 f.write("#ifndef QUNICODETABLES_P_H\n"
3132 "#define QUNICODETABLES_P_H\n\n"
3133 "#include <QtCore/private/qglobal_p.h>\n\n"
3134 "#include <QtCore/qchar.h>\n\n"
3135 "QT_BEGIN_NAMESPACE\n\n");
3136 f.write("#define UNICODE_DATA_VERSION " DATA_VERSION_STR "\n\n");
3137 f.write("namespace QUnicodeTables {\n\n");
3138 f.write(property_string);
3139 f.write(sizeOfPropertiesStructCheck);
3140 f.write(grapheme_break_class_string);
3141 f.write(word_break_class_string);
3142 f.write(sentence_break_class_string);
3143 f.write(line_break_class_string);
3144 f.write(methods);
3145 f.write("} // namespace QUnicodeTables\n\n"
3146 "QT_END_NAMESPACE\n\n"
3147 "#endif // QUNICODETABLES_P_H\n");
3148 f.close();
3149
3150 qDebug() << "maxMirroredDiff = " << hex << maxMirroredDiff;
3151 qDebug() << "maxLowerCaseDiff = " << hex << maxLowerCaseDiff;
3152 qDebug() << "maxUpperCaseDiff = " << hex << maxUpperCaseDiff;
3153 qDebug() << "maxTitleCaseDiff = " << hex << maxTitleCaseDiff;
3154 qDebug() << "maxCaseFoldDiff = " << hex << maxCaseFoldDiff;
3155 #if 0
3156 // dump(0, 0x7f);
3157 // dump(0x620, 0x640);
3158 // dump(0x10000, 0x10020);
3159 // dump(0x10800, 0x10820);
3160
3161 qDebug("decompositionLength used:");
3162 int totalcompositions = 0;
3163 int sum = 0;
3164 for (int i = 1; i < 20; ++i) {
3165 qDebug(" length %d used %d times", i, decompositionLength.value(i, 0));
3166 totalcompositions += i*decompositionLength.value(i, 0);
3167 sum += decompositionLength.value(i, 0);
3168 }
3169 qDebug(" len decomposition map %d, average length %f, num composed chars %d",
3170 totalcompositions, (float)totalcompositions/(float)sum, sum);
3171 qDebug("highest composed character %x", highestComposedCharacter);
3172 qDebug("num ligatures = %d highest=%x, maxLength=%d", numLigatures, highestLigature, longestLigature);
3173
3174 qBubbleSort(ligatures);
3175 for (int i = 0; i < ligatures.size(); ++i)
3176 qDebug("%s", ligatures.at(i).data());
3177
3178 // qDebug("combiningClass usage:");
3179 // int numClasses = 0;
3180 // for (int i = 0; i < 255; ++i) {
3181 // int num = combiningClassUsage.value(i, 0);
3182 // if (num) {
3183 // ++numClasses;
3184 // qDebug(" combiningClass %d used %d times", i, num);
3185 // }
3186 // }
3187 // qDebug("total of %d combining classes used", numClasses);
3188
3189 #endif
3190 }
3191