1 /* -*- Mode: C++; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4 -*- */
2 /*
3  * This file is part of the LibreOffice project.
4  *
5  * This Source Code Form is subject to the terms of the Mozilla Public
6  * License, v. 2.0. If a copy of the MPL was not distributed with this
7  * file, You can obtain one at http://mozilla.org/MPL/2.0/.
8  *
9  * This file incorporates work covered by the following license notice:
10  *
11  *   Licensed to the Apache Software Foundation (ASF) under one or more
12  *   contributor license agreements. See the NOTICE file distributed
13  *   with this work for additional information regarding copyright
14  *   ownership. The ASF licenses this file to you under the Apache
15  *   License, Version 2.0 (the "License"); you may not use this file
16  *   except in compliance with the License. You may obtain a copy of
17  *   the License at http://www.apache.org/licenses/LICENSE-2.0 .
18  */
19 
20 #include <com/sun/star/i18n/UnicodeType.hpp>
21 #include <com/sun/star/i18n/ScriptType.hpp>
22 #include <i18nlangtag/languagetag.hxx>
23 #include <i18nlangtag/languagetagicu.hxx>
24 #include <i18nutil/unicode.hxx>
25 #include <sal/log.hxx>
26 #include <unicode/numfmt.h>
27 #include "unicode_data.h"
28 #include <rtl/character.hxx>
29 #include <memory>
30 
31 // Workaround for glibc braindamage:
32 // glibc 2.4's langinfo.h does "#define CURRENCY_SYMBOL __CURRENCY_SYMBOL"
33 // which (obviously) breaks UnicodeType::CURRENCY_SYMBOL
34 #undef CURRENCY_SYMBOL
35 
36 using namespace ::com::sun::star::i18n;
37 
38 template<class L, typename T>
getScriptType(const sal_Unicode ch,const L * typeList,T unknownType)39 static T getScriptType( const sal_Unicode ch, const L* typeList, T unknownType ) {
40 
41     sal_Int16 i = 0;
42     css::i18n::UnicodeScript type = typeList[0].to;
43     while (type < UnicodeScript_kScriptCount && ch > UnicodeScriptType[static_cast<int>(type)][UnicodeScriptTypeTo]) {
44         type = typeList[++i].to;
45     }
46 
47     return (type < UnicodeScript_kScriptCount &&
48             ch >= UnicodeScriptType[static_cast<int>(typeList[i].from)][int(UnicodeScriptTypeFrom)]) ?
49             typeList[i].value : unknownType;
50 }
51 
52 sal_Int16
getUnicodeScriptType(const sal_Unicode ch,const ScriptTypeList * typeList,sal_Int16 unknownType)53 unicode::getUnicodeScriptType( const sal_Unicode ch, const ScriptTypeList* typeList, sal_Int16 unknownType ) {
54     return getScriptType(ch, typeList, unknownType);
55 }
56 
57 sal_Unicode
getUnicodeScriptStart(UnicodeScript type)58 unicode::getUnicodeScriptStart( UnicodeScript type) {
59     return UnicodeScriptType[static_cast<int>(type)][UnicodeScriptTypeFrom];
60 }
61 
62 sal_Unicode
getUnicodeScriptEnd(UnicodeScript type)63 unicode::getUnicodeScriptEnd( UnicodeScript type) {
64     return UnicodeScriptType[static_cast<int>(type)][UnicodeScriptTypeTo];
65 }
66 
67 sal_Int16
getUnicodeType(const sal_Unicode ch)68 unicode::getUnicodeType( const sal_Unicode ch ) {
69     static sal_Unicode c = 0x00;
70     static sal_Int16 r = 0x00;
71 
72     if (ch == c) return r;
73     else c = ch;
74 
75     sal_Int16 address = UnicodeTypeIndex[ch >> 8];
76     r = static_cast<sal_Int16>(
77             (address < UnicodeTypeNumberBlock)
78             ? UnicodeTypeBlockValue[address]
79             : UnicodeTypeValue[((address - UnicodeTypeNumberBlock) << 8) + (ch & 0xff)]);
80     return r;
81 }
82 
83 sal_uInt8
getUnicodeDirection(const sal_Unicode ch)84 unicode::getUnicodeDirection( const sal_Unicode ch ) {
85     static sal_Unicode c = 0x00;
86     static sal_uInt8 r = 0x00;
87 
88     if (ch == c) return r;
89     else c = ch;
90 
91     sal_Int16 address = UnicodeDirectionIndex[ch >> 8];
92     r = (address < UnicodeDirectionNumberBlock)
93             ? UnicodeDirectionBlockValue[address]
94             : UnicodeDirectionValue[((address - UnicodeDirectionNumberBlock) << 8) + (ch & 0xff)];
95     return r;
96 }
97 
98 #define bit(name)   (1U << name)
99 
100 #define UPPERMASK   bit(UnicodeType::UPPERCASE_LETTER)
101 
102 #define LOWERMASK   bit(UnicodeType::LOWERCASE_LETTER)
103 
104 #define TITLEMASK   bit(UnicodeType::TITLECASE_LETTER)
105 
106 #define ALPHAMASK   UPPERMASK|LOWERMASK|TITLEMASK|\
107             bit(UnicodeType::MODIFIER_LETTER)|\
108             bit(UnicodeType::OTHER_LETTER)
109 
110 #define SPACEMASK   bit(UnicodeType::SPACE_SEPARATOR)|\
111             bit(UnicodeType::LINE_SEPARATOR)|\
112             bit(UnicodeType::PARAGRAPH_SEPARATOR)
113 
114 #define CONTROLMASK bit(UnicodeType::CONTROL)|\
115             bit(UnicodeType::FORMAT)|\
116             bit(UnicodeType::LINE_SEPARATOR)|\
117             bit(UnicodeType::PARAGRAPH_SEPARATOR)
118 
119 #define IsType(func, mask)  \
120 bool func( const sal_Unicode ch) {\
121     return (bit(getUnicodeType(ch)) & (mask)) != 0;\
122 }
123 
IsType(unicode::isControl,CONTROLMASK)124 IsType(unicode::isControl, CONTROLMASK)
125 IsType(unicode::isAlpha, ALPHAMASK)
126 IsType(unicode::isSpace, SPACEMASK)
127 
128 #define CONTROLSPACE    bit(0x09)|bit(0x0a)|bit(0x0b)|bit(0x0c)|bit(0x0d)|\
129             bit(0x1c)|bit(0x1d)|bit(0x1e)|bit(0x1f)
130 
131 bool unicode::isWhiteSpace( const sal_Unicode ch) {
132     return (ch != 0xa0 && isSpace(ch)) || (ch <= 0x1F && (bit(ch) & (CONTROLSPACE)));
133 }
134 
getScriptClassFromUScriptCode(UScriptCode eScript)135 sal_Int16 unicode::getScriptClassFromUScriptCode(UScriptCode eScript)
136 {
137     //See unicode/uscript.h
138     static const sal_Int16 scriptTypes[] =
139     {
140         ScriptType::WEAK, ScriptType::WEAK, ScriptType::COMPLEX, ScriptType::LATIN, ScriptType::COMPLEX,
141         ScriptType::ASIAN, ScriptType::LATIN, ScriptType::LATIN, ScriptType::LATIN, ScriptType::COMPLEX,
142         ScriptType::COMPLEX, ScriptType::COMPLEX, ScriptType::LATIN, ScriptType::LATIN, ScriptType::LATIN,
143     // 15
144         ScriptType::COMPLEX, ScriptType::COMPLEX, ScriptType::ASIAN, ScriptType::ASIAN, ScriptType::COMPLEX,
145         ScriptType::ASIAN, ScriptType::COMPLEX, ScriptType::ASIAN, ScriptType::COMPLEX, ScriptType::COMPLEX,
146         ScriptType::LATIN, ScriptType::COMPLEX, ScriptType::COMPLEX, ScriptType::COMPLEX, ScriptType::LATIN,
147     // 30
148         ScriptType::LATIN, ScriptType::COMPLEX, ScriptType::LATIN, ScriptType::COMPLEX, ScriptType::COMPLEX,
149         ScriptType::COMPLEX, ScriptType::COMPLEX, ScriptType::COMPLEX, ScriptType::COMPLEX, ScriptType::COMPLEX,
150         ScriptType::LATIN, ScriptType::ASIAN, ScriptType::COMPLEX, ScriptType::COMPLEX, ScriptType::COMPLEX,
151     // 45
152         ScriptType::COMPLEX, ScriptType::LATIN, ScriptType::LATIN, ScriptType::COMPLEX, ScriptType::COMPLEX,
153         ScriptType::LATIN, ScriptType::LATIN, ScriptType::COMPLEX, ScriptType::COMPLEX, ScriptType::LATIN,
154         ScriptType::COMPLEX, ScriptType::LATIN, ScriptType::COMPLEX, ScriptType::COMPLEX, ScriptType::COMPLEX,
155     // 60
156         ScriptType::COMPLEX, ScriptType::COMPLEX, ScriptType::COMPLEX, ScriptType::COMPLEX, ScriptType::COMPLEX,
157         ScriptType::COMPLEX, ScriptType::COMPLEX, ScriptType::LATIN, ScriptType::LATIN, ScriptType::COMPLEX,
158         ScriptType::COMPLEX, ScriptType::COMPLEX, ScriptType::COMPLEX, ScriptType::ASIAN, ScriptType::ASIAN,
159     // 75
160         ScriptType::COMPLEX, ScriptType::LATIN, ScriptType::COMPLEX, ScriptType::COMPLEX, ScriptType::COMPLEX,
161         ScriptType::LATIN, ScriptType::LATIN, ScriptType::COMPLEX, ScriptType::COMPLEX, ScriptType::COMPLEX,
162         ScriptType::COMPLEX, ScriptType::COMPLEX, ScriptType::COMPLEX, ScriptType::COMPLEX, ScriptType::COMPLEX,
163     // 90
164         ScriptType::COMPLEX, ScriptType::COMPLEX, ScriptType::COMPLEX, ScriptType::COMPLEX, ScriptType::COMPLEX,
165         ScriptType::COMPLEX, ScriptType::COMPLEX, ScriptType::COMPLEX, ScriptType::COMPLEX, ScriptType::COMPLEX,
166         ScriptType::COMPLEX, ScriptType::COMPLEX, ScriptType::WEAK, ScriptType::WEAK, ScriptType::COMPLEX,
167     // 105
168         ScriptType::ASIAN, ScriptType::COMPLEX, ScriptType::COMPLEX, ScriptType::COMPLEX, ScriptType::COMPLEX,
169         ScriptType::COMPLEX, ScriptType::COMPLEX, ScriptType::COMPLEX, ScriptType::COMPLEX, ScriptType::COMPLEX,
170         ScriptType::COMPLEX, ScriptType::COMPLEX, ScriptType::COMPLEX, ScriptType::COMPLEX, ScriptType::ASIAN,
171     // 120
172         ScriptType::COMPLEX, ScriptType::COMPLEX, ScriptType::COMPLEX, ScriptType::COMPLEX, ScriptType::COMPLEX,
173         ScriptType::COMPLEX, ScriptType::COMPLEX, ScriptType::COMPLEX, ScriptType::WEAK, ScriptType::WEAK,
174         ScriptType::COMPLEX, ScriptType::COMPLEX, ScriptType::COMPLEX, ScriptType::COMPLEX, ScriptType::COMPLEX,
175     // 135
176         ScriptType::COMPLEX, ScriptType::COMPLEX, ScriptType::COMPLEX, ScriptType::COMPLEX, ScriptType::COMPLEX,
177         ScriptType::COMPLEX, ScriptType::COMPLEX, ScriptType::COMPLEX, ScriptType::COMPLEX, ScriptType::COMPLEX,
178         ScriptType::COMPLEX,
179         ScriptType::WEAK
180     };
181 
182     sal_Int16 nRet;
183     if (eScript < USCRIPT_COMMON)
184         nRet = ScriptType::WEAK;
185     else if (static_cast<size_t>(eScript) >= SAL_N_ELEMENTS(scriptTypes))
186         nRet = ScriptType::COMPLEX;         // anything new is going to be pretty wild
187     else
188         nRet = scriptTypes[eScript];
189     return nRet;
190 }
191 
getExemplarLanguageForUScriptCode(UScriptCode eScript)192 OString unicode::getExemplarLanguageForUScriptCode(UScriptCode eScript)
193 {
194     OString sRet;
195     switch (eScript)
196     {
197         case USCRIPT_CODE_LIMIT:
198         case USCRIPT_INVALID_CODE:
199             sRet = "zxx";
200             break;
201         case USCRIPT_COMMON:
202         case USCRIPT_INHERITED:
203             sRet = "und";
204             break;
205         case USCRIPT_MATHEMATICAL_NOTATION:
206         case USCRIPT_SYMBOLS:
207             sRet = "zxx";
208             break;
209         case USCRIPT_UNWRITTEN_LANGUAGES:
210         case USCRIPT_UNKNOWN:
211             sRet = "und";
212             break;
213         case USCRIPT_ARABIC:
214             sRet = "ar";
215             break;
216         case USCRIPT_ARMENIAN:
217             sRet = "hy";
218             break;
219         case USCRIPT_BENGALI:
220             sRet = "bn";
221             break;
222         case USCRIPT_BOPOMOFO:
223             sRet = "zh";
224             break;
225         case USCRIPT_CHEROKEE:
226             sRet = "chr";
227             break;
228         case USCRIPT_COPTIC:
229             sRet = "cop";
230             break;
231         case USCRIPT_CYRILLIC:
232             sRet = "ru";
233             break;
234         case USCRIPT_DESERET:
235             sRet = "en";
236             break;
237         case USCRIPT_DEVANAGARI:
238             sRet = "hi";
239             break;
240         case USCRIPT_ETHIOPIC:
241             sRet = "am";
242             break;
243         case USCRIPT_GEORGIAN:
244             sRet = "ka";
245             break;
246         case USCRIPT_GOTHIC:
247             sRet = "got";
248             break;
249         case USCRIPT_GREEK:
250             sRet = "el";
251             break;
252         case USCRIPT_GUJARATI:
253             sRet = "gu";
254             break;
255         case USCRIPT_GURMUKHI:
256             sRet = "pa";
257             break;
258         case USCRIPT_HAN:
259             sRet = "zh";
260             break;
261         case USCRIPT_HANGUL:
262             sRet = "ko";
263             break;
264         case USCRIPT_HEBREW:
265             sRet = "hr";
266             break;
267         case USCRIPT_HIRAGANA:
268             sRet = "ja";
269             break;
270         case USCRIPT_KANNADA:
271             sRet = "kn";
272             break;
273         case USCRIPT_KATAKANA:
274             sRet = "ja";
275             break;
276         case USCRIPT_KHMER:
277             sRet = "km";
278             break;
279         case USCRIPT_LAO:
280             sRet = "lo";
281             break;
282         case USCRIPT_LATIN:
283             sRet = "en";
284             break;
285         case USCRIPT_MALAYALAM:
286             sRet = "ml";
287             break;
288         case USCRIPT_MONGOLIAN:
289             sRet = "mn";
290             break;
291         case USCRIPT_MYANMAR:
292             sRet = "my";
293             break;
294         case USCRIPT_OGHAM:
295             sRet = "pgl";
296             break;
297         case USCRIPT_OLD_ITALIC:
298             sRet = "osc";
299             break;
300         case USCRIPT_ORIYA:
301             sRet = "or";
302             break;
303         case USCRIPT_RUNIC:
304             sRet = "ang";
305             break;
306         case USCRIPT_SINHALA:
307             sRet = "si";
308             break;
309         case USCRIPT_SYRIAC:
310             sRet = "syr";
311             break;
312         case USCRIPT_TAMIL:
313             sRet = "ta";
314             break;
315         case USCRIPT_TELUGU:
316             sRet = "te";
317             break;
318         case USCRIPT_THAANA:
319             sRet = "dv";
320             break;
321         case USCRIPT_THAI:
322             sRet = "th";
323             break;
324         case USCRIPT_TIBETAN:
325             sRet = "bo";
326             break;
327         case USCRIPT_CANADIAN_ABORIGINAL:
328             sRet = "iu";
329             break;
330         case USCRIPT_YI:
331             sRet = "ii";
332             break;
333         case USCRIPT_TAGALOG:
334             sRet = "tl";
335             break;
336         case USCRIPT_HANUNOO:
337             sRet = "hnn";
338             break;
339         case USCRIPT_BUHID:
340             sRet = "bku";
341             break;
342         case USCRIPT_TAGBANWA:
343             sRet = "tbw";
344             break;
345         case USCRIPT_BRAILLE:
346             sRet = "en";
347             break;
348         case USCRIPT_CYPRIOT:
349             sRet = "ecy";
350             break;
351         case USCRIPT_LIMBU:
352             sRet = "lif";
353             break;
354         case USCRIPT_LINEAR_B:
355             sRet = "gmy";
356             break;
357         case USCRIPT_OSMANYA:
358             sRet = "so";
359             break;
360         case USCRIPT_SHAVIAN:
361             sRet = "en";
362             break;
363         case USCRIPT_TAI_LE:
364             sRet = "tdd";
365             break;
366         case USCRIPT_UGARITIC:
367             sRet = "uga";
368             break;
369         case USCRIPT_KATAKANA_OR_HIRAGANA:
370             sRet = "ja";
371             break;
372         case USCRIPT_BUGINESE:
373             sRet = "bug";
374             break;
375         case USCRIPT_GLAGOLITIC:
376             sRet = "ch";
377             break;
378         case USCRIPT_KHAROSHTHI:
379             sRet = "pra";
380             break;
381         case USCRIPT_SYLOTI_NAGRI:
382             sRet = "syl";
383             break;
384         case USCRIPT_NEW_TAI_LUE:
385             sRet = "khb";
386             break;
387         case USCRIPT_TIFINAGH:
388             sRet = "tmh";
389             break;
390         case USCRIPT_OLD_PERSIAN:
391             sRet = "peo";
392             break;
393         case USCRIPT_BALINESE:
394             sRet = "ban";
395             break;
396         case USCRIPT_BATAK:
397             sRet = "btk";
398             break;
399         case USCRIPT_BLISSYMBOLS:
400             sRet = "en";
401             break;
402         case USCRIPT_BRAHMI:
403             sRet = "pra";
404             break;
405         case USCRIPT_CHAM:
406             sRet = "cja";
407             break;
408         case USCRIPT_CIRTH:
409             sRet = "sjn";
410             break;
411         case USCRIPT_OLD_CHURCH_SLAVONIC_CYRILLIC:
412             sRet = "cu";
413             break;
414         case USCRIPT_DEMOTIC_EGYPTIAN:
415         case USCRIPT_HIERATIC_EGYPTIAN:
416         case USCRIPT_EGYPTIAN_HIEROGLYPHS:
417             sRet = "egy";
418             break;
419         case USCRIPT_KHUTSURI:
420             sRet = "ka";
421             break;
422         case USCRIPT_SIMPLIFIED_HAN:
423             sRet = "zh";
424             break;
425         case USCRIPT_TRADITIONAL_HAN:
426             sRet = "zh";
427             break;
428         case USCRIPT_PAHAWH_HMONG:
429             sRet = "blu";
430             break;
431         case USCRIPT_OLD_HUNGARIAN:
432             sRet = "ohu";
433             break;
434         case USCRIPT_HARAPPAN_INDUS:
435             sRet = "xiv";
436             break;
437         case USCRIPT_JAVANESE:
438             sRet = "kaw";
439             break;
440         case USCRIPT_KAYAH_LI:
441             sRet = "eky";
442             break;
443         case USCRIPT_LATIN_FRAKTUR:
444             sRet = "de";
445             break;
446         case USCRIPT_LATIN_GAELIC:
447             sRet = "ga";
448             break;
449         case USCRIPT_LEPCHA:
450             sRet = "lep";
451             break;
452         case USCRIPT_LINEAR_A:
453             sRet = "ecr";
454             break;
455         case USCRIPT_MAYAN_HIEROGLYPHS:
456             sRet = "myn";
457             break;
458         case USCRIPT_MEROITIC:
459             sRet = "xmr";
460             break;
461         case USCRIPT_NKO:
462             sRet = "nqo";
463             break;
464         case USCRIPT_ORKHON:
465             sRet = "otk";
466             break;
467         case USCRIPT_OLD_PERMIC:
468             sRet = "kv";
469             break;
470         case USCRIPT_PHAGS_PA:
471             sRet = "xng";
472             break;
473         case USCRIPT_PHOENICIAN:
474             sRet = "phn";
475             break;
476         case USCRIPT_PHONETIC_POLLARD:
477             sRet = "hmd";
478             break;
479         case USCRIPT_RONGORONGO:
480             sRet = "rap";
481             break;
482         case USCRIPT_SARATI:
483             sRet = "qya";
484             break;
485         case USCRIPT_ESTRANGELO_SYRIAC:
486             sRet = "syr";
487             break;
488         case USCRIPT_WESTERN_SYRIAC:
489             sRet = "tru";
490             break;
491         case USCRIPT_EASTERN_SYRIAC:
492             sRet = "aii";
493             break;
494         case USCRIPT_TENGWAR:
495             sRet = "sjn";
496             break;
497         case USCRIPT_VAI:
498             sRet = "vai";
499             break;
500         case USCRIPT_VISIBLE_SPEECH:
501             sRet = "en";
502             break;
503         case USCRIPT_CUNEIFORM:
504             sRet = "akk";
505             break;
506         case USCRIPT_CARIAN:
507             sRet = "xcr";
508             break;
509         case USCRIPT_JAPANESE:
510             sRet = "ja";
511             break;
512         case USCRIPT_LANNA:
513             sRet = "nod";
514             break;
515         case USCRIPT_LYCIAN:
516             sRet = "xlc";
517             break;
518         case USCRIPT_LYDIAN:
519             sRet = "xld";
520             break;
521         case USCRIPT_OL_CHIKI:
522             sRet = "sat";
523             break;
524         case USCRIPT_REJANG:
525             sRet = "rej";
526             break;
527         case USCRIPT_SAURASHTRA:
528             sRet = "saz";
529             break;
530         case USCRIPT_SIGN_WRITING:
531             sRet = "en";
532             break;
533         case USCRIPT_SUNDANESE:
534             sRet = "su";
535             break;
536         case USCRIPT_MOON:
537             sRet = "en";
538             break;
539         case USCRIPT_MEITEI_MAYEK:
540             sRet = "mni";
541             break;
542         case USCRIPT_IMPERIAL_ARAMAIC:
543             sRet = "arc";
544             break;
545         case USCRIPT_AVESTAN:
546             sRet = "ae";
547             break;
548         case USCRIPT_CHAKMA:
549             sRet = "ccp";
550             break;
551         case USCRIPT_KOREAN:
552             sRet = "ko";
553             break;
554         case USCRIPT_KAITHI:
555             sRet = "awa";
556             break;
557         case USCRIPT_MANICHAEAN:
558             sRet = "xmn";
559             break;
560         case USCRIPT_INSCRIPTIONAL_PAHLAVI:
561         case USCRIPT_PSALTER_PAHLAVI:
562         case USCRIPT_BOOK_PAHLAVI:
563         case USCRIPT_INSCRIPTIONAL_PARTHIAN:
564             sRet = "xpr";
565             break;
566         case USCRIPT_SAMARITAN:
567             sRet = "heb";
568             break;
569         case USCRIPT_TAI_VIET:
570             sRet = "blt";
571             break;
572         case USCRIPT_MANDAEAN: /* Aliased to USCRIPT_MANDAIC in icu 4.6. */
573             sRet = "mic";
574             break;
575 #if (U_ICU_VERSION_MAJOR_NUM > 4) || (U_ICU_VERSION_MAJOR_NUM == 4 && U_ICU_VERSION_MINOR_NUM >= 4)
576         case USCRIPT_NABATAEAN: //no language with an assigned code yet
577             sRet = "mis";
578             break;
579         case USCRIPT_PALMYRENE: //no language with an assigned code yet
580             sRet = "mis";
581             break;
582         case USCRIPT_BAMUM:
583             sRet = "bax";
584             break;
585         case USCRIPT_LISU:
586             sRet = "lis";
587             break;
588         case USCRIPT_NAKHI_GEBA:
589             sRet = "nxq";
590             break;
591         case USCRIPT_OLD_SOUTH_ARABIAN:
592             sRet = "xsa";
593             break;
594         case USCRIPT_BASSA_VAH:
595             sRet = "bsq";
596             break;
597         case USCRIPT_DUPLOYAN_SHORTAND:
598             sRet = "fr";
599             break;
600         case USCRIPT_ELBASAN:
601             sRet = "sq";
602             break;
603         case USCRIPT_GRANTHA:
604             sRet = "ta";
605             break;
606         case USCRIPT_KPELLE:
607             sRet = "kpe";
608             break;
609         case USCRIPT_LOMA:
610             sRet = "lom";
611             break;
612         case USCRIPT_MENDE:
613             sRet = "men";
614             break;
615         case USCRIPT_MEROITIC_CURSIVE:
616             sRet = "xmr";
617             break;
618         case USCRIPT_OLD_NORTH_ARABIAN:
619             sRet = "xna";
620             break;
621         case USCRIPT_SINDHI:
622             sRet = "sd";
623             break;
624         case USCRIPT_WARANG_CITI:
625             sRet = "hoc";
626             break;
627 #endif
628 #if (U_ICU_VERSION_MAJOR_NUM > 4) || (U_ICU_VERSION_MAJOR_NUM == 4 && U_ICU_VERSION_MINOR_NUM >= 8)
629         case USCRIPT_AFAKA:
630             sRet = "djk";
631             break;
632         case USCRIPT_JURCHEN:
633             sRet = "juc";
634             break;
635         case USCRIPT_MRO:
636             sRet = "cmr";
637             break;
638         case USCRIPT_NUSHU: //no language with an assigned code yet
639             sRet = "mis";
640             break;
641         case USCRIPT_SHARADA:
642             sRet = "sa";
643             break;
644         case USCRIPT_SORA_SOMPENG:
645             sRet = "srb";
646             break;
647         case USCRIPT_TAKRI:
648             sRet = "doi";
649             break;
650         case USCRIPT_TANGUT:
651             sRet = "txg";
652             break;
653         case USCRIPT_WOLEAI:
654             sRet = "woe";
655             break;
656 #endif
657 #if (U_ICU_VERSION_MAJOR_NUM >= 49)
658         case USCRIPT_ANATOLIAN_HIEROGLYPHS:
659             sRet = "hlu";
660             break;
661         case USCRIPT_KHOJKI:
662             sRet = "gu";
663             break;
664         case USCRIPT_TIRHUTA:
665             sRet = "mai";
666             break;
667 #endif
668 #if (U_ICU_VERSION_MAJOR_NUM >= 52)
669         case USCRIPT_CAUCASIAN_ALBANIAN:
670             sRet = "xag";
671             break;
672         case USCRIPT_MAHAJANI:
673             sRet = "mwr";
674             break;
675 #endif
676 #if (U_ICU_VERSION_MAJOR_NUM >= 54)
677         case USCRIPT_AHOM:
678             sRet = "aho";
679             break;
680         case USCRIPT_HATRAN:
681             sRet = "qly-Hatr";
682             break;
683         case USCRIPT_MODI:
684             sRet = "mr-Modi";
685             break;
686         case USCRIPT_MULTANI:
687             sRet = "skr-Mutl";
688             break;
689         case USCRIPT_PAU_CIN_HAU:
690             sRet = "ctd-Pauc";
691             break;
692         case USCRIPT_SIDDHAM:
693             sRet = "sa-Sidd";
694             break;
695 #endif
696 #if (U_ICU_VERSION_MAJOR_NUM >= 58)
697         case USCRIPT_ADLAM:
698             sRet = "mis";   // Adlm - Adlam for Fulani, no language code
699             break;
700         case USCRIPT_BHAIKSUKI:
701             sRet = "mis";   // Bhks - Bhaiksuki for some Buddhist texts, no language code
702             break;
703         case USCRIPT_MARCHEN:
704             sRet = "bo-Marc";
705             break;
706         case USCRIPT_NEWA:
707             sRet = "new-Newa";
708             break;
709         case USCRIPT_OSAGE:
710             sRet = "osa-Osge";
711             break;
712         case USCRIPT_HAN_WITH_BOPOMOFO:
713             sRet = "mis";   // Hanb - Han with Bopomofo, zh-Hanb ?
714             break;
715         case USCRIPT_JAMO:
716             sRet = "ko";   // Jamo - elements of Hangul Syllables
717             break;
718         case USCRIPT_SYMBOLS_EMOJI:
719             sRet = "mis";   // Zsye - Emoji variant
720             break;
721 #endif
722 #if (U_ICU_VERSION_MAJOR_NUM >= 60)
723         case USCRIPT_MASARAM_GONDI:
724             sRet = "gon-Gonm";  // macro language code, could be wsg,esg,gno
725             break;
726         case USCRIPT_SOYOMBO:
727             sRet = "mn-Soyo";   // abugida to write Mongolian, also Tibetan and Sanskrit
728             break;
729         case USCRIPT_ZANABAZAR_SQUARE:
730             sRet = "mn-Zanb";   // abugida to write Mongolian
731             break;
732 #endif
733 #if (U_ICU_VERSION_MAJOR_NUM >= 62)
734         case USCRIPT_DOGRA:
735             sRet = "dgo";       // Dogri proper
736             break;
737         case USCRIPT_GUNJALA_GONDI:
738             sRet = "wsg";       // Adilabad Gondi
739             break;
740         case USCRIPT_MAKASAR:
741             sRet = "mak";
742             break;
743         case USCRIPT_MEDEFAIDRIN:
744             sRet = "mis-Medf";  // Uncoded with script
745             break;
746         case USCRIPT_HANIFI_ROHINGYA:
747             sRet = "rhg";
748             break;
749         case USCRIPT_SOGDIAN:
750             sRet = "sog";
751             break;
752         case USCRIPT_OLD_SOGDIAN:
753             sRet = "sog";
754             break;
755 #endif
756 #if (U_ICU_VERSION_MAJOR_NUM >= 64)
757         case USCRIPT_ELYMAIC:
758             sRet = "arc-Elym";
759             break;
760         case USCRIPT_NYIAKENG_PUACHUE_HMONG:
761             sRet = "hmn-Hmnp";  // macrolanguage code
762             break;
763         case USCRIPT_NANDINAGARI:
764             sRet = "sa-Nand";
765             break;
766         case USCRIPT_WANCHO:
767             sRet = "nnp-Wcho";
768             break;
769 #endif
770 #if (U_ICU_VERSION_MAJOR_NUM >= 66)
771         case USCRIPT_CHORASMIAN:
772             sRet = "xco-Chrs";
773             break;
774         case USCRIPT_DIVES_AKURU:
775             sRet = "dv-Diak";
776             break;
777         case USCRIPT_KHITAN_SMALL_SCRIPT:
778             sRet = "zkt-Kits";
779             break;
780         case USCRIPT_YEZIDI:
781             sRet = "kmr-Yezi";
782             break;
783 #endif
784     }
785     return sRet;
786 }
787 
788 //Format a number as a percentage according to the rules of the given
789 //language, e.g. 100 -> "100%" for en-US vs "100 %" for de-DE
formatPercent(double dNumber,const LanguageTag & rLangTag)790 OUString unicode::formatPercent(double dNumber,
791     const LanguageTag &rLangTag)
792 {
793     // get a currency formatter for this locale ID
794     UErrorCode errorCode=U_ZERO_ERROR;
795 
796     LanguageTag aLangTag(rLangTag);
797 
798     // As of CLDR Version 24 these languages were not listed as using spacing
799     // between number and % but are reported as such by our l10n groups
800     // http://www.unicode.org/cldr/charts/24/by_type/numbers.number_formatting_patterns.html
801     // so format using French which has the desired rules
802     if (aLangTag.getLanguage() == "es" || aLangTag.getLanguage() == "sl")
803         aLangTag.reset("fr-FR");
804 
805     icu::Locale aLocale = LanguageTagIcu::getIcuLocale(aLangTag);
806 
807     std::unique_ptr<icu::NumberFormat> xF(
808         icu::NumberFormat::createPercentInstance(aLocale, errorCode));
809     if(U_FAILURE(errorCode))
810     {
811         SAL_WARN("i18n", "icu::NumberFormat::createPercentInstance failed");
812         return OUString::number(dNumber) + "%";
813     }
814 
815     icu::UnicodeString output;
816     xF->format(dNumber/100, output);
817     OUString aRet(reinterpret_cast<const sal_Unicode *>(output.getBuffer()),
818         output.length());
819     if (rLangTag.getLanguage() == "de")
820     {
821         //narrow no-break space instead of (normal) no-break space
822         return aRet.replace(0x00A0, 0x202F);
823     }
824     return aRet;
825 }
826 
AllowMoreInput(sal_Unicode uChar)827 bool ToggleUnicodeCodepoint::AllowMoreInput(sal_Unicode uChar)
828 {
829     //arbitrarily chosen maximum length allowed - normal max usage would be around 30.
830     if( maInput.getLength() > 255 )
831         mbAllowMoreChars = false;
832 
833     if( !mbAllowMoreChars )
834         return false;
835 
836     bool bPreventNonHex = false;
837     if( maInput.indexOf("U+") != -1 )
838         bPreventNonHex = true;
839 
840     switch ( unicode::getUnicodeType(uChar) )
841     {
842         case css::i18n::UnicodeType::SURROGATE:
843             if( bPreventNonHex )
844             {
845                 mbAllowMoreChars = false;
846                 return false;
847             }
848 
849             if( rtl::isLowSurrogate(uChar) && maUtf16.isEmpty() && maInput.isEmpty()  )
850             {
851                 maUtf16.append(uChar);
852                 return true;
853             }
854             if( rtl::isHighSurrogate(uChar) && maInput.isEmpty() )
855                 maUtf16.insert(0, uChar );
856             //end of hex strings, or unexpected order of high/low, so don't accept more
857             if( !maUtf16.isEmpty() )
858                 maInput.append(maUtf16);
859             if( !maCombining.isEmpty() )
860                 maInput.append(maCombining);
861             mbAllowMoreChars = false;
862             break;
863 
864         case css::i18n::UnicodeType::NON_SPACING_MARK:
865         case css::i18n::UnicodeType::COMBINING_SPACING_MARK:
866             if( bPreventNonHex )
867             {
868                 mbAllowMoreChars = false;
869                 return false;
870             }
871 
872             //extreme edge case: already invalid high/low surrogates with preceding combining chars, and now an extra combining mark.
873             if( !maUtf16.isEmpty() )
874             {
875                 maInput = maUtf16;
876                 if( !maCombining.isEmpty() )
877                     maInput.append(maCombining);
878                 mbAllowMoreChars = false;
879                 return false;
880             }
881             maCombining.insert(0, uChar);
882             break;
883 
884         default:
885             //extreme edge case: already invalid high/low surrogates with preceding combining chars, and now an extra character.
886             if( !maUtf16.isEmpty() )
887             {
888                 maInput = maUtf16;
889                 if( !maCombining.isEmpty() )
890                     maInput.append(maCombining);
891                 mbAllowMoreChars = false;
892                 return false;
893             }
894 
895             if( !maCombining.isEmpty() )
896             {
897                 maCombining.insert(0, uChar);
898                 maInput = maCombining;
899                 mbAllowMoreChars = false;
900                 return false;
901             }
902 
903             // 0 - 1f are control characters.  Do not process those.
904             if( uChar < 0x20 )
905             {
906                 mbAllowMoreChars = false;
907                 return false;
908             }
909 
910             switch( uChar )
911             {
912                 case 'u':
913                 case 'U':
914                     // U+ notation found.  Continue looking for another one.
915                     if( mbRequiresU )
916                     {
917                         mbRequiresU = false;
918                         maInput.insert(0,"U+");
919                     }
920                     // treat as a normal character
921                     else
922                     {
923                         mbAllowMoreChars = false;
924                         if( !bPreventNonHex )
925                             maInput.insertUtf32(0, uChar);
926                     }
927                     break;
928                 case '+':
929                     // + already found: skip when not U, or edge case of +U+xxxx
930                     if( mbRequiresU || (maInput.indexOf("U+") == 0) )
931                         mbAllowMoreChars = false;
932                     // hex chars followed by '+' - now require a 'U'
933                     else if ( !maInput.isEmpty() )
934                         mbRequiresU = true;
935                     // treat as a normal character
936                     else
937                     {
938                         mbAllowMoreChars = false;
939                         if( !bPreventNonHex )
940                             maInput.insertUtf32(0, uChar);
941                     }
942                     break;
943                 default:
944                     // + already found. Since not U, cancel further input
945                     if( mbRequiresU )
946                         mbAllowMoreChars = false;
947                     // maximum digits per notation is 8: only one notation
948                     else if( maInput.indexOf("U+") == -1 && maInput.getLength() == 8 )
949                         mbAllowMoreChars = false;
950                     // maximum digits per notation is 8: previous notation found
951                     else if( maInput.indexOf("U+") == 8 )
952                         mbAllowMoreChars = false;
953                     // a hex character. Add to string.
954                     else if( rtl::isAsciiHexDigit(uChar) )
955                     {
956                         mbIsHexString = true;
957                         maInput.insertUtf32(0, uChar);
958                     }
959                     // not a hex character: stop input. keep if it is the first input provided
960                     else
961                     {
962                         mbAllowMoreChars = false;
963                         if( maInput.isEmpty() )
964                             maInput.insertUtf32(0, uChar);
965                     }
966             }
967     }
968     return mbAllowMoreChars;
969 }
970 
StringToReplace()971 OUString ToggleUnicodeCodepoint::StringToReplace()
972 {
973     if( maInput.isEmpty() )
974     {
975         //edge case - input finished with incomplete low surrogate or combining characters without a base
976         if( mbAllowMoreChars )
977         {
978             if( !maUtf16.isEmpty() )
979                 maInput = maUtf16;
980             if( !maCombining.isEmpty() )
981                 maInput.append(maCombining);
982         }
983         return maInput.toString();
984     }
985 
986     if( !mbIsHexString )
987         return maInput.toString();
988 
989     //this function potentially modifies the input string.  Prevent addition of further characters
990     mbAllowMoreChars = false;
991 
992     //validate unicode notation.
993     OUString sIn;
994     sal_uInt32 nUnicode = 0;
995     sal_Int32 nUPlus = maInput.indexOf("U+");
996     //if U+ notation used, strip off all extra chars added not in U+ notation
997     if( nUPlus != -1 )
998     {
999         maInput.remove(0, nUPlus);
1000         sIn = maInput.copy(2).makeStringAndClear();
1001         nUPlus = sIn.indexOf("U+");
1002     }
1003     else
1004         sIn = maInput.toString();
1005     while( nUPlus != -1 )
1006     {
1007         nUnicode = sIn.copy(0, nUPlus).toUInt32(16);
1008         //prevent creating control characters or invalid Unicode values
1009         if( !rtl::isUnicodeCodePoint(nUnicode) || nUnicode < 0x20  )
1010             maInput = sIn.subView(nUPlus);
1011         sIn = sIn.copy(nUPlus+2);
1012         nUPlus =  sIn.indexOf("U+");
1013     }
1014 
1015     nUnicode = sIn.toUInt32(16);
1016     if( !rtl::isUnicodeCodePoint(nUnicode) || nUnicode < 0x20 )
1017        maInput.truncate().append( sIn[sIn.getLength()-1] );
1018     return maInput.toString();
1019 }
1020 
CharsToDelete()1021 sal_uInt32 ToggleUnicodeCodepoint::CharsToDelete()
1022 {
1023     OUString sIn = StringToReplace();
1024     sal_Int32 nPos = 0;
1025     sal_uInt32 counter = 0;
1026     while( nPos < sIn.getLength() )
1027     {
1028         sIn.iterateCodePoints(&nPos);
1029         ++counter;
1030     }
1031     return counter;
1032 }
1033 
ReplacementString()1034 OUString ToggleUnicodeCodepoint::ReplacementString()
1035 {
1036     OUString sIn = StringToReplace();
1037     OUStringBuffer output = "";
1038     sal_Int32 nUPlus = sIn.indexOf("U+");
1039     // convert from hex notation to glyph
1040     if( nUPlus != -1 || (sIn.getLength() > 1 && mbIsHexString) )
1041     {
1042         sal_uInt32 nUnicode = 0;
1043         if( nUPlus == 0)
1044         {
1045             sIn = sIn.copy(2);
1046             nUPlus = sIn.indexOf("U+");
1047         }
1048         while( nUPlus > 0 )
1049         {
1050             nUnicode = sIn.copy(0, nUPlus).toUInt32(16);
1051             output.appendUtf32( nUnicode );
1052 
1053             sIn = sIn.copy(nUPlus+2);
1054             nUPlus = sIn.indexOf("U+");
1055         }
1056         nUnicode = sIn.toUInt32(16);
1057         output.appendUtf32( nUnicode );
1058     }
1059     // convert from glyph to hex notation
1060     else
1061     {
1062         sal_Int32 nPos = 0;
1063         while( nPos < sIn.getLength() )
1064         {
1065             OUStringBuffer aTmp = OUString::number(sIn.iterateCodePoints(&nPos),16);
1066             //pad with zeros - minimum length of 4.
1067             for( sal_Int32 i = 4 - aTmp.getLength(); i > 0; --i )
1068                 aTmp.insert( 0,"0" );
1069             output.append( "U+" );
1070             output.append( aTmp );
1071         }
1072     }
1073     return output.toString();
1074 }
1075 
1076 /* vim:set shiftwidth=4 softtabstop=4 expandtab: */
1077