1 // © 2016 and later: Unicode, Inc. and others.
2 // License & terms of use: http://www.unicode.org/copyright.html
3 /*
4 *******************************************************************************
5 *   Copyright (C) 2013-2016, International Business Machines
6 *   Corporation and others.  All Rights Reserved.
7 *******************************************************************************
8 *   file name:  uscript_props.cpp
9 *   encoding:   UTF-8
10 *   tab size:   8 (not used)
11 *   indentation:4
12 *
13 *   created on: 2013feb16
14 *   created by: Markus W. Scherer
15 */
16 
17 #include "unicode/utypes.h"
18 #include "unicode/unistr.h"
19 #include "unicode/uscript.h"
20 #include "unicode/utf16.h"
21 #include "ustr_imp.h"
22 #include "cmemory.h"
23 
24 namespace {
25 
26 // Script metadata (script properties).
27 // See http://unicode.org/cldr/trac/browser/trunk/common/properties/scriptMetadata.txt
28 
29 // 0 = NOT_ENCODED, no sample character, default false script properties.
30 // Bits 20.. 0: sample character
31 
32 // Bits 23..21: usage
33 const int32_t UNKNOWN = 1 << 21;
34 const int32_t EXCLUSION = 2 << 21;
35 const int32_t LIMITED_USE = 3 << 21;
36 // st int32_t ASPIRATIONAL = 4 << 21; -- not used any more since Unicode 10
37 const int32_t RECOMMENDED = 5 << 21;
38 
39 // Bits 31..24: Single-bit flags
40 const int32_t RTL = 1 << 24;
41 const int32_t LB_LETTERS = 1 << 25;
42 const int32_t CASED = 1 << 26;
43 
44 const int32_t SCRIPT_PROPS[] = {
45     // Begin copy-paste output from
46     // tools/trunk/unicode/py/parsescriptmetadata.py
47     0x0040 | RECOMMENDED,  // Zyyy
48     0x0308 | RECOMMENDED,  // Zinh
49     0x0628 | RECOMMENDED | RTL,  // Arab
50     0x0531 | RECOMMENDED | CASED,  // Armn
51     0x0995 | RECOMMENDED,  // Beng
52     0x3105 | RECOMMENDED | LB_LETTERS,  // Bopo
53     0x13C4 | LIMITED_USE | CASED,  // Cher
54     0x03E2 | EXCLUSION | CASED,  // Copt
55     0x042F | RECOMMENDED | CASED,  // Cyrl
56     0x10414 | EXCLUSION | CASED,  // Dsrt
57     0x0905 | RECOMMENDED,  // Deva
58     0x12A0 | RECOMMENDED,  // Ethi
59     0x10D3 | RECOMMENDED,  // Geor
60     0x10330 | EXCLUSION,  // Goth
61     0x03A9 | RECOMMENDED | CASED,  // Grek
62     0x0A95 | RECOMMENDED,  // Gujr
63     0x0A15 | RECOMMENDED,  // Guru
64     0x5B57 | RECOMMENDED | LB_LETTERS,  // Hani
65     0xAC00 | RECOMMENDED,  // Hang
66     0x05D0 | RECOMMENDED | RTL,  // Hebr
67     0x304B | RECOMMENDED | LB_LETTERS,  // Hira
68     0x0C95 | RECOMMENDED,  // Knda
69     0x30AB | RECOMMENDED | LB_LETTERS,  // Kana
70     0x1780 | RECOMMENDED | LB_LETTERS,  // Khmr
71     0x0EA5 | RECOMMENDED | LB_LETTERS,  // Laoo
72     0x004C | RECOMMENDED | CASED,  // Latn
73     0x0D15 | RECOMMENDED,  // Mlym
74     0x1826 | EXCLUSION,  // Mong
75     0x1000 | RECOMMENDED | LB_LETTERS,  // Mymr
76     0x168F | EXCLUSION,  // Ogam
77     0x10300 | EXCLUSION,  // Ital
78     0x0B15 | RECOMMENDED,  // Orya
79     0x16A0 | EXCLUSION,  // Runr
80     0x0D85 | RECOMMENDED,  // Sinh
81     0x0710 | LIMITED_USE | RTL,  // Syrc
82     0x0B95 | RECOMMENDED,  // Taml
83     0x0C15 | RECOMMENDED,  // Telu
84     0x078C | RECOMMENDED | RTL,  // Thaa
85     0x0E17 | RECOMMENDED | LB_LETTERS,  // Thai
86     0x0F40 | RECOMMENDED,  // Tibt
87     0x14C0 | LIMITED_USE,  // Cans
88     0xA288 | LIMITED_USE | LB_LETTERS,  // Yiii
89     0x1703 | EXCLUSION,  // Tglg
90     0x1723 | EXCLUSION,  // Hano
91     0x1743 | EXCLUSION,  // Buhd
92     0x1763 | EXCLUSION,  // Tagb
93     0x280E | UNKNOWN,  // Brai
94     0x10800 | EXCLUSION | RTL,  // Cprt
95     0x1900 | LIMITED_USE,  // Limb
96     0x10000 | EXCLUSION,  // Linb
97     0x10480 | EXCLUSION,  // Osma
98     0x10450 | EXCLUSION,  // Shaw
99     0x1950 | LIMITED_USE | LB_LETTERS,  // Tale
100     0x10380 | EXCLUSION,  // Ugar
101     0,
102     0x1A00 | EXCLUSION,  // Bugi
103     0x2C00 | EXCLUSION | CASED,  // Glag
104     0x10A00 | EXCLUSION | RTL,  // Khar
105     0xA800 | LIMITED_USE,  // Sylo
106     0x1980 | LIMITED_USE | LB_LETTERS,  // Talu
107     0x2D30 | LIMITED_USE,  // Tfng
108     0x103A0 | EXCLUSION,  // Xpeo
109     0x1B05 | LIMITED_USE,  // Bali
110     0x1BC0 | LIMITED_USE,  // Batk
111     0,
112     0x11005 | EXCLUSION,  // Brah
113     0xAA00 | LIMITED_USE,  // Cham
114     0,
115     0,
116     0,
117     0,
118     0x13153 | EXCLUSION,  // Egyp
119     0,
120     0x5B57 | RECOMMENDED | LB_LETTERS,  // Hans
121     0x5B57 | RECOMMENDED | LB_LETTERS,  // Hant
122     0x16B1C | EXCLUSION,  // Hmng
123     0x10CA1 | EXCLUSION | RTL | CASED,  // Hung
124     0,
125     0xA984 | LIMITED_USE,  // Java
126     0xA90A | LIMITED_USE,  // Kali
127     0,
128     0,
129     0x1C00 | LIMITED_USE,  // Lepc
130     0x10647 | EXCLUSION,  // Lina
131     0x0840 | LIMITED_USE | RTL,  // Mand
132     0,
133     0x10980 | EXCLUSION | RTL,  // Mero
134     0x07CA | LIMITED_USE | RTL,  // Nkoo
135     0x10C00 | EXCLUSION | RTL,  // Orkh
136     0x1036B | EXCLUSION,  // Perm
137     0xA840 | EXCLUSION,  // Phag
138     0x10900 | EXCLUSION | RTL,  // Phnx
139     0x16F00 | LIMITED_USE,  // Plrd
140     0,
141     0,
142     0,
143     0,
144     0,
145     0,
146     0xA549 | LIMITED_USE,  // Vaii
147     0,
148     0x12000 | EXCLUSION,  // Xsux
149     0,
150     0xFDD0 | UNKNOWN,  // Zzzz
151     0x102A0 | EXCLUSION,  // Cari
152     0x304B | RECOMMENDED | LB_LETTERS,  // Jpan
153     0x1A20 | LIMITED_USE | LB_LETTERS,  // Lana
154     0x10280 | EXCLUSION,  // Lyci
155     0x10920 | EXCLUSION | RTL,  // Lydi
156     0x1C5A | LIMITED_USE,  // Olck
157     0xA930 | EXCLUSION,  // Rjng
158     0xA882 | LIMITED_USE,  // Saur
159     0x1D850 | EXCLUSION,  // Sgnw
160     0x1B83 | LIMITED_USE,  // Sund
161     0,
162     0xABC0 | LIMITED_USE,  // Mtei
163     0x10840 | EXCLUSION | RTL,  // Armi
164     0x10B00 | EXCLUSION | RTL,  // Avst
165     0x11103 | LIMITED_USE,  // Cakm
166     0xAC00 | RECOMMENDED,  // Kore
167     0x11083 | EXCLUSION,  // Kthi
168     0x10AD8 | EXCLUSION | RTL,  // Mani
169     0x10B60 | EXCLUSION | RTL,  // Phli
170     0x10B8F | EXCLUSION | RTL,  // Phlp
171     0,
172     0x10B40 | EXCLUSION | RTL,  // Prti
173     0x0800 | EXCLUSION | RTL,  // Samr
174     0xAA80 | LIMITED_USE | LB_LETTERS,  // Tavt
175     0,
176     0,
177     0xA6A0 | LIMITED_USE,  // Bamu
178     0xA4D0 | LIMITED_USE,  // Lisu
179     0,
180     0x10A60 | EXCLUSION | RTL,  // Sarb
181     0x16AE6 | EXCLUSION,  // Bass
182     0x1BC20 | EXCLUSION,  // Dupl
183     0x10500 | EXCLUSION,  // Elba
184     0x11315 | EXCLUSION,  // Gran
185     0,
186     0,
187     0x1E802 | EXCLUSION | RTL,  // Mend
188     0x109A0 | EXCLUSION | RTL,  // Merc
189     0x10A95 | EXCLUSION | RTL,  // Narb
190     0x10896 | EXCLUSION | RTL,  // Nbat
191     0x10873 | EXCLUSION | RTL,  // Palm
192     0x112BE | EXCLUSION,  // Sind
193     0x118B4 | EXCLUSION | CASED,  // Wara
194     0,
195     0,
196     0x16A4F | EXCLUSION,  // Mroo
197     0x1B1C4 | EXCLUSION | LB_LETTERS,  // Nshu
198     0x11183 | EXCLUSION,  // Shrd
199     0x110D0 | EXCLUSION,  // Sora
200     0x11680 | EXCLUSION,  // Takr
201     0x18229 | EXCLUSION | LB_LETTERS,  // Tang
202     0,
203     0x14400 | EXCLUSION,  // Hluw
204     0x11208 | EXCLUSION,  // Khoj
205     0x11484 | EXCLUSION,  // Tirh
206     0x10537 | EXCLUSION,  // Aghb
207     0x11152 | EXCLUSION,  // Mahj
208     0x11717 | EXCLUSION | LB_LETTERS,  // Ahom
209     0x108F4 | EXCLUSION | RTL,  // Hatr
210     0x1160E | EXCLUSION,  // Modi
211     0x1128F | EXCLUSION,  // Mult
212     0x11AC0 | EXCLUSION,  // Pauc
213     0x1158E | EXCLUSION,  // Sidd
214     0x1E909 | LIMITED_USE | RTL | CASED,  // Adlm
215     0x11C0E | EXCLUSION,  // Bhks
216     0x11C72 | EXCLUSION,  // Marc
217     0x11412 | LIMITED_USE,  // Newa
218     0x104B5 | LIMITED_USE | CASED,  // Osge
219     0x5B57 | RECOMMENDED | LB_LETTERS,  // Hanb
220     0x1112 | RECOMMENDED,  // Jamo
221     0,
222     0x11D10 | EXCLUSION,  // Gonm
223     0x11A5C | EXCLUSION,  // Soyo
224     0x11A0B | EXCLUSION,  // Zanb
225     0x1180B | EXCLUSION,  // Dogr
226     0x11D71 | LIMITED_USE,  // Gong
227     0x11EE5 | EXCLUSION,  // Maka
228     0x16E40 | EXCLUSION | CASED,  // Medf
229     0x10D12 | LIMITED_USE | RTL,  // Rohg
230     0x10F42 | EXCLUSION | RTL,  // Sogd
231     0x10F19 | EXCLUSION | RTL,  // Sogo
232     0x10FF1 | EXCLUSION | RTL,  // Elym
233     0x1E108 | LIMITED_USE,  // Hmnp
234     0x119CE | EXCLUSION,  // Nand
235     0x1E2E1 | LIMITED_USE,  // Wcho
236     0x10FBF | EXCLUSION | RTL,  // Chrs
237     0x1190C | EXCLUSION,  // Diak
238     0x18C65 | EXCLUSION | LB_LETTERS,  // Kits
239     0x10E88 | EXCLUSION | RTL,  // Yezi
240     // End copy-paste from parsescriptmetadata.py
241 };
242 
getScriptProps(UScriptCode script)243 int32_t getScriptProps(UScriptCode script) {
244     if (0 <= script && script < UPRV_LENGTHOF(SCRIPT_PROPS)) {
245         return SCRIPT_PROPS[script];
246     } else {
247         return 0;
248     }
249 }
250 
251 }  // namespace
252 
253 U_CAPI int32_t U_EXPORT2
uscript_getSampleString(UScriptCode script,UChar * dest,int32_t capacity,UErrorCode * pErrorCode)254 uscript_getSampleString(UScriptCode script, UChar *dest, int32_t capacity, UErrorCode *pErrorCode) {
255     if(U_FAILURE(*pErrorCode)) { return 0; }
256     if(capacity < 0 || (capacity > 0 && dest == NULL)) {
257         *pErrorCode = U_ILLEGAL_ARGUMENT_ERROR;
258         return 0;
259     }
260     int32_t sampleChar = getScriptProps(script) & 0x1fffff;
261     int32_t length;
262     if(sampleChar == 0) {
263         length = 0;
264     } else {
265         length = U16_LENGTH(sampleChar);
266         if(length <= capacity) {
267             int32_t i = 0;
268             U16_APPEND_UNSAFE(dest, i, sampleChar);
269         }
270     }
271     return u_terminateUChars(dest, capacity, length, pErrorCode);
272 }
273 
274 U_COMMON_API icu::UnicodeString U_EXPORT2
uscript_getSampleUnicodeString(UScriptCode script)275 uscript_getSampleUnicodeString(UScriptCode script) {
276     icu::UnicodeString sample;
277     int32_t sampleChar = getScriptProps(script) & 0x1fffff;
278     if(sampleChar != 0) {
279         sample.append(sampleChar);
280     }
281     return sample;
282 }
283 
284 U_CAPI UScriptUsage U_EXPORT2
uscript_getUsage(UScriptCode script)285 uscript_getUsage(UScriptCode script) {
286     return (UScriptUsage)((getScriptProps(script) >> 21) & 7);
287 }
288 
289 U_CAPI UBool U_EXPORT2
uscript_isRightToLeft(UScriptCode script)290 uscript_isRightToLeft(UScriptCode script) {
291     return (getScriptProps(script) & RTL) != 0;
292 }
293 
294 U_CAPI UBool U_EXPORT2
uscript_breaksBetweenLetters(UScriptCode script)295 uscript_breaksBetweenLetters(UScriptCode script) {
296     return (getScriptProps(script) & LB_LETTERS) != 0;
297 }
298 
299 U_CAPI UBool U_EXPORT2
uscript_isCased(UScriptCode script)300 uscript_isCased(UScriptCode script) {
301     return (getScriptProps(script) & CASED) != 0;
302 }
303