1 // © 2016 and later: Unicode, Inc. and others.
2 // License & terms of use: http://www.unicode.org/copyright.html
3 /*
4 **********************************************************************
5 *   Copyright (C) 1999-2016, International Business Machines
6 *   Corporation and others.  All Rights Reserved.
7 **********************************************************************
8 *
9 * File USC_IMPL.C
10 *
11 * Modification History:
12 *
13 *   Date        Name        Description
14 *   07/08/2002  Eric Mader  Creation.
15 ******************************************************************************
16 */
17 
18 #include "unicode/uscript.h"
19 #include "usc_impl.h"
20 #include "cmemory.h"
21 
22 #define PAREN_STACK_DEPTH 32
23 
24 #define MOD(sp) ((sp) % PAREN_STACK_DEPTH)
25 #define LIMIT_INC(sp) (((sp) < PAREN_STACK_DEPTH)? (sp) + 1 : PAREN_STACK_DEPTH)
26 #define INC(sp,count) (MOD((sp) + (count)))
27 #define INC1(sp) (INC(sp, 1))
28 #define DEC(sp,count) (MOD((sp) + PAREN_STACK_DEPTH - (count)))
29 #define DEC1(sp) (DEC(sp, 1))
30 #define STACK_IS_EMPTY(scriptRun) ((scriptRun)->pushCount <= 0)
31 #define STACK_IS_NOT_EMPTY(scriptRun) (! STACK_IS_EMPTY(scriptRun))
32 #define TOP(scriptRun) ((scriptRun)->parenStack[(scriptRun)->parenSP])
33 #define SYNC_FIXUP(scriptRun) ((scriptRun)->fixupCount = 0)
34 
35 struct ParenStackEntry
36 {
37     int32_t pairIndex;
38     UScriptCode scriptCode;
39 };
40 
41 struct UScriptRun
42 {
43     int32_t textLength;
44     const UChar *textArray;
45 
46     int32_t scriptStart;
47     int32_t scriptLimit;
48     UScriptCode scriptCode;
49 
50     struct ParenStackEntry parenStack[PAREN_STACK_DEPTH];
51     int32_t parenSP;
52     int32_t pushCount;
53     int32_t fixupCount;
54 };
55 
56 static int8_t highBit(int32_t value);
57 
58 static const UChar32 pairedChars[] = {
59     0x0028, 0x0029, /* ascii paired punctuation */
60     0x003c, 0x003e,
61     0x005b, 0x005d,
62     0x007b, 0x007d,
63     0x00ab, 0x00bb, /* guillemets */
64     0x2018, 0x2019, /* general punctuation */
65     0x201c, 0x201d,
66     0x2039, 0x203a,
67     0x3008, 0x3009, /* chinese paired punctuation */
68     0x300a, 0x300b,
69     0x300c, 0x300d,
70     0x300e, 0x300f,
71     0x3010, 0x3011,
72     0x3014, 0x3015,
73     0x3016, 0x3017,
74     0x3018, 0x3019,
75     0x301a, 0x301b
76 };
77 
push(UScriptRun * scriptRun,int32_t pairIndex,UScriptCode scriptCode)78 static void push(UScriptRun *scriptRun, int32_t pairIndex, UScriptCode scriptCode)
79 {
80     scriptRun->pushCount  = LIMIT_INC(scriptRun->pushCount);
81     scriptRun->fixupCount = LIMIT_INC(scriptRun->fixupCount);
82 
83     scriptRun->parenSP = INC1(scriptRun->parenSP);
84     scriptRun->parenStack[scriptRun->parenSP].pairIndex  = pairIndex;
85     scriptRun->parenStack[scriptRun->parenSP].scriptCode = scriptCode;
86 }
87 
pop(UScriptRun * scriptRun)88 static void pop(UScriptRun *scriptRun)
89 {
90     if (STACK_IS_EMPTY(scriptRun)) {
91         return;
92     }
93 
94     if (scriptRun->fixupCount > 0) {
95         scriptRun->fixupCount -= 1;
96     }
97 
98     scriptRun->pushCount -= 1;
99     scriptRun->parenSP = DEC1(scriptRun->parenSP);
100 
101     /* If the stack is now empty, reset the stack
102        pointers to their initial values.
103      */
104     if (STACK_IS_EMPTY(scriptRun)) {
105         scriptRun->parenSP = -1;
106     }
107 }
108 
fixup(UScriptRun * scriptRun,UScriptCode scriptCode)109 static void fixup(UScriptRun *scriptRun, UScriptCode scriptCode)
110 {
111     int32_t fixupSP = DEC(scriptRun->parenSP, scriptRun->fixupCount);
112 
113     while (scriptRun->fixupCount-- > 0) {
114         fixupSP = INC1(fixupSP);
115         scriptRun->parenStack[fixupSP].scriptCode = scriptCode;
116     }
117 }
118 
119 static int8_t
highBit(int32_t value)120 highBit(int32_t value)
121 {
122     int8_t bit = 0;
123 
124     if (value <= 0) {
125         return -32;
126     }
127 
128     if (value >= 1 << 16) {
129         value >>= 16;
130         bit += 16;
131     }
132 
133     if (value >= 1 << 8) {
134         value >>= 8;
135         bit += 8;
136     }
137 
138     if (value >= 1 << 4) {
139         value >>= 4;
140         bit += 4;
141     }
142 
143     if (value >= 1 << 2) {
144         value >>= 2;
145         bit += 2;
146     }
147 
148     if (value >= 1 << 1) {
149         //value >>= 1;
150         bit += 1;
151     }
152 
153     return bit;
154 }
155 
156 static int32_t
getPairIndex(UChar32 ch)157 getPairIndex(UChar32 ch)
158 {
159     int32_t pairedCharCount = UPRV_LENGTHOF(pairedChars);
160     int32_t pairedCharPower = 1 << highBit(pairedCharCount);
161     int32_t pairedCharExtra = pairedCharCount - pairedCharPower;
162 
163     int32_t probe = pairedCharPower;
164     int32_t pairIndex = 0;
165 
166     if (ch >= pairedChars[pairedCharExtra]) {
167         pairIndex = pairedCharExtra;
168     }
169 
170     while (probe > (1 << 0)) {
171         probe >>= 1;
172 
173         if (ch >= pairedChars[pairIndex + probe]) {
174             pairIndex += probe;
175         }
176     }
177 
178     if (pairedChars[pairIndex] != ch) {
179         pairIndex = -1;
180     }
181 
182     return pairIndex;
183 }
184 
185 static UBool
sameScript(UScriptCode scriptOne,UScriptCode scriptTwo)186 sameScript(UScriptCode scriptOne, UScriptCode scriptTwo)
187 {
188     return scriptOne <= USCRIPT_INHERITED || scriptTwo <= USCRIPT_INHERITED || scriptOne == scriptTwo;
189 }
190 
191 U_CAPI UScriptRun * U_EXPORT2
uscript_openRun(const UChar * src,int32_t length,UErrorCode * pErrorCode)192 uscript_openRun(const UChar *src, int32_t length, UErrorCode *pErrorCode)
193 {
194     UScriptRun *result = NULL;
195 
196     if (pErrorCode == NULL || U_FAILURE(*pErrorCode)) {
197         return NULL;
198     }
199 
200     result = (UScriptRun *)uprv_malloc(sizeof (UScriptRun));
201 
202     if (result == NULL) {
203         *pErrorCode = U_MEMORY_ALLOCATION_ERROR;
204         return NULL;
205     }
206 
207     uscript_setRunText(result, src, length, pErrorCode);
208 
209     /* Release the UScriptRun if uscript_setRunText() returns an error */
210     if (U_FAILURE(*pErrorCode)) {
211         uprv_free(result);
212         result = NULL;
213     }
214 
215     return result;
216 }
217 
218 U_CAPI void U_EXPORT2
uscript_closeRun(UScriptRun * scriptRun)219 uscript_closeRun(UScriptRun *scriptRun)
220 {
221     if (scriptRun != NULL) {
222         uprv_free(scriptRun);
223     }
224 }
225 
226 U_CAPI void U_EXPORT2
uscript_resetRun(UScriptRun * scriptRun)227 uscript_resetRun(UScriptRun *scriptRun)
228 {
229     if (scriptRun != NULL) {
230         scriptRun->scriptStart = 0;
231         scriptRun->scriptLimit = 0;
232         scriptRun->scriptCode  = USCRIPT_INVALID_CODE;
233         scriptRun->parenSP     = -1;
234         scriptRun->pushCount   =  0;
235         scriptRun->fixupCount  =  0;
236     }
237 }
238 
239 U_CAPI void U_EXPORT2
uscript_setRunText(UScriptRun * scriptRun,const UChar * src,int32_t length,UErrorCode * pErrorCode)240 uscript_setRunText(UScriptRun *scriptRun, const UChar *src, int32_t length, UErrorCode *pErrorCode)
241 {
242     if (pErrorCode == NULL || U_FAILURE(*pErrorCode)) {
243         return;
244     }
245 
246     if (scriptRun == NULL || length < 0 || ((src == NULL) != (length == 0))) {
247         *pErrorCode = U_ILLEGAL_ARGUMENT_ERROR;
248         return;
249     }
250 
251     scriptRun->textArray  = src;
252     scriptRun->textLength = length;
253 
254     uscript_resetRun(scriptRun);
255 }
256 
257 U_CAPI UBool U_EXPORT2
uscript_nextRun(UScriptRun * scriptRun,int32_t * pRunStart,int32_t * pRunLimit,UScriptCode * pRunScript)258 uscript_nextRun(UScriptRun *scriptRun, int32_t *pRunStart, int32_t *pRunLimit, UScriptCode *pRunScript)
259 {
260     UErrorCode error = U_ZERO_ERROR;
261 
262     /* if we've fallen off the end of the text, we're done */
263     if (scriptRun == NULL || scriptRun->scriptLimit >= scriptRun->textLength) {
264         return FALSE;
265     }
266 
267     SYNC_FIXUP(scriptRun);
268     scriptRun->scriptCode = USCRIPT_COMMON;
269 
270     for (scriptRun->scriptStart = scriptRun->scriptLimit; scriptRun->scriptLimit < scriptRun->textLength; scriptRun->scriptLimit += 1) {
271         UChar   high = scriptRun->textArray[scriptRun->scriptLimit];
272         UChar32 ch   = high;
273         UScriptCode sc;
274         int32_t pairIndex;
275 
276         /*
277          * if the character is a high surrogate and it's not the last one
278          * in the text, see if it's followed by a low surrogate
279          */
280         if (high >= 0xD800 && high <= 0xDBFF && scriptRun->scriptLimit < scriptRun->textLength - 1) {
281             UChar low = scriptRun->textArray[scriptRun->scriptLimit + 1];
282 
283             /*
284              * if it is followed by a low surrogate,
285              * consume it and form the full character
286              */
287             if (low >= 0xDC00 && low <= 0xDFFF) {
288                 ch = (high - 0xD800) * 0x0400 + low - 0xDC00 + 0x10000;
289                 scriptRun->scriptLimit += 1;
290             }
291         }
292 
293         sc = uscript_getScript(ch, &error);
294         pairIndex = getPairIndex(ch);
295 
296         /*
297          * Paired character handling:
298          *
299          * if it's an open character, push it onto the stack.
300          * if it's a close character, find the matching open on the
301          * stack, and use that script code. Any non-matching open
302          * characters above it on the stack will be poped.
303          */
304         if (pairIndex >= 0) {
305             if ((pairIndex & 1) == 0) {
306                 push(scriptRun, pairIndex, scriptRun->scriptCode);
307             } else {
308                 int32_t pi = pairIndex & ~1;
309 
310                 while (STACK_IS_NOT_EMPTY(scriptRun) && TOP(scriptRun).pairIndex != pi) {
311                     pop(scriptRun);
312                 }
313 
314                 if (STACK_IS_NOT_EMPTY(scriptRun)) {
315                     sc = TOP(scriptRun).scriptCode;
316                 }
317             }
318         }
319 
320         if (sameScript(scriptRun->scriptCode, sc)) {
321             if (scriptRun->scriptCode <= USCRIPT_INHERITED && sc > USCRIPT_INHERITED) {
322                 scriptRun->scriptCode = sc;
323 
324                 fixup(scriptRun, scriptRun->scriptCode);
325             }
326 
327             /*
328              * if this character is a close paired character,
329              * pop the matching open character from the stack
330              */
331             if (pairIndex >= 0 && (pairIndex & 1) != 0) {
332                 pop(scriptRun);
333             }
334         } else {
335             /*
336              * if the run broke on a surrogate pair,
337              * end it before the high surrogate
338              */
339             if (ch >= 0x10000) {
340                 scriptRun->scriptLimit -= 1;
341             }
342 
343             break;
344         }
345     }
346 
347 
348     if (pRunStart != NULL) {
349         *pRunStart = scriptRun->scriptStart;
350     }
351 
352     if (pRunLimit != NULL) {
353         *pRunLimit = scriptRun->scriptLimit;
354     }
355 
356     if (pRunScript != NULL) {
357         *pRunScript = scriptRun->scriptCode;
358     }
359 
360     return TRUE;
361 }
362