1 // © 2016 and later: Unicode, Inc. and others.
2 // License & terms of use: http://www.unicode.org/copyright.html
3 /*
4 *******************************************************************************
5 *
6 * Copyright (C) 1999-2016, International Business Machines
7 * Corporation and others. All Rights Reserved.
8 *
9 *******************************************************************************
10 * file name: scrptrun.cpp
11 *
12 * created on: 10/17/2001
13 * created by: Eric R. Mader
14 */
15
16 #include "unicode/utypes.h"
17 #include "unicode/uscript.h"
18
19 #include "cmemory.h"
20 #include "scrptrun.h"
21
22 U_NAMESPACE_BEGIN
23
24 const char ScriptRun::fgClassID=0;
25
26 UChar32 ScriptRun::pairedChars[] = {
27 0x0028, 0x0029, // ascii paired punctuation
28 0x003c, 0x003e,
29 0x005b, 0x005d,
30 0x007b, 0x007d,
31 0x00ab, 0x00bb, // guillemets
32 0x2018, 0x2019, // general punctuation
33 0x201c, 0x201d,
34 0x2039, 0x203a,
35 0x3008, 0x3009, // chinese paired punctuation
36 0x300a, 0x300b,
37 0x300c, 0x300d,
38 0x300e, 0x300f,
39 0x3010, 0x3011,
40 0x3014, 0x3015,
41 0x3016, 0x3017,
42 0x3018, 0x3019,
43 0x301a, 0x301b
44 };
45
46 const int32_t ScriptRun::pairedCharCount = UPRV_LENGTHOF(pairedChars);
47 const int32_t ScriptRun::pairedCharPower = 1 << highBit(pairedCharCount);
48 const int32_t ScriptRun::pairedCharExtra = pairedCharCount - pairedCharPower;
49
highBit(int32_t value)50 int8_t ScriptRun::highBit(int32_t value)
51 {
52 if (value <= 0) {
53 return -32;
54 }
55
56 int8_t bit = 0;
57
58 if (value >= 1 << 16) {
59 value >>= 16;
60 bit += 16;
61 }
62
63 if (value >= 1 << 8) {
64 value >>= 8;
65 bit += 8;
66 }
67
68 if (value >= 1 << 4) {
69 value >>= 4;
70 bit += 4;
71 }
72
73 if (value >= 1 << 2) {
74 value >>= 2;
75 bit += 2;
76 }
77
78 if (value >= 1 << 1) {
79 value >>= 1;
80 bit += 1;
81 }
82
83 return bit;
84 }
85
getPairIndex(UChar32 ch)86 int32_t ScriptRun::getPairIndex(UChar32 ch)
87 {
88 int32_t probe = pairedCharPower;
89 int32_t index = 0;
90
91 if (ch >= pairedChars[pairedCharExtra]) {
92 index = pairedCharExtra;
93 }
94
95 while (probe > (1 << 0)) {
96 probe >>= 1;
97
98 if (ch >= pairedChars[index + probe]) {
99 index += probe;
100 }
101 }
102
103 if (pairedChars[index] != ch) {
104 index = -1;
105 }
106
107 return index;
108 }
109
sameScript(int32_t scriptOne,int32_t scriptTwo)110 UBool ScriptRun::sameScript(int32_t scriptOne, int32_t scriptTwo)
111 {
112 return scriptOne <= USCRIPT_INHERITED || scriptTwo <= USCRIPT_INHERITED || scriptOne == scriptTwo;
113 }
114
next()115 UBool ScriptRun::next()
116 {
117 int32_t startSP = parenSP; // used to find the first new open character
118 UErrorCode error = U_ZERO_ERROR;
119
120 // if we've fallen off the end of the text, we're done
121 if (scriptEnd >= charLimit) {
122 return false;
123 }
124
125 scriptCode = USCRIPT_COMMON;
126
127 for (scriptStart = scriptEnd; scriptEnd < charLimit; scriptEnd += 1) {
128 UChar high = charArray[scriptEnd];
129 UChar32 ch = high;
130
131 // if the character is a high surrogate and it's not the last one
132 // in the text, see if it's followed by a low surrogate
133 if (high >= 0xD800 && high <= 0xDBFF && scriptEnd < charLimit - 1)
134 {
135 UChar low = charArray[scriptEnd + 1];
136
137 // if it is followed by a low surrogate,
138 // consume it and form the full character
139 if (low >= 0xDC00 && low <= 0xDFFF) {
140 ch = (high - 0xD800) * 0x0400 + low - 0xDC00 + 0x10000;
141 scriptEnd += 1;
142 }
143 }
144
145 UScriptCode sc = uscript_getScript(ch, &error);
146 int32_t pairIndex = getPairIndex(ch);
147
148 // Paired character handling:
149 //
150 // if it's an open character, push it onto the stack.
151 // if it's a close character, find the matching open on the
152 // stack, and use that script code. Any non-matching open
153 // characters above it on the stack will be poped.
154 if (pairIndex >= 0) {
155 if ((pairIndex & 1) == 0) {
156 parenStack[++parenSP].pairIndex = pairIndex;
157 parenStack[parenSP].scriptCode = scriptCode;
158 } else if (parenSP >= 0) {
159 int32_t pi = pairIndex & ~1;
160
161 while (parenSP >= 0 && parenStack[parenSP].pairIndex != pi) {
162 parenSP -= 1;
163 }
164
165 if (parenSP < startSP) {
166 startSP = parenSP;
167 }
168
169 if (parenSP >= 0) {
170 sc = parenStack[parenSP].scriptCode;
171 }
172 }
173 }
174
175 if (sameScript(scriptCode, sc)) {
176 if (scriptCode <= USCRIPT_INHERITED && sc > USCRIPT_INHERITED) {
177 scriptCode = sc;
178
179 // now that we have a final script code, fix any open
180 // characters we pushed before we knew the script code.
181 while (startSP < parenSP) {
182 parenStack[++startSP].scriptCode = scriptCode;
183 }
184 }
185
186 // if this character is a close paired character,
187 // pop it from the stack
188 if (pairIndex >= 0 && (pairIndex & 1) != 0 && parenSP >= 0) {
189 parenSP -= 1;
190 startSP -= 1;
191 }
192 } else {
193 // if the run broke on a surrogate pair,
194 // end it before the high surrogate
195 if (ch >= 0x10000) {
196 scriptEnd -= 1;
197 }
198
199 break;
200 }
201 }
202
203 return true;
204 }
205
206 U_NAMESPACE_END
207