1 // Scintilla source code edit control
2 // Encoding: UTF-8
3 /** @file CaseConvert.cxx
4 ** Case fold characters and convert them to upper or lower case.
5 ** Tables automatically regenerated by scripts/GenerateCaseConvert.py
6 ** Should only be rarely regenerated for new versions of Unicode.
7 **/
8 // Copyright 2013 by Neil Hodgson <neilh@scintilla.org>
9 // The License.txt file describes the conditions under which this software may be distributed.
10
11 #include <cstring>
12
13 #include <vector>
14 #include <algorithm>
15
16 #include "CaseConvert.h"
17 #include "UniConversion.h"
18 #include "UnicodeFromUTF8.h"
19
20 #ifdef SCI_NAMESPACE
21 using namespace Scintilla;
22 #endif
23
24 namespace {
25 // Use an unnamed namespace to protect the declarations from name conflicts
26
27 // Unicode code points are ordered by groups and follow patterns.
28 // Most characters (pitch==1) are in ranges for a particular alphabet and their
29 // upper case forms are a fixed distance away.
30 // Another pattern (pitch==2) is where each lower case letter is preceded by
31 // the upper case form. These are also grouped into ranges.
32
33 int symmetricCaseConversionRanges[] = {
34 //lower, upper, range length, range pitch
35 //++Autogenerated -- start of section automatically generated
36 //**\(\*\n\)
37 97,65,26,1,
38 224,192,23,1,
39 248,216,7,1,
40 257,256,24,2,
41 314,313,8,2,
42 331,330,23,2,
43 462,461,8,2,
44 479,478,9,2,
45 505,504,20,2,
46 547,546,9,2,
47 583,582,5,2,
48 945,913,17,1,
49 963,931,9,1,
50 985,984,12,2,
51 1072,1040,32,1,
52 1104,1024,16,1,
53 1121,1120,17,2,
54 1163,1162,27,2,
55 1218,1217,7,2,
56 1233,1232,44,2,
57 1377,1329,38,1,
58 7681,7680,75,2,
59 7841,7840,48,2,
60 7936,7944,8,1,
61 7952,7960,6,1,
62 7968,7976,8,1,
63 7984,7992,8,1,
64 8000,8008,6,1,
65 8032,8040,8,1,
66 8560,8544,16,1,
67 9424,9398,26,1,
68 11312,11264,47,1,
69 11393,11392,50,2,
70 11520,4256,38,1,
71 42561,42560,23,2,
72 42625,42624,12,2,
73 42787,42786,7,2,
74 42803,42802,31,2,
75 42879,42878,5,2,
76 42913,42912,5,2,
77 65345,65313,26,1,
78 66600,66560,40,1,
79
80 //--Autogenerated -- end of section automatically generated
81 };
82
83 // Code points that are symmetric but don't fit into a range of similar characters
84 // are listed here.
85
86 int symmetricCaseConversions[] = {
87 //lower, upper
88 //++Autogenerated -- start of section automatically generated
89 //**1 \(\*\n\)
90 255,376,
91 307,306,
92 309,308,
93 311,310,
94 378,377,
95 380,379,
96 382,381,
97 384,579,
98 387,386,
99 389,388,
100 392,391,
101 396,395,
102 402,401,
103 405,502,
104 409,408,
105 410,573,
106 414,544,
107 417,416,
108 419,418,
109 421,420,
110 424,423,
111 429,428,
112 432,431,
113 436,435,
114 438,437,
115 441,440,
116 445,444,
117 447,503,
118 454,452,
119 457,455,
120 460,458,
121 477,398,
122 499,497,
123 501,500,
124 572,571,
125 575,11390,
126 576,11391,
127 578,577,
128 592,11375,
129 593,11373,
130 594,11376,
131 595,385,
132 596,390,
133 598,393,
134 599,394,
135 601,399,
136 603,400,
137 608,403,
138 611,404,
139 613,42893,
140 614,42922,
141 616,407,
142 617,406,
143 619,11362,
144 623,412,
145 625,11374,
146 626,413,
147 629,415,
148 637,11364,
149 640,422,
150 643,425,
151 648,430,
152 649,580,
153 650,433,
154 651,434,
155 652,581,
156 658,439,
157 881,880,
158 883,882,
159 887,886,
160 891,1021,
161 892,1022,
162 893,1023,
163 940,902,
164 941,904,
165 942,905,
166 943,906,
167 972,908,
168 973,910,
169 974,911,
170 983,975,
171 1010,1017,
172 1016,1015,
173 1019,1018,
174 1231,1216,
175 7545,42877,
176 7549,11363,
177 8017,8025,
178 8019,8027,
179 8021,8029,
180 8023,8031,
181 8048,8122,
182 8049,8123,
183 8050,8136,
184 8051,8137,
185 8052,8138,
186 8053,8139,
187 8054,8154,
188 8055,8155,
189 8056,8184,
190 8057,8185,
191 8058,8170,
192 8059,8171,
193 8060,8186,
194 8061,8187,
195 8112,8120,
196 8113,8121,
197 8144,8152,
198 8145,8153,
199 8160,8168,
200 8161,8169,
201 8165,8172,
202 8526,8498,
203 8580,8579,
204 11361,11360,
205 11365,570,
206 11366,574,
207 11368,11367,
208 11370,11369,
209 11372,11371,
210 11379,11378,
211 11382,11381,
212 11500,11499,
213 11502,11501,
214 11507,11506,
215 11559,4295,
216 11565,4301,
217 42874,42873,
218 42876,42875,
219 42892,42891,
220 42897,42896,
221 42899,42898,
222
223 //--Autogenerated -- end of section automatically generated
224 };
225
226 // Characters that have complex case conversions are listed here.
227 // This includes cases where more than one character is needed for a conversion,
228 // folding is different to lowering, or (as appropriate) upper(lower(x)) != x or
229 // lower(upper(x)) != x.
230
231 const char *complexCaseConversions =
232 // Original | Folded | Upper | Lower |
233 //++Autogenerated -- start of section automatically generated
234 //**2 \(\*\n\)
235 "\xc2\xb5|\xce\xbc|\xce\x9c||"
236 "\xc3\x9f|ss|SS||"
237 "\xc4\xb0|i\xcc\x87||i\xcc\x87|"
238 "\xc4\xb1||I||"
239 "\xc5\x89|\xca\xbcn|\xca\xbcN||"
240 "\xc5\xbf|s|S||"
241 "\xc7\x85|\xc7\x86|\xc7\x84|\xc7\x86|"
242 "\xc7\x88|\xc7\x89|\xc7\x87|\xc7\x89|"
243 "\xc7\x8b|\xc7\x8c|\xc7\x8a|\xc7\x8c|"
244 "\xc7\xb0|j\xcc\x8c|J\xcc\x8c||"
245 "\xc7\xb2|\xc7\xb3|\xc7\xb1|\xc7\xb3|"
246 "\xcd\x85|\xce\xb9|\xce\x99||"
247 "\xce\x90|\xce\xb9\xcc\x88\xcc\x81|\xce\x99\xcc\x88\xcc\x81||"
248 "\xce\xb0|\xcf\x85\xcc\x88\xcc\x81|\xce\xa5\xcc\x88\xcc\x81||"
249 "\xcf\x82|\xcf\x83|\xce\xa3||"
250 "\xcf\x90|\xce\xb2|\xce\x92||"
251 "\xcf\x91|\xce\xb8|\xce\x98||"
252 "\xcf\x95|\xcf\x86|\xce\xa6||"
253 "\xcf\x96|\xcf\x80|\xce\xa0||"
254 "\xcf\xb0|\xce\xba|\xce\x9a||"
255 "\xcf\xb1|\xcf\x81|\xce\xa1||"
256 "\xcf\xb4|\xce\xb8||\xce\xb8|"
257 "\xcf\xb5|\xce\xb5|\xce\x95||"
258 "\xd6\x87|\xd5\xa5\xd6\x82|\xd4\xb5\xd5\x92||"
259 "\xe1\xba\x96|h\xcc\xb1|H\xcc\xb1||"
260 "\xe1\xba\x97|t\xcc\x88|T\xcc\x88||"
261 "\xe1\xba\x98|w\xcc\x8a|W\xcc\x8a||"
262 "\xe1\xba\x99|y\xcc\x8a|Y\xcc\x8a||"
263 "\xe1\xba\x9a|a\xca\xbe|A\xca\xbe||"
264 "\xe1\xba\x9b|\xe1\xb9\xa1|\xe1\xb9\xa0||"
265 "\xe1\xba\x9e|ss||\xc3\x9f|"
266 "\xe1\xbd\x90|\xcf\x85\xcc\x93|\xce\xa5\xcc\x93||"
267 "\xe1\xbd\x92|\xcf\x85\xcc\x93\xcc\x80|\xce\xa5\xcc\x93\xcc\x80||"
268 "\xe1\xbd\x94|\xcf\x85\xcc\x93\xcc\x81|\xce\xa5\xcc\x93\xcc\x81||"
269 "\xe1\xbd\x96|\xcf\x85\xcc\x93\xcd\x82|\xce\xa5\xcc\x93\xcd\x82||"
270 "\xe1\xbe\x80|\xe1\xbc\x80\xce\xb9|\xe1\xbc\x88\xce\x99||"
271 "\xe1\xbe\x81|\xe1\xbc\x81\xce\xb9|\xe1\xbc\x89\xce\x99||"
272 "\xe1\xbe\x82|\xe1\xbc\x82\xce\xb9|\xe1\xbc\x8a\xce\x99||"
273 "\xe1\xbe\x83|\xe1\xbc\x83\xce\xb9|\xe1\xbc\x8b\xce\x99||"
274 "\xe1\xbe\x84|\xe1\xbc\x84\xce\xb9|\xe1\xbc\x8c\xce\x99||"
275 "\xe1\xbe\x85|\xe1\xbc\x85\xce\xb9|\xe1\xbc\x8d\xce\x99||"
276 "\xe1\xbe\x86|\xe1\xbc\x86\xce\xb9|\xe1\xbc\x8e\xce\x99||"
277 "\xe1\xbe\x87|\xe1\xbc\x87\xce\xb9|\xe1\xbc\x8f\xce\x99||"
278 "\xe1\xbe\x88|\xe1\xbc\x80\xce\xb9|\xe1\xbc\x88\xce\x99|\xe1\xbe\x80|"
279 "\xe1\xbe\x89|\xe1\xbc\x81\xce\xb9|\xe1\xbc\x89\xce\x99|\xe1\xbe\x81|"
280 "\xe1\xbe\x8a|\xe1\xbc\x82\xce\xb9|\xe1\xbc\x8a\xce\x99|\xe1\xbe\x82|"
281 "\xe1\xbe\x8b|\xe1\xbc\x83\xce\xb9|\xe1\xbc\x8b\xce\x99|\xe1\xbe\x83|"
282 "\xe1\xbe\x8c|\xe1\xbc\x84\xce\xb9|\xe1\xbc\x8c\xce\x99|\xe1\xbe\x84|"
283 "\xe1\xbe\x8d|\xe1\xbc\x85\xce\xb9|\xe1\xbc\x8d\xce\x99|\xe1\xbe\x85|"
284 "\xe1\xbe\x8e|\xe1\xbc\x86\xce\xb9|\xe1\xbc\x8e\xce\x99|\xe1\xbe\x86|"
285 "\xe1\xbe\x8f|\xe1\xbc\x87\xce\xb9|\xe1\xbc\x8f\xce\x99|\xe1\xbe\x87|"
286 "\xe1\xbe\x90|\xe1\xbc\xa0\xce\xb9|\xe1\xbc\xa8\xce\x99||"
287 "\xe1\xbe\x91|\xe1\xbc\xa1\xce\xb9|\xe1\xbc\xa9\xce\x99||"
288 "\xe1\xbe\x92|\xe1\xbc\xa2\xce\xb9|\xe1\xbc\xaa\xce\x99||"
289 "\xe1\xbe\x93|\xe1\xbc\xa3\xce\xb9|\xe1\xbc\xab\xce\x99||"
290 "\xe1\xbe\x94|\xe1\xbc\xa4\xce\xb9|\xe1\xbc\xac\xce\x99||"
291 "\xe1\xbe\x95|\xe1\xbc\xa5\xce\xb9|\xe1\xbc\xad\xce\x99||"
292 "\xe1\xbe\x96|\xe1\xbc\xa6\xce\xb9|\xe1\xbc\xae\xce\x99||"
293 "\xe1\xbe\x97|\xe1\xbc\xa7\xce\xb9|\xe1\xbc\xaf\xce\x99||"
294 "\xe1\xbe\x98|\xe1\xbc\xa0\xce\xb9|\xe1\xbc\xa8\xce\x99|\xe1\xbe\x90|"
295 "\xe1\xbe\x99|\xe1\xbc\xa1\xce\xb9|\xe1\xbc\xa9\xce\x99|\xe1\xbe\x91|"
296 "\xe1\xbe\x9a|\xe1\xbc\xa2\xce\xb9|\xe1\xbc\xaa\xce\x99|\xe1\xbe\x92|"
297 "\xe1\xbe\x9b|\xe1\xbc\xa3\xce\xb9|\xe1\xbc\xab\xce\x99|\xe1\xbe\x93|"
298 "\xe1\xbe\x9c|\xe1\xbc\xa4\xce\xb9|\xe1\xbc\xac\xce\x99|\xe1\xbe\x94|"
299 "\xe1\xbe\x9d|\xe1\xbc\xa5\xce\xb9|\xe1\xbc\xad\xce\x99|\xe1\xbe\x95|"
300 "\xe1\xbe\x9e|\xe1\xbc\xa6\xce\xb9|\xe1\xbc\xae\xce\x99|\xe1\xbe\x96|"
301 "\xe1\xbe\x9f|\xe1\xbc\xa7\xce\xb9|\xe1\xbc\xaf\xce\x99|\xe1\xbe\x97|"
302 "\xe1\xbe\xa0|\xe1\xbd\xa0\xce\xb9|\xe1\xbd\xa8\xce\x99||"
303 "\xe1\xbe\xa1|\xe1\xbd\xa1\xce\xb9|\xe1\xbd\xa9\xce\x99||"
304 "\xe1\xbe\xa2|\xe1\xbd\xa2\xce\xb9|\xe1\xbd\xaa\xce\x99||"
305 "\xe1\xbe\xa3|\xe1\xbd\xa3\xce\xb9|\xe1\xbd\xab\xce\x99||"
306 "\xe1\xbe\xa4|\xe1\xbd\xa4\xce\xb9|\xe1\xbd\xac\xce\x99||"
307 "\xe1\xbe\xa5|\xe1\xbd\xa5\xce\xb9|\xe1\xbd\xad\xce\x99||"
308 "\xe1\xbe\xa6|\xe1\xbd\xa6\xce\xb9|\xe1\xbd\xae\xce\x99||"
309 "\xe1\xbe\xa7|\xe1\xbd\xa7\xce\xb9|\xe1\xbd\xaf\xce\x99||"
310 "\xe1\xbe\xa8|\xe1\xbd\xa0\xce\xb9|\xe1\xbd\xa8\xce\x99|\xe1\xbe\xa0|"
311 "\xe1\xbe\xa9|\xe1\xbd\xa1\xce\xb9|\xe1\xbd\xa9\xce\x99|\xe1\xbe\xa1|"
312 "\xe1\xbe\xaa|\xe1\xbd\xa2\xce\xb9|\xe1\xbd\xaa\xce\x99|\xe1\xbe\xa2|"
313 "\xe1\xbe\xab|\xe1\xbd\xa3\xce\xb9|\xe1\xbd\xab\xce\x99|\xe1\xbe\xa3|"
314 "\xe1\xbe\xac|\xe1\xbd\xa4\xce\xb9|\xe1\xbd\xac\xce\x99|\xe1\xbe\xa4|"
315 "\xe1\xbe\xad|\xe1\xbd\xa5\xce\xb9|\xe1\xbd\xad\xce\x99|\xe1\xbe\xa5|"
316 "\xe1\xbe\xae|\xe1\xbd\xa6\xce\xb9|\xe1\xbd\xae\xce\x99|\xe1\xbe\xa6|"
317 "\xe1\xbe\xaf|\xe1\xbd\xa7\xce\xb9|\xe1\xbd\xaf\xce\x99|\xe1\xbe\xa7|"
318 "\xe1\xbe\xb2|\xe1\xbd\xb0\xce\xb9|\xe1\xbe\xba\xce\x99||"
319 "\xe1\xbe\xb3|\xce\xb1\xce\xb9|\xce\x91\xce\x99||"
320 "\xe1\xbe\xb4|\xce\xac\xce\xb9|\xce\x86\xce\x99||"
321 "\xe1\xbe\xb6|\xce\xb1\xcd\x82|\xce\x91\xcd\x82||"
322 "\xe1\xbe\xb7|\xce\xb1\xcd\x82\xce\xb9|\xce\x91\xcd\x82\xce\x99||"
323 "\xe1\xbe\xbc|\xce\xb1\xce\xb9|\xce\x91\xce\x99|\xe1\xbe\xb3|"
324 "\xe1\xbe\xbe|\xce\xb9|\xce\x99||"
325 "\xe1\xbf\x82|\xe1\xbd\xb4\xce\xb9|\xe1\xbf\x8a\xce\x99||"
326 "\xe1\xbf\x83|\xce\xb7\xce\xb9|\xce\x97\xce\x99||"
327 "\xe1\xbf\x84|\xce\xae\xce\xb9|\xce\x89\xce\x99||"
328 "\xe1\xbf\x86|\xce\xb7\xcd\x82|\xce\x97\xcd\x82||"
329 "\xe1\xbf\x87|\xce\xb7\xcd\x82\xce\xb9|\xce\x97\xcd\x82\xce\x99||"
330 "\xe1\xbf\x8c|\xce\xb7\xce\xb9|\xce\x97\xce\x99|\xe1\xbf\x83|"
331 "\xe1\xbf\x92|\xce\xb9\xcc\x88\xcc\x80|\xce\x99\xcc\x88\xcc\x80||"
332 "\xe1\xbf\x93|\xce\xb9\xcc\x88\xcc\x81|\xce\x99\xcc\x88\xcc\x81||"
333 "\xe1\xbf\x96|\xce\xb9\xcd\x82|\xce\x99\xcd\x82||"
334 "\xe1\xbf\x97|\xce\xb9\xcc\x88\xcd\x82|\xce\x99\xcc\x88\xcd\x82||"
335 "\xe1\xbf\xa2|\xcf\x85\xcc\x88\xcc\x80|\xce\xa5\xcc\x88\xcc\x80||"
336 "\xe1\xbf\xa3|\xcf\x85\xcc\x88\xcc\x81|\xce\xa5\xcc\x88\xcc\x81||"
337 "\xe1\xbf\xa4|\xcf\x81\xcc\x93|\xce\xa1\xcc\x93||"
338 "\xe1\xbf\xa6|\xcf\x85\xcd\x82|\xce\xa5\xcd\x82||"
339 "\xe1\xbf\xa7|\xcf\x85\xcc\x88\xcd\x82|\xce\xa5\xcc\x88\xcd\x82||"
340 "\xe1\xbf\xb2|\xe1\xbd\xbc\xce\xb9|\xe1\xbf\xba\xce\x99||"
341 "\xe1\xbf\xb3|\xcf\x89\xce\xb9|\xce\xa9\xce\x99||"
342 "\xe1\xbf\xb4|\xcf\x8e\xce\xb9|\xce\x8f\xce\x99||"
343 "\xe1\xbf\xb6|\xcf\x89\xcd\x82|\xce\xa9\xcd\x82||"
344 "\xe1\xbf\xb7|\xcf\x89\xcd\x82\xce\xb9|\xce\xa9\xcd\x82\xce\x99||"
345 "\xe1\xbf\xbc|\xcf\x89\xce\xb9|\xce\xa9\xce\x99|\xe1\xbf\xb3|"
346 "\xe2\x84\xa6|\xcf\x89||\xcf\x89|"
347 "\xe2\x84\xaa|k||k|"
348 "\xe2\x84\xab|\xc3\xa5||\xc3\xa5|"
349 "\xef\xac\x80|ff|FF||"
350 "\xef\xac\x81|fi|FI||"
351 "\xef\xac\x82|fl|FL||"
352 "\xef\xac\x83|ffi|FFI||"
353 "\xef\xac\x84|ffl|FFL||"
354 "\xef\xac\x85|st|ST||"
355 "\xef\xac\x86|st|ST||"
356 "\xef\xac\x93|\xd5\xb4\xd5\xb6|\xd5\x84\xd5\x86||"
357 "\xef\xac\x94|\xd5\xb4\xd5\xa5|\xd5\x84\xd4\xb5||"
358 "\xef\xac\x95|\xd5\xb4\xd5\xab|\xd5\x84\xd4\xbb||"
359 "\xef\xac\x96|\xd5\xbe\xd5\xb6|\xd5\x8e\xd5\x86||"
360 "\xef\xac\x97|\xd5\xb4\xd5\xad|\xd5\x84\xd4\xbd||"
361
362 //--Autogenerated -- end of section automatically generated
363 ;
364
365 class CaseConverter : public ICaseConverter {
366 // Maximum length of a case conversion result is 6 bytes in UTF-8
367 enum { maxConversionLength=6 };
368 struct ConversionString {
369 char conversion[maxConversionLength+1];
370 };
371 // Conversions are initially store in a vector of structs but then decomposed into
372 // parallel arrays as that is about 10% faster to search.
373 struct CharacterConversion {
374 int character;
375 ConversionString conversion;
CharacterConversion__anon03f3b3130111::CaseConverter::CharacterConversion376 CharacterConversion(int character_=0, const char *conversion_="") : character(character_) {
377 strcpy(conversion.conversion, conversion_);
378 }
operator <__anon03f3b3130111::CaseConverter::CharacterConversion379 bool operator<(const CharacterConversion &other) const {
380 return character < other.character;
381 }
382 };
383 typedef std::vector<CharacterConversion> CharacterToConversion;
384 CharacterToConversion characterToConversion;
385 // The parallel arrays
386 std::vector<int> characters;
387 std::vector<ConversionString> conversions;
388
389 public:
CaseConverter()390 CaseConverter() {
391 }
Initialised() const392 bool Initialised() const {
393 return characters.size() > 0;
394 }
Add(int character,const char * conversion)395 void Add(int character, const char *conversion) {
396 characterToConversion.push_back(CharacterConversion(character, conversion));
397 }
Find(int character)398 const char *Find(int character) {
399 const std::vector<int>::iterator it = std::lower_bound(characters.begin(), characters.end(), character);
400 if (it == characters.end())
401 return 0;
402 else if (*it == character)
403 return conversions[it - characters.begin()].conversion;
404 else
405 return 0;
406 }
CaseConvertString(char * converted,size_t sizeConverted,const char * mixed,size_t lenMixed)407 size_t CaseConvertString(char *converted, size_t sizeConverted, const char *mixed, size_t lenMixed) {
408 size_t lenConverted = 0;
409 size_t mixedPos = 0;
410 unsigned char bytes[UTF8MaxBytes + 1];
411 while (mixedPos < lenMixed) {
412 const unsigned char leadByte = static_cast<unsigned char>(mixed[mixedPos]);
413 const char *caseConverted = 0;
414 size_t lenMixedChar = 1;
415 if (UTF8IsAscii(leadByte)) {
416 caseConverted = Find(leadByte);
417 } else {
418 bytes[0] = leadByte;
419 const int widthCharBytes = UTF8BytesOfLead[leadByte];
420 for (int b=1; b<widthCharBytes; b++) {
421 bytes[b] = (mixedPos+b < lenMixed) ? mixed[mixedPos+b] : 0;
422 }
423 int classified = UTF8Classify(bytes, widthCharBytes);
424 if (!(classified & UTF8MaskInvalid)) {
425 // valid UTF-8
426 lenMixedChar = classified & UTF8MaskWidth;
427 int character = UnicodeFromUTF8(bytes);
428 caseConverted = Find(character);
429 }
430 }
431 if (caseConverted) {
432 // Character has a conversion so copy that conversion in
433 while (*caseConverted) {
434 converted[lenConverted++] = *caseConverted++;
435 if (lenConverted >= sizeConverted)
436 return 0;
437 }
438 } else {
439 // Character has no conversion so copy the input to output
440 for (size_t i=0; i<lenMixedChar; i++) {
441 converted[lenConverted++] = mixed[mixedPos+i];
442 if (lenConverted >= sizeConverted)
443 return 0;
444 }
445 }
446 mixedPos += lenMixedChar;
447 }
448 return lenConverted;
449 }
FinishedAdding()450 void FinishedAdding() {
451 std::sort(characterToConversion.begin(), characterToConversion.end());
452 characters.reserve(characterToConversion.size());
453 conversions.reserve(characterToConversion.size());
454 for (CharacterToConversion::iterator it = characterToConversion.begin(); it != characterToConversion.end(); ++it) {
455 characters.push_back(it->character);
456 conversions.push_back(it->conversion);
457 }
458 // Empty the original calculated data completely
459 CharacterToConversion().swap(characterToConversion);
460 }
461 };
462
463 CaseConverter caseConvFold;
464 CaseConverter caseConvUp;
465 CaseConverter caseConvLow;
466
UTF8FromUTF32Character(int uch,char * putf)467 void UTF8FromUTF32Character(int uch, char *putf) {
468 size_t k = 0;
469 if (uch < 0x80) {
470 putf[k++] = static_cast<char>(uch);
471 } else if (uch < 0x800) {
472 putf[k++] = static_cast<char>(0xC0 | (uch >> 6));
473 putf[k++] = static_cast<char>(0x80 | (uch & 0x3f));
474 } else if (uch < 0x10000) {
475 putf[k++] = static_cast<char>(0xE0 | (uch >> 12));
476 putf[k++] = static_cast<char>(0x80 | ((uch >> 6) & 0x3f));
477 putf[k++] = static_cast<char>(0x80 | (uch & 0x3f));
478 } else {
479 putf[k++] = static_cast<char>(0xF0 | (uch >> 18));
480 putf[k++] = static_cast<char>(0x80 | ((uch >> 12) & 0x3f));
481 putf[k++] = static_cast<char>(0x80 | ((uch >> 6) & 0x3f));
482 putf[k++] = static_cast<char>(0x80 | (uch & 0x3f));
483 }
484 putf[k] = 0;
485 }
486
AddSymmetric(enum CaseConversion conversion,int lower,int upper)487 void AddSymmetric(enum CaseConversion conversion, int lower,int upper) {
488 char lowerUTF8[UTF8MaxBytes+1];
489 UTF8FromUTF32Character(lower, lowerUTF8);
490 char upperUTF8[UTF8MaxBytes+1];
491 UTF8FromUTF32Character(upper, upperUTF8);
492
493 switch (conversion) {
494 case CaseConversionFold:
495 caseConvFold.Add(upper, lowerUTF8);
496 break;
497 case CaseConversionUpper:
498 caseConvUp.Add(lower, upperUTF8);
499 break;
500 case CaseConversionLower:
501 caseConvLow.Add(upper, lowerUTF8);
502 break;
503 }
504 }
505
SetupConversions(enum CaseConversion conversion)506 void SetupConversions(enum CaseConversion conversion) {
507 // First initialize for the symmetric ranges
508 for (size_t i=0; i<sizeof(symmetricCaseConversionRanges)/sizeof(symmetricCaseConversionRanges[0]);) {
509 int lower = symmetricCaseConversionRanges[i++];
510 int upper = symmetricCaseConversionRanges[i++];
511 int length = symmetricCaseConversionRanges[i++];
512 int pitch = symmetricCaseConversionRanges[i++];
513 for (int j=0;j<length*pitch;j+=pitch) {
514 AddSymmetric(conversion, lower+j, upper+j);
515 }
516 }
517 // Add the symmetric singletons
518 for (size_t i=0; i<sizeof(symmetricCaseConversions)/sizeof(symmetricCaseConversions[0]);) {
519 int lower = symmetricCaseConversions[i++];
520 int upper = symmetricCaseConversions[i++];
521 AddSymmetric(conversion, lower, upper);
522 }
523 // Add the complex cases
524 const char *sComplex = complexCaseConversions;
525 while (*sComplex) {
526 // Longest ligature is 3 character so 5 for safety
527 const size_t lenUTF8 = 5*UTF8MaxBytes+1;
528 char originUTF8[lenUTF8];
529 char foldedUTF8[lenUTF8];
530 char lowerUTF8[lenUTF8];
531 char upperUTF8[lenUTF8];
532 size_t i = 0;
533 while (*sComplex && *sComplex != '|') {
534 originUTF8[i++] = *sComplex;
535 sComplex++;
536 }
537 sComplex++;
538 originUTF8[i] = 0;
539 i = 0;
540 while (*sComplex && *sComplex != '|') {
541 foldedUTF8[i++] = *sComplex;
542 sComplex++;
543 }
544 sComplex++;
545 foldedUTF8[i] = 0;
546 i = 0;
547 while (*sComplex && *sComplex != '|') {
548 upperUTF8[i++] = *sComplex;
549 sComplex++;
550 }
551 sComplex++;
552 upperUTF8[i] = 0;
553 i = 0;
554 while (*sComplex && *sComplex != '|') {
555 lowerUTF8[i++] = *sComplex;
556 sComplex++;
557 }
558 sComplex++;
559 lowerUTF8[i] = 0;
560
561 int character = UnicodeFromUTF8(reinterpret_cast<unsigned char *>(originUTF8));
562
563 if (conversion == CaseConversionFold && foldedUTF8[0]) {
564 caseConvFold.Add(character, foldedUTF8);
565 }
566
567 if (conversion == CaseConversionUpper && upperUTF8[0]) {
568 caseConvUp.Add(character, upperUTF8);
569 }
570
571 if (conversion == CaseConversionLower && lowerUTF8[0]) {
572 caseConvLow.Add(character, lowerUTF8);
573 }
574 }
575
576 switch (conversion) {
577 case CaseConversionFold:
578 caseConvFold.FinishedAdding();
579 break;
580 case CaseConversionUpper:
581 caseConvUp.FinishedAdding();
582 break;
583 case CaseConversionLower:
584 caseConvLow.FinishedAdding();
585 break;
586 }
587 }
588
ConverterForConversion(enum CaseConversion conversion)589 CaseConverter *ConverterForConversion(enum CaseConversion conversion) {
590 switch (conversion) {
591 case CaseConversionFold:
592 return &caseConvFold;
593 case CaseConversionUpper:
594 return &caseConvUp;
595 case CaseConversionLower:
596 return &caseConvLow;
597 }
598 return 0;
599 }
600
601 }
602
603 #ifdef SCI_NAMESPACE
604 namespace Scintilla {
605 #endif
606
ConverterFor(enum CaseConversion conversion)607 ICaseConverter *ConverterFor(enum CaseConversion conversion) {
608 CaseConverter *pCaseConv = ConverterForConversion(conversion);
609 if (!pCaseConv->Initialised())
610 SetupConversions(conversion);
611 return pCaseConv;
612 }
613
CaseConvert(int character,enum CaseConversion conversion)614 const char *CaseConvert(int character, enum CaseConversion conversion) {
615 CaseConverter *pCaseConv = ConverterForConversion(conversion);
616 if (!pCaseConv->Initialised())
617 SetupConversions(conversion);
618 return pCaseConv->Find(character);
619 }
620
CaseConvertString(char * converted,size_t sizeConverted,const char * mixed,size_t lenMixed,enum CaseConversion conversion)621 size_t CaseConvertString(char *converted, size_t sizeConverted, const char *mixed, size_t lenMixed, enum CaseConversion conversion) {
622 CaseConverter *pCaseConv = ConverterForConversion(conversion);
623 if (!pCaseConv->Initialised())
624 SetupConversions(conversion);
625 return pCaseConv->CaseConvertString(converted, sizeConverted, mixed, lenMixed);
626 }
627
628 #ifdef SCI_NAMESPACE
629 }
630 #endif
631