1 // Scintilla source code edit control
2 // Encoding: UTF-8
3 /** @file CaseConvert.cxx
4  ** Case fold characters and convert them to upper or lower case.
5  ** Tables automatically regenerated by scripts/GenerateCaseConvert.py
6  ** Should only be rarely regenerated for new versions of Unicode.
7  **/
8 // Copyright 2013 by Neil Hodgson <neilh@scintilla.org>
9 // The License.txt file describes the conditions under which this software may be distributed.
10 
11 #include <cstring>
12 
13 #include <stdexcept>
14 #include <string>
15 #include <vector>
16 #include <algorithm>
17 
18 #include "StringCopy.h"
19 #include "CaseConvert.h"
20 #include "UniConversion.h"
21 
22 #include "Compat.h"
23 
24 using namespace Scintilla;
25 
26 namespace {
27 	// Use an unnamed namespace to protect the declarations from name conflicts
28 
29 // Unicode code points are ordered by groups and follow patterns.
30 // Most characters (pitch==1) are in ranges for a particular alphabet and their
31 // upper case forms are a fixed distance away.
32 // Another pattern (pitch==2) is where each lower case letter is preceded by
33 // the upper case form. These are also grouped into ranges.
34 
35 int symmetricCaseConversionRanges[] = {
36 //lower, upper, range length, range pitch
37 //++Autogenerated -- start of section automatically generated
38 //**\(\*\n\)
39 97,65,26,1,
40 224,192,23,1,
41 248,216,7,1,
42 257,256,24,2,
43 314,313,8,2,
44 331,330,23,2,
45 462,461,8,2,
46 479,478,9,2,
47 505,504,20,2,
48 547,546,9,2,
49 583,582,5,2,
50 945,913,17,1,
51 963,931,9,1,
52 985,984,12,2,
53 1072,1040,32,1,
54 1104,1024,16,1,
55 1121,1120,17,2,
56 1163,1162,27,2,
57 1218,1217,7,2,
58 1233,1232,48,2,
59 1377,1329,38,1,
60 4304,7312,43,1,
61 7681,7680,75,2,
62 7841,7840,48,2,
63 7936,7944,8,1,
64 7952,7960,6,1,
65 7968,7976,8,1,
66 7984,7992,8,1,
67 8000,8008,6,1,
68 8032,8040,8,1,
69 8560,8544,16,1,
70 9424,9398,26,1,
71 11312,11264,47,1,
72 11393,11392,50,2,
73 11520,4256,38,1,
74 42561,42560,23,2,
75 42625,42624,14,2,
76 42787,42786,7,2,
77 42803,42802,31,2,
78 42879,42878,5,2,
79 42903,42902,10,2,
80 42933,42932,6,2,
81 65345,65313,26,1,
82 66600,66560,40,1,
83 66776,66736,36,1,
84 68800,68736,51,1,
85 71872,71840,32,1,
86 93792,93760,32,1,
87 125218,125184,34,1,
88 
89 //--Autogenerated -- end of section automatically generated
90 };
91 
92 // Code points that are symmetric but don't fit into a range of similar characters
93 // are listed here.
94 
95 int symmetricCaseConversions[] = {
96 //lower, upper
97 //++Autogenerated -- start of section automatically generated
98 //**1 \(\*\n\)
99 255,376,
100 307,306,
101 309,308,
102 311,310,
103 378,377,
104 380,379,
105 382,381,
106 384,579,
107 387,386,
108 389,388,
109 392,391,
110 396,395,
111 402,401,
112 405,502,
113 409,408,
114 410,573,
115 414,544,
116 417,416,
117 419,418,
118 421,420,
119 424,423,
120 429,428,
121 432,431,
122 436,435,
123 438,437,
124 441,440,
125 445,444,
126 447,503,
127 454,452,
128 457,455,
129 460,458,
130 477,398,
131 499,497,
132 501,500,
133 572,571,
134 575,11390,
135 576,11391,
136 578,577,
137 592,11375,
138 593,11373,
139 594,11376,
140 595,385,
141 596,390,
142 598,393,
143 599,394,
144 601,399,
145 603,400,
146 604,42923,
147 608,403,
148 609,42924,
149 611,404,
150 613,42893,
151 614,42922,
152 616,407,
153 617,406,
154 618,42926,
155 619,11362,
156 620,42925,
157 623,412,
158 625,11374,
159 626,413,
160 629,415,
161 637,11364,
162 640,422,
163 642,42949,
164 643,425,
165 647,42929,
166 648,430,
167 649,580,
168 650,433,
169 651,434,
170 652,581,
171 658,439,
172 669,42930,
173 670,42928,
174 881,880,
175 883,882,
176 887,886,
177 891,1021,
178 892,1022,
179 893,1023,
180 940,902,
181 941,904,
182 942,905,
183 943,906,
184 972,908,
185 973,910,
186 974,911,
187 983,975,
188 1010,1017,
189 1011,895,
190 1016,1015,
191 1019,1018,
192 1231,1216,
193 4349,7357,
194 4350,7358,
195 4351,7359,
196 7545,42877,
197 7549,11363,
198 7566,42950,
199 8017,8025,
200 8019,8027,
201 8021,8029,
202 8023,8031,
203 8048,8122,
204 8049,8123,
205 8050,8136,
206 8051,8137,
207 8052,8138,
208 8053,8139,
209 8054,8154,
210 8055,8155,
211 8056,8184,
212 8057,8185,
213 8058,8170,
214 8059,8171,
215 8060,8186,
216 8061,8187,
217 8112,8120,
218 8113,8121,
219 8144,8152,
220 8145,8153,
221 8160,8168,
222 8161,8169,
223 8165,8172,
224 8526,8498,
225 8580,8579,
226 11361,11360,
227 11365,570,
228 11366,574,
229 11368,11367,
230 11370,11369,
231 11372,11371,
232 11379,11378,
233 11382,11381,
234 11500,11499,
235 11502,11501,
236 11507,11506,
237 11559,4295,
238 11565,4301,
239 42874,42873,
240 42876,42875,
241 42892,42891,
242 42897,42896,
243 42899,42898,
244 42900,42948,
245 42947,42946,
246 43859,42931,
247 
248 //--Autogenerated -- end of section automatically generated
249 };
250 
251 // Characters that have complex case conversions are listed here.
252 // This includes cases where more than one character is needed for a conversion,
253 // folding is different to lowering, or (as appropriate) upper(lower(x)) != x or
254 // lower(upper(x)) != x.
255 
256 const char *complexCaseConversions =
257 // Original | Folded | Upper | Lower |
258 //++Autogenerated -- start of section automatically generated
259 //**2 \(\*\n\)
260 "\xc2\xb5|\xce\xbc|\xce\x9c||"
261 "\xc3\x9f|ss|SS||"
262 "\xc4\xb0|i\xcc\x87||i\xcc\x87|"
263 "\xc4\xb1||I||"
264 "\xc5\x89|\xca\xbcn|\xca\xbcN||"
265 "\xc5\xbf|s|S||"
266 "\xc7\x85|\xc7\x86|\xc7\x84|\xc7\x86|"
267 "\xc7\x88|\xc7\x89|\xc7\x87|\xc7\x89|"
268 "\xc7\x8b|\xc7\x8c|\xc7\x8a|\xc7\x8c|"
269 "\xc7\xb0|j\xcc\x8c|J\xcc\x8c||"
270 "\xc7\xb2|\xc7\xb3|\xc7\xb1|\xc7\xb3|"
271 "\xcd\x85|\xce\xb9|\xce\x99||"
272 "\xce\x90|\xce\xb9\xcc\x88\xcc\x81|\xce\x99\xcc\x88\xcc\x81||"
273 "\xce\xb0|\xcf\x85\xcc\x88\xcc\x81|\xce\xa5\xcc\x88\xcc\x81||"
274 "\xcf\x82|\xcf\x83|\xce\xa3||"
275 "\xcf\x90|\xce\xb2|\xce\x92||"
276 "\xcf\x91|\xce\xb8|\xce\x98||"
277 "\xcf\x95|\xcf\x86|\xce\xa6||"
278 "\xcf\x96|\xcf\x80|\xce\xa0||"
279 "\xcf\xb0|\xce\xba|\xce\x9a||"
280 "\xcf\xb1|\xcf\x81|\xce\xa1||"
281 "\xcf\xb4|\xce\xb8||\xce\xb8|"
282 "\xcf\xb5|\xce\xb5|\xce\x95||"
283 "\xd6\x87|\xd5\xa5\xd6\x82|\xd4\xb5\xd5\x92||"
284 "\xe1\x8e\xa0|||\xea\xad\xb0|"
285 "\xe1\x8e\xa1|||\xea\xad\xb1|"
286 "\xe1\x8e\xa2|||\xea\xad\xb2|"
287 "\xe1\x8e\xa3|||\xea\xad\xb3|"
288 "\xe1\x8e\xa4|||\xea\xad\xb4|"
289 "\xe1\x8e\xa5|||\xea\xad\xb5|"
290 "\xe1\x8e\xa6|||\xea\xad\xb6|"
291 "\xe1\x8e\xa7|||\xea\xad\xb7|"
292 "\xe1\x8e\xa8|||\xea\xad\xb8|"
293 "\xe1\x8e\xa9|||\xea\xad\xb9|"
294 "\xe1\x8e\xaa|||\xea\xad\xba|"
295 "\xe1\x8e\xab|||\xea\xad\xbb|"
296 "\xe1\x8e\xac|||\xea\xad\xbc|"
297 "\xe1\x8e\xad|||\xea\xad\xbd|"
298 "\xe1\x8e\xae|||\xea\xad\xbe|"
299 "\xe1\x8e\xaf|||\xea\xad\xbf|"
300 "\xe1\x8e\xb0|||\xea\xae\x80|"
301 "\xe1\x8e\xb1|||\xea\xae\x81|"
302 "\xe1\x8e\xb2|||\xea\xae\x82|"
303 "\xe1\x8e\xb3|||\xea\xae\x83|"
304 "\xe1\x8e\xb4|||\xea\xae\x84|"
305 "\xe1\x8e\xb5|||\xea\xae\x85|"
306 "\xe1\x8e\xb6|||\xea\xae\x86|"
307 "\xe1\x8e\xb7|||\xea\xae\x87|"
308 "\xe1\x8e\xb8|||\xea\xae\x88|"
309 "\xe1\x8e\xb9|||\xea\xae\x89|"
310 "\xe1\x8e\xba|||\xea\xae\x8a|"
311 "\xe1\x8e\xbb|||\xea\xae\x8b|"
312 "\xe1\x8e\xbc|||\xea\xae\x8c|"
313 "\xe1\x8e\xbd|||\xea\xae\x8d|"
314 "\xe1\x8e\xbe|||\xea\xae\x8e|"
315 "\xe1\x8e\xbf|||\xea\xae\x8f|"
316 "\xe1\x8f\x80|||\xea\xae\x90|"
317 "\xe1\x8f\x81|||\xea\xae\x91|"
318 "\xe1\x8f\x82|||\xea\xae\x92|"
319 "\xe1\x8f\x83|||\xea\xae\x93|"
320 "\xe1\x8f\x84|||\xea\xae\x94|"
321 "\xe1\x8f\x85|||\xea\xae\x95|"
322 "\xe1\x8f\x86|||\xea\xae\x96|"
323 "\xe1\x8f\x87|||\xea\xae\x97|"
324 "\xe1\x8f\x88|||\xea\xae\x98|"
325 "\xe1\x8f\x89|||\xea\xae\x99|"
326 "\xe1\x8f\x8a|||\xea\xae\x9a|"
327 "\xe1\x8f\x8b|||\xea\xae\x9b|"
328 "\xe1\x8f\x8c|||\xea\xae\x9c|"
329 "\xe1\x8f\x8d|||\xea\xae\x9d|"
330 "\xe1\x8f\x8e|||\xea\xae\x9e|"
331 "\xe1\x8f\x8f|||\xea\xae\x9f|"
332 "\xe1\x8f\x90|||\xea\xae\xa0|"
333 "\xe1\x8f\x91|||\xea\xae\xa1|"
334 "\xe1\x8f\x92|||\xea\xae\xa2|"
335 "\xe1\x8f\x93|||\xea\xae\xa3|"
336 "\xe1\x8f\x94|||\xea\xae\xa4|"
337 "\xe1\x8f\x95|||\xea\xae\xa5|"
338 "\xe1\x8f\x96|||\xea\xae\xa6|"
339 "\xe1\x8f\x97|||\xea\xae\xa7|"
340 "\xe1\x8f\x98|||\xea\xae\xa8|"
341 "\xe1\x8f\x99|||\xea\xae\xa9|"
342 "\xe1\x8f\x9a|||\xea\xae\xaa|"
343 "\xe1\x8f\x9b|||\xea\xae\xab|"
344 "\xe1\x8f\x9c|||\xea\xae\xac|"
345 "\xe1\x8f\x9d|||\xea\xae\xad|"
346 "\xe1\x8f\x9e|||\xea\xae\xae|"
347 "\xe1\x8f\x9f|||\xea\xae\xaf|"
348 "\xe1\x8f\xa0|||\xea\xae\xb0|"
349 "\xe1\x8f\xa1|||\xea\xae\xb1|"
350 "\xe1\x8f\xa2|||\xea\xae\xb2|"
351 "\xe1\x8f\xa3|||\xea\xae\xb3|"
352 "\xe1\x8f\xa4|||\xea\xae\xb4|"
353 "\xe1\x8f\xa5|||\xea\xae\xb5|"
354 "\xe1\x8f\xa6|||\xea\xae\xb6|"
355 "\xe1\x8f\xa7|||\xea\xae\xb7|"
356 "\xe1\x8f\xa8|||\xea\xae\xb8|"
357 "\xe1\x8f\xa9|||\xea\xae\xb9|"
358 "\xe1\x8f\xaa|||\xea\xae\xba|"
359 "\xe1\x8f\xab|||\xea\xae\xbb|"
360 "\xe1\x8f\xac|||\xea\xae\xbc|"
361 "\xe1\x8f\xad|||\xea\xae\xbd|"
362 "\xe1\x8f\xae|||\xea\xae\xbe|"
363 "\xe1\x8f\xaf|||\xea\xae\xbf|"
364 "\xe1\x8f\xb0|||\xe1\x8f\xb8|"
365 "\xe1\x8f\xb1|||\xe1\x8f\xb9|"
366 "\xe1\x8f\xb2|||\xe1\x8f\xba|"
367 "\xe1\x8f\xb3|||\xe1\x8f\xbb|"
368 "\xe1\x8f\xb4|||\xe1\x8f\xbc|"
369 "\xe1\x8f\xb5|||\xe1\x8f\xbd|"
370 "\xe1\x8f\xb8|\xe1\x8f\xb0|\xe1\x8f\xb0||"
371 "\xe1\x8f\xb9|\xe1\x8f\xb1|\xe1\x8f\xb1||"
372 "\xe1\x8f\xba|\xe1\x8f\xb2|\xe1\x8f\xb2||"
373 "\xe1\x8f\xbb|\xe1\x8f\xb3|\xe1\x8f\xb3||"
374 "\xe1\x8f\xbc|\xe1\x8f\xb4|\xe1\x8f\xb4||"
375 "\xe1\x8f\xbd|\xe1\x8f\xb5|\xe1\x8f\xb5||"
376 "\xe1\xb2\x80|\xd0\xb2|\xd0\x92||"
377 "\xe1\xb2\x81|\xd0\xb4|\xd0\x94||"
378 "\xe1\xb2\x82|\xd0\xbe|\xd0\x9e||"
379 "\xe1\xb2\x83|\xd1\x81|\xd0\xa1||"
380 "\xe1\xb2\x84|\xd1\x82|\xd0\xa2||"
381 "\xe1\xb2\x85|\xd1\x82|\xd0\xa2||"
382 "\xe1\xb2\x86|\xd1\x8a|\xd0\xaa||"
383 "\xe1\xb2\x87|\xd1\xa3|\xd1\xa2||"
384 "\xe1\xb2\x88|\xea\x99\x8b|\xea\x99\x8a||"
385 "\xe1\xba\x96|h\xcc\xb1|H\xcc\xb1||"
386 "\xe1\xba\x97|t\xcc\x88|T\xcc\x88||"
387 "\xe1\xba\x98|w\xcc\x8a|W\xcc\x8a||"
388 "\xe1\xba\x99|y\xcc\x8a|Y\xcc\x8a||"
389 "\xe1\xba\x9a|a\xca\xbe|A\xca\xbe||"
390 "\xe1\xba\x9b|\xe1\xb9\xa1|\xe1\xb9\xa0||"
391 "\xe1\xba\x9e|ss||\xc3\x9f|"
392 "\xe1\xbd\x90|\xcf\x85\xcc\x93|\xce\xa5\xcc\x93||"
393 "\xe1\xbd\x92|\xcf\x85\xcc\x93\xcc\x80|\xce\xa5\xcc\x93\xcc\x80||"
394 "\xe1\xbd\x94|\xcf\x85\xcc\x93\xcc\x81|\xce\xa5\xcc\x93\xcc\x81||"
395 "\xe1\xbd\x96|\xcf\x85\xcc\x93\xcd\x82|\xce\xa5\xcc\x93\xcd\x82||"
396 "\xe1\xbe\x80|\xe1\xbc\x80\xce\xb9|\xe1\xbc\x88\xce\x99||"
397 "\xe1\xbe\x81|\xe1\xbc\x81\xce\xb9|\xe1\xbc\x89\xce\x99||"
398 "\xe1\xbe\x82|\xe1\xbc\x82\xce\xb9|\xe1\xbc\x8a\xce\x99||"
399 "\xe1\xbe\x83|\xe1\xbc\x83\xce\xb9|\xe1\xbc\x8b\xce\x99||"
400 "\xe1\xbe\x84|\xe1\xbc\x84\xce\xb9|\xe1\xbc\x8c\xce\x99||"
401 "\xe1\xbe\x85|\xe1\xbc\x85\xce\xb9|\xe1\xbc\x8d\xce\x99||"
402 "\xe1\xbe\x86|\xe1\xbc\x86\xce\xb9|\xe1\xbc\x8e\xce\x99||"
403 "\xe1\xbe\x87|\xe1\xbc\x87\xce\xb9|\xe1\xbc\x8f\xce\x99||"
404 "\xe1\xbe\x88|\xe1\xbc\x80\xce\xb9|\xe1\xbc\x88\xce\x99|\xe1\xbe\x80|"
405 "\xe1\xbe\x89|\xe1\xbc\x81\xce\xb9|\xe1\xbc\x89\xce\x99|\xe1\xbe\x81|"
406 "\xe1\xbe\x8a|\xe1\xbc\x82\xce\xb9|\xe1\xbc\x8a\xce\x99|\xe1\xbe\x82|"
407 "\xe1\xbe\x8b|\xe1\xbc\x83\xce\xb9|\xe1\xbc\x8b\xce\x99|\xe1\xbe\x83|"
408 "\xe1\xbe\x8c|\xe1\xbc\x84\xce\xb9|\xe1\xbc\x8c\xce\x99|\xe1\xbe\x84|"
409 "\xe1\xbe\x8d|\xe1\xbc\x85\xce\xb9|\xe1\xbc\x8d\xce\x99|\xe1\xbe\x85|"
410 "\xe1\xbe\x8e|\xe1\xbc\x86\xce\xb9|\xe1\xbc\x8e\xce\x99|\xe1\xbe\x86|"
411 "\xe1\xbe\x8f|\xe1\xbc\x87\xce\xb9|\xe1\xbc\x8f\xce\x99|\xe1\xbe\x87|"
412 "\xe1\xbe\x90|\xe1\xbc\xa0\xce\xb9|\xe1\xbc\xa8\xce\x99||"
413 "\xe1\xbe\x91|\xe1\xbc\xa1\xce\xb9|\xe1\xbc\xa9\xce\x99||"
414 "\xe1\xbe\x92|\xe1\xbc\xa2\xce\xb9|\xe1\xbc\xaa\xce\x99||"
415 "\xe1\xbe\x93|\xe1\xbc\xa3\xce\xb9|\xe1\xbc\xab\xce\x99||"
416 "\xe1\xbe\x94|\xe1\xbc\xa4\xce\xb9|\xe1\xbc\xac\xce\x99||"
417 "\xe1\xbe\x95|\xe1\xbc\xa5\xce\xb9|\xe1\xbc\xad\xce\x99||"
418 "\xe1\xbe\x96|\xe1\xbc\xa6\xce\xb9|\xe1\xbc\xae\xce\x99||"
419 "\xe1\xbe\x97|\xe1\xbc\xa7\xce\xb9|\xe1\xbc\xaf\xce\x99||"
420 "\xe1\xbe\x98|\xe1\xbc\xa0\xce\xb9|\xe1\xbc\xa8\xce\x99|\xe1\xbe\x90|"
421 "\xe1\xbe\x99|\xe1\xbc\xa1\xce\xb9|\xe1\xbc\xa9\xce\x99|\xe1\xbe\x91|"
422 "\xe1\xbe\x9a|\xe1\xbc\xa2\xce\xb9|\xe1\xbc\xaa\xce\x99|\xe1\xbe\x92|"
423 "\xe1\xbe\x9b|\xe1\xbc\xa3\xce\xb9|\xe1\xbc\xab\xce\x99|\xe1\xbe\x93|"
424 "\xe1\xbe\x9c|\xe1\xbc\xa4\xce\xb9|\xe1\xbc\xac\xce\x99|\xe1\xbe\x94|"
425 "\xe1\xbe\x9d|\xe1\xbc\xa5\xce\xb9|\xe1\xbc\xad\xce\x99|\xe1\xbe\x95|"
426 "\xe1\xbe\x9e|\xe1\xbc\xa6\xce\xb9|\xe1\xbc\xae\xce\x99|\xe1\xbe\x96|"
427 "\xe1\xbe\x9f|\xe1\xbc\xa7\xce\xb9|\xe1\xbc\xaf\xce\x99|\xe1\xbe\x97|"
428 "\xe1\xbe\xa0|\xe1\xbd\xa0\xce\xb9|\xe1\xbd\xa8\xce\x99||"
429 "\xe1\xbe\xa1|\xe1\xbd\xa1\xce\xb9|\xe1\xbd\xa9\xce\x99||"
430 "\xe1\xbe\xa2|\xe1\xbd\xa2\xce\xb9|\xe1\xbd\xaa\xce\x99||"
431 "\xe1\xbe\xa3|\xe1\xbd\xa3\xce\xb9|\xe1\xbd\xab\xce\x99||"
432 "\xe1\xbe\xa4|\xe1\xbd\xa4\xce\xb9|\xe1\xbd\xac\xce\x99||"
433 "\xe1\xbe\xa5|\xe1\xbd\xa5\xce\xb9|\xe1\xbd\xad\xce\x99||"
434 "\xe1\xbe\xa6|\xe1\xbd\xa6\xce\xb9|\xe1\xbd\xae\xce\x99||"
435 "\xe1\xbe\xa7|\xe1\xbd\xa7\xce\xb9|\xe1\xbd\xaf\xce\x99||"
436 "\xe1\xbe\xa8|\xe1\xbd\xa0\xce\xb9|\xe1\xbd\xa8\xce\x99|\xe1\xbe\xa0|"
437 "\xe1\xbe\xa9|\xe1\xbd\xa1\xce\xb9|\xe1\xbd\xa9\xce\x99|\xe1\xbe\xa1|"
438 "\xe1\xbe\xaa|\xe1\xbd\xa2\xce\xb9|\xe1\xbd\xaa\xce\x99|\xe1\xbe\xa2|"
439 "\xe1\xbe\xab|\xe1\xbd\xa3\xce\xb9|\xe1\xbd\xab\xce\x99|\xe1\xbe\xa3|"
440 "\xe1\xbe\xac|\xe1\xbd\xa4\xce\xb9|\xe1\xbd\xac\xce\x99|\xe1\xbe\xa4|"
441 "\xe1\xbe\xad|\xe1\xbd\xa5\xce\xb9|\xe1\xbd\xad\xce\x99|\xe1\xbe\xa5|"
442 "\xe1\xbe\xae|\xe1\xbd\xa6\xce\xb9|\xe1\xbd\xae\xce\x99|\xe1\xbe\xa6|"
443 "\xe1\xbe\xaf|\xe1\xbd\xa7\xce\xb9|\xe1\xbd\xaf\xce\x99|\xe1\xbe\xa7|"
444 "\xe1\xbe\xb2|\xe1\xbd\xb0\xce\xb9|\xe1\xbe\xba\xce\x99||"
445 "\xe1\xbe\xb3|\xce\xb1\xce\xb9|\xce\x91\xce\x99||"
446 "\xe1\xbe\xb4|\xce\xac\xce\xb9|\xce\x86\xce\x99||"
447 "\xe1\xbe\xb6|\xce\xb1\xcd\x82|\xce\x91\xcd\x82||"
448 "\xe1\xbe\xb7|\xce\xb1\xcd\x82\xce\xb9|\xce\x91\xcd\x82\xce\x99||"
449 "\xe1\xbe\xbc|\xce\xb1\xce\xb9|\xce\x91\xce\x99|\xe1\xbe\xb3|"
450 "\xe1\xbe\xbe|\xce\xb9|\xce\x99||"
451 "\xe1\xbf\x82|\xe1\xbd\xb4\xce\xb9|\xe1\xbf\x8a\xce\x99||"
452 "\xe1\xbf\x83|\xce\xb7\xce\xb9|\xce\x97\xce\x99||"
453 "\xe1\xbf\x84|\xce\xae\xce\xb9|\xce\x89\xce\x99||"
454 "\xe1\xbf\x86|\xce\xb7\xcd\x82|\xce\x97\xcd\x82||"
455 "\xe1\xbf\x87|\xce\xb7\xcd\x82\xce\xb9|\xce\x97\xcd\x82\xce\x99||"
456 "\xe1\xbf\x8c|\xce\xb7\xce\xb9|\xce\x97\xce\x99|\xe1\xbf\x83|"
457 "\xe1\xbf\x92|\xce\xb9\xcc\x88\xcc\x80|\xce\x99\xcc\x88\xcc\x80||"
458 "\xe1\xbf\x93|\xce\xb9\xcc\x88\xcc\x81|\xce\x99\xcc\x88\xcc\x81||"
459 "\xe1\xbf\x96|\xce\xb9\xcd\x82|\xce\x99\xcd\x82||"
460 "\xe1\xbf\x97|\xce\xb9\xcc\x88\xcd\x82|\xce\x99\xcc\x88\xcd\x82||"
461 "\xe1\xbf\xa2|\xcf\x85\xcc\x88\xcc\x80|\xce\xa5\xcc\x88\xcc\x80||"
462 "\xe1\xbf\xa3|\xcf\x85\xcc\x88\xcc\x81|\xce\xa5\xcc\x88\xcc\x81||"
463 "\xe1\xbf\xa4|\xcf\x81\xcc\x93|\xce\xa1\xcc\x93||"
464 "\xe1\xbf\xa6|\xcf\x85\xcd\x82|\xce\xa5\xcd\x82||"
465 "\xe1\xbf\xa7|\xcf\x85\xcc\x88\xcd\x82|\xce\xa5\xcc\x88\xcd\x82||"
466 "\xe1\xbf\xb2|\xe1\xbd\xbc\xce\xb9|\xe1\xbf\xba\xce\x99||"
467 "\xe1\xbf\xb3|\xcf\x89\xce\xb9|\xce\xa9\xce\x99||"
468 "\xe1\xbf\xb4|\xcf\x8e\xce\xb9|\xce\x8f\xce\x99||"
469 "\xe1\xbf\xb6|\xcf\x89\xcd\x82|\xce\xa9\xcd\x82||"
470 "\xe1\xbf\xb7|\xcf\x89\xcd\x82\xce\xb9|\xce\xa9\xcd\x82\xce\x99||"
471 "\xe1\xbf\xbc|\xcf\x89\xce\xb9|\xce\xa9\xce\x99|\xe1\xbf\xb3|"
472 "\xe2\x84\xa6|\xcf\x89||\xcf\x89|"
473 "\xe2\x84\xaa|k||k|"
474 "\xe2\x84\xab|\xc3\xa5||\xc3\xa5|"
475 "\xea\xad\xb0|\xe1\x8e\xa0|\xe1\x8e\xa0||"
476 "\xea\xad\xb1|\xe1\x8e\xa1|\xe1\x8e\xa1||"
477 "\xea\xad\xb2|\xe1\x8e\xa2|\xe1\x8e\xa2||"
478 "\xea\xad\xb3|\xe1\x8e\xa3|\xe1\x8e\xa3||"
479 "\xea\xad\xb4|\xe1\x8e\xa4|\xe1\x8e\xa4||"
480 "\xea\xad\xb5|\xe1\x8e\xa5|\xe1\x8e\xa5||"
481 "\xea\xad\xb6|\xe1\x8e\xa6|\xe1\x8e\xa6||"
482 "\xea\xad\xb7|\xe1\x8e\xa7|\xe1\x8e\xa7||"
483 "\xea\xad\xb8|\xe1\x8e\xa8|\xe1\x8e\xa8||"
484 "\xea\xad\xb9|\xe1\x8e\xa9|\xe1\x8e\xa9||"
485 "\xea\xad\xba|\xe1\x8e\xaa|\xe1\x8e\xaa||"
486 "\xea\xad\xbb|\xe1\x8e\xab|\xe1\x8e\xab||"
487 "\xea\xad\xbc|\xe1\x8e\xac|\xe1\x8e\xac||"
488 "\xea\xad\xbd|\xe1\x8e\xad|\xe1\x8e\xad||"
489 "\xea\xad\xbe|\xe1\x8e\xae|\xe1\x8e\xae||"
490 "\xea\xad\xbf|\xe1\x8e\xaf|\xe1\x8e\xaf||"
491 "\xea\xae\x80|\xe1\x8e\xb0|\xe1\x8e\xb0||"
492 "\xea\xae\x81|\xe1\x8e\xb1|\xe1\x8e\xb1||"
493 "\xea\xae\x82|\xe1\x8e\xb2|\xe1\x8e\xb2||"
494 "\xea\xae\x83|\xe1\x8e\xb3|\xe1\x8e\xb3||"
495 "\xea\xae\x84|\xe1\x8e\xb4|\xe1\x8e\xb4||"
496 "\xea\xae\x85|\xe1\x8e\xb5|\xe1\x8e\xb5||"
497 "\xea\xae\x86|\xe1\x8e\xb6|\xe1\x8e\xb6||"
498 "\xea\xae\x87|\xe1\x8e\xb7|\xe1\x8e\xb7||"
499 "\xea\xae\x88|\xe1\x8e\xb8|\xe1\x8e\xb8||"
500 "\xea\xae\x89|\xe1\x8e\xb9|\xe1\x8e\xb9||"
501 "\xea\xae\x8a|\xe1\x8e\xba|\xe1\x8e\xba||"
502 "\xea\xae\x8b|\xe1\x8e\xbb|\xe1\x8e\xbb||"
503 "\xea\xae\x8c|\xe1\x8e\xbc|\xe1\x8e\xbc||"
504 "\xea\xae\x8d|\xe1\x8e\xbd|\xe1\x8e\xbd||"
505 "\xea\xae\x8e|\xe1\x8e\xbe|\xe1\x8e\xbe||"
506 "\xea\xae\x8f|\xe1\x8e\xbf|\xe1\x8e\xbf||"
507 "\xea\xae\x90|\xe1\x8f\x80|\xe1\x8f\x80||"
508 "\xea\xae\x91|\xe1\x8f\x81|\xe1\x8f\x81||"
509 "\xea\xae\x92|\xe1\x8f\x82|\xe1\x8f\x82||"
510 "\xea\xae\x93|\xe1\x8f\x83|\xe1\x8f\x83||"
511 "\xea\xae\x94|\xe1\x8f\x84|\xe1\x8f\x84||"
512 "\xea\xae\x95|\xe1\x8f\x85|\xe1\x8f\x85||"
513 "\xea\xae\x96|\xe1\x8f\x86|\xe1\x8f\x86||"
514 "\xea\xae\x97|\xe1\x8f\x87|\xe1\x8f\x87||"
515 "\xea\xae\x98|\xe1\x8f\x88|\xe1\x8f\x88||"
516 "\xea\xae\x99|\xe1\x8f\x89|\xe1\x8f\x89||"
517 "\xea\xae\x9a|\xe1\x8f\x8a|\xe1\x8f\x8a||"
518 "\xea\xae\x9b|\xe1\x8f\x8b|\xe1\x8f\x8b||"
519 "\xea\xae\x9c|\xe1\x8f\x8c|\xe1\x8f\x8c||"
520 "\xea\xae\x9d|\xe1\x8f\x8d|\xe1\x8f\x8d||"
521 "\xea\xae\x9e|\xe1\x8f\x8e|\xe1\x8f\x8e||"
522 "\xea\xae\x9f|\xe1\x8f\x8f|\xe1\x8f\x8f||"
523 "\xea\xae\xa0|\xe1\x8f\x90|\xe1\x8f\x90||"
524 "\xea\xae\xa1|\xe1\x8f\x91|\xe1\x8f\x91||"
525 "\xea\xae\xa2|\xe1\x8f\x92|\xe1\x8f\x92||"
526 "\xea\xae\xa3|\xe1\x8f\x93|\xe1\x8f\x93||"
527 "\xea\xae\xa4|\xe1\x8f\x94|\xe1\x8f\x94||"
528 "\xea\xae\xa5|\xe1\x8f\x95|\xe1\x8f\x95||"
529 "\xea\xae\xa6|\xe1\x8f\x96|\xe1\x8f\x96||"
530 "\xea\xae\xa7|\xe1\x8f\x97|\xe1\x8f\x97||"
531 "\xea\xae\xa8|\xe1\x8f\x98|\xe1\x8f\x98||"
532 "\xea\xae\xa9|\xe1\x8f\x99|\xe1\x8f\x99||"
533 "\xea\xae\xaa|\xe1\x8f\x9a|\xe1\x8f\x9a||"
534 "\xea\xae\xab|\xe1\x8f\x9b|\xe1\x8f\x9b||"
535 "\xea\xae\xac|\xe1\x8f\x9c|\xe1\x8f\x9c||"
536 "\xea\xae\xad|\xe1\x8f\x9d|\xe1\x8f\x9d||"
537 "\xea\xae\xae|\xe1\x8f\x9e|\xe1\x8f\x9e||"
538 "\xea\xae\xaf|\xe1\x8f\x9f|\xe1\x8f\x9f||"
539 "\xea\xae\xb0|\xe1\x8f\xa0|\xe1\x8f\xa0||"
540 "\xea\xae\xb1|\xe1\x8f\xa1|\xe1\x8f\xa1||"
541 "\xea\xae\xb2|\xe1\x8f\xa2|\xe1\x8f\xa2||"
542 "\xea\xae\xb3|\xe1\x8f\xa3|\xe1\x8f\xa3||"
543 "\xea\xae\xb4|\xe1\x8f\xa4|\xe1\x8f\xa4||"
544 "\xea\xae\xb5|\xe1\x8f\xa5|\xe1\x8f\xa5||"
545 "\xea\xae\xb6|\xe1\x8f\xa6|\xe1\x8f\xa6||"
546 "\xea\xae\xb7|\xe1\x8f\xa7|\xe1\x8f\xa7||"
547 "\xea\xae\xb8|\xe1\x8f\xa8|\xe1\x8f\xa8||"
548 "\xea\xae\xb9|\xe1\x8f\xa9|\xe1\x8f\xa9||"
549 "\xea\xae\xba|\xe1\x8f\xaa|\xe1\x8f\xaa||"
550 "\xea\xae\xbb|\xe1\x8f\xab|\xe1\x8f\xab||"
551 "\xea\xae\xbc|\xe1\x8f\xac|\xe1\x8f\xac||"
552 "\xea\xae\xbd|\xe1\x8f\xad|\xe1\x8f\xad||"
553 "\xea\xae\xbe|\xe1\x8f\xae|\xe1\x8f\xae||"
554 "\xea\xae\xbf|\xe1\x8f\xaf|\xe1\x8f\xaf||"
555 "\xef\xac\x80|ff|FF||"
556 "\xef\xac\x81|fi|FI||"
557 "\xef\xac\x82|fl|FL||"
558 "\xef\xac\x83|ffi|FFI||"
559 "\xef\xac\x84|ffl|FFL||"
560 "\xef\xac\x85|st|ST||"
561 "\xef\xac\x86|st|ST||"
562 "\xef\xac\x93|\xd5\xb4\xd5\xb6|\xd5\x84\xd5\x86||"
563 "\xef\xac\x94|\xd5\xb4\xd5\xa5|\xd5\x84\xd4\xb5||"
564 "\xef\xac\x95|\xd5\xb4\xd5\xab|\xd5\x84\xd4\xbb||"
565 "\xef\xac\x96|\xd5\xbe\xd5\xb6|\xd5\x8e\xd5\x86||"
566 "\xef\xac\x97|\xd5\xb4\xd5\xad|\xd5\x84\xd4\xbd||"
567 
568 //--Autogenerated -- end of section automatically generated
569 ;
570 
571 class CaseConverter : public ICaseConverter {
572 	// Maximum length of a case conversion result is 6 bytes in UTF-8
573 	enum { maxConversionLength=6 };
574 	struct ConversionString {
575 		char conversion[maxConversionLength+1];
ConversionString__anon6a72038d0111::CaseConverter::ConversionString576 		ConversionString() noexcept : conversion{} {
577 		}
578 	};
579 	// Conversions are initially store in a vector of structs but then decomposed into
580 	// parallel arrays as that is about 10% faster to search.
581 	struct CharacterConversion {
582 		int character;
583 		ConversionString conversion;
CharacterConversion__anon6a72038d0111::CaseConverter::CharacterConversion584 		CharacterConversion() noexcept : character(0) {
585 			// Empty case: NUL -> "".
586 		}
CharacterConversion__anon6a72038d0111::CaseConverter::CharacterConversion587 		CharacterConversion(int character_=0, const char *conversion_="") noexcept : character(character_) {
588 			StringCopy(conversion.conversion, conversion_);
589 		}
operator <__anon6a72038d0111::CaseConverter::CharacterConversion590 		bool operator<(const CharacterConversion &other) const noexcept {
591 			return character < other.character;
592 		}
593 	};
594 	typedef std::vector<CharacterConversion> CharacterToConversion;
595 	CharacterToConversion characterToConversion;
596 	// The parallel arrays
597 	std::vector<int> characters;
598 	std::vector<ConversionString> conversions;
599 
600 public:
CaseConverter()601 	CaseConverter() noexcept {
602 	}
603 	virtual ~CaseConverter() = default;
Initialised() const604 	bool Initialised() const noexcept {
605 		return !characters.empty();
606 	}
Add(int character,const char * conversion)607 	void Add(int character, const char *conversion) {
608 		characterToConversion.emplace_back(character, conversion);
609 	}
Find(int character)610 	const char *Find(int character) {
611 		const std::vector<int>::iterator it = std::lower_bound(characters.begin(), characters.end(), character);
612 		if (it == characters.end())
613 			return nullptr;
614 		else if (*it == character)
615 			return conversions[it - characters.begin()].conversion;
616 		else
617 			return nullptr;
618 	}
CaseConvertString(char * converted,size_t sizeConverted,const char * mixed,size_t lenMixed)619 	size_t CaseConvertString(char *converted, size_t sizeConverted, const char *mixed, size_t lenMixed) override {
620 		size_t lenConverted = 0;
621 		size_t mixedPos = 0;
622 		unsigned char bytes[UTF8MaxBytes + 1]{};
623 		while (mixedPos < lenMixed) {
624 			const unsigned char leadByte = mixed[mixedPos];
625 			const char *caseConverted = nullptr;
626 			size_t lenMixedChar = 1;
627 			if (UTF8IsAscii(leadByte)) {
628 				caseConverted = Find(leadByte);
629 			} else {
630 				bytes[0] = leadByte;
631 				const int widthCharBytes = UTF8BytesOfLead[leadByte];
632 				for (int b=1; b<widthCharBytes; b++) {
633 					bytes[b] = (mixedPos+b < lenMixed) ? mixed[mixedPos+b] : 0;
634 				}
635 				const int classified = UTF8Classify(bytes, widthCharBytes);
636 				if (!(classified & UTF8MaskInvalid)) {
637 					// valid UTF-8
638 					lenMixedChar = classified & UTF8MaskWidth;
639 					const int character = UnicodeFromUTF8(bytes);
640 					caseConverted = Find(character);
641 				}
642 			}
643 			if (caseConverted) {
644 				// Character has a conversion so copy that conversion in
645 				while (*caseConverted) {
646 					converted[lenConverted++] = *caseConverted++;
647 					if (lenConverted >= sizeConverted)
648 						return 0;
649 				}
650 			} else {
651 				// Character has no conversion so copy the input to output
652 				for (size_t i=0; i<lenMixedChar; i++) {
653 					converted[lenConverted++] = mixed[mixedPos+i];
654 					if (lenConverted >= sizeConverted)
655 						return 0;
656 				}
657 			}
658 			mixedPos += lenMixedChar;
659 		}
660 		return lenConverted;
661 	}
FinishedAdding()662 	void FinishedAdding() {
663 		std::sort(characterToConversion.begin(), characterToConversion.end());
664 		characters.reserve(characterToConversion.size());
665 		conversions.reserve(characterToConversion.size());
666 		for (const CharacterConversion &chConv : characterToConversion) {
667 			characters.push_back(chConv.character);
668 			conversions.push_back(chConv.conversion);
669 		}
670 		// Empty the original calculated data completely
671 		CharacterToConversion().swap(characterToConversion);
672 	}
673 };
674 
675 CaseConverter caseConvFold;
676 CaseConverter caseConvUp;
677 CaseConverter caseConvLow;
678 
AddSymmetric(enum CaseConversion conversion,int lower,int upper)679 void AddSymmetric(enum CaseConversion conversion, int lower,int upper) {
680 	char lowerUTF8[UTF8MaxBytes+1];
681 	UTF8FromUTF32Character(lower, lowerUTF8);
682 	char upperUTF8[UTF8MaxBytes+1];
683 	UTF8FromUTF32Character(upper, upperUTF8);
684 
685 	switch (conversion) {
686 	case CaseConversionFold:
687 		caseConvFold.Add(upper, lowerUTF8);
688 		break;
689 	case CaseConversionUpper:
690 		caseConvUp.Add(lower, upperUTF8);
691 		break;
692 	case CaseConversionLower:
693 		caseConvLow.Add(upper, lowerUTF8);
694 		break;
695 	}
696 }
697 
SetupConversions(enum CaseConversion conversion)698 void SetupConversions(enum CaseConversion conversion) {
699 	// First initialize for the symmetric ranges
700 	for (size_t i=0; i<Sci::size(symmetricCaseConversionRanges);) {
701 		const int lower = symmetricCaseConversionRanges[i++];
702 		const int upper = symmetricCaseConversionRanges[i++];
703 		const int length = symmetricCaseConversionRanges[i++];
704 		const int pitch = symmetricCaseConversionRanges[i++];
705 		for (int j=0; j<length*pitch; j+=pitch) {
706 			AddSymmetric(conversion, lower+j, upper+j);
707 		}
708 	}
709 	// Add the symmetric singletons
710 	for (size_t i=0; i<Sci::size(symmetricCaseConversions);) {
711 		const int lower = symmetricCaseConversions[i++];
712 		const int upper = symmetricCaseConversions[i++];
713 		AddSymmetric(conversion, lower, upper);
714 	}
715 	// Add the complex cases
716 	const char *sComplex = complexCaseConversions;
717 	while (*sComplex) {
718 		// Longest ligature is 3 character so 5 for safety
719 		constexpr size_t lenUTF8 = 5*UTF8MaxBytes+1;
720 		unsigned char originUTF8[lenUTF8]{};
721 		char foldedUTF8[lenUTF8]{};
722 		char lowerUTF8[lenUTF8]{};
723 		char upperUTF8[lenUTF8]{};
724 		size_t i = 0;
725 		while (*sComplex && *sComplex != '|') {
726 			originUTF8[i++] = *sComplex;
727 			sComplex++;
728 		}
729 		sComplex++;
730 		originUTF8[i] = 0;
731 		i = 0;
732 		while (*sComplex && *sComplex != '|') {
733 			foldedUTF8[i++] = *sComplex;
734 			sComplex++;
735 		}
736 		sComplex++;
737 		foldedUTF8[i] = 0;
738 		i = 0;
739 		while (*sComplex && *sComplex != '|') {
740 			upperUTF8[i++] = *sComplex;
741 			sComplex++;
742 		}
743 		sComplex++;
744 		upperUTF8[i] = 0;
745 		i = 0;
746 		while (*sComplex && *sComplex != '|') {
747 			lowerUTF8[i++] = *sComplex;
748 			sComplex++;
749 		}
750 		sComplex++;
751 		lowerUTF8[i] = 0;
752 
753 		const int character = UnicodeFromUTF8(originUTF8);
754 
755 		if (conversion == CaseConversionFold && foldedUTF8[0]) {
756 			caseConvFold.Add(character, foldedUTF8);
757 		}
758 
759 		if (conversion == CaseConversionUpper && upperUTF8[0]) {
760 			caseConvUp.Add(character, upperUTF8);
761 		}
762 
763 		if (conversion == CaseConversionLower && lowerUTF8[0]) {
764 			caseConvLow.Add(character, lowerUTF8);
765 		}
766 	}
767 
768 	switch (conversion) {
769 	case CaseConversionFold:
770 		caseConvFold.FinishedAdding();
771 		break;
772 	case CaseConversionUpper:
773 		caseConvUp.FinishedAdding();
774 		break;
775 	case CaseConversionLower:
776 		caseConvLow.FinishedAdding();
777 		break;
778 	}
779 }
780 
ConverterForConversion(enum CaseConversion conversion)781 CaseConverter *ConverterForConversion(enum CaseConversion conversion) noexcept {
782 	switch (conversion) {
783 	case CaseConversionFold:
784 		return &caseConvFold;
785 	case CaseConversionUpper:
786 		return &caseConvUp;
787 	case CaseConversionLower:
788 		return &caseConvLow;
789 	}
790 	return nullptr;
791 }
792 
793 }
794 
795 namespace Scintilla {
796 
ConverterFor(enum CaseConversion conversion)797 ICaseConverter *ConverterFor(enum CaseConversion conversion) {
798 	CaseConverter *pCaseConv = ConverterForConversion(conversion);
799 	if (!pCaseConv->Initialised())
800 		SetupConversions(conversion);
801 	return pCaseConv;
802 }
803 
CaseConvert(int character,enum CaseConversion conversion)804 const char *CaseConvert(int character, enum CaseConversion conversion) {
805 	CaseConverter *pCaseConv = ConverterForConversion(conversion);
806 	if (!pCaseConv->Initialised())
807 		SetupConversions(conversion);
808 	return pCaseConv->Find(character);
809 }
810 
CaseConvertString(char * converted,size_t sizeConverted,const char * mixed,size_t lenMixed,enum CaseConversion conversion)811 size_t CaseConvertString(char *converted, size_t sizeConverted, const char *mixed, size_t lenMixed, enum CaseConversion conversion) {
812 	CaseConverter *pCaseConv = ConverterForConversion(conversion);
813 	if (!pCaseConv->Initialised())
814 		SetupConversions(conversion);
815 	return pCaseConv->CaseConvertString(converted, sizeConverted, mixed, lenMixed);
816 }
817 
CaseConvertString(const std::string & s,enum CaseConversion conversion)818 std::string CaseConvertString(const std::string &s, enum CaseConversion conversion) {
819 	std::string retMapped(s.length() * maxExpansionCaseConversion, 0);
820 	const size_t lenMapped = CaseConvertString(&retMapped[0], retMapped.length(), s.c_str(), s.length(),
821 		conversion);
822 	retMapped.resize(lenMapped);
823 	return retMapped;
824 }
825 
826 }
827