1 // Scintilla source code edit control
2 // Encoding: UTF-8
3 /** @file CaseConvert.cxx
4  ** Case fold characters and convert them to upper or lower case.
5  ** Tables automatically regenerated by scripts/GenerateCaseConvert.py
6  ** Should only be rarely regenerated for new versions of Unicode.
7  **/
8 // Copyright 2013 by Neil Hodgson <neilh@scintilla.org>
9 // The License.txt file describes the conditions under which this software may be distributed.
10 
11 #include <cassert>
12 #include <cstring>
13 
14 #include <stdexcept>
15 #include <string>
16 #include <string_view>
17 #include <vector>
18 #include <algorithm>
19 
20 #include "CaseConvert.h"
21 #include "UniConversion.h"
22 
23 using namespace Scintilla;
24 
25 namespace {
26 	// Use an unnamed namespace to protect the declarations from name conflicts
27 
28 // Unicode code points are ordered by groups and follow patterns.
29 // Most characters (pitch==1) are in ranges for a particular alphabet and their
30 // upper case forms are a fixed distance away.
31 // Another pattern (pitch==2) is where each lower case letter is preceded by
32 // the upper case form. These are also grouped into ranges.
33 
34 int symmetricCaseConversionRanges[] = {
35 //lower, upper, range length, range pitch
36 //++Autogenerated -- start of section automatically generated
37 //**\(\*\n\)
38 97,65,26,1,
39 224,192,23,1,
40 248,216,7,1,
41 257,256,24,2,
42 314,313,8,2,
43 331,330,23,2,
44 462,461,8,2,
45 479,478,9,2,
46 505,504,20,2,
47 547,546,9,2,
48 583,582,5,2,
49 945,913,17,1,
50 963,931,9,1,
51 985,984,12,2,
52 1072,1040,32,1,
53 1104,1024,16,1,
54 1121,1120,17,2,
55 1163,1162,27,2,
56 1218,1217,7,2,
57 1233,1232,48,2,
58 1377,1329,38,1,
59 4304,7312,43,1,
60 7681,7680,75,2,
61 7841,7840,48,2,
62 7936,7944,8,1,
63 7952,7960,6,1,
64 7968,7976,8,1,
65 7984,7992,8,1,
66 8000,8008,6,1,
67 8032,8040,8,1,
68 8560,8544,16,1,
69 9424,9398,26,1,
70 11312,11264,47,1,
71 11393,11392,50,2,
72 11520,4256,38,1,
73 42561,42560,23,2,
74 42625,42624,14,2,
75 42787,42786,7,2,
76 42803,42802,31,2,
77 42879,42878,5,2,
78 42903,42902,10,2,
79 42933,42932,6,2,
80 65345,65313,26,1,
81 66600,66560,40,1,
82 66776,66736,36,1,
83 68800,68736,51,1,
84 71872,71840,32,1,
85 93792,93760,32,1,
86 125218,125184,34,1,
87 
88 //--Autogenerated -- end of section automatically generated
89 };
90 
91 // Code points that are symmetric but don't fit into a range of similar characters
92 // are listed here.
93 
94 int symmetricCaseConversions[] = {
95 //lower, upper
96 //++Autogenerated -- start of section automatically generated
97 //**1 \(\*\n\)
98 255,376,
99 307,306,
100 309,308,
101 311,310,
102 378,377,
103 380,379,
104 382,381,
105 384,579,
106 387,386,
107 389,388,
108 392,391,
109 396,395,
110 402,401,
111 405,502,
112 409,408,
113 410,573,
114 414,544,
115 417,416,
116 419,418,
117 421,420,
118 424,423,
119 429,428,
120 432,431,
121 436,435,
122 438,437,
123 441,440,
124 445,444,
125 447,503,
126 454,452,
127 457,455,
128 460,458,
129 477,398,
130 499,497,
131 501,500,
132 572,571,
133 575,11390,
134 576,11391,
135 578,577,
136 592,11375,
137 593,11373,
138 594,11376,
139 595,385,
140 596,390,
141 598,393,
142 599,394,
143 601,399,
144 603,400,
145 604,42923,
146 608,403,
147 609,42924,
148 611,404,
149 613,42893,
150 614,42922,
151 616,407,
152 617,406,
153 618,42926,
154 619,11362,
155 620,42925,
156 623,412,
157 625,11374,
158 626,413,
159 629,415,
160 637,11364,
161 640,422,
162 642,42949,
163 643,425,
164 647,42929,
165 648,430,
166 649,580,
167 650,433,
168 651,434,
169 652,581,
170 658,439,
171 669,42930,
172 670,42928,
173 881,880,
174 883,882,
175 887,886,
176 891,1021,
177 892,1022,
178 893,1023,
179 940,902,
180 941,904,
181 942,905,
182 943,906,
183 972,908,
184 973,910,
185 974,911,
186 983,975,
187 1010,1017,
188 1011,895,
189 1016,1015,
190 1019,1018,
191 1231,1216,
192 4349,7357,
193 4350,7358,
194 4351,7359,
195 7545,42877,
196 7549,11363,
197 7566,42950,
198 8017,8025,
199 8019,8027,
200 8021,8029,
201 8023,8031,
202 8048,8122,
203 8049,8123,
204 8050,8136,
205 8051,8137,
206 8052,8138,
207 8053,8139,
208 8054,8154,
209 8055,8155,
210 8056,8184,
211 8057,8185,
212 8058,8170,
213 8059,8171,
214 8060,8186,
215 8061,8187,
216 8112,8120,
217 8113,8121,
218 8144,8152,
219 8145,8153,
220 8160,8168,
221 8161,8169,
222 8165,8172,
223 8526,8498,
224 8580,8579,
225 11361,11360,
226 11365,570,
227 11366,574,
228 11368,11367,
229 11370,11369,
230 11372,11371,
231 11379,11378,
232 11382,11381,
233 11500,11499,
234 11502,11501,
235 11507,11506,
236 11559,4295,
237 11565,4301,
238 42874,42873,
239 42876,42875,
240 42892,42891,
241 42897,42896,
242 42899,42898,
243 42900,42948,
244 42947,42946,
245 43859,42931,
246 
247 //--Autogenerated -- end of section automatically generated
248 };
249 
250 // Characters that have complex case conversions are listed here.
251 // This includes cases where more than one character is needed for a conversion,
252 // folding is different to lowering, or (as appropriate) upper(lower(x)) != x or
253 // lower(upper(x)) != x.
254 
255 const char *complexCaseConversions =
256 // Original | Folded | Upper | Lower |
257 //++Autogenerated -- start of section automatically generated
258 //**2 \(\*\n\)
259 "\xc2\xb5|\xce\xbc|\xce\x9c||"
260 "\xc3\x9f|ss|SS||"
261 "\xc4\xb0|i\xcc\x87||i\xcc\x87|"
262 "\xc4\xb1||I||"
263 "\xc5\x89|\xca\xbcn|\xca\xbcN||"
264 "\xc5\xbf|s|S||"
265 "\xc7\x85|\xc7\x86|\xc7\x84|\xc7\x86|"
266 "\xc7\x88|\xc7\x89|\xc7\x87|\xc7\x89|"
267 "\xc7\x8b|\xc7\x8c|\xc7\x8a|\xc7\x8c|"
268 "\xc7\xb0|j\xcc\x8c|J\xcc\x8c||"
269 "\xc7\xb2|\xc7\xb3|\xc7\xb1|\xc7\xb3|"
270 "\xcd\x85|\xce\xb9|\xce\x99||"
271 "\xce\x90|\xce\xb9\xcc\x88\xcc\x81|\xce\x99\xcc\x88\xcc\x81||"
272 "\xce\xb0|\xcf\x85\xcc\x88\xcc\x81|\xce\xa5\xcc\x88\xcc\x81||"
273 "\xcf\x82|\xcf\x83|\xce\xa3||"
274 "\xcf\x90|\xce\xb2|\xce\x92||"
275 "\xcf\x91|\xce\xb8|\xce\x98||"
276 "\xcf\x95|\xcf\x86|\xce\xa6||"
277 "\xcf\x96|\xcf\x80|\xce\xa0||"
278 "\xcf\xb0|\xce\xba|\xce\x9a||"
279 "\xcf\xb1|\xcf\x81|\xce\xa1||"
280 "\xcf\xb4|\xce\xb8||\xce\xb8|"
281 "\xcf\xb5|\xce\xb5|\xce\x95||"
282 "\xd6\x87|\xd5\xa5\xd6\x82|\xd4\xb5\xd5\x92||"
283 "\xe1\x8e\xa0|||\xea\xad\xb0|"
284 "\xe1\x8e\xa1|||\xea\xad\xb1|"
285 "\xe1\x8e\xa2|||\xea\xad\xb2|"
286 "\xe1\x8e\xa3|||\xea\xad\xb3|"
287 "\xe1\x8e\xa4|||\xea\xad\xb4|"
288 "\xe1\x8e\xa5|||\xea\xad\xb5|"
289 "\xe1\x8e\xa6|||\xea\xad\xb6|"
290 "\xe1\x8e\xa7|||\xea\xad\xb7|"
291 "\xe1\x8e\xa8|||\xea\xad\xb8|"
292 "\xe1\x8e\xa9|||\xea\xad\xb9|"
293 "\xe1\x8e\xaa|||\xea\xad\xba|"
294 "\xe1\x8e\xab|||\xea\xad\xbb|"
295 "\xe1\x8e\xac|||\xea\xad\xbc|"
296 "\xe1\x8e\xad|||\xea\xad\xbd|"
297 "\xe1\x8e\xae|||\xea\xad\xbe|"
298 "\xe1\x8e\xaf|||\xea\xad\xbf|"
299 "\xe1\x8e\xb0|||\xea\xae\x80|"
300 "\xe1\x8e\xb1|||\xea\xae\x81|"
301 "\xe1\x8e\xb2|||\xea\xae\x82|"
302 "\xe1\x8e\xb3|||\xea\xae\x83|"
303 "\xe1\x8e\xb4|||\xea\xae\x84|"
304 "\xe1\x8e\xb5|||\xea\xae\x85|"
305 "\xe1\x8e\xb6|||\xea\xae\x86|"
306 "\xe1\x8e\xb7|||\xea\xae\x87|"
307 "\xe1\x8e\xb8|||\xea\xae\x88|"
308 "\xe1\x8e\xb9|||\xea\xae\x89|"
309 "\xe1\x8e\xba|||\xea\xae\x8a|"
310 "\xe1\x8e\xbb|||\xea\xae\x8b|"
311 "\xe1\x8e\xbc|||\xea\xae\x8c|"
312 "\xe1\x8e\xbd|||\xea\xae\x8d|"
313 "\xe1\x8e\xbe|||\xea\xae\x8e|"
314 "\xe1\x8e\xbf|||\xea\xae\x8f|"
315 "\xe1\x8f\x80|||\xea\xae\x90|"
316 "\xe1\x8f\x81|||\xea\xae\x91|"
317 "\xe1\x8f\x82|||\xea\xae\x92|"
318 "\xe1\x8f\x83|||\xea\xae\x93|"
319 "\xe1\x8f\x84|||\xea\xae\x94|"
320 "\xe1\x8f\x85|||\xea\xae\x95|"
321 "\xe1\x8f\x86|||\xea\xae\x96|"
322 "\xe1\x8f\x87|||\xea\xae\x97|"
323 "\xe1\x8f\x88|||\xea\xae\x98|"
324 "\xe1\x8f\x89|||\xea\xae\x99|"
325 "\xe1\x8f\x8a|||\xea\xae\x9a|"
326 "\xe1\x8f\x8b|||\xea\xae\x9b|"
327 "\xe1\x8f\x8c|||\xea\xae\x9c|"
328 "\xe1\x8f\x8d|||\xea\xae\x9d|"
329 "\xe1\x8f\x8e|||\xea\xae\x9e|"
330 "\xe1\x8f\x8f|||\xea\xae\x9f|"
331 "\xe1\x8f\x90|||\xea\xae\xa0|"
332 "\xe1\x8f\x91|||\xea\xae\xa1|"
333 "\xe1\x8f\x92|||\xea\xae\xa2|"
334 "\xe1\x8f\x93|||\xea\xae\xa3|"
335 "\xe1\x8f\x94|||\xea\xae\xa4|"
336 "\xe1\x8f\x95|||\xea\xae\xa5|"
337 "\xe1\x8f\x96|||\xea\xae\xa6|"
338 "\xe1\x8f\x97|||\xea\xae\xa7|"
339 "\xe1\x8f\x98|||\xea\xae\xa8|"
340 "\xe1\x8f\x99|||\xea\xae\xa9|"
341 "\xe1\x8f\x9a|||\xea\xae\xaa|"
342 "\xe1\x8f\x9b|||\xea\xae\xab|"
343 "\xe1\x8f\x9c|||\xea\xae\xac|"
344 "\xe1\x8f\x9d|||\xea\xae\xad|"
345 "\xe1\x8f\x9e|||\xea\xae\xae|"
346 "\xe1\x8f\x9f|||\xea\xae\xaf|"
347 "\xe1\x8f\xa0|||\xea\xae\xb0|"
348 "\xe1\x8f\xa1|||\xea\xae\xb1|"
349 "\xe1\x8f\xa2|||\xea\xae\xb2|"
350 "\xe1\x8f\xa3|||\xea\xae\xb3|"
351 "\xe1\x8f\xa4|||\xea\xae\xb4|"
352 "\xe1\x8f\xa5|||\xea\xae\xb5|"
353 "\xe1\x8f\xa6|||\xea\xae\xb6|"
354 "\xe1\x8f\xa7|||\xea\xae\xb7|"
355 "\xe1\x8f\xa8|||\xea\xae\xb8|"
356 "\xe1\x8f\xa9|||\xea\xae\xb9|"
357 "\xe1\x8f\xaa|||\xea\xae\xba|"
358 "\xe1\x8f\xab|||\xea\xae\xbb|"
359 "\xe1\x8f\xac|||\xea\xae\xbc|"
360 "\xe1\x8f\xad|||\xea\xae\xbd|"
361 "\xe1\x8f\xae|||\xea\xae\xbe|"
362 "\xe1\x8f\xaf|||\xea\xae\xbf|"
363 "\xe1\x8f\xb0|||\xe1\x8f\xb8|"
364 "\xe1\x8f\xb1|||\xe1\x8f\xb9|"
365 "\xe1\x8f\xb2|||\xe1\x8f\xba|"
366 "\xe1\x8f\xb3|||\xe1\x8f\xbb|"
367 "\xe1\x8f\xb4|||\xe1\x8f\xbc|"
368 "\xe1\x8f\xb5|||\xe1\x8f\xbd|"
369 "\xe1\x8f\xb8|\xe1\x8f\xb0|\xe1\x8f\xb0||"
370 "\xe1\x8f\xb9|\xe1\x8f\xb1|\xe1\x8f\xb1||"
371 "\xe1\x8f\xba|\xe1\x8f\xb2|\xe1\x8f\xb2||"
372 "\xe1\x8f\xbb|\xe1\x8f\xb3|\xe1\x8f\xb3||"
373 "\xe1\x8f\xbc|\xe1\x8f\xb4|\xe1\x8f\xb4||"
374 "\xe1\x8f\xbd|\xe1\x8f\xb5|\xe1\x8f\xb5||"
375 "\xe1\xb2\x80|\xd0\xb2|\xd0\x92||"
376 "\xe1\xb2\x81|\xd0\xb4|\xd0\x94||"
377 "\xe1\xb2\x82|\xd0\xbe|\xd0\x9e||"
378 "\xe1\xb2\x83|\xd1\x81|\xd0\xa1||"
379 "\xe1\xb2\x84|\xd1\x82|\xd0\xa2||"
380 "\xe1\xb2\x85|\xd1\x82|\xd0\xa2||"
381 "\xe1\xb2\x86|\xd1\x8a|\xd0\xaa||"
382 "\xe1\xb2\x87|\xd1\xa3|\xd1\xa2||"
383 "\xe1\xb2\x88|\xea\x99\x8b|\xea\x99\x8a||"
384 "\xe1\xba\x96|h\xcc\xb1|H\xcc\xb1||"
385 "\xe1\xba\x97|t\xcc\x88|T\xcc\x88||"
386 "\xe1\xba\x98|w\xcc\x8a|W\xcc\x8a||"
387 "\xe1\xba\x99|y\xcc\x8a|Y\xcc\x8a||"
388 "\xe1\xba\x9a|a\xca\xbe|A\xca\xbe||"
389 "\xe1\xba\x9b|\xe1\xb9\xa1|\xe1\xb9\xa0||"
390 "\xe1\xba\x9e|ss||\xc3\x9f|"
391 "\xe1\xbd\x90|\xcf\x85\xcc\x93|\xce\xa5\xcc\x93||"
392 "\xe1\xbd\x92|\xcf\x85\xcc\x93\xcc\x80|\xce\xa5\xcc\x93\xcc\x80||"
393 "\xe1\xbd\x94|\xcf\x85\xcc\x93\xcc\x81|\xce\xa5\xcc\x93\xcc\x81||"
394 "\xe1\xbd\x96|\xcf\x85\xcc\x93\xcd\x82|\xce\xa5\xcc\x93\xcd\x82||"
395 "\xe1\xbe\x80|\xe1\xbc\x80\xce\xb9|\xe1\xbc\x88\xce\x99||"
396 "\xe1\xbe\x81|\xe1\xbc\x81\xce\xb9|\xe1\xbc\x89\xce\x99||"
397 "\xe1\xbe\x82|\xe1\xbc\x82\xce\xb9|\xe1\xbc\x8a\xce\x99||"
398 "\xe1\xbe\x83|\xe1\xbc\x83\xce\xb9|\xe1\xbc\x8b\xce\x99||"
399 "\xe1\xbe\x84|\xe1\xbc\x84\xce\xb9|\xe1\xbc\x8c\xce\x99||"
400 "\xe1\xbe\x85|\xe1\xbc\x85\xce\xb9|\xe1\xbc\x8d\xce\x99||"
401 "\xe1\xbe\x86|\xe1\xbc\x86\xce\xb9|\xe1\xbc\x8e\xce\x99||"
402 "\xe1\xbe\x87|\xe1\xbc\x87\xce\xb9|\xe1\xbc\x8f\xce\x99||"
403 "\xe1\xbe\x88|\xe1\xbc\x80\xce\xb9|\xe1\xbc\x88\xce\x99|\xe1\xbe\x80|"
404 "\xe1\xbe\x89|\xe1\xbc\x81\xce\xb9|\xe1\xbc\x89\xce\x99|\xe1\xbe\x81|"
405 "\xe1\xbe\x8a|\xe1\xbc\x82\xce\xb9|\xe1\xbc\x8a\xce\x99|\xe1\xbe\x82|"
406 "\xe1\xbe\x8b|\xe1\xbc\x83\xce\xb9|\xe1\xbc\x8b\xce\x99|\xe1\xbe\x83|"
407 "\xe1\xbe\x8c|\xe1\xbc\x84\xce\xb9|\xe1\xbc\x8c\xce\x99|\xe1\xbe\x84|"
408 "\xe1\xbe\x8d|\xe1\xbc\x85\xce\xb9|\xe1\xbc\x8d\xce\x99|\xe1\xbe\x85|"
409 "\xe1\xbe\x8e|\xe1\xbc\x86\xce\xb9|\xe1\xbc\x8e\xce\x99|\xe1\xbe\x86|"
410 "\xe1\xbe\x8f|\xe1\xbc\x87\xce\xb9|\xe1\xbc\x8f\xce\x99|\xe1\xbe\x87|"
411 "\xe1\xbe\x90|\xe1\xbc\xa0\xce\xb9|\xe1\xbc\xa8\xce\x99||"
412 "\xe1\xbe\x91|\xe1\xbc\xa1\xce\xb9|\xe1\xbc\xa9\xce\x99||"
413 "\xe1\xbe\x92|\xe1\xbc\xa2\xce\xb9|\xe1\xbc\xaa\xce\x99||"
414 "\xe1\xbe\x93|\xe1\xbc\xa3\xce\xb9|\xe1\xbc\xab\xce\x99||"
415 "\xe1\xbe\x94|\xe1\xbc\xa4\xce\xb9|\xe1\xbc\xac\xce\x99||"
416 "\xe1\xbe\x95|\xe1\xbc\xa5\xce\xb9|\xe1\xbc\xad\xce\x99||"
417 "\xe1\xbe\x96|\xe1\xbc\xa6\xce\xb9|\xe1\xbc\xae\xce\x99||"
418 "\xe1\xbe\x97|\xe1\xbc\xa7\xce\xb9|\xe1\xbc\xaf\xce\x99||"
419 "\xe1\xbe\x98|\xe1\xbc\xa0\xce\xb9|\xe1\xbc\xa8\xce\x99|\xe1\xbe\x90|"
420 "\xe1\xbe\x99|\xe1\xbc\xa1\xce\xb9|\xe1\xbc\xa9\xce\x99|\xe1\xbe\x91|"
421 "\xe1\xbe\x9a|\xe1\xbc\xa2\xce\xb9|\xe1\xbc\xaa\xce\x99|\xe1\xbe\x92|"
422 "\xe1\xbe\x9b|\xe1\xbc\xa3\xce\xb9|\xe1\xbc\xab\xce\x99|\xe1\xbe\x93|"
423 "\xe1\xbe\x9c|\xe1\xbc\xa4\xce\xb9|\xe1\xbc\xac\xce\x99|\xe1\xbe\x94|"
424 "\xe1\xbe\x9d|\xe1\xbc\xa5\xce\xb9|\xe1\xbc\xad\xce\x99|\xe1\xbe\x95|"
425 "\xe1\xbe\x9e|\xe1\xbc\xa6\xce\xb9|\xe1\xbc\xae\xce\x99|\xe1\xbe\x96|"
426 "\xe1\xbe\x9f|\xe1\xbc\xa7\xce\xb9|\xe1\xbc\xaf\xce\x99|\xe1\xbe\x97|"
427 "\xe1\xbe\xa0|\xe1\xbd\xa0\xce\xb9|\xe1\xbd\xa8\xce\x99||"
428 "\xe1\xbe\xa1|\xe1\xbd\xa1\xce\xb9|\xe1\xbd\xa9\xce\x99||"
429 "\xe1\xbe\xa2|\xe1\xbd\xa2\xce\xb9|\xe1\xbd\xaa\xce\x99||"
430 "\xe1\xbe\xa3|\xe1\xbd\xa3\xce\xb9|\xe1\xbd\xab\xce\x99||"
431 "\xe1\xbe\xa4|\xe1\xbd\xa4\xce\xb9|\xe1\xbd\xac\xce\x99||"
432 "\xe1\xbe\xa5|\xe1\xbd\xa5\xce\xb9|\xe1\xbd\xad\xce\x99||"
433 "\xe1\xbe\xa6|\xe1\xbd\xa6\xce\xb9|\xe1\xbd\xae\xce\x99||"
434 "\xe1\xbe\xa7|\xe1\xbd\xa7\xce\xb9|\xe1\xbd\xaf\xce\x99||"
435 "\xe1\xbe\xa8|\xe1\xbd\xa0\xce\xb9|\xe1\xbd\xa8\xce\x99|\xe1\xbe\xa0|"
436 "\xe1\xbe\xa9|\xe1\xbd\xa1\xce\xb9|\xe1\xbd\xa9\xce\x99|\xe1\xbe\xa1|"
437 "\xe1\xbe\xaa|\xe1\xbd\xa2\xce\xb9|\xe1\xbd\xaa\xce\x99|\xe1\xbe\xa2|"
438 "\xe1\xbe\xab|\xe1\xbd\xa3\xce\xb9|\xe1\xbd\xab\xce\x99|\xe1\xbe\xa3|"
439 "\xe1\xbe\xac|\xe1\xbd\xa4\xce\xb9|\xe1\xbd\xac\xce\x99|\xe1\xbe\xa4|"
440 "\xe1\xbe\xad|\xe1\xbd\xa5\xce\xb9|\xe1\xbd\xad\xce\x99|\xe1\xbe\xa5|"
441 "\xe1\xbe\xae|\xe1\xbd\xa6\xce\xb9|\xe1\xbd\xae\xce\x99|\xe1\xbe\xa6|"
442 "\xe1\xbe\xaf|\xe1\xbd\xa7\xce\xb9|\xe1\xbd\xaf\xce\x99|\xe1\xbe\xa7|"
443 "\xe1\xbe\xb2|\xe1\xbd\xb0\xce\xb9|\xe1\xbe\xba\xce\x99||"
444 "\xe1\xbe\xb3|\xce\xb1\xce\xb9|\xce\x91\xce\x99||"
445 "\xe1\xbe\xb4|\xce\xac\xce\xb9|\xce\x86\xce\x99||"
446 "\xe1\xbe\xb6|\xce\xb1\xcd\x82|\xce\x91\xcd\x82||"
447 "\xe1\xbe\xb7|\xce\xb1\xcd\x82\xce\xb9|\xce\x91\xcd\x82\xce\x99||"
448 "\xe1\xbe\xbc|\xce\xb1\xce\xb9|\xce\x91\xce\x99|\xe1\xbe\xb3|"
449 "\xe1\xbe\xbe|\xce\xb9|\xce\x99||"
450 "\xe1\xbf\x82|\xe1\xbd\xb4\xce\xb9|\xe1\xbf\x8a\xce\x99||"
451 "\xe1\xbf\x83|\xce\xb7\xce\xb9|\xce\x97\xce\x99||"
452 "\xe1\xbf\x84|\xce\xae\xce\xb9|\xce\x89\xce\x99||"
453 "\xe1\xbf\x86|\xce\xb7\xcd\x82|\xce\x97\xcd\x82||"
454 "\xe1\xbf\x87|\xce\xb7\xcd\x82\xce\xb9|\xce\x97\xcd\x82\xce\x99||"
455 "\xe1\xbf\x8c|\xce\xb7\xce\xb9|\xce\x97\xce\x99|\xe1\xbf\x83|"
456 "\xe1\xbf\x92|\xce\xb9\xcc\x88\xcc\x80|\xce\x99\xcc\x88\xcc\x80||"
457 "\xe1\xbf\x93|\xce\xb9\xcc\x88\xcc\x81|\xce\x99\xcc\x88\xcc\x81||"
458 "\xe1\xbf\x96|\xce\xb9\xcd\x82|\xce\x99\xcd\x82||"
459 "\xe1\xbf\x97|\xce\xb9\xcc\x88\xcd\x82|\xce\x99\xcc\x88\xcd\x82||"
460 "\xe1\xbf\xa2|\xcf\x85\xcc\x88\xcc\x80|\xce\xa5\xcc\x88\xcc\x80||"
461 "\xe1\xbf\xa3|\xcf\x85\xcc\x88\xcc\x81|\xce\xa5\xcc\x88\xcc\x81||"
462 "\xe1\xbf\xa4|\xcf\x81\xcc\x93|\xce\xa1\xcc\x93||"
463 "\xe1\xbf\xa6|\xcf\x85\xcd\x82|\xce\xa5\xcd\x82||"
464 "\xe1\xbf\xa7|\xcf\x85\xcc\x88\xcd\x82|\xce\xa5\xcc\x88\xcd\x82||"
465 "\xe1\xbf\xb2|\xe1\xbd\xbc\xce\xb9|\xe1\xbf\xba\xce\x99||"
466 "\xe1\xbf\xb3|\xcf\x89\xce\xb9|\xce\xa9\xce\x99||"
467 "\xe1\xbf\xb4|\xcf\x8e\xce\xb9|\xce\x8f\xce\x99||"
468 "\xe1\xbf\xb6|\xcf\x89\xcd\x82|\xce\xa9\xcd\x82||"
469 "\xe1\xbf\xb7|\xcf\x89\xcd\x82\xce\xb9|\xce\xa9\xcd\x82\xce\x99||"
470 "\xe1\xbf\xbc|\xcf\x89\xce\xb9|\xce\xa9\xce\x99|\xe1\xbf\xb3|"
471 "\xe2\x84\xa6|\xcf\x89||\xcf\x89|"
472 "\xe2\x84\xaa|k||k|"
473 "\xe2\x84\xab|\xc3\xa5||\xc3\xa5|"
474 "\xea\xad\xb0|\xe1\x8e\xa0|\xe1\x8e\xa0||"
475 "\xea\xad\xb1|\xe1\x8e\xa1|\xe1\x8e\xa1||"
476 "\xea\xad\xb2|\xe1\x8e\xa2|\xe1\x8e\xa2||"
477 "\xea\xad\xb3|\xe1\x8e\xa3|\xe1\x8e\xa3||"
478 "\xea\xad\xb4|\xe1\x8e\xa4|\xe1\x8e\xa4||"
479 "\xea\xad\xb5|\xe1\x8e\xa5|\xe1\x8e\xa5||"
480 "\xea\xad\xb6|\xe1\x8e\xa6|\xe1\x8e\xa6||"
481 "\xea\xad\xb7|\xe1\x8e\xa7|\xe1\x8e\xa7||"
482 "\xea\xad\xb8|\xe1\x8e\xa8|\xe1\x8e\xa8||"
483 "\xea\xad\xb9|\xe1\x8e\xa9|\xe1\x8e\xa9||"
484 "\xea\xad\xba|\xe1\x8e\xaa|\xe1\x8e\xaa||"
485 "\xea\xad\xbb|\xe1\x8e\xab|\xe1\x8e\xab||"
486 "\xea\xad\xbc|\xe1\x8e\xac|\xe1\x8e\xac||"
487 "\xea\xad\xbd|\xe1\x8e\xad|\xe1\x8e\xad||"
488 "\xea\xad\xbe|\xe1\x8e\xae|\xe1\x8e\xae||"
489 "\xea\xad\xbf|\xe1\x8e\xaf|\xe1\x8e\xaf||"
490 "\xea\xae\x80|\xe1\x8e\xb0|\xe1\x8e\xb0||"
491 "\xea\xae\x81|\xe1\x8e\xb1|\xe1\x8e\xb1||"
492 "\xea\xae\x82|\xe1\x8e\xb2|\xe1\x8e\xb2||"
493 "\xea\xae\x83|\xe1\x8e\xb3|\xe1\x8e\xb3||"
494 "\xea\xae\x84|\xe1\x8e\xb4|\xe1\x8e\xb4||"
495 "\xea\xae\x85|\xe1\x8e\xb5|\xe1\x8e\xb5||"
496 "\xea\xae\x86|\xe1\x8e\xb6|\xe1\x8e\xb6||"
497 "\xea\xae\x87|\xe1\x8e\xb7|\xe1\x8e\xb7||"
498 "\xea\xae\x88|\xe1\x8e\xb8|\xe1\x8e\xb8||"
499 "\xea\xae\x89|\xe1\x8e\xb9|\xe1\x8e\xb9||"
500 "\xea\xae\x8a|\xe1\x8e\xba|\xe1\x8e\xba||"
501 "\xea\xae\x8b|\xe1\x8e\xbb|\xe1\x8e\xbb||"
502 "\xea\xae\x8c|\xe1\x8e\xbc|\xe1\x8e\xbc||"
503 "\xea\xae\x8d|\xe1\x8e\xbd|\xe1\x8e\xbd||"
504 "\xea\xae\x8e|\xe1\x8e\xbe|\xe1\x8e\xbe||"
505 "\xea\xae\x8f|\xe1\x8e\xbf|\xe1\x8e\xbf||"
506 "\xea\xae\x90|\xe1\x8f\x80|\xe1\x8f\x80||"
507 "\xea\xae\x91|\xe1\x8f\x81|\xe1\x8f\x81||"
508 "\xea\xae\x92|\xe1\x8f\x82|\xe1\x8f\x82||"
509 "\xea\xae\x93|\xe1\x8f\x83|\xe1\x8f\x83||"
510 "\xea\xae\x94|\xe1\x8f\x84|\xe1\x8f\x84||"
511 "\xea\xae\x95|\xe1\x8f\x85|\xe1\x8f\x85||"
512 "\xea\xae\x96|\xe1\x8f\x86|\xe1\x8f\x86||"
513 "\xea\xae\x97|\xe1\x8f\x87|\xe1\x8f\x87||"
514 "\xea\xae\x98|\xe1\x8f\x88|\xe1\x8f\x88||"
515 "\xea\xae\x99|\xe1\x8f\x89|\xe1\x8f\x89||"
516 "\xea\xae\x9a|\xe1\x8f\x8a|\xe1\x8f\x8a||"
517 "\xea\xae\x9b|\xe1\x8f\x8b|\xe1\x8f\x8b||"
518 "\xea\xae\x9c|\xe1\x8f\x8c|\xe1\x8f\x8c||"
519 "\xea\xae\x9d|\xe1\x8f\x8d|\xe1\x8f\x8d||"
520 "\xea\xae\x9e|\xe1\x8f\x8e|\xe1\x8f\x8e||"
521 "\xea\xae\x9f|\xe1\x8f\x8f|\xe1\x8f\x8f||"
522 "\xea\xae\xa0|\xe1\x8f\x90|\xe1\x8f\x90||"
523 "\xea\xae\xa1|\xe1\x8f\x91|\xe1\x8f\x91||"
524 "\xea\xae\xa2|\xe1\x8f\x92|\xe1\x8f\x92||"
525 "\xea\xae\xa3|\xe1\x8f\x93|\xe1\x8f\x93||"
526 "\xea\xae\xa4|\xe1\x8f\x94|\xe1\x8f\x94||"
527 "\xea\xae\xa5|\xe1\x8f\x95|\xe1\x8f\x95||"
528 "\xea\xae\xa6|\xe1\x8f\x96|\xe1\x8f\x96||"
529 "\xea\xae\xa7|\xe1\x8f\x97|\xe1\x8f\x97||"
530 "\xea\xae\xa8|\xe1\x8f\x98|\xe1\x8f\x98||"
531 "\xea\xae\xa9|\xe1\x8f\x99|\xe1\x8f\x99||"
532 "\xea\xae\xaa|\xe1\x8f\x9a|\xe1\x8f\x9a||"
533 "\xea\xae\xab|\xe1\x8f\x9b|\xe1\x8f\x9b||"
534 "\xea\xae\xac|\xe1\x8f\x9c|\xe1\x8f\x9c||"
535 "\xea\xae\xad|\xe1\x8f\x9d|\xe1\x8f\x9d||"
536 "\xea\xae\xae|\xe1\x8f\x9e|\xe1\x8f\x9e||"
537 "\xea\xae\xaf|\xe1\x8f\x9f|\xe1\x8f\x9f||"
538 "\xea\xae\xb0|\xe1\x8f\xa0|\xe1\x8f\xa0||"
539 "\xea\xae\xb1|\xe1\x8f\xa1|\xe1\x8f\xa1||"
540 "\xea\xae\xb2|\xe1\x8f\xa2|\xe1\x8f\xa2||"
541 "\xea\xae\xb3|\xe1\x8f\xa3|\xe1\x8f\xa3||"
542 "\xea\xae\xb4|\xe1\x8f\xa4|\xe1\x8f\xa4||"
543 "\xea\xae\xb5|\xe1\x8f\xa5|\xe1\x8f\xa5||"
544 "\xea\xae\xb6|\xe1\x8f\xa6|\xe1\x8f\xa6||"
545 "\xea\xae\xb7|\xe1\x8f\xa7|\xe1\x8f\xa7||"
546 "\xea\xae\xb8|\xe1\x8f\xa8|\xe1\x8f\xa8||"
547 "\xea\xae\xb9|\xe1\x8f\xa9|\xe1\x8f\xa9||"
548 "\xea\xae\xba|\xe1\x8f\xaa|\xe1\x8f\xaa||"
549 "\xea\xae\xbb|\xe1\x8f\xab|\xe1\x8f\xab||"
550 "\xea\xae\xbc|\xe1\x8f\xac|\xe1\x8f\xac||"
551 "\xea\xae\xbd|\xe1\x8f\xad|\xe1\x8f\xad||"
552 "\xea\xae\xbe|\xe1\x8f\xae|\xe1\x8f\xae||"
553 "\xea\xae\xbf|\xe1\x8f\xaf|\xe1\x8f\xaf||"
554 "\xef\xac\x80|ff|FF||"
555 "\xef\xac\x81|fi|FI||"
556 "\xef\xac\x82|fl|FL||"
557 "\xef\xac\x83|ffi|FFI||"
558 "\xef\xac\x84|ffl|FFL||"
559 "\xef\xac\x85|st|ST||"
560 "\xef\xac\x86|st|ST||"
561 "\xef\xac\x93|\xd5\xb4\xd5\xb6|\xd5\x84\xd5\x86||"
562 "\xef\xac\x94|\xd5\xb4\xd5\xa5|\xd5\x84\xd4\xb5||"
563 "\xef\xac\x95|\xd5\xb4\xd5\xab|\xd5\x84\xd4\xbb||"
564 "\xef\xac\x96|\xd5\xbe\xd5\xb6|\xd5\x8e\xd5\x86||"
565 "\xef\xac\x97|\xd5\xb4\xd5\xad|\xd5\x84\xd4\xbd||"
566 
567 //--Autogenerated -- end of section automatically generated
568 ;
569 
570 class CaseConverter : public ICaseConverter {
571 	// Maximum length of a case conversion result is 6 bytes in UTF-8
572 	enum { maxConversionLength=6 };
573 	struct ConversionString {
574 		char conversion[maxConversionLength+1];
ConversionString__anon08788c320111::CaseConverter::ConversionString575 		ConversionString() noexcept : conversion{} {
576 		}
577 	};
578 	// Conversions are initially store in a vector of structs but then decomposed into
579 	// parallel arrays as that is about 10% faster to search.
580 	struct CharacterConversion {
581 		int character;
582 		ConversionString conversion;
CharacterConversion__anon08788c320111::CaseConverter::CharacterConversion583 		CharacterConversion() noexcept : character(0) {
584 			// Empty case: NUL -> "".
585 		}
CharacterConversion__anon08788c320111::CaseConverter::CharacterConversion586 		CharacterConversion(int character_, std::string_view conversion_) noexcept : character(character_) {
587 			assert(conversion_.length() <= maxConversionLength);
588 			conversion_.copy(conversion.conversion, conversion_.length());
589 		}
operator <__anon08788c320111::CaseConverter::CharacterConversion590 		bool operator<(const CharacterConversion &other) const noexcept {
591 			return character < other.character;
592 		}
593 	};
594 	typedef std::vector<CharacterConversion> CharacterToConversion;
595 	CharacterToConversion characterToConversion;
596 	// The parallel arrays
597 	std::vector<int> characters;
598 	std::vector<ConversionString> conversions;
599 
600 public:
CaseConverter()601 	CaseConverter() noexcept {
602 	}
603 	virtual ~CaseConverter() = default;
Initialised() const604 	bool Initialised() const noexcept {
605 		return !characters.empty();
606 	}
Add(int character,const char * conversion)607 	void Add(int character, const char *conversion) {
608 		characterToConversion.emplace_back(character, conversion);
609 	}
Find(int character)610 	const char *Find(int character) {
611 		const std::vector<int>::iterator it = std::lower_bound(characters.begin(), characters.end(), character);
612 		if (it == characters.end())
613 			return nullptr;
614 		else if (*it == character)
615 			return conversions[it - characters.begin()].conversion;
616 		else
617 			return nullptr;
618 	}
CaseConvertString(char * converted,size_t sizeConverted,const char * mixed,size_t lenMixed)619 	size_t CaseConvertString(char *converted, size_t sizeConverted, const char *mixed, size_t lenMixed) override {
620 		size_t lenConverted = 0;
621 		size_t mixedPos = 0;
622 		unsigned char bytes[UTF8MaxBytes + 1]{};
623 		while (mixedPos < lenMixed) {
624 			const unsigned char leadByte = mixed[mixedPos];
625 			const char *caseConverted = nullptr;
626 			size_t lenMixedChar = 1;
627 			if (UTF8IsAscii(leadByte)) {
628 				caseConverted = Find(leadByte);
629 			} else {
630 				bytes[0] = leadByte;
631 				const int widthCharBytes = UTF8BytesOfLead[leadByte];
632 				for (int b=1; b<widthCharBytes; b++) {
633 					bytes[b] = (mixedPos+b < lenMixed) ? mixed[mixedPos+b] : 0;
634 				}
635 				const int classified = UTF8Classify(bytes, widthCharBytes);
636 				if (!(classified & UTF8MaskInvalid)) {
637 					// valid UTF-8
638 					lenMixedChar = classified & UTF8MaskWidth;
639 					const int character = UnicodeFromUTF8(bytes);
640 					caseConverted = Find(character);
641 				}
642 			}
643 			if (caseConverted) {
644 				// Character has a conversion so copy that conversion in
645 				while (*caseConverted) {
646 					converted[lenConverted++] = *caseConverted++;
647 					if (lenConverted >= sizeConverted)
648 						return 0;
649 				}
650 			} else {
651 				// Character has no conversion so copy the input to output
652 				for (size_t i=0; i<lenMixedChar; i++) {
653 					converted[lenConverted++] = mixed[mixedPos+i];
654 					if (lenConverted >= sizeConverted)
655 						return 0;
656 				}
657 			}
658 			mixedPos += lenMixedChar;
659 		}
660 		return lenConverted;
661 	}
FinishedAdding()662 	void FinishedAdding() {
663 		std::sort(characterToConversion.begin(), characterToConversion.end());
664 		characters.reserve(characterToConversion.size());
665 		conversions.reserve(characterToConversion.size());
666 		for (const CharacterConversion &chConv : characterToConversion) {
667 			characters.push_back(chConv.character);
668 			conversions.push_back(chConv.conversion);
669 		}
670 		// Empty the original calculated data completely
671 		CharacterToConversion().swap(characterToConversion);
672 	}
673 };
674 
675 CaseConverter caseConvFold;
676 CaseConverter caseConvUp;
677 CaseConverter caseConvLow;
678 
AddSymmetric(enum CaseConversion conversion,int lower,int upper)679 void AddSymmetric(enum CaseConversion conversion, int lower,int upper) {
680 	char lowerUTF8[UTF8MaxBytes+1];
681 	UTF8FromUTF32Character(lower, lowerUTF8);
682 	char upperUTF8[UTF8MaxBytes+1];
683 	UTF8FromUTF32Character(upper, upperUTF8);
684 
685 	switch (conversion) {
686 	case CaseConversionFold:
687 		caseConvFold.Add(upper, lowerUTF8);
688 		break;
689 	case CaseConversionUpper:
690 		caseConvUp.Add(lower, upperUTF8);
691 		break;
692 	case CaseConversionLower:
693 		caseConvLow.Add(upper, lowerUTF8);
694 		break;
695 	}
696 }
697 
SetupConversions(enum CaseConversion conversion)698 void SetupConversions(enum CaseConversion conversion) {
699 	// First initialize for the symmetric ranges
700 	for (size_t i=0; i<std::size(symmetricCaseConversionRanges);) {
701 		const int lower = symmetricCaseConversionRanges[i++];
702 		const int upper = symmetricCaseConversionRanges[i++];
703 		const int length = symmetricCaseConversionRanges[i++];
704 		const int pitch = symmetricCaseConversionRanges[i++];
705 		for (int j=0; j<length*pitch; j+=pitch) {
706 			AddSymmetric(conversion, lower+j, upper+j);
707 		}
708 	}
709 	// Add the symmetric singletons
710 	for (size_t i=0; i<std::size(symmetricCaseConversions);) {
711 		const int lower = symmetricCaseConversions[i++];
712 		const int upper = symmetricCaseConversions[i++];
713 		AddSymmetric(conversion, lower, upper);
714 	}
715 	// Add the complex cases
716 	const char *sComplex = complexCaseConversions;
717 	while (*sComplex) {
718 		// Longest ligature is 3 character so 5 for safety
719 		constexpr size_t lenUTF8 = 5*UTF8MaxBytes+1;
720 		unsigned char originUTF8[lenUTF8]{};
721 		char foldedUTF8[lenUTF8]{};
722 		char lowerUTF8[lenUTF8]{};
723 		char upperUTF8[lenUTF8]{};
724 		size_t i = 0;
725 		while (*sComplex && *sComplex != '|') {
726 			originUTF8[i++] = *sComplex;
727 			sComplex++;
728 		}
729 		sComplex++;
730 		originUTF8[i] = 0;
731 		i = 0;
732 		while (*sComplex && *sComplex != '|') {
733 			foldedUTF8[i++] = *sComplex;
734 			sComplex++;
735 		}
736 		sComplex++;
737 		foldedUTF8[i] = 0;
738 		i = 0;
739 		while (*sComplex && *sComplex != '|') {
740 			upperUTF8[i++] = *sComplex;
741 			sComplex++;
742 		}
743 		sComplex++;
744 		upperUTF8[i] = 0;
745 		i = 0;
746 		while (*sComplex && *sComplex != '|') {
747 			lowerUTF8[i++] = *sComplex;
748 			sComplex++;
749 		}
750 		sComplex++;
751 		lowerUTF8[i] = 0;
752 
753 		const int character = UnicodeFromUTF8(originUTF8);
754 
755 		if (conversion == CaseConversionFold && foldedUTF8[0]) {
756 			caseConvFold.Add(character, foldedUTF8);
757 		}
758 
759 		if (conversion == CaseConversionUpper && upperUTF8[0]) {
760 			caseConvUp.Add(character, upperUTF8);
761 		}
762 
763 		if (conversion == CaseConversionLower && lowerUTF8[0]) {
764 			caseConvLow.Add(character, lowerUTF8);
765 		}
766 	}
767 
768 	switch (conversion) {
769 	case CaseConversionFold:
770 		caseConvFold.FinishedAdding();
771 		break;
772 	case CaseConversionUpper:
773 		caseConvUp.FinishedAdding();
774 		break;
775 	case CaseConversionLower:
776 		caseConvLow.FinishedAdding();
777 		break;
778 	}
779 }
780 
ConverterForConversion(enum CaseConversion conversion)781 CaseConverter *ConverterForConversion(enum CaseConversion conversion) noexcept {
782 	switch (conversion) {
783 	case CaseConversionFold:
784 		return &caseConvFold;
785 	case CaseConversionUpper:
786 		return &caseConvUp;
787 	case CaseConversionLower:
788 		return &caseConvLow;
789 	}
790 	return nullptr;
791 }
792 
793 }
794 
795 namespace Scintilla {
796 
ConverterFor(enum CaseConversion conversion)797 ICaseConverter *ConverterFor(enum CaseConversion conversion) {
798 	CaseConverter *pCaseConv = ConverterForConversion(conversion);
799 	if (!pCaseConv->Initialised())
800 		SetupConversions(conversion);
801 	return pCaseConv;
802 }
803 
CaseConvert(int character,enum CaseConversion conversion)804 const char *CaseConvert(int character, enum CaseConversion conversion) {
805 	CaseConverter *pCaseConv = ConverterForConversion(conversion);
806 	if (!pCaseConv->Initialised())
807 		SetupConversions(conversion);
808 	return pCaseConv->Find(character);
809 }
810 
CaseConvertString(char * converted,size_t sizeConverted,const char * mixed,size_t lenMixed,enum CaseConversion conversion)811 size_t CaseConvertString(char *converted, size_t sizeConverted, const char *mixed, size_t lenMixed, enum CaseConversion conversion) {
812 	CaseConverter *pCaseConv = ConverterForConversion(conversion);
813 	if (!pCaseConv->Initialised())
814 		SetupConversions(conversion);
815 	return pCaseConv->CaseConvertString(converted, sizeConverted, mixed, lenMixed);
816 }
817 
CaseConvertString(const std::string & s,enum CaseConversion conversion)818 std::string CaseConvertString(const std::string &s, enum CaseConversion conversion) {
819 	std::string retMapped(s.length() * maxExpansionCaseConversion, 0);
820 	const size_t lenMapped = CaseConvertString(&retMapped[0], retMapped.length(), s.c_str(), s.length(),
821 		conversion);
822 	retMapped.resize(lenMapped);
823 	return retMapped;
824 }
825 
826 }
827