1 // Scintilla source code edit control
2 // Encoding: UTF-8
3 /** @file CaseConvert.cxx
4 ** Case fold characters and convert them to upper or lower case.
5 ** Tables automatically regenerated by scripts/GenerateCaseConvert.py
6 ** Should only be rarely regenerated for new versions of Unicode.
7 **/
8 // Copyright 2013 by Neil Hodgson <neilh@scintilla.org>
9 // The License.txt file describes the conditions under which this software may be distributed.
10
11 #include <cstring>
12
13 #include <stdexcept>
14 #include <string>
15 #include <vector>
16 #include <algorithm>
17
18 #include "StringCopy.h"
19 #include "CaseConvert.h"
20 #include "UniConversion.h"
21
22 #include "Compat.h"
23
24 using namespace Scintilla;
25
26 namespace {
27 // Use an unnamed namespace to protect the declarations from name conflicts
28
29 // Unicode code points are ordered by groups and follow patterns.
30 // Most characters (pitch==1) are in ranges for a particular alphabet and their
31 // upper case forms are a fixed distance away.
32 // Another pattern (pitch==2) is where each lower case letter is preceded by
33 // the upper case form. These are also grouped into ranges.
34
35 int symmetricCaseConversionRanges[] = {
36 //lower, upper, range length, range pitch
37 //++Autogenerated -- start of section automatically generated
38 //**\(\*\n\)
39 97,65,26,1,
40 224,192,23,1,
41 248,216,7,1,
42 257,256,24,2,
43 314,313,8,2,
44 331,330,23,2,
45 462,461,8,2,
46 479,478,9,2,
47 505,504,20,2,
48 547,546,9,2,
49 583,582,5,2,
50 945,913,17,1,
51 963,931,9,1,
52 985,984,12,2,
53 1072,1040,32,1,
54 1104,1024,16,1,
55 1121,1120,17,2,
56 1163,1162,27,2,
57 1218,1217,7,2,
58 1233,1232,48,2,
59 1377,1329,38,1,
60 4304,7312,43,1,
61 7681,7680,75,2,
62 7841,7840,48,2,
63 7936,7944,8,1,
64 7952,7960,6,1,
65 7968,7976,8,1,
66 7984,7992,8,1,
67 8000,8008,6,1,
68 8032,8040,8,1,
69 8560,8544,16,1,
70 9424,9398,26,1,
71 11312,11264,47,1,
72 11393,11392,50,2,
73 11520,4256,38,1,
74 42561,42560,23,2,
75 42625,42624,14,2,
76 42787,42786,7,2,
77 42803,42802,31,2,
78 42879,42878,5,2,
79 42903,42902,10,2,
80 42933,42932,6,2,
81 65345,65313,26,1,
82 66600,66560,40,1,
83 66776,66736,36,1,
84 68800,68736,51,1,
85 71872,71840,32,1,
86 93792,93760,32,1,
87 125218,125184,34,1,
88
89 //--Autogenerated -- end of section automatically generated
90 };
91
92 // Code points that are symmetric but don't fit into a range of similar characters
93 // are listed here.
94
95 int symmetricCaseConversions[] = {
96 //lower, upper
97 //++Autogenerated -- start of section automatically generated
98 //**1 \(\*\n\)
99 255,376,
100 307,306,
101 309,308,
102 311,310,
103 378,377,
104 380,379,
105 382,381,
106 384,579,
107 387,386,
108 389,388,
109 392,391,
110 396,395,
111 402,401,
112 405,502,
113 409,408,
114 410,573,
115 414,544,
116 417,416,
117 419,418,
118 421,420,
119 424,423,
120 429,428,
121 432,431,
122 436,435,
123 438,437,
124 441,440,
125 445,444,
126 447,503,
127 454,452,
128 457,455,
129 460,458,
130 477,398,
131 499,497,
132 501,500,
133 572,571,
134 575,11390,
135 576,11391,
136 578,577,
137 592,11375,
138 593,11373,
139 594,11376,
140 595,385,
141 596,390,
142 598,393,
143 599,394,
144 601,399,
145 603,400,
146 604,42923,
147 608,403,
148 609,42924,
149 611,404,
150 613,42893,
151 614,42922,
152 616,407,
153 617,406,
154 618,42926,
155 619,11362,
156 620,42925,
157 623,412,
158 625,11374,
159 626,413,
160 629,415,
161 637,11364,
162 640,422,
163 642,42949,
164 643,425,
165 647,42929,
166 648,430,
167 649,580,
168 650,433,
169 651,434,
170 652,581,
171 658,439,
172 669,42930,
173 670,42928,
174 881,880,
175 883,882,
176 887,886,
177 891,1021,
178 892,1022,
179 893,1023,
180 940,902,
181 941,904,
182 942,905,
183 943,906,
184 972,908,
185 973,910,
186 974,911,
187 983,975,
188 1010,1017,
189 1011,895,
190 1016,1015,
191 1019,1018,
192 1231,1216,
193 4349,7357,
194 4350,7358,
195 4351,7359,
196 7545,42877,
197 7549,11363,
198 7566,42950,
199 8017,8025,
200 8019,8027,
201 8021,8029,
202 8023,8031,
203 8048,8122,
204 8049,8123,
205 8050,8136,
206 8051,8137,
207 8052,8138,
208 8053,8139,
209 8054,8154,
210 8055,8155,
211 8056,8184,
212 8057,8185,
213 8058,8170,
214 8059,8171,
215 8060,8186,
216 8061,8187,
217 8112,8120,
218 8113,8121,
219 8144,8152,
220 8145,8153,
221 8160,8168,
222 8161,8169,
223 8165,8172,
224 8526,8498,
225 8580,8579,
226 11361,11360,
227 11365,570,
228 11366,574,
229 11368,11367,
230 11370,11369,
231 11372,11371,
232 11379,11378,
233 11382,11381,
234 11500,11499,
235 11502,11501,
236 11507,11506,
237 11559,4295,
238 11565,4301,
239 42874,42873,
240 42876,42875,
241 42892,42891,
242 42897,42896,
243 42899,42898,
244 42900,42948,
245 42947,42946,
246 43859,42931,
247
248 //--Autogenerated -- end of section automatically generated
249 };
250
251 // Characters that have complex case conversions are listed here.
252 // This includes cases where more than one character is needed for a conversion,
253 // folding is different to lowering, or (as appropriate) upper(lower(x)) != x or
254 // lower(upper(x)) != x.
255
256 const char *complexCaseConversions =
257 // Original | Folded | Upper | Lower |
258 //++Autogenerated -- start of section automatically generated
259 //**2 \(\*\n\)
260 "\xc2\xb5|\xce\xbc|\xce\x9c||"
261 "\xc3\x9f|ss|SS||"
262 "\xc4\xb0|i\xcc\x87||i\xcc\x87|"
263 "\xc4\xb1||I||"
264 "\xc5\x89|\xca\xbcn|\xca\xbcN||"
265 "\xc5\xbf|s|S||"
266 "\xc7\x85|\xc7\x86|\xc7\x84|\xc7\x86|"
267 "\xc7\x88|\xc7\x89|\xc7\x87|\xc7\x89|"
268 "\xc7\x8b|\xc7\x8c|\xc7\x8a|\xc7\x8c|"
269 "\xc7\xb0|j\xcc\x8c|J\xcc\x8c||"
270 "\xc7\xb2|\xc7\xb3|\xc7\xb1|\xc7\xb3|"
271 "\xcd\x85|\xce\xb9|\xce\x99||"
272 "\xce\x90|\xce\xb9\xcc\x88\xcc\x81|\xce\x99\xcc\x88\xcc\x81||"
273 "\xce\xb0|\xcf\x85\xcc\x88\xcc\x81|\xce\xa5\xcc\x88\xcc\x81||"
274 "\xcf\x82|\xcf\x83|\xce\xa3||"
275 "\xcf\x90|\xce\xb2|\xce\x92||"
276 "\xcf\x91|\xce\xb8|\xce\x98||"
277 "\xcf\x95|\xcf\x86|\xce\xa6||"
278 "\xcf\x96|\xcf\x80|\xce\xa0||"
279 "\xcf\xb0|\xce\xba|\xce\x9a||"
280 "\xcf\xb1|\xcf\x81|\xce\xa1||"
281 "\xcf\xb4|\xce\xb8||\xce\xb8|"
282 "\xcf\xb5|\xce\xb5|\xce\x95||"
283 "\xd6\x87|\xd5\xa5\xd6\x82|\xd4\xb5\xd5\x92||"
284 "\xe1\x8e\xa0|||\xea\xad\xb0|"
285 "\xe1\x8e\xa1|||\xea\xad\xb1|"
286 "\xe1\x8e\xa2|||\xea\xad\xb2|"
287 "\xe1\x8e\xa3|||\xea\xad\xb3|"
288 "\xe1\x8e\xa4|||\xea\xad\xb4|"
289 "\xe1\x8e\xa5|||\xea\xad\xb5|"
290 "\xe1\x8e\xa6|||\xea\xad\xb6|"
291 "\xe1\x8e\xa7|||\xea\xad\xb7|"
292 "\xe1\x8e\xa8|||\xea\xad\xb8|"
293 "\xe1\x8e\xa9|||\xea\xad\xb9|"
294 "\xe1\x8e\xaa|||\xea\xad\xba|"
295 "\xe1\x8e\xab|||\xea\xad\xbb|"
296 "\xe1\x8e\xac|||\xea\xad\xbc|"
297 "\xe1\x8e\xad|||\xea\xad\xbd|"
298 "\xe1\x8e\xae|||\xea\xad\xbe|"
299 "\xe1\x8e\xaf|||\xea\xad\xbf|"
300 "\xe1\x8e\xb0|||\xea\xae\x80|"
301 "\xe1\x8e\xb1|||\xea\xae\x81|"
302 "\xe1\x8e\xb2|||\xea\xae\x82|"
303 "\xe1\x8e\xb3|||\xea\xae\x83|"
304 "\xe1\x8e\xb4|||\xea\xae\x84|"
305 "\xe1\x8e\xb5|||\xea\xae\x85|"
306 "\xe1\x8e\xb6|||\xea\xae\x86|"
307 "\xe1\x8e\xb7|||\xea\xae\x87|"
308 "\xe1\x8e\xb8|||\xea\xae\x88|"
309 "\xe1\x8e\xb9|||\xea\xae\x89|"
310 "\xe1\x8e\xba|||\xea\xae\x8a|"
311 "\xe1\x8e\xbb|||\xea\xae\x8b|"
312 "\xe1\x8e\xbc|||\xea\xae\x8c|"
313 "\xe1\x8e\xbd|||\xea\xae\x8d|"
314 "\xe1\x8e\xbe|||\xea\xae\x8e|"
315 "\xe1\x8e\xbf|||\xea\xae\x8f|"
316 "\xe1\x8f\x80|||\xea\xae\x90|"
317 "\xe1\x8f\x81|||\xea\xae\x91|"
318 "\xe1\x8f\x82|||\xea\xae\x92|"
319 "\xe1\x8f\x83|||\xea\xae\x93|"
320 "\xe1\x8f\x84|||\xea\xae\x94|"
321 "\xe1\x8f\x85|||\xea\xae\x95|"
322 "\xe1\x8f\x86|||\xea\xae\x96|"
323 "\xe1\x8f\x87|||\xea\xae\x97|"
324 "\xe1\x8f\x88|||\xea\xae\x98|"
325 "\xe1\x8f\x89|||\xea\xae\x99|"
326 "\xe1\x8f\x8a|||\xea\xae\x9a|"
327 "\xe1\x8f\x8b|||\xea\xae\x9b|"
328 "\xe1\x8f\x8c|||\xea\xae\x9c|"
329 "\xe1\x8f\x8d|||\xea\xae\x9d|"
330 "\xe1\x8f\x8e|||\xea\xae\x9e|"
331 "\xe1\x8f\x8f|||\xea\xae\x9f|"
332 "\xe1\x8f\x90|||\xea\xae\xa0|"
333 "\xe1\x8f\x91|||\xea\xae\xa1|"
334 "\xe1\x8f\x92|||\xea\xae\xa2|"
335 "\xe1\x8f\x93|||\xea\xae\xa3|"
336 "\xe1\x8f\x94|||\xea\xae\xa4|"
337 "\xe1\x8f\x95|||\xea\xae\xa5|"
338 "\xe1\x8f\x96|||\xea\xae\xa6|"
339 "\xe1\x8f\x97|||\xea\xae\xa7|"
340 "\xe1\x8f\x98|||\xea\xae\xa8|"
341 "\xe1\x8f\x99|||\xea\xae\xa9|"
342 "\xe1\x8f\x9a|||\xea\xae\xaa|"
343 "\xe1\x8f\x9b|||\xea\xae\xab|"
344 "\xe1\x8f\x9c|||\xea\xae\xac|"
345 "\xe1\x8f\x9d|||\xea\xae\xad|"
346 "\xe1\x8f\x9e|||\xea\xae\xae|"
347 "\xe1\x8f\x9f|||\xea\xae\xaf|"
348 "\xe1\x8f\xa0|||\xea\xae\xb0|"
349 "\xe1\x8f\xa1|||\xea\xae\xb1|"
350 "\xe1\x8f\xa2|||\xea\xae\xb2|"
351 "\xe1\x8f\xa3|||\xea\xae\xb3|"
352 "\xe1\x8f\xa4|||\xea\xae\xb4|"
353 "\xe1\x8f\xa5|||\xea\xae\xb5|"
354 "\xe1\x8f\xa6|||\xea\xae\xb6|"
355 "\xe1\x8f\xa7|||\xea\xae\xb7|"
356 "\xe1\x8f\xa8|||\xea\xae\xb8|"
357 "\xe1\x8f\xa9|||\xea\xae\xb9|"
358 "\xe1\x8f\xaa|||\xea\xae\xba|"
359 "\xe1\x8f\xab|||\xea\xae\xbb|"
360 "\xe1\x8f\xac|||\xea\xae\xbc|"
361 "\xe1\x8f\xad|||\xea\xae\xbd|"
362 "\xe1\x8f\xae|||\xea\xae\xbe|"
363 "\xe1\x8f\xaf|||\xea\xae\xbf|"
364 "\xe1\x8f\xb0|||\xe1\x8f\xb8|"
365 "\xe1\x8f\xb1|||\xe1\x8f\xb9|"
366 "\xe1\x8f\xb2|||\xe1\x8f\xba|"
367 "\xe1\x8f\xb3|||\xe1\x8f\xbb|"
368 "\xe1\x8f\xb4|||\xe1\x8f\xbc|"
369 "\xe1\x8f\xb5|||\xe1\x8f\xbd|"
370 "\xe1\x8f\xb8|\xe1\x8f\xb0|\xe1\x8f\xb0||"
371 "\xe1\x8f\xb9|\xe1\x8f\xb1|\xe1\x8f\xb1||"
372 "\xe1\x8f\xba|\xe1\x8f\xb2|\xe1\x8f\xb2||"
373 "\xe1\x8f\xbb|\xe1\x8f\xb3|\xe1\x8f\xb3||"
374 "\xe1\x8f\xbc|\xe1\x8f\xb4|\xe1\x8f\xb4||"
375 "\xe1\x8f\xbd|\xe1\x8f\xb5|\xe1\x8f\xb5||"
376 "\xe1\xb2\x80|\xd0\xb2|\xd0\x92||"
377 "\xe1\xb2\x81|\xd0\xb4|\xd0\x94||"
378 "\xe1\xb2\x82|\xd0\xbe|\xd0\x9e||"
379 "\xe1\xb2\x83|\xd1\x81|\xd0\xa1||"
380 "\xe1\xb2\x84|\xd1\x82|\xd0\xa2||"
381 "\xe1\xb2\x85|\xd1\x82|\xd0\xa2||"
382 "\xe1\xb2\x86|\xd1\x8a|\xd0\xaa||"
383 "\xe1\xb2\x87|\xd1\xa3|\xd1\xa2||"
384 "\xe1\xb2\x88|\xea\x99\x8b|\xea\x99\x8a||"
385 "\xe1\xba\x96|h\xcc\xb1|H\xcc\xb1||"
386 "\xe1\xba\x97|t\xcc\x88|T\xcc\x88||"
387 "\xe1\xba\x98|w\xcc\x8a|W\xcc\x8a||"
388 "\xe1\xba\x99|y\xcc\x8a|Y\xcc\x8a||"
389 "\xe1\xba\x9a|a\xca\xbe|A\xca\xbe||"
390 "\xe1\xba\x9b|\xe1\xb9\xa1|\xe1\xb9\xa0||"
391 "\xe1\xba\x9e|ss||\xc3\x9f|"
392 "\xe1\xbd\x90|\xcf\x85\xcc\x93|\xce\xa5\xcc\x93||"
393 "\xe1\xbd\x92|\xcf\x85\xcc\x93\xcc\x80|\xce\xa5\xcc\x93\xcc\x80||"
394 "\xe1\xbd\x94|\xcf\x85\xcc\x93\xcc\x81|\xce\xa5\xcc\x93\xcc\x81||"
395 "\xe1\xbd\x96|\xcf\x85\xcc\x93\xcd\x82|\xce\xa5\xcc\x93\xcd\x82||"
396 "\xe1\xbe\x80|\xe1\xbc\x80\xce\xb9|\xe1\xbc\x88\xce\x99||"
397 "\xe1\xbe\x81|\xe1\xbc\x81\xce\xb9|\xe1\xbc\x89\xce\x99||"
398 "\xe1\xbe\x82|\xe1\xbc\x82\xce\xb9|\xe1\xbc\x8a\xce\x99||"
399 "\xe1\xbe\x83|\xe1\xbc\x83\xce\xb9|\xe1\xbc\x8b\xce\x99||"
400 "\xe1\xbe\x84|\xe1\xbc\x84\xce\xb9|\xe1\xbc\x8c\xce\x99||"
401 "\xe1\xbe\x85|\xe1\xbc\x85\xce\xb9|\xe1\xbc\x8d\xce\x99||"
402 "\xe1\xbe\x86|\xe1\xbc\x86\xce\xb9|\xe1\xbc\x8e\xce\x99||"
403 "\xe1\xbe\x87|\xe1\xbc\x87\xce\xb9|\xe1\xbc\x8f\xce\x99||"
404 "\xe1\xbe\x88|\xe1\xbc\x80\xce\xb9|\xe1\xbc\x88\xce\x99|\xe1\xbe\x80|"
405 "\xe1\xbe\x89|\xe1\xbc\x81\xce\xb9|\xe1\xbc\x89\xce\x99|\xe1\xbe\x81|"
406 "\xe1\xbe\x8a|\xe1\xbc\x82\xce\xb9|\xe1\xbc\x8a\xce\x99|\xe1\xbe\x82|"
407 "\xe1\xbe\x8b|\xe1\xbc\x83\xce\xb9|\xe1\xbc\x8b\xce\x99|\xe1\xbe\x83|"
408 "\xe1\xbe\x8c|\xe1\xbc\x84\xce\xb9|\xe1\xbc\x8c\xce\x99|\xe1\xbe\x84|"
409 "\xe1\xbe\x8d|\xe1\xbc\x85\xce\xb9|\xe1\xbc\x8d\xce\x99|\xe1\xbe\x85|"
410 "\xe1\xbe\x8e|\xe1\xbc\x86\xce\xb9|\xe1\xbc\x8e\xce\x99|\xe1\xbe\x86|"
411 "\xe1\xbe\x8f|\xe1\xbc\x87\xce\xb9|\xe1\xbc\x8f\xce\x99|\xe1\xbe\x87|"
412 "\xe1\xbe\x90|\xe1\xbc\xa0\xce\xb9|\xe1\xbc\xa8\xce\x99||"
413 "\xe1\xbe\x91|\xe1\xbc\xa1\xce\xb9|\xe1\xbc\xa9\xce\x99||"
414 "\xe1\xbe\x92|\xe1\xbc\xa2\xce\xb9|\xe1\xbc\xaa\xce\x99||"
415 "\xe1\xbe\x93|\xe1\xbc\xa3\xce\xb9|\xe1\xbc\xab\xce\x99||"
416 "\xe1\xbe\x94|\xe1\xbc\xa4\xce\xb9|\xe1\xbc\xac\xce\x99||"
417 "\xe1\xbe\x95|\xe1\xbc\xa5\xce\xb9|\xe1\xbc\xad\xce\x99||"
418 "\xe1\xbe\x96|\xe1\xbc\xa6\xce\xb9|\xe1\xbc\xae\xce\x99||"
419 "\xe1\xbe\x97|\xe1\xbc\xa7\xce\xb9|\xe1\xbc\xaf\xce\x99||"
420 "\xe1\xbe\x98|\xe1\xbc\xa0\xce\xb9|\xe1\xbc\xa8\xce\x99|\xe1\xbe\x90|"
421 "\xe1\xbe\x99|\xe1\xbc\xa1\xce\xb9|\xe1\xbc\xa9\xce\x99|\xe1\xbe\x91|"
422 "\xe1\xbe\x9a|\xe1\xbc\xa2\xce\xb9|\xe1\xbc\xaa\xce\x99|\xe1\xbe\x92|"
423 "\xe1\xbe\x9b|\xe1\xbc\xa3\xce\xb9|\xe1\xbc\xab\xce\x99|\xe1\xbe\x93|"
424 "\xe1\xbe\x9c|\xe1\xbc\xa4\xce\xb9|\xe1\xbc\xac\xce\x99|\xe1\xbe\x94|"
425 "\xe1\xbe\x9d|\xe1\xbc\xa5\xce\xb9|\xe1\xbc\xad\xce\x99|\xe1\xbe\x95|"
426 "\xe1\xbe\x9e|\xe1\xbc\xa6\xce\xb9|\xe1\xbc\xae\xce\x99|\xe1\xbe\x96|"
427 "\xe1\xbe\x9f|\xe1\xbc\xa7\xce\xb9|\xe1\xbc\xaf\xce\x99|\xe1\xbe\x97|"
428 "\xe1\xbe\xa0|\xe1\xbd\xa0\xce\xb9|\xe1\xbd\xa8\xce\x99||"
429 "\xe1\xbe\xa1|\xe1\xbd\xa1\xce\xb9|\xe1\xbd\xa9\xce\x99||"
430 "\xe1\xbe\xa2|\xe1\xbd\xa2\xce\xb9|\xe1\xbd\xaa\xce\x99||"
431 "\xe1\xbe\xa3|\xe1\xbd\xa3\xce\xb9|\xe1\xbd\xab\xce\x99||"
432 "\xe1\xbe\xa4|\xe1\xbd\xa4\xce\xb9|\xe1\xbd\xac\xce\x99||"
433 "\xe1\xbe\xa5|\xe1\xbd\xa5\xce\xb9|\xe1\xbd\xad\xce\x99||"
434 "\xe1\xbe\xa6|\xe1\xbd\xa6\xce\xb9|\xe1\xbd\xae\xce\x99||"
435 "\xe1\xbe\xa7|\xe1\xbd\xa7\xce\xb9|\xe1\xbd\xaf\xce\x99||"
436 "\xe1\xbe\xa8|\xe1\xbd\xa0\xce\xb9|\xe1\xbd\xa8\xce\x99|\xe1\xbe\xa0|"
437 "\xe1\xbe\xa9|\xe1\xbd\xa1\xce\xb9|\xe1\xbd\xa9\xce\x99|\xe1\xbe\xa1|"
438 "\xe1\xbe\xaa|\xe1\xbd\xa2\xce\xb9|\xe1\xbd\xaa\xce\x99|\xe1\xbe\xa2|"
439 "\xe1\xbe\xab|\xe1\xbd\xa3\xce\xb9|\xe1\xbd\xab\xce\x99|\xe1\xbe\xa3|"
440 "\xe1\xbe\xac|\xe1\xbd\xa4\xce\xb9|\xe1\xbd\xac\xce\x99|\xe1\xbe\xa4|"
441 "\xe1\xbe\xad|\xe1\xbd\xa5\xce\xb9|\xe1\xbd\xad\xce\x99|\xe1\xbe\xa5|"
442 "\xe1\xbe\xae|\xe1\xbd\xa6\xce\xb9|\xe1\xbd\xae\xce\x99|\xe1\xbe\xa6|"
443 "\xe1\xbe\xaf|\xe1\xbd\xa7\xce\xb9|\xe1\xbd\xaf\xce\x99|\xe1\xbe\xa7|"
444 "\xe1\xbe\xb2|\xe1\xbd\xb0\xce\xb9|\xe1\xbe\xba\xce\x99||"
445 "\xe1\xbe\xb3|\xce\xb1\xce\xb9|\xce\x91\xce\x99||"
446 "\xe1\xbe\xb4|\xce\xac\xce\xb9|\xce\x86\xce\x99||"
447 "\xe1\xbe\xb6|\xce\xb1\xcd\x82|\xce\x91\xcd\x82||"
448 "\xe1\xbe\xb7|\xce\xb1\xcd\x82\xce\xb9|\xce\x91\xcd\x82\xce\x99||"
449 "\xe1\xbe\xbc|\xce\xb1\xce\xb9|\xce\x91\xce\x99|\xe1\xbe\xb3|"
450 "\xe1\xbe\xbe|\xce\xb9|\xce\x99||"
451 "\xe1\xbf\x82|\xe1\xbd\xb4\xce\xb9|\xe1\xbf\x8a\xce\x99||"
452 "\xe1\xbf\x83|\xce\xb7\xce\xb9|\xce\x97\xce\x99||"
453 "\xe1\xbf\x84|\xce\xae\xce\xb9|\xce\x89\xce\x99||"
454 "\xe1\xbf\x86|\xce\xb7\xcd\x82|\xce\x97\xcd\x82||"
455 "\xe1\xbf\x87|\xce\xb7\xcd\x82\xce\xb9|\xce\x97\xcd\x82\xce\x99||"
456 "\xe1\xbf\x8c|\xce\xb7\xce\xb9|\xce\x97\xce\x99|\xe1\xbf\x83|"
457 "\xe1\xbf\x92|\xce\xb9\xcc\x88\xcc\x80|\xce\x99\xcc\x88\xcc\x80||"
458 "\xe1\xbf\x93|\xce\xb9\xcc\x88\xcc\x81|\xce\x99\xcc\x88\xcc\x81||"
459 "\xe1\xbf\x96|\xce\xb9\xcd\x82|\xce\x99\xcd\x82||"
460 "\xe1\xbf\x97|\xce\xb9\xcc\x88\xcd\x82|\xce\x99\xcc\x88\xcd\x82||"
461 "\xe1\xbf\xa2|\xcf\x85\xcc\x88\xcc\x80|\xce\xa5\xcc\x88\xcc\x80||"
462 "\xe1\xbf\xa3|\xcf\x85\xcc\x88\xcc\x81|\xce\xa5\xcc\x88\xcc\x81||"
463 "\xe1\xbf\xa4|\xcf\x81\xcc\x93|\xce\xa1\xcc\x93||"
464 "\xe1\xbf\xa6|\xcf\x85\xcd\x82|\xce\xa5\xcd\x82||"
465 "\xe1\xbf\xa7|\xcf\x85\xcc\x88\xcd\x82|\xce\xa5\xcc\x88\xcd\x82||"
466 "\xe1\xbf\xb2|\xe1\xbd\xbc\xce\xb9|\xe1\xbf\xba\xce\x99||"
467 "\xe1\xbf\xb3|\xcf\x89\xce\xb9|\xce\xa9\xce\x99||"
468 "\xe1\xbf\xb4|\xcf\x8e\xce\xb9|\xce\x8f\xce\x99||"
469 "\xe1\xbf\xb6|\xcf\x89\xcd\x82|\xce\xa9\xcd\x82||"
470 "\xe1\xbf\xb7|\xcf\x89\xcd\x82\xce\xb9|\xce\xa9\xcd\x82\xce\x99||"
471 "\xe1\xbf\xbc|\xcf\x89\xce\xb9|\xce\xa9\xce\x99|\xe1\xbf\xb3|"
472 "\xe2\x84\xa6|\xcf\x89||\xcf\x89|"
473 "\xe2\x84\xaa|k||k|"
474 "\xe2\x84\xab|\xc3\xa5||\xc3\xa5|"
475 "\xea\xad\xb0|\xe1\x8e\xa0|\xe1\x8e\xa0||"
476 "\xea\xad\xb1|\xe1\x8e\xa1|\xe1\x8e\xa1||"
477 "\xea\xad\xb2|\xe1\x8e\xa2|\xe1\x8e\xa2||"
478 "\xea\xad\xb3|\xe1\x8e\xa3|\xe1\x8e\xa3||"
479 "\xea\xad\xb4|\xe1\x8e\xa4|\xe1\x8e\xa4||"
480 "\xea\xad\xb5|\xe1\x8e\xa5|\xe1\x8e\xa5||"
481 "\xea\xad\xb6|\xe1\x8e\xa6|\xe1\x8e\xa6||"
482 "\xea\xad\xb7|\xe1\x8e\xa7|\xe1\x8e\xa7||"
483 "\xea\xad\xb8|\xe1\x8e\xa8|\xe1\x8e\xa8||"
484 "\xea\xad\xb9|\xe1\x8e\xa9|\xe1\x8e\xa9||"
485 "\xea\xad\xba|\xe1\x8e\xaa|\xe1\x8e\xaa||"
486 "\xea\xad\xbb|\xe1\x8e\xab|\xe1\x8e\xab||"
487 "\xea\xad\xbc|\xe1\x8e\xac|\xe1\x8e\xac||"
488 "\xea\xad\xbd|\xe1\x8e\xad|\xe1\x8e\xad||"
489 "\xea\xad\xbe|\xe1\x8e\xae|\xe1\x8e\xae||"
490 "\xea\xad\xbf|\xe1\x8e\xaf|\xe1\x8e\xaf||"
491 "\xea\xae\x80|\xe1\x8e\xb0|\xe1\x8e\xb0||"
492 "\xea\xae\x81|\xe1\x8e\xb1|\xe1\x8e\xb1||"
493 "\xea\xae\x82|\xe1\x8e\xb2|\xe1\x8e\xb2||"
494 "\xea\xae\x83|\xe1\x8e\xb3|\xe1\x8e\xb3||"
495 "\xea\xae\x84|\xe1\x8e\xb4|\xe1\x8e\xb4||"
496 "\xea\xae\x85|\xe1\x8e\xb5|\xe1\x8e\xb5||"
497 "\xea\xae\x86|\xe1\x8e\xb6|\xe1\x8e\xb6||"
498 "\xea\xae\x87|\xe1\x8e\xb7|\xe1\x8e\xb7||"
499 "\xea\xae\x88|\xe1\x8e\xb8|\xe1\x8e\xb8||"
500 "\xea\xae\x89|\xe1\x8e\xb9|\xe1\x8e\xb9||"
501 "\xea\xae\x8a|\xe1\x8e\xba|\xe1\x8e\xba||"
502 "\xea\xae\x8b|\xe1\x8e\xbb|\xe1\x8e\xbb||"
503 "\xea\xae\x8c|\xe1\x8e\xbc|\xe1\x8e\xbc||"
504 "\xea\xae\x8d|\xe1\x8e\xbd|\xe1\x8e\xbd||"
505 "\xea\xae\x8e|\xe1\x8e\xbe|\xe1\x8e\xbe||"
506 "\xea\xae\x8f|\xe1\x8e\xbf|\xe1\x8e\xbf||"
507 "\xea\xae\x90|\xe1\x8f\x80|\xe1\x8f\x80||"
508 "\xea\xae\x91|\xe1\x8f\x81|\xe1\x8f\x81||"
509 "\xea\xae\x92|\xe1\x8f\x82|\xe1\x8f\x82||"
510 "\xea\xae\x93|\xe1\x8f\x83|\xe1\x8f\x83||"
511 "\xea\xae\x94|\xe1\x8f\x84|\xe1\x8f\x84||"
512 "\xea\xae\x95|\xe1\x8f\x85|\xe1\x8f\x85||"
513 "\xea\xae\x96|\xe1\x8f\x86|\xe1\x8f\x86||"
514 "\xea\xae\x97|\xe1\x8f\x87|\xe1\x8f\x87||"
515 "\xea\xae\x98|\xe1\x8f\x88|\xe1\x8f\x88||"
516 "\xea\xae\x99|\xe1\x8f\x89|\xe1\x8f\x89||"
517 "\xea\xae\x9a|\xe1\x8f\x8a|\xe1\x8f\x8a||"
518 "\xea\xae\x9b|\xe1\x8f\x8b|\xe1\x8f\x8b||"
519 "\xea\xae\x9c|\xe1\x8f\x8c|\xe1\x8f\x8c||"
520 "\xea\xae\x9d|\xe1\x8f\x8d|\xe1\x8f\x8d||"
521 "\xea\xae\x9e|\xe1\x8f\x8e|\xe1\x8f\x8e||"
522 "\xea\xae\x9f|\xe1\x8f\x8f|\xe1\x8f\x8f||"
523 "\xea\xae\xa0|\xe1\x8f\x90|\xe1\x8f\x90||"
524 "\xea\xae\xa1|\xe1\x8f\x91|\xe1\x8f\x91||"
525 "\xea\xae\xa2|\xe1\x8f\x92|\xe1\x8f\x92||"
526 "\xea\xae\xa3|\xe1\x8f\x93|\xe1\x8f\x93||"
527 "\xea\xae\xa4|\xe1\x8f\x94|\xe1\x8f\x94||"
528 "\xea\xae\xa5|\xe1\x8f\x95|\xe1\x8f\x95||"
529 "\xea\xae\xa6|\xe1\x8f\x96|\xe1\x8f\x96||"
530 "\xea\xae\xa7|\xe1\x8f\x97|\xe1\x8f\x97||"
531 "\xea\xae\xa8|\xe1\x8f\x98|\xe1\x8f\x98||"
532 "\xea\xae\xa9|\xe1\x8f\x99|\xe1\x8f\x99||"
533 "\xea\xae\xaa|\xe1\x8f\x9a|\xe1\x8f\x9a||"
534 "\xea\xae\xab|\xe1\x8f\x9b|\xe1\x8f\x9b||"
535 "\xea\xae\xac|\xe1\x8f\x9c|\xe1\x8f\x9c||"
536 "\xea\xae\xad|\xe1\x8f\x9d|\xe1\x8f\x9d||"
537 "\xea\xae\xae|\xe1\x8f\x9e|\xe1\x8f\x9e||"
538 "\xea\xae\xaf|\xe1\x8f\x9f|\xe1\x8f\x9f||"
539 "\xea\xae\xb0|\xe1\x8f\xa0|\xe1\x8f\xa0||"
540 "\xea\xae\xb1|\xe1\x8f\xa1|\xe1\x8f\xa1||"
541 "\xea\xae\xb2|\xe1\x8f\xa2|\xe1\x8f\xa2||"
542 "\xea\xae\xb3|\xe1\x8f\xa3|\xe1\x8f\xa3||"
543 "\xea\xae\xb4|\xe1\x8f\xa4|\xe1\x8f\xa4||"
544 "\xea\xae\xb5|\xe1\x8f\xa5|\xe1\x8f\xa5||"
545 "\xea\xae\xb6|\xe1\x8f\xa6|\xe1\x8f\xa6||"
546 "\xea\xae\xb7|\xe1\x8f\xa7|\xe1\x8f\xa7||"
547 "\xea\xae\xb8|\xe1\x8f\xa8|\xe1\x8f\xa8||"
548 "\xea\xae\xb9|\xe1\x8f\xa9|\xe1\x8f\xa9||"
549 "\xea\xae\xba|\xe1\x8f\xaa|\xe1\x8f\xaa||"
550 "\xea\xae\xbb|\xe1\x8f\xab|\xe1\x8f\xab||"
551 "\xea\xae\xbc|\xe1\x8f\xac|\xe1\x8f\xac||"
552 "\xea\xae\xbd|\xe1\x8f\xad|\xe1\x8f\xad||"
553 "\xea\xae\xbe|\xe1\x8f\xae|\xe1\x8f\xae||"
554 "\xea\xae\xbf|\xe1\x8f\xaf|\xe1\x8f\xaf||"
555 "\xef\xac\x80|ff|FF||"
556 "\xef\xac\x81|fi|FI||"
557 "\xef\xac\x82|fl|FL||"
558 "\xef\xac\x83|ffi|FFI||"
559 "\xef\xac\x84|ffl|FFL||"
560 "\xef\xac\x85|st|ST||"
561 "\xef\xac\x86|st|ST||"
562 "\xef\xac\x93|\xd5\xb4\xd5\xb6|\xd5\x84\xd5\x86||"
563 "\xef\xac\x94|\xd5\xb4\xd5\xa5|\xd5\x84\xd4\xb5||"
564 "\xef\xac\x95|\xd5\xb4\xd5\xab|\xd5\x84\xd4\xbb||"
565 "\xef\xac\x96|\xd5\xbe\xd5\xb6|\xd5\x8e\xd5\x86||"
566 "\xef\xac\x97|\xd5\xb4\xd5\xad|\xd5\x84\xd4\xbd||"
567
568 //--Autogenerated -- end of section automatically generated
569 ;
570
571 class CaseConverter : public ICaseConverter {
572 // Maximum length of a case conversion result is 6 bytes in UTF-8
573 enum { maxConversionLength=6 };
574 struct ConversionString {
575 char conversion[maxConversionLength+1];
ConversionString__anon6a72038d0111::CaseConverter::ConversionString576 ConversionString() noexcept : conversion{} {
577 }
578 };
579 // Conversions are initially store in a vector of structs but then decomposed into
580 // parallel arrays as that is about 10% faster to search.
581 struct CharacterConversion {
582 int character;
583 ConversionString conversion;
CharacterConversion__anon6a72038d0111::CaseConverter::CharacterConversion584 CharacterConversion() noexcept : character(0) {
585 // Empty case: NUL -> "".
586 }
CharacterConversion__anon6a72038d0111::CaseConverter::CharacterConversion587 CharacterConversion(int character_=0, const char *conversion_="") noexcept : character(character_) {
588 StringCopy(conversion.conversion, conversion_);
589 }
operator <__anon6a72038d0111::CaseConverter::CharacterConversion590 bool operator<(const CharacterConversion &other) const noexcept {
591 return character < other.character;
592 }
593 };
594 typedef std::vector<CharacterConversion> CharacterToConversion;
595 CharacterToConversion characterToConversion;
596 // The parallel arrays
597 std::vector<int> characters;
598 std::vector<ConversionString> conversions;
599
600 public:
CaseConverter()601 CaseConverter() noexcept {
602 }
603 virtual ~CaseConverter() = default;
Initialised() const604 bool Initialised() const noexcept {
605 return !characters.empty();
606 }
Add(int character,const char * conversion)607 void Add(int character, const char *conversion) {
608 characterToConversion.emplace_back(character, conversion);
609 }
Find(int character)610 const char *Find(int character) {
611 const std::vector<int>::iterator it = std::lower_bound(characters.begin(), characters.end(), character);
612 if (it == characters.end())
613 return nullptr;
614 else if (*it == character)
615 return conversions[it - characters.begin()].conversion;
616 else
617 return nullptr;
618 }
CaseConvertString(char * converted,size_t sizeConverted,const char * mixed,size_t lenMixed)619 size_t CaseConvertString(char *converted, size_t sizeConverted, const char *mixed, size_t lenMixed) override {
620 size_t lenConverted = 0;
621 size_t mixedPos = 0;
622 unsigned char bytes[UTF8MaxBytes + 1]{};
623 while (mixedPos < lenMixed) {
624 const unsigned char leadByte = mixed[mixedPos];
625 const char *caseConverted = nullptr;
626 size_t lenMixedChar = 1;
627 if (UTF8IsAscii(leadByte)) {
628 caseConverted = Find(leadByte);
629 } else {
630 bytes[0] = leadByte;
631 const int widthCharBytes = UTF8BytesOfLead[leadByte];
632 for (int b=1; b<widthCharBytes; b++) {
633 bytes[b] = (mixedPos+b < lenMixed) ? mixed[mixedPos+b] : 0;
634 }
635 const int classified = UTF8Classify(bytes, widthCharBytes);
636 if (!(classified & UTF8MaskInvalid)) {
637 // valid UTF-8
638 lenMixedChar = classified & UTF8MaskWidth;
639 const int character = UnicodeFromUTF8(bytes);
640 caseConverted = Find(character);
641 }
642 }
643 if (caseConverted) {
644 // Character has a conversion so copy that conversion in
645 while (*caseConverted) {
646 converted[lenConverted++] = *caseConverted++;
647 if (lenConverted >= sizeConverted)
648 return 0;
649 }
650 } else {
651 // Character has no conversion so copy the input to output
652 for (size_t i=0; i<lenMixedChar; i++) {
653 converted[lenConverted++] = mixed[mixedPos+i];
654 if (lenConverted >= sizeConverted)
655 return 0;
656 }
657 }
658 mixedPos += lenMixedChar;
659 }
660 return lenConverted;
661 }
FinishedAdding()662 void FinishedAdding() {
663 std::sort(characterToConversion.begin(), characterToConversion.end());
664 characters.reserve(characterToConversion.size());
665 conversions.reserve(characterToConversion.size());
666 for (const CharacterConversion &chConv : characterToConversion) {
667 characters.push_back(chConv.character);
668 conversions.push_back(chConv.conversion);
669 }
670 // Empty the original calculated data completely
671 CharacterToConversion().swap(characterToConversion);
672 }
673 };
674
675 CaseConverter caseConvFold;
676 CaseConverter caseConvUp;
677 CaseConverter caseConvLow;
678
AddSymmetric(enum CaseConversion conversion,int lower,int upper)679 void AddSymmetric(enum CaseConversion conversion, int lower,int upper) {
680 char lowerUTF8[UTF8MaxBytes+1];
681 UTF8FromUTF32Character(lower, lowerUTF8);
682 char upperUTF8[UTF8MaxBytes+1];
683 UTF8FromUTF32Character(upper, upperUTF8);
684
685 switch (conversion) {
686 case CaseConversionFold:
687 caseConvFold.Add(upper, lowerUTF8);
688 break;
689 case CaseConversionUpper:
690 caseConvUp.Add(lower, upperUTF8);
691 break;
692 case CaseConversionLower:
693 caseConvLow.Add(upper, lowerUTF8);
694 break;
695 }
696 }
697
SetupConversions(enum CaseConversion conversion)698 void SetupConversions(enum CaseConversion conversion) {
699 // First initialize for the symmetric ranges
700 for (size_t i=0; i<Sci::size(symmetricCaseConversionRanges);) {
701 const int lower = symmetricCaseConversionRanges[i++];
702 const int upper = symmetricCaseConversionRanges[i++];
703 const int length = symmetricCaseConversionRanges[i++];
704 const int pitch = symmetricCaseConversionRanges[i++];
705 for (int j=0; j<length*pitch; j+=pitch) {
706 AddSymmetric(conversion, lower+j, upper+j);
707 }
708 }
709 // Add the symmetric singletons
710 for (size_t i=0; i<Sci::size(symmetricCaseConversions);) {
711 const int lower = symmetricCaseConversions[i++];
712 const int upper = symmetricCaseConversions[i++];
713 AddSymmetric(conversion, lower, upper);
714 }
715 // Add the complex cases
716 const char *sComplex = complexCaseConversions;
717 while (*sComplex) {
718 // Longest ligature is 3 character so 5 for safety
719 constexpr size_t lenUTF8 = 5*UTF8MaxBytes+1;
720 unsigned char originUTF8[lenUTF8]{};
721 char foldedUTF8[lenUTF8]{};
722 char lowerUTF8[lenUTF8]{};
723 char upperUTF8[lenUTF8]{};
724 size_t i = 0;
725 while (*sComplex && *sComplex != '|') {
726 originUTF8[i++] = *sComplex;
727 sComplex++;
728 }
729 sComplex++;
730 originUTF8[i] = 0;
731 i = 0;
732 while (*sComplex && *sComplex != '|') {
733 foldedUTF8[i++] = *sComplex;
734 sComplex++;
735 }
736 sComplex++;
737 foldedUTF8[i] = 0;
738 i = 0;
739 while (*sComplex && *sComplex != '|') {
740 upperUTF8[i++] = *sComplex;
741 sComplex++;
742 }
743 sComplex++;
744 upperUTF8[i] = 0;
745 i = 0;
746 while (*sComplex && *sComplex != '|') {
747 lowerUTF8[i++] = *sComplex;
748 sComplex++;
749 }
750 sComplex++;
751 lowerUTF8[i] = 0;
752
753 const int character = UnicodeFromUTF8(originUTF8);
754
755 if (conversion == CaseConversionFold && foldedUTF8[0]) {
756 caseConvFold.Add(character, foldedUTF8);
757 }
758
759 if (conversion == CaseConversionUpper && upperUTF8[0]) {
760 caseConvUp.Add(character, upperUTF8);
761 }
762
763 if (conversion == CaseConversionLower && lowerUTF8[0]) {
764 caseConvLow.Add(character, lowerUTF8);
765 }
766 }
767
768 switch (conversion) {
769 case CaseConversionFold:
770 caseConvFold.FinishedAdding();
771 break;
772 case CaseConversionUpper:
773 caseConvUp.FinishedAdding();
774 break;
775 case CaseConversionLower:
776 caseConvLow.FinishedAdding();
777 break;
778 }
779 }
780
ConverterForConversion(enum CaseConversion conversion)781 CaseConverter *ConverterForConversion(enum CaseConversion conversion) noexcept {
782 switch (conversion) {
783 case CaseConversionFold:
784 return &caseConvFold;
785 case CaseConversionUpper:
786 return &caseConvUp;
787 case CaseConversionLower:
788 return &caseConvLow;
789 }
790 return nullptr;
791 }
792
793 }
794
795 namespace Scintilla {
796
ConverterFor(enum CaseConversion conversion)797 ICaseConverter *ConverterFor(enum CaseConversion conversion) {
798 CaseConverter *pCaseConv = ConverterForConversion(conversion);
799 if (!pCaseConv->Initialised())
800 SetupConversions(conversion);
801 return pCaseConv;
802 }
803
CaseConvert(int character,enum CaseConversion conversion)804 const char *CaseConvert(int character, enum CaseConversion conversion) {
805 CaseConverter *pCaseConv = ConverterForConversion(conversion);
806 if (!pCaseConv->Initialised())
807 SetupConversions(conversion);
808 return pCaseConv->Find(character);
809 }
810
CaseConvertString(char * converted,size_t sizeConverted,const char * mixed,size_t lenMixed,enum CaseConversion conversion)811 size_t CaseConvertString(char *converted, size_t sizeConverted, const char *mixed, size_t lenMixed, enum CaseConversion conversion) {
812 CaseConverter *pCaseConv = ConverterForConversion(conversion);
813 if (!pCaseConv->Initialised())
814 SetupConversions(conversion);
815 return pCaseConv->CaseConvertString(converted, sizeConverted, mixed, lenMixed);
816 }
817
CaseConvertString(const std::string & s,enum CaseConversion conversion)818 std::string CaseConvertString(const std::string &s, enum CaseConversion conversion) {
819 std::string retMapped(s.length() * maxExpansionCaseConversion, 0);
820 const size_t lenMapped = CaseConvertString(&retMapped[0], retMapped.length(), s.c_str(), s.length(),
821 conversion);
822 retMapped.resize(lenMapped);
823 return retMapped;
824 }
825
826 }
827