1 // Scintilla source code edit control
2 // Encoding: UTF-8
3 /** @file CaseConvert.cxx
4  ** Case fold characters and convert them to upper or lower case.
5  ** Tables automatically regenerated by scripts/GenerateCaseConvert.py
6  ** Should only be rarely regenerated for new versions of Unicode.
7  **/
8 // Copyright 2013 by Neil Hodgson <neilh@scintilla.org>
9 // The License.txt file describes the conditions under which this software may be distributed.
10 
11 #include <cstring>
12 
13 #include <vector>
14 #include <algorithm>
15 
16 #include "CaseConvert.h"
17 #include "UniConversion.h"
18 #include "UnicodeFromUTF8.h"
19 
20 #ifdef SCI_NAMESPACE
21 using namespace Scintilla;
22 #endif
23 
24 namespace {
25 	// Use an unnamed namespace to protect the declarations from name conflicts
26 
27 // Unicode code points are ordered by groups and follow patterns.
28 // Most characters (pitch==1) are in ranges for a particular alphabet and their
29 // upper case forms are a fixed distance away.
30 // Another pattern (pitch==2) is where each lower case letter is preceded by
31 // the upper case form. These are also grouped into ranges.
32 
33 int symmetricCaseConversionRanges[] = {
34 //lower, upper, range length, range pitch
35 //++Autogenerated -- start of section automatically generated
36 //**\(\*\n\)
37 97,65,26,1,
38 224,192,23,1,
39 248,216,7,1,
40 257,256,24,2,
41 314,313,8,2,
42 331,330,23,2,
43 462,461,8,2,
44 479,478,9,2,
45 505,504,20,2,
46 547,546,9,2,
47 583,582,5,2,
48 945,913,17,1,
49 963,931,9,1,
50 985,984,12,2,
51 1072,1040,32,1,
52 1104,1024,16,1,
53 1121,1120,17,2,
54 1163,1162,27,2,
55 1218,1217,7,2,
56 1233,1232,44,2,
57 1377,1329,38,1,
58 7681,7680,75,2,
59 7841,7840,48,2,
60 7936,7944,8,1,
61 7952,7960,6,1,
62 7968,7976,8,1,
63 7984,7992,8,1,
64 8000,8008,6,1,
65 8032,8040,8,1,
66 8560,8544,16,1,
67 9424,9398,26,1,
68 11312,11264,47,1,
69 11393,11392,50,2,
70 11520,4256,38,1,
71 42561,42560,23,2,
72 42625,42624,12,2,
73 42787,42786,7,2,
74 42803,42802,31,2,
75 42879,42878,5,2,
76 42913,42912,5,2,
77 65345,65313,26,1,
78 66600,66560,40,1,
79 
80 //--Autogenerated -- end of section automatically generated
81 };
82 
83 // Code points that are symmetric but don't fit into a range of similar characters
84 // are listed here.
85 
86 int symmetricCaseConversions[] = {
87 //lower, upper
88 //++Autogenerated -- start of section automatically generated
89 //**1 \(\*\n\)
90 255,376,
91 307,306,
92 309,308,
93 311,310,
94 378,377,
95 380,379,
96 382,381,
97 384,579,
98 387,386,
99 389,388,
100 392,391,
101 396,395,
102 402,401,
103 405,502,
104 409,408,
105 410,573,
106 414,544,
107 417,416,
108 419,418,
109 421,420,
110 424,423,
111 429,428,
112 432,431,
113 436,435,
114 438,437,
115 441,440,
116 445,444,
117 447,503,
118 454,452,
119 457,455,
120 460,458,
121 477,398,
122 499,497,
123 501,500,
124 572,571,
125 575,11390,
126 576,11391,
127 578,577,
128 592,11375,
129 593,11373,
130 594,11376,
131 595,385,
132 596,390,
133 598,393,
134 599,394,
135 601,399,
136 603,400,
137 608,403,
138 611,404,
139 613,42893,
140 614,42922,
141 616,407,
142 617,406,
143 619,11362,
144 623,412,
145 625,11374,
146 626,413,
147 629,415,
148 637,11364,
149 640,422,
150 643,425,
151 648,430,
152 649,580,
153 650,433,
154 651,434,
155 652,581,
156 658,439,
157 881,880,
158 883,882,
159 887,886,
160 891,1021,
161 892,1022,
162 893,1023,
163 940,902,
164 941,904,
165 942,905,
166 943,906,
167 972,908,
168 973,910,
169 974,911,
170 983,975,
171 1010,1017,
172 1016,1015,
173 1019,1018,
174 1231,1216,
175 7545,42877,
176 7549,11363,
177 8017,8025,
178 8019,8027,
179 8021,8029,
180 8023,8031,
181 8048,8122,
182 8049,8123,
183 8050,8136,
184 8051,8137,
185 8052,8138,
186 8053,8139,
187 8054,8154,
188 8055,8155,
189 8056,8184,
190 8057,8185,
191 8058,8170,
192 8059,8171,
193 8060,8186,
194 8061,8187,
195 8112,8120,
196 8113,8121,
197 8144,8152,
198 8145,8153,
199 8160,8168,
200 8161,8169,
201 8165,8172,
202 8526,8498,
203 8580,8579,
204 11361,11360,
205 11365,570,
206 11366,574,
207 11368,11367,
208 11370,11369,
209 11372,11371,
210 11379,11378,
211 11382,11381,
212 11500,11499,
213 11502,11501,
214 11507,11506,
215 11559,4295,
216 11565,4301,
217 42874,42873,
218 42876,42875,
219 42892,42891,
220 42897,42896,
221 42899,42898,
222 
223 //--Autogenerated -- end of section automatically generated
224 };
225 
226 // Characters that have complex case conversions are listed here.
227 // This includes cases where more than one character is needed for a conversion,
228 // folding is different to lowering, or (as appropriate) upper(lower(x)) != x or
229 // lower(upper(x)) != x.
230 
231 const char *complexCaseConversions =
232 // Original | Folded | Upper | Lower |
233 //++Autogenerated -- start of section automatically generated
234 //**2 \(\*\n\)
235 "\xc2\xb5|\xce\xbc|\xce\x9c||"
236 "\xc3\x9f|ss|SS||"
237 "\xc4\xb0|i\xcc\x87||i\xcc\x87|"
238 "\xc4\xb1||I||"
239 "\xc5\x89|\xca\xbcn|\xca\xbcN||"
240 "\xc5\xbf|s|S||"
241 "\xc7\x85|\xc7\x86|\xc7\x84|\xc7\x86|"
242 "\xc7\x88|\xc7\x89|\xc7\x87|\xc7\x89|"
243 "\xc7\x8b|\xc7\x8c|\xc7\x8a|\xc7\x8c|"
244 "\xc7\xb0|j\xcc\x8c|J\xcc\x8c||"
245 "\xc7\xb2|\xc7\xb3|\xc7\xb1|\xc7\xb3|"
246 "\xcd\x85|\xce\xb9|\xce\x99||"
247 "\xce\x90|\xce\xb9\xcc\x88\xcc\x81|\xce\x99\xcc\x88\xcc\x81||"
248 "\xce\xb0|\xcf\x85\xcc\x88\xcc\x81|\xce\xa5\xcc\x88\xcc\x81||"
249 "\xcf\x82|\xcf\x83|\xce\xa3||"
250 "\xcf\x90|\xce\xb2|\xce\x92||"
251 "\xcf\x91|\xce\xb8|\xce\x98||"
252 "\xcf\x95|\xcf\x86|\xce\xa6||"
253 "\xcf\x96|\xcf\x80|\xce\xa0||"
254 "\xcf\xb0|\xce\xba|\xce\x9a||"
255 "\xcf\xb1|\xcf\x81|\xce\xa1||"
256 "\xcf\xb4|\xce\xb8||\xce\xb8|"
257 "\xcf\xb5|\xce\xb5|\xce\x95||"
258 "\xd6\x87|\xd5\xa5\xd6\x82|\xd4\xb5\xd5\x92||"
259 "\xe1\xba\x96|h\xcc\xb1|H\xcc\xb1||"
260 "\xe1\xba\x97|t\xcc\x88|T\xcc\x88||"
261 "\xe1\xba\x98|w\xcc\x8a|W\xcc\x8a||"
262 "\xe1\xba\x99|y\xcc\x8a|Y\xcc\x8a||"
263 "\xe1\xba\x9a|a\xca\xbe|A\xca\xbe||"
264 "\xe1\xba\x9b|\xe1\xb9\xa1|\xe1\xb9\xa0||"
265 "\xe1\xba\x9e|ss||\xc3\x9f|"
266 "\xe1\xbd\x90|\xcf\x85\xcc\x93|\xce\xa5\xcc\x93||"
267 "\xe1\xbd\x92|\xcf\x85\xcc\x93\xcc\x80|\xce\xa5\xcc\x93\xcc\x80||"
268 "\xe1\xbd\x94|\xcf\x85\xcc\x93\xcc\x81|\xce\xa5\xcc\x93\xcc\x81||"
269 "\xe1\xbd\x96|\xcf\x85\xcc\x93\xcd\x82|\xce\xa5\xcc\x93\xcd\x82||"
270 "\xe1\xbe\x80|\xe1\xbc\x80\xce\xb9|\xe1\xbc\x88\xce\x99||"
271 "\xe1\xbe\x81|\xe1\xbc\x81\xce\xb9|\xe1\xbc\x89\xce\x99||"
272 "\xe1\xbe\x82|\xe1\xbc\x82\xce\xb9|\xe1\xbc\x8a\xce\x99||"
273 "\xe1\xbe\x83|\xe1\xbc\x83\xce\xb9|\xe1\xbc\x8b\xce\x99||"
274 "\xe1\xbe\x84|\xe1\xbc\x84\xce\xb9|\xe1\xbc\x8c\xce\x99||"
275 "\xe1\xbe\x85|\xe1\xbc\x85\xce\xb9|\xe1\xbc\x8d\xce\x99||"
276 "\xe1\xbe\x86|\xe1\xbc\x86\xce\xb9|\xe1\xbc\x8e\xce\x99||"
277 "\xe1\xbe\x87|\xe1\xbc\x87\xce\xb9|\xe1\xbc\x8f\xce\x99||"
278 "\xe1\xbe\x88|\xe1\xbc\x80\xce\xb9|\xe1\xbc\x88\xce\x99|\xe1\xbe\x80|"
279 "\xe1\xbe\x89|\xe1\xbc\x81\xce\xb9|\xe1\xbc\x89\xce\x99|\xe1\xbe\x81|"
280 "\xe1\xbe\x8a|\xe1\xbc\x82\xce\xb9|\xe1\xbc\x8a\xce\x99|\xe1\xbe\x82|"
281 "\xe1\xbe\x8b|\xe1\xbc\x83\xce\xb9|\xe1\xbc\x8b\xce\x99|\xe1\xbe\x83|"
282 "\xe1\xbe\x8c|\xe1\xbc\x84\xce\xb9|\xe1\xbc\x8c\xce\x99|\xe1\xbe\x84|"
283 "\xe1\xbe\x8d|\xe1\xbc\x85\xce\xb9|\xe1\xbc\x8d\xce\x99|\xe1\xbe\x85|"
284 "\xe1\xbe\x8e|\xe1\xbc\x86\xce\xb9|\xe1\xbc\x8e\xce\x99|\xe1\xbe\x86|"
285 "\xe1\xbe\x8f|\xe1\xbc\x87\xce\xb9|\xe1\xbc\x8f\xce\x99|\xe1\xbe\x87|"
286 "\xe1\xbe\x90|\xe1\xbc\xa0\xce\xb9|\xe1\xbc\xa8\xce\x99||"
287 "\xe1\xbe\x91|\xe1\xbc\xa1\xce\xb9|\xe1\xbc\xa9\xce\x99||"
288 "\xe1\xbe\x92|\xe1\xbc\xa2\xce\xb9|\xe1\xbc\xaa\xce\x99||"
289 "\xe1\xbe\x93|\xe1\xbc\xa3\xce\xb9|\xe1\xbc\xab\xce\x99||"
290 "\xe1\xbe\x94|\xe1\xbc\xa4\xce\xb9|\xe1\xbc\xac\xce\x99||"
291 "\xe1\xbe\x95|\xe1\xbc\xa5\xce\xb9|\xe1\xbc\xad\xce\x99||"
292 "\xe1\xbe\x96|\xe1\xbc\xa6\xce\xb9|\xe1\xbc\xae\xce\x99||"
293 "\xe1\xbe\x97|\xe1\xbc\xa7\xce\xb9|\xe1\xbc\xaf\xce\x99||"
294 "\xe1\xbe\x98|\xe1\xbc\xa0\xce\xb9|\xe1\xbc\xa8\xce\x99|\xe1\xbe\x90|"
295 "\xe1\xbe\x99|\xe1\xbc\xa1\xce\xb9|\xe1\xbc\xa9\xce\x99|\xe1\xbe\x91|"
296 "\xe1\xbe\x9a|\xe1\xbc\xa2\xce\xb9|\xe1\xbc\xaa\xce\x99|\xe1\xbe\x92|"
297 "\xe1\xbe\x9b|\xe1\xbc\xa3\xce\xb9|\xe1\xbc\xab\xce\x99|\xe1\xbe\x93|"
298 "\xe1\xbe\x9c|\xe1\xbc\xa4\xce\xb9|\xe1\xbc\xac\xce\x99|\xe1\xbe\x94|"
299 "\xe1\xbe\x9d|\xe1\xbc\xa5\xce\xb9|\xe1\xbc\xad\xce\x99|\xe1\xbe\x95|"
300 "\xe1\xbe\x9e|\xe1\xbc\xa6\xce\xb9|\xe1\xbc\xae\xce\x99|\xe1\xbe\x96|"
301 "\xe1\xbe\x9f|\xe1\xbc\xa7\xce\xb9|\xe1\xbc\xaf\xce\x99|\xe1\xbe\x97|"
302 "\xe1\xbe\xa0|\xe1\xbd\xa0\xce\xb9|\xe1\xbd\xa8\xce\x99||"
303 "\xe1\xbe\xa1|\xe1\xbd\xa1\xce\xb9|\xe1\xbd\xa9\xce\x99||"
304 "\xe1\xbe\xa2|\xe1\xbd\xa2\xce\xb9|\xe1\xbd\xaa\xce\x99||"
305 "\xe1\xbe\xa3|\xe1\xbd\xa3\xce\xb9|\xe1\xbd\xab\xce\x99||"
306 "\xe1\xbe\xa4|\xe1\xbd\xa4\xce\xb9|\xe1\xbd\xac\xce\x99||"
307 "\xe1\xbe\xa5|\xe1\xbd\xa5\xce\xb9|\xe1\xbd\xad\xce\x99||"
308 "\xe1\xbe\xa6|\xe1\xbd\xa6\xce\xb9|\xe1\xbd\xae\xce\x99||"
309 "\xe1\xbe\xa7|\xe1\xbd\xa7\xce\xb9|\xe1\xbd\xaf\xce\x99||"
310 "\xe1\xbe\xa8|\xe1\xbd\xa0\xce\xb9|\xe1\xbd\xa8\xce\x99|\xe1\xbe\xa0|"
311 "\xe1\xbe\xa9|\xe1\xbd\xa1\xce\xb9|\xe1\xbd\xa9\xce\x99|\xe1\xbe\xa1|"
312 "\xe1\xbe\xaa|\xe1\xbd\xa2\xce\xb9|\xe1\xbd\xaa\xce\x99|\xe1\xbe\xa2|"
313 "\xe1\xbe\xab|\xe1\xbd\xa3\xce\xb9|\xe1\xbd\xab\xce\x99|\xe1\xbe\xa3|"
314 "\xe1\xbe\xac|\xe1\xbd\xa4\xce\xb9|\xe1\xbd\xac\xce\x99|\xe1\xbe\xa4|"
315 "\xe1\xbe\xad|\xe1\xbd\xa5\xce\xb9|\xe1\xbd\xad\xce\x99|\xe1\xbe\xa5|"
316 "\xe1\xbe\xae|\xe1\xbd\xa6\xce\xb9|\xe1\xbd\xae\xce\x99|\xe1\xbe\xa6|"
317 "\xe1\xbe\xaf|\xe1\xbd\xa7\xce\xb9|\xe1\xbd\xaf\xce\x99|\xe1\xbe\xa7|"
318 "\xe1\xbe\xb2|\xe1\xbd\xb0\xce\xb9|\xe1\xbe\xba\xce\x99||"
319 "\xe1\xbe\xb3|\xce\xb1\xce\xb9|\xce\x91\xce\x99||"
320 "\xe1\xbe\xb4|\xce\xac\xce\xb9|\xce\x86\xce\x99||"
321 "\xe1\xbe\xb6|\xce\xb1\xcd\x82|\xce\x91\xcd\x82||"
322 "\xe1\xbe\xb7|\xce\xb1\xcd\x82\xce\xb9|\xce\x91\xcd\x82\xce\x99||"
323 "\xe1\xbe\xbc|\xce\xb1\xce\xb9|\xce\x91\xce\x99|\xe1\xbe\xb3|"
324 "\xe1\xbe\xbe|\xce\xb9|\xce\x99||"
325 "\xe1\xbf\x82|\xe1\xbd\xb4\xce\xb9|\xe1\xbf\x8a\xce\x99||"
326 "\xe1\xbf\x83|\xce\xb7\xce\xb9|\xce\x97\xce\x99||"
327 "\xe1\xbf\x84|\xce\xae\xce\xb9|\xce\x89\xce\x99||"
328 "\xe1\xbf\x86|\xce\xb7\xcd\x82|\xce\x97\xcd\x82||"
329 "\xe1\xbf\x87|\xce\xb7\xcd\x82\xce\xb9|\xce\x97\xcd\x82\xce\x99||"
330 "\xe1\xbf\x8c|\xce\xb7\xce\xb9|\xce\x97\xce\x99|\xe1\xbf\x83|"
331 "\xe1\xbf\x92|\xce\xb9\xcc\x88\xcc\x80|\xce\x99\xcc\x88\xcc\x80||"
332 "\xe1\xbf\x93|\xce\xb9\xcc\x88\xcc\x81|\xce\x99\xcc\x88\xcc\x81||"
333 "\xe1\xbf\x96|\xce\xb9\xcd\x82|\xce\x99\xcd\x82||"
334 "\xe1\xbf\x97|\xce\xb9\xcc\x88\xcd\x82|\xce\x99\xcc\x88\xcd\x82||"
335 "\xe1\xbf\xa2|\xcf\x85\xcc\x88\xcc\x80|\xce\xa5\xcc\x88\xcc\x80||"
336 "\xe1\xbf\xa3|\xcf\x85\xcc\x88\xcc\x81|\xce\xa5\xcc\x88\xcc\x81||"
337 "\xe1\xbf\xa4|\xcf\x81\xcc\x93|\xce\xa1\xcc\x93||"
338 "\xe1\xbf\xa6|\xcf\x85\xcd\x82|\xce\xa5\xcd\x82||"
339 "\xe1\xbf\xa7|\xcf\x85\xcc\x88\xcd\x82|\xce\xa5\xcc\x88\xcd\x82||"
340 "\xe1\xbf\xb2|\xe1\xbd\xbc\xce\xb9|\xe1\xbf\xba\xce\x99||"
341 "\xe1\xbf\xb3|\xcf\x89\xce\xb9|\xce\xa9\xce\x99||"
342 "\xe1\xbf\xb4|\xcf\x8e\xce\xb9|\xce\x8f\xce\x99||"
343 "\xe1\xbf\xb6|\xcf\x89\xcd\x82|\xce\xa9\xcd\x82||"
344 "\xe1\xbf\xb7|\xcf\x89\xcd\x82\xce\xb9|\xce\xa9\xcd\x82\xce\x99||"
345 "\xe1\xbf\xbc|\xcf\x89\xce\xb9|\xce\xa9\xce\x99|\xe1\xbf\xb3|"
346 "\xe2\x84\xa6|\xcf\x89||\xcf\x89|"
347 "\xe2\x84\xaa|k||k|"
348 "\xe2\x84\xab|\xc3\xa5||\xc3\xa5|"
349 "\xef\xac\x80|ff|FF||"
350 "\xef\xac\x81|fi|FI||"
351 "\xef\xac\x82|fl|FL||"
352 "\xef\xac\x83|ffi|FFI||"
353 "\xef\xac\x84|ffl|FFL||"
354 "\xef\xac\x85|st|ST||"
355 "\xef\xac\x86|st|ST||"
356 "\xef\xac\x93|\xd5\xb4\xd5\xb6|\xd5\x84\xd5\x86||"
357 "\xef\xac\x94|\xd5\xb4\xd5\xa5|\xd5\x84\xd4\xb5||"
358 "\xef\xac\x95|\xd5\xb4\xd5\xab|\xd5\x84\xd4\xbb||"
359 "\xef\xac\x96|\xd5\xbe\xd5\xb6|\xd5\x8e\xd5\x86||"
360 "\xef\xac\x97|\xd5\xb4\xd5\xad|\xd5\x84\xd4\xbd||"
361 
362 //--Autogenerated -- end of section automatically generated
363 ;
364 
365 class CaseConverter : public ICaseConverter {
366 	// Maximum length of a case conversion result is 6 bytes in UTF-8
367 	enum { maxConversionLength=6 };
368 	struct ConversionString {
369 		char conversion[maxConversionLength+1];
370 	};
371 	// Conversions are initially store in a vector of structs but then decomposed into
372 	// parallel arrays as that is about 10% faster to search.
373 	struct CharacterConversion {
374 		int character;
375 		ConversionString conversion;
CharacterConversion__anon03f3b3130111::CaseConverter::CharacterConversion376 		CharacterConversion(int character_=0, const char *conversion_="") : character(character_) {
377 			strcpy(conversion.conversion, conversion_);
378 		}
operator <__anon03f3b3130111::CaseConverter::CharacterConversion379 		bool operator<(const CharacterConversion &other) const {
380 			return character < other.character;
381 		}
382 	};
383 	typedef std::vector<CharacterConversion> CharacterToConversion;
384 	CharacterToConversion characterToConversion;
385 	// The parallel arrays
386 	std::vector<int> characters;
387 	std::vector<ConversionString> conversions;
388 
389 public:
CaseConverter()390 	CaseConverter() {
391 	}
Initialised() const392 	bool Initialised() const {
393 		return characters.size() > 0;
394 	}
Add(int character,const char * conversion)395 	void Add(int character, const char *conversion) {
396 		characterToConversion.push_back(CharacterConversion(character, conversion));
397 	}
Find(int character)398 	const char *Find(int character) {
399 		const std::vector<int>::iterator it = std::lower_bound(characters.begin(), characters.end(), character);
400 		if (it == characters.end())
401 			return 0;
402 		else if (*it == character)
403 			return conversions[it - characters.begin()].conversion;
404 		else
405 			return 0;
406 	}
CaseConvertString(char * converted,size_t sizeConverted,const char * mixed,size_t lenMixed)407 	size_t CaseConvertString(char *converted, size_t sizeConverted, const char *mixed, size_t lenMixed) {
408 		size_t lenConverted = 0;
409 		size_t mixedPos = 0;
410 		unsigned char bytes[UTF8MaxBytes + 1];
411 		while (mixedPos < lenMixed) {
412 			const unsigned char leadByte = static_cast<unsigned char>(mixed[mixedPos]);
413 			const char *caseConverted = 0;
414 			size_t lenMixedChar = 1;
415 			if (UTF8IsAscii(leadByte)) {
416 				caseConverted = Find(leadByte);
417 			} else {
418 				bytes[0] = leadByte;
419 				const int widthCharBytes = UTF8BytesOfLead[leadByte];
420 				for (int b=1; b<widthCharBytes; b++) {
421 					bytes[b] = (mixedPos+b < lenMixed) ? mixed[mixedPos+b] : 0;
422 				}
423 				int classified = UTF8Classify(bytes, widthCharBytes);
424 				if (!(classified & UTF8MaskInvalid)) {
425 					// valid UTF-8
426 					lenMixedChar = classified & UTF8MaskWidth;
427 					int character = UnicodeFromUTF8(bytes);
428 					caseConverted = Find(character);
429 				}
430 			}
431 			if (caseConverted) {
432 				// Character has a conversion so copy that conversion in
433 				while (*caseConverted) {
434 					converted[lenConverted++] = *caseConverted++;
435 					if (lenConverted >= sizeConverted)
436 						return 0;
437 				}
438 			} else {
439 				// Character has no conversion so copy the input to output
440 				for (size_t i=0; i<lenMixedChar; i++) {
441 					converted[lenConverted++] = mixed[mixedPos+i];
442 					if (lenConverted >= sizeConverted)
443 						return 0;
444 				}
445 			}
446 			mixedPos += lenMixedChar;
447 		}
448 		return lenConverted;
449 	}
FinishedAdding()450 	void FinishedAdding() {
451 		std::sort(characterToConversion.begin(), characterToConversion.end());
452 		characters.reserve(characterToConversion.size());
453 		conversions.reserve(characterToConversion.size());
454 		for (CharacterToConversion::iterator it = characterToConversion.begin(); it != characterToConversion.end(); ++it) {
455 			characters.push_back(it->character);
456 			conversions.push_back(it->conversion);
457 		}
458 		// Empty the original calculated data completely
459 		CharacterToConversion().swap(characterToConversion);
460 	}
461 };
462 
463 CaseConverter caseConvFold;
464 CaseConverter caseConvUp;
465 CaseConverter caseConvLow;
466 
UTF8FromUTF32Character(int uch,char * putf)467 void UTF8FromUTF32Character(int uch, char *putf) {
468 	size_t k = 0;
469 	if (uch < 0x80) {
470 		putf[k++] = static_cast<char>(uch);
471 	} else if (uch < 0x800) {
472 		putf[k++] = static_cast<char>(0xC0 | (uch >> 6));
473 		putf[k++] = static_cast<char>(0x80 | (uch & 0x3f));
474 	} else if (uch < 0x10000) {
475 		putf[k++] = static_cast<char>(0xE0 | (uch >> 12));
476 		putf[k++] = static_cast<char>(0x80 | ((uch >> 6) & 0x3f));
477 		putf[k++] = static_cast<char>(0x80 | (uch & 0x3f));
478 	} else {
479 		putf[k++] = static_cast<char>(0xF0 | (uch >> 18));
480 		putf[k++] = static_cast<char>(0x80 | ((uch >> 12) & 0x3f));
481 		putf[k++] = static_cast<char>(0x80 | ((uch >> 6) & 0x3f));
482 		putf[k++] = static_cast<char>(0x80 | (uch & 0x3f));
483 	}
484 	putf[k] = 0;
485 }
486 
AddSymmetric(enum CaseConversion conversion,int lower,int upper)487 void AddSymmetric(enum CaseConversion conversion, int lower,int upper) {
488 	char lowerUTF8[UTF8MaxBytes+1];
489 	UTF8FromUTF32Character(lower, lowerUTF8);
490 	char upperUTF8[UTF8MaxBytes+1];
491 	UTF8FromUTF32Character(upper, upperUTF8);
492 
493 	switch (conversion) {
494 	case CaseConversionFold:
495 		caseConvFold.Add(upper, lowerUTF8);
496 		break;
497 	case CaseConversionUpper:
498 		caseConvUp.Add(lower, upperUTF8);
499 		break;
500 	case CaseConversionLower:
501 		caseConvLow.Add(upper, lowerUTF8);
502 		break;
503 	}
504 }
505 
SetupConversions(enum CaseConversion conversion)506 void SetupConversions(enum CaseConversion conversion) {
507 	// First initialize for the symmetric ranges
508 	for (size_t i=0; i<sizeof(symmetricCaseConversionRanges)/sizeof(symmetricCaseConversionRanges[0]);) {
509 		int lower = symmetricCaseConversionRanges[i++];
510 		int upper = symmetricCaseConversionRanges[i++];
511 		int length = symmetricCaseConversionRanges[i++];
512 		int pitch = symmetricCaseConversionRanges[i++];
513 		for (int j=0;j<length*pitch;j+=pitch) {
514 			AddSymmetric(conversion, lower+j, upper+j);
515 		}
516 	}
517 	// Add the symmetric singletons
518 	for (size_t i=0; i<sizeof(symmetricCaseConversions)/sizeof(symmetricCaseConversions[0]);) {
519 		int lower = symmetricCaseConversions[i++];
520 		int upper = symmetricCaseConversions[i++];
521 		AddSymmetric(conversion, lower, upper);
522 	}
523 	// Add the complex cases
524 	const char *sComplex = complexCaseConversions;
525 	while (*sComplex) {
526 		// Longest ligature is 3 character so 5 for safety
527 		const size_t lenUTF8 = 5*UTF8MaxBytes+1;
528 		char originUTF8[lenUTF8];
529 		char foldedUTF8[lenUTF8];
530 		char lowerUTF8[lenUTF8];
531 		char upperUTF8[lenUTF8];
532 		size_t i = 0;
533 		while (*sComplex && *sComplex != '|') {
534 			originUTF8[i++] = *sComplex;
535 			sComplex++;
536 		}
537 		sComplex++;
538 		originUTF8[i] = 0;
539 		i = 0;
540 		while (*sComplex && *sComplex != '|') {
541 			foldedUTF8[i++] = *sComplex;
542 			sComplex++;
543 		}
544 		sComplex++;
545 		foldedUTF8[i] = 0;
546 		i = 0;
547 		while (*sComplex && *sComplex != '|') {
548 			upperUTF8[i++] = *sComplex;
549 			sComplex++;
550 		}
551 		sComplex++;
552 		upperUTF8[i] = 0;
553 		i = 0;
554 		while (*sComplex && *sComplex != '|') {
555 			lowerUTF8[i++] = *sComplex;
556 			sComplex++;
557 		}
558 		sComplex++;
559 		lowerUTF8[i] = 0;
560 
561 		int character = UnicodeFromUTF8(reinterpret_cast<unsigned char *>(originUTF8));
562 
563 		if (conversion == CaseConversionFold && foldedUTF8[0]) {
564 			caseConvFold.Add(character, foldedUTF8);
565 		}
566 
567 		if (conversion == CaseConversionUpper && upperUTF8[0]) {
568 			caseConvUp.Add(character, upperUTF8);
569 		}
570 
571 		if (conversion == CaseConversionLower && lowerUTF8[0]) {
572 			caseConvLow.Add(character, lowerUTF8);
573 		}
574 	}
575 
576 	switch (conversion) {
577 	case CaseConversionFold:
578 		caseConvFold.FinishedAdding();
579 		break;
580 	case CaseConversionUpper:
581 		caseConvUp.FinishedAdding();
582 		break;
583 	case CaseConversionLower:
584 		caseConvLow.FinishedAdding();
585 		break;
586 	}
587 }
588 
ConverterForConversion(enum CaseConversion conversion)589 CaseConverter *ConverterForConversion(enum CaseConversion conversion) {
590 	switch (conversion) {
591 	case CaseConversionFold:
592 		return &caseConvFold;
593 	case CaseConversionUpper:
594 		return &caseConvUp;
595 	case CaseConversionLower:
596 		return &caseConvLow;
597 	}
598 	return 0;
599 }
600 
601 }
602 
603 #ifdef SCI_NAMESPACE
604 namespace Scintilla {
605 #endif
606 
ConverterFor(enum CaseConversion conversion)607 ICaseConverter *ConverterFor(enum CaseConversion conversion) {
608 	CaseConverter *pCaseConv = ConverterForConversion(conversion);
609 	if (!pCaseConv->Initialised())
610 		SetupConversions(conversion);
611 	return pCaseConv;
612 }
613 
CaseConvert(int character,enum CaseConversion conversion)614 const char *CaseConvert(int character, enum CaseConversion conversion) {
615 	CaseConverter *pCaseConv = ConverterForConversion(conversion);
616 	if (!pCaseConv->Initialised())
617 		SetupConversions(conversion);
618 	return pCaseConv->Find(character);
619 }
620 
CaseConvertString(char * converted,size_t sizeConverted,const char * mixed,size_t lenMixed,enum CaseConversion conversion)621 size_t CaseConvertString(char *converted, size_t sizeConverted, const char *mixed, size_t lenMixed, enum CaseConversion conversion) {
622 	CaseConverter *pCaseConv = ConverterForConversion(conversion);
623 	if (!pCaseConv->Initialised())
624 		SetupConversions(conversion);
625 	return pCaseConv->CaseConvertString(converted, sizeConverted, mixed, lenMixed);
626 }
627 
628 #ifdef SCI_NAMESPACE
629 }
630 #endif
631