1 {
2  /***************************************************************************
3                                   lazutf8.pas
4  ***************************************************************************/
5 
6  *****************************************************************************
7   This file is part of LazUtils
8 
9   See the file COPYING.modifiedLGPL.txt, included in this distribution,
10   for details about the license.
11  *****************************************************************************
12 
13   Useful routines for managing UTF-8 strings
14 
15   - all functions are thread safe unless explicitely stated
16 }
17 unit LazUTF8;
18 
19 {$mode objfpc}{$H+}{$inline on}
20 
21 {$i lazutils_defines.inc}
22 
23 interface
24 
25 uses
26   {$ifdef unix}
27   // WideCompare* functions on Unix requires this. Must be used although it pulls in clib.
28   cwstring,
29   {$endif}
30   {$IFDEF UTF8_RTL}
31   FPCAdds,
32   {$ENDIF}
33   {$ifdef windows}
34   Windows,
35   {$endif}
36   Classes, SysUtils, strutils;
37 
38 // AnsiToUTF8 and UTF8ToAnsi need a widestring manager under Linux, BSD, MacOSX
39 // but normally these OS use UTF-8 as system encoding so the widestringmanager
40 // is not needed.
NeedRTLAnsinull41 function NeedRTLAnsi: boolean;// true if system encoding is not UTF-8
42 procedure SetNeedRTLAnsi(NewValue: boolean);
43 
44 // UTF8ToSys works like UTF8ToAnsi but more independent of widestringmanager
UTF8ToSysnull45 function UTF8ToSys(const s: string): string; overload; {$IFDEF UTF8_RTL}inline;{$ENDIF}
UTF8ToSysnull46 function UTF8ToSys(const AFormatSettings: TFormatSettings): TFormatSettings; overload; {$IFDEF UTF8_RTL}inline;{$ENDIF}
47 
48 // SysToUTF8 works like AnsiToUTF8 but more independent of widestringmanager
SysToUTF8null49 function SysToUTF8(const s: string): string; overload; {$IFDEF UTF8_RTL}inline;{$ENDIF}
SysToUTF8null50 function SysToUTF8(const AFormatSettings: TFormatSettings): TFormatSettings; overload;
51 
52 // converts OEM encoded string to UTF8 (used with some Windows specific functions)
ConsoleToUTF8null53 function ConsoleToUTF8(const s: string): string; {$IFDEF UTF8_RTL}inline;{$ENDIF}
54 // converts UTF8 string to console encoding (used by Write, WriteLn)
UTF8ToConsolenull55 function UTF8ToConsole(const s: string): string; {$IFDEF UTF8_RTL}inline;{$ENDIF}
56 
57 // for all Windows supporting 8bit codepages (e.g. not WinCE)
58 // converts string in Windows code page to UTF8 (used with some Windows specific functions)
WinCPToUTF8null59 function WinCPToUTF8(const s: string): string; {$ifdef WinCe}inline;{$endif}
60 // converts UTF8 string to Windows code page encoding (used by Write, WriteLn)
UTF8ToWinCPnull61 function UTF8ToWinCP(const s: string): string; {$ifdef WinCe}inline;{$endif}
62 
ParamStrUTF8null63 function ParamStrUTF8(Param: Integer): string;
64 
65 {$ifdef windows}
66 procedure GetFormatSettingsUTF8;
67 procedure GetLocaleFormatSettingsUTF8(LCID: Integer; var aFormatSettings: TFormatSettings);
68 {$endif}
69 
GetEnvironmentVariableCountUTF8null70 Function GetEnvironmentVariableCountUTF8: Integer;
GetEnvironmentStringUTF8null71 function GetEnvironmentStringUTF8(Index: Integer): string;
GetEnvironmentVariableUTF8null72 function GetEnvironmentVariableUTF8(const EnvVar: string): String;
SysErrorMessageUTF8null73 function SysErrorMessageUTF8(ErrorCode: Integer): String;
74 
75 // Returns the size of one codepoint in bytes.
UTF8CodepointSizenull76 function UTF8CodepointSize(p: PChar): integer; inline;
UTF8CharacterLengthnull77 function UTF8CharacterLength(p: PChar): integer; deprecated 'Use UTF8CodepointSize instead.';
78 // Fast version of UTF8CodepointSize. Assumes the UTF-8 codepoint is valid.
UTF8CodepointSizeFastnull79 function UTF8CodepointSizeFast(p: PChar): integer; inline;
80 
UTF8Lengthnull81 function UTF8Length(const s: string): PtrInt; inline;
UTF8Lengthnull82 function UTF8Length(p: PChar; ByteCount: PtrInt): PtrInt;
83 // Fast versions of UTF8Length. They assume the UTF-8 data is valid.
UTF8LengthFastnull84 function UTF8LengthFast(const s: string): PtrInt; inline;
UTF8LengthFastnull85 function UTF8LengthFast(p: PChar; ByteCount: PtrInt): PtrInt;
86 
87 // Functions dealing with unicode number U+xxx.
UTF8CodepointToUnicodenull88 function UTF8CodepointToUnicode(p: PChar; out CodepointLen: integer): Cardinal;
UTF8CharacterToUnicodenull89 function UTF8CharacterToUnicode(p: PChar; out CharLen: integer): Cardinal; deprecated 'Use UTF8CodepointToUnicode instead.';
UnicodeToUTF8null90 function UnicodeToUTF8(CodePoint: cardinal): string; // UTF32 to UTF8
UnicodeToUTF8null91 function UnicodeToUTF8(CodePoint: cardinal; Buf: PChar): integer; // UTF32 to UTF8
UnicodeToUTF8SkipErrorsnull92 function UnicodeToUTF8SkipErrors(CodePoint: cardinal; Buf: PChar): integer; // UTF32 to UTF8
UnicodeToUTF8Inlinenull93 function UnicodeToUTF8Inline(CodePoint: cardinal; Buf: PChar): integer; inline; // UTF32 to UTF8
UTF8ToDoubleByteStringnull94 function UTF8ToDoubleByteString(const s: string): string;
UTF8ToDoubleBytenull95 function UTF8ToDoubleByte(UTF8Str: PChar; Len: PtrInt; DBStr: PByte): PtrInt;
UTF8FindNearestCharStartnull96 function UTF8FindNearestCharStart(UTF8Str: PChar; Len: SizeInt;
97                                   BytePos: SizeInt): SizeInt;
Utf8TryFindCodepointStartnull98 function Utf8TryFindCodepointStart(AString: PChar; var CurPos: PChar; out CodepointLen: Integer): Boolean;
Utf8TryFindCodepointStartnull99 function Utf8TryFindCodepointStart(const AString: String; var Index: Integer; out CharLen: Integer): Boolean;
100 // find the n-th UTF8 codepoint, ignoring BIDI
UTF8CodepointStartnull101 function UTF8CodepointStart(UTF8Str: PChar; Len, CodepointIndex: PtrInt): PChar;
UTF8CharStartnull102 function UTF8CharStart(UTF8Str: PChar; Len, CharIndex: PtrInt): PChar; deprecated 'Use UTF8CodepointStart instead.';
103 // find the byte index of the n-th UTF8 codepoint, ignoring BIDI (byte len of substr)
UTF8CodepointToByteIndexnull104 function UTF8CodepointToByteIndex(UTF8Str: PChar; Len, CodepointIndex: PtrInt): PtrInt;
UTF8CharToByteIndexnull105 function UTF8CharToByteIndex(UTF8Str: PChar; Len, CharIndex: PtrInt): PtrInt; deprecated 'Use UTF8CodepointToByteIndex instead.';
106 procedure UTF8FixBroken(P: PChar); overload;
107 procedure UTF8FixBroken(var S: string); overload;
UTF8CodepointStrictSizenull108 function UTF8CodepointStrictSize(P: PChar): integer;
UTF8CharacterStrictLengthnull109 function UTF8CharacterStrictLength(P: PChar): integer; deprecated 'Use UTF8CodepointStrictSize instead.';
UTF8CStringToUTF8Stringnull110 function UTF8CStringToUTF8String(SourceStart: PChar; SourceLen: PtrInt) : string;
111 
UTF8Posnull112 function UTF8Pos(const SearchForText, SearchInText: string; StartPos: SizeInt = 1): PtrInt;
UTF8PosPnull113 function UTF8PosP(SearchForText: PChar; SearchForTextLen: SizeInt;
114   SearchInText: PChar; SearchInTextLen: SizeInt): PChar;
UTF8Copynull115 function UTF8Copy(const s: string; StartCharIndex, CharCount: PtrInt): string;
116 {$IFnDEF NO_CP_RTL}
117 procedure UTF8Delete(var s: Utf8String; StartCharIndex, CharCount: PtrInt);
118 {$ENDIF}
119 procedure UTF8Delete(var s: String; StartCharIndex, CharCount: PtrInt);
120 {$IFnDEF NO_CP_RTL}
121 procedure UTF8Insert(const source: Utf8String; var s: Utf8String; StartCharIndex: PtrInt);
122 {$ENDIF}
123 procedure UTF8Insert(const source: String; var s: String; StartCharIndex: PtrInt);
UTF8StringReplacenull124 function UTF8StringReplace(const S, OldPattern, NewPattern: String;
125   Flags: TReplaceFlags; ALanguage: string=''): String;
126 
UTF8LowerCasenull127 function UTF8LowerCase(const AInStr: string; ALanguage: string=''): string;
UTF8LowerStringnull128 function UTF8LowerString(const s: string): string;
UTF8UpperCasenull129 function UTF8UpperCase(const AInStr: string; ALanguage: string=''): string;
UTF8UpperStringnull130 function UTF8UpperString(const s: string): string;
UTF8SwapCasenull131 function UTF8SwapCase(const AInStr: string; ALanguage: string=''): string;
132 // Capitalize the first letters of every word
UTF8ProperCasenull133 function UTF8ProperCase(const AInStr: string; const WordDelims: TSysCharSet): string;
FindInvalidUTF8Codepointnull134 function FindInvalidUTF8Codepoint(p: PChar; Count: PtrInt; StopOnNonUTF8: Boolean = true): PtrInt;
FindInvalidUTF8Characternull135 function FindInvalidUTF8Character(p: PChar; Count: PtrInt; StopOnNonUTF8: Boolean = true): PtrInt; deprecated 'Use FindInvalidUTF8Codepoint instead.';
UTF8StringOfCharnull136 function UTF8StringOfChar(AUtf8Char: String; N: Integer): String;
UTF8AddCharnull137 function UTF8AddChar(AUtf8Char: String; const S: String; N: Integer): String;
UTF8AddCharRnull138 function UTF8AddCharR(AUtf8Char: String; const S: String; N: Integer): String;
UTF8PadLeftnull139 function UTF8PadLeft(const S: String; const N: Integer; const AUtf8Char: String = #32): String;
UTF8PadRightnull140 function UTF8PadRight(const S: String; const N: Integer; const AUtf8Char: String = #32): String;
UTF8PadCenternull141 function UTF8PadCenter(const S: String; const N: Integer; const AUtf8Char: String = #32): String;
UTF8LeftStrnull142 function UTF8LeftStr(const AText: String; const ACount: Integer): String;
UTF8RightStrnull143 function UTF8RightStr(const AText: String; const ACount: Integer): String;
UTF8QuotedStrnull144 function UTF8QuotedStr(const S, Quote: string): string;
145 //Utf8 version of MidStr is just Utf8Copy with same parameters, so it is not implemented here
UTF8StartsTextnull146 function UTF8StartsText(const ASubText, AText: string): Boolean;
UTF8EndsTextnull147 function UTF8EndsText(const ASubText, AText: string): Boolean;
UTF8ReverseStringnull148 function UTF8ReverseString(p: PChar; const ByteCount: LongInt): string;
UTF8ReverseStringnull149 function UTF8ReverseString(const AText: string): string; inline;
UTF8RPosnull150 function UTF8RPos(const Substr, Source: string): PtrInt;
151 
UTF8WrapTextnull152 function UTF8WrapText(S, BreakStr: string; BreakChars: TSysCharSet; MaxCol: integer): string; overload;
UTF8WrapTextnull153 function UTF8WrapText(S: string; MaxCol: integer): string; overload;
154 
155 type
156   TEscapeMode = (emPascal, emHexPascal, emHexC, emC, emAsciiControlNames);
157 
Utf8EscapeControlCharsnull158 function Utf8EscapeControlChars(S: String; EscapeMode: TEscapeMode = emPascal): String;
159 
160 type
161   TUTF8TrimFlag = (
162     u8tKeepStart,
163     u8tKeepEnd,
164     u8tKeepTabs,
165     u8tKeepLineBreaks,
166     u8tKeepNoBreakSpaces,
167     u8tKeepControlCodes // excluding tabs and line breaks
168     );
169   TUTF8TrimFlags = set of TUTF8TrimFlag;
UTF8Trimnull170 function UTF8Trim(const s: string; Flags: TUTF8TrimFlags = []): string;
171 
172 //compare functions
173 
UTF8CompareStrnull174 function UTF8CompareStr(const S1, S2: string): PtrInt; inline;
UTF8CompareStrPnull175 function UTF8CompareStrP(S1, S2: PChar): PtrInt;
UTF8CompareStrnull176 function UTF8CompareStr(S1: PChar; Count1: SizeInt; S2: PChar; Count2: SizeInt): PtrInt;
UTF8CompareTextnull177 function UTF8CompareText(const S1, S2: string): PtrInt;
UTF8CompareStrCollatednull178 function UTF8CompareStrCollated(const S1, S2: string): PtrInt; {$IFnDEF ACP_RTL}inline;{$endif}
CompareStrListUTF8LowerCasenull179 function CompareStrListUTF8LowerCase(List: TStringList; Index1, Index2: Integer): Integer;
180 
181 type
182   TConvertResult = (trNoError, trNullSrc, trNullDest, trDestExhausted,
183     trInvalidChar, trUnfinishedChar);
184 
185   TConvertOption = (toInvalidCharError, toInvalidCharToSymbol,
186     toUnfinishedCharError, toUnfinishedCharToSymbol);
187   TConvertOptions = set of TConvertOption;
188 
ConvertUTF8ToUTF16null189 function ConvertUTF8ToUTF16(Dest: PWideChar; DestWideCharCount: SizeUInt;
190   Src: PChar; SrcCharCount: SizeUInt; Options: TConvertOptions;
191   out ActualWideCharCount: SizeUInt): TConvertResult;
192 
ConvertUTF16ToUTF8null193 function ConvertUTF16ToUTF8(Dest: PChar; DestCharCount: SizeUInt;
194   Src: PWideChar; SrcWideCharCount: SizeUInt; Options: TConvertOptions;
195   out ActualCharCount: SizeUInt): TConvertResult;
196 
UTF8ToUTF16null197 function UTF8ToUTF16(const S: AnsiString): UnicodeString; overload;
UTF8ToUTF16null198 function UTF8ToUTF16(const P: PChar; ByteCnt: SizeUInt): UnicodeString; overload;
UTF16ToUTF8null199 function UTF16ToUTF8(const S: UnicodeString): AnsiString; overload;
UTF16ToUTF8null200 function UTF16ToUTF8(const P: PWideChar; WideCnt: SizeUInt): AnsiString; overload;
201 
202 // locale
203 procedure LazGetLanguageIDs(var Lang, FallbackLang: String);
204 procedure LazGetShortLanguageID(var Lang: String);
205 
206 var
207   FPUpChars: array[char] of char;
208 
209 procedure ReplaceSubstring(var s: string; StartPos, Count: SizeInt;
210                            const Insertion: string);
211 
212 implementation
213 
214 uses
215   gettext
216 {$IFDEF Darwin}, MacOSAll{$ENDIF}
217   ;
218 
219 {$IFDEF WinCE}
220 // CP_UTF8 is missing in the windows unit of the Windows CE RTL
221 const
222   CP_UTF8 = 65001;
223 {$ENDIF}
224 
IsASCIInull225 function IsASCII(const s: string): boolean; inline;
226 var
227   i: Integer;
228 begin
229   for i:=1 to length(s) do if ord(s[i])>127 then exit(false);
230   Result:=true;
231 end;
232 
233 {$IFDEF windows}
234   {$i winlazutf8.inc}
235 {$ELSE}
236   {$IFDEF HASAMIGA}
237   {$i unixlazutf8.inc}   // Reuse UNIX code for Amiga
238   {$ELSE}
239   {$i unixlazutf8.inc}
240   {$ENDIF}
241 {$ENDIF}
242 
243 var
244   FNeedRTLAnsi: boolean = false;
245   FNeedRTLAnsiValid: boolean = false;
246 
NeedRTLAnsinull247 function NeedRTLAnsi: boolean;
248 {$IFNDEF Windows}
249 var
250   Lang: String;
251   i: LongInt;
252   Encoding: String;
253 {$ENDIF}
254 begin
255   if FNeedRTLAnsiValid then
256     exit(FNeedRTLAnsi);
257   {$IFDEF Windows}
258     {$IF FPC_FULLVERSION>=20701}
259     FNeedRTLAnsi:=DefaultSystemCodePage<>CP_UTF8;
260     {$ELSE}
261     FNeedRTLAnsi:=GetACP<>CP_UTF8;
262     {$ENDIF}
263   {$ELSE}
264   FNeedRTLAnsi:=false;
265   Lang := SysUtils.GetEnvironmentVariable('LC_ALL');
266   if lang = '' then
267   begin
268     Lang := SysUtils.GetEnvironmentVariable('LC_MESSAGES');
269     if Lang = '' then
270     begin
271       Lang := SysUtils.GetEnvironmentVariable('LANG');
272     end;
273   end;
274   i:=System.Pos('.',Lang);
275   if (i>0) then begin
276     Encoding:=copy(Lang,i+1,length(Lang)-i);
277     FNeedRTLAnsi:=(SysUtils.CompareText(Encoding,'UTF-8')<>0)
278               and (SysUtils.CompareText(Encoding,'UTF8')<>0);
279   end;
280   {$ENDIF}
281   FNeedRTLAnsiValid:=true;
282   Result:=FNeedRTLAnsi;
283 end;
284 
285 procedure SetNeedRTLAnsi(NewValue: boolean);
286 begin
287   FNeedRTLAnsi:=NewValue;
288   FNeedRTLAnsiValid:=true;
289 end;
290 
UTF8ToSysnull291 function UTF8ToSys(const s: string): string;
292 begin
293   {$IFDEF UTF8_RTL}
294   Result:=s;
295   {$ELSE}
296   if NeedRTLAnsi and (not IsASCII(s)) then
297     Result:=UTF8ToAnsi(s)
298   else
299     Result:=s;
300   {$ENDIF}
301 end;
302 
SysToUTF8null303 function SysToUTF8(const s: string): string;
304 begin
305   {$IFDEF UTF8_RTL}
306   Result:=s;
307   {$ELSE}
308   if NeedRTLAnsi and (not IsASCII(s)) then
309   begin
310     Result:=AnsiToUTF8(s);
311     {$ifdef FPC_HAS_CPSTRING}
312     // prevent UTF8 codepage appear in the strings - we don't need codepage
313     // conversion magic in LCL code
314     SetCodePage(RawByteString(Result), StringCodePage(s), False);
315     {$endif}
316   end
317   else
318     Result:=s;
319   {$ENDIF}
320 end;
321 
322 function SysToUTF8(const AFormatSettings: TFormatSettings): TFormatSettings;
323 {$IFNDEF UTF8_RTL}
324 var
325   i: Integer;
326 {$ENDIF}
327 begin
328   Result := AFormatSettings;
329   {$IFNDEF UTF8_RTL}
330   Result.CurrencyString := SysToUTF8(AFormatSettings.CurrencyString);
331   for i:=1 to 12 do begin
332     Result.LongMonthNames[i] := SysToUTF8(AFormatSettings.LongMonthNames[i]);
333     Result.ShortMonthNames[i] := SysToUTF8(AFormatSettings.ShortMonthNames[i]);
334   end;
335   for i:=1 to 7 do begin
336     Result.LongDayNames[i] := SysToUTF8(AFormatSettings.LongDayNames[i]);
337     Result.ShortDayNames[i] := SysToUTF8(AFormatSettings.ShortDayNames[i]);
338   end;
339   {$ENDIF}
340 end;
341 
342 function UTF8ToSys(const AFormatSettings: TFormatSettings): TFormatSettings;
343 {$IFnDEF UTF8_RTL}
344 var
345   i: Integer;
346 {$ENDIF}
347 begin
348   Result := AFormatSettings;
349   {$IFnDEF UTF8_RTL}
350   Result.CurrencyString := UTF8ToSys(AFormatSettings.CurrencyString);
351   for i:=1 to 12 do begin
352     Result.LongMonthNames[i] := UTF8ToSys(AFormatSettings.LongMonthNames[i]);
353     Result.ShortMonthNames[i] := UTF8ToSys(AFormatSettings.ShortMonthNames[i]);
354   end;
355   for i:=1 to 7 do begin
356     Result.LongDayNames[i] := UTF8ToSys(AFormatSettings.LongDayNames[i]);
357     Result.ShortDayNames[i] := UTF8ToSys(AFormatSettings.ShortDayNames[i]);
358   end;
359   {$ENDIF}
360 end;
361 
362 function GetEnvironmentVariableCountUTF8: Integer;
363 begin
364   {$IF defined(FPC_RTL_UNICODE) or not defined(MSWindows)} //also WinCE, issue #0031788
365   Result:=SysUtils.GetEnvironmentVariableCount;
366   {$ELSE}
367   Result:=GetGetEnvironmentVariableCountWide;
368   {$ENDIF}
369 end;
370 
371 function GetEnvironmentStringUTF8(Index: Integer): string;
372 begin
373   {$IFDEF FPC_RTL_UNICODE}
374   Result:=UTF16ToUTF8(SysUtils.GetEnvironmentString(Index));
375   {$ELSE}
376     {$IFDEF MSWindows} //not for WinCE, issue #0031788
377     Result:=UTF16ToUTF8(GetEnvironmentStringWide(Index));
378     {$ELSE}
379     // by default the environment is in console encoding
380     // see also RTL issue: http://bugs.freepascal.org/view.php?id=15233
381     Result:=ConsoleToUTF8(SysUtils.GetEnvironmentString(Index));
382     {$ENDIF}
383   {$ENDIF}
384 end;
385 
386 function GetEnvironmentVariableUTF8(const EnvVar: string): String;
387 begin
388   {$IFDEF FPC_RTL_UNICODE}
389   Result:=UTF16ToUTF8(SysUtils.GetEnvironmentVariable(UTF8ToUTF16(EnvVar)));
390   {$ELSE}
391     {$IFDEF Windows}
392     Result:=UTF16ToUTF8(GetEnvironmentVariableWide(EnvVar));
393     {$ELSE}
394     // by default the environment is in console encoding
395     // RTL issue: http://bugs.freepascal.org/view.php?id=15233
396     Result:=ConsoleToUTF8(SysUtils.GetEnvironmentVariable(UTF8ToSys(EnvVar)));
397     {$ENDIF}
398   {$ENDIF}
399 end;
400 
401 function SysErrorMessageUTF8(ErrorCode: Integer): String;
402 begin
403   Result := SysToUTF8(SysUtils.SysErrorMessage(ErrorCode));
404 end;
405 
406 function UTF8CodepointSizeFull(p: PChar): integer;
407 begin
408   case p^ of
409   #0..#191: // %11000000
410     // regular single byte character (#0 is a character, this is Pascal ;)
411     Result:=1;
412   #192..#223: // p^ and %11100000 = %11000000
413     begin
414       // could be 2 byte character
415       if (ord(p[1]) and %11000000) = %10000000 then
416         Result:=2
417       else
418         Result:=1;
419     end;
420   #224..#239: // p^ and %11110000 = %11100000
421     begin
422       // could be 3 byte character
423       if ((ord(p[1]) and %11000000) = %10000000)
424       and ((ord(p[2]) and %11000000) = %10000000) then
425         Result:=3
426       else
427         Result:=1;
428     end;
429   #240..#247: // p^ and %11111000 = %11110000
430     begin
431       // could be 4 byte character
432       if ((ord(p[1]) and %11000000) = %10000000)
433       and ((ord(p[2]) and %11000000) = %10000000)
434       and ((ord(p[3]) and %11000000) = %10000000) then
435         Result:=4
436       else
437         Result:=1;
438     end;
439   else
440     Result:=1;
441   end;
442 end;
443 
444 function UTF8CodepointSize(p: PChar): integer; inline;
445 begin
446   if p=nil then exit(0);
447   if p^<#192 then exit(1);
448   Result:=UTF8CodepointSizeFull(p);
449 end;
450 
451 function UTF8CharacterLength(p: PChar): integer;
452 begin
453   Result := UTF8CodepointSize(p);
454 end;
455 
456 function UTF8CodepointSizeFast(p: PChar): integer;
457 begin
458   case p^ of
459     #0..#191   : Result := 1;
460     #192..#223 : Result := 2;
461     #224..#239 : Result := 3;
462     #240..#247 : Result := 4;
463     //#248..#255 : Result := 1;
464     // Theoretically UTF-8 supports length 1-7, but since 2003, RFC 3629 limits
465     // it to 1-4 bytes.
466     // This is an inline function, so keep the function short.
467     //#248..#251   : Result := 5;
468     //#252, #253   : Result := 6;
469     //#254         : Result := 7;
470 
471     else Result := 1; // An optimization + prevents compiler warning about uninitialized Result.
472   end;
473 end;
474 
475 function UTF8Length(const s: string): PtrInt;
476 begin
477   Result:=UTF8Length(PChar(s),length(s));
478 end;
479 
480 function UTF8Length(p: PChar; ByteCount: PtrInt): PtrInt;
481 var
482   CharLen: LongInt;
483 begin
484   Result:=0;
485   while (ByteCount>0) do begin
486     inc(Result);
487     CharLen:=UTF8CodepointSize(p);
488     inc(p,CharLen);
489     dec(ByteCount,CharLen);
490   end;
491 end;
492 
493 function UTF8LengthFast(const s: string): PtrInt;
494 begin
495   Result := UTF8LengthFast(PChar(s), Length(s));
496 end;
497 
498 // Ported from:
499 //  http://www.daemonology.net/blog/2008-06-05-faster-utf8-strlen.html
500 // The code uses CPU's native data size. In a 64-bit CPU it means 8 bytes at once.
501 // The UTF-8 data is assumed to be valid.
UTF8LengthFastnull502 function UTF8LengthFast(p: PChar; ByteCount: PtrInt): PtrInt;
503 const
504 {$ifdef CPU32}
505   ONEMASK   =$01010101;
506   EIGHTYMASK=$80808080;
507 {$endif}
508 {$ifdef CPU64}
509   ONEMASK   =$0101010101010101;
510   EIGHTYMASK=$8080808080808080;
511 {$endif}
512 var
513   pnx: PPtrInt absolute p; // To get contents of text in PtrInt blocks. x refers to 32 or 64 bits
514   pn8: pint8 absolute pnx; // To read text as Int8 in the initial and final loops
515   ix: PtrInt absolute pnx; // To read text as PtrInt in the block loop
516   nx: PtrInt;              // values processed in block loop
517   i,cnt,e: PtrInt;
518 begin
519   Result := 0;
520   e := ix+ByteCount; // End marker
521   // Handle any initial misaligned bytes.
522   cnt := (not (ix-1)) and (sizeof(PtrInt)-1);
523   if cnt>ByteCount then
524     cnt := ByteCount;
525   for i := 1 to cnt do
526   begin
527     // Is this byte NOT the first byte of a character?
528     Result += (pn8^ shr 7) and ((not pn8^) shr 6);
529     inc(pn8);
530   end;
531   // Handle complete blocks
532   for i := 1 to (ByteCount-cnt) div sizeof(PtrInt) do
533   begin
534     // Count bytes which are NOT the first byte of a character.
535     nx := ((pnx^ and EIGHTYMASK) shr 7) and ((not pnx^) shr 6);
536     {$push}{$overflowchecks off} // "nx * ONEMASK" causes an arithmetic overflow.
537     Result += (nx * ONEMASK) >> ((sizeof(PtrInt) - 1) * 8);
538     {$pop}
539     inc(pnx);
540   end;
541   // Take care of any left-over bytes.
542   while ix<e do
543   begin
544     // Is this byte NOT the first byte of a character?
545     Result += (pn8^ shr 7) and ((not pn8^) shr 6);
546     inc(pn8);
547   end;
548   Result := ByteCount - Result;
549 end;
550 
UTF8CodepointToUnicodenull551 function UTF8CodepointToUnicode(p: PChar; out CodepointLen: integer): Cardinal;
552 { if p=nil then CodepointLen=0 otherwise CodepointLen>0
553   If there is an encoding error the Result is 0 and CodepointLen=1.
554   Use UTF8FixBroken to fix UTF-8 encoding.
555   It does not check if the codepoint is defined in the Unicode tables.
556 }
557 begin
558   if p<>nil then begin
559     if ord(p^)<%11000000 then begin
560       // regular single byte character (#0 is a normal char, this is pascal ;)
561       Result:=ord(p^);
562       CodepointLen:=1;
563     end
564     else if ((ord(p^) and %11100000) = %11000000) then begin
565       // starts with %110 => could be double byte character
566       if (ord(p[1]) and %11000000) = %10000000 then begin
567         CodepointLen:=2;
568         Result:=((ord(p^) and %00011111) shl 6) or (ord(p[1]) and %00111111);
569         if Result<(1 shl 7) then begin
570           // wrong encoded, could be an XSS attack
571           Result:=0;
572         end;
573       end else begin
574         Result:=ord(p^);
575         CodepointLen:=1;
576       end;
577     end
578     else if ((ord(p^) and %11110000) = %11100000) then begin
579       // starts with %1110 => could be triple byte character
580       if ((ord(p[1]) and %11000000) = %10000000)
581       and ((ord(p[2]) and %11000000) = %10000000) then begin
582         CodepointLen:=3;
583         Result:=((ord(p^) and %00011111) shl 12)
584                 or ((ord(p[1]) and %00111111) shl 6)
585                 or (ord(p[2]) and %00111111);
586         if Result<(1 shl 11) then begin
587           // wrong encoded, could be an XSS attack
588           Result:=0;
589         end;
590       end else begin
591         Result:=ord(p^);
592         CodepointLen:=1;
593       end;
594     end
595     else if ((ord(p^) and %11111000) = %11110000) then begin
596       // starts with %11110 => could be 4 byte character
597       if ((ord(p[1]) and %11000000) = %10000000)
598       and ((ord(p[2]) and %11000000) = %10000000)
599       and ((ord(p[3]) and %11000000) = %10000000) then begin
600         CodepointLen:=4;
601         Result:=((ord(p^) and %00001111) shl 18)
602                 or ((ord(p[1]) and %00111111) shl 12)
603                 or ((ord(p[2]) and %00111111) shl 6)
604                 or (ord(p[3]) and %00111111);
605         if Result<(1 shl 16) then begin
606           // wrong encoded, could be an XSS attack
607           Result:=0;
608         end;
609       end else begin
610         Result:=ord(p^);
611         CodepointLen:=1;
612       end;
613     end
614     else begin
615       // invalid character
616       Result:=ord(p^);
617       CodepointLen:=1;
618     end;
619   end else begin
620     Result:=0;
621     CodepointLen:=0;
622   end;
623 end;
624 
UTF8CharacterToUnicodenull625 function UTF8CharacterToUnicode(p: PChar; out CharLen: integer): Cardinal;
626 begin
627   Result := UTF8CodepointToUnicode(p, CharLen);
628 end;
629 
UnicodeToUTF8null630 function UnicodeToUTF8(CodePoint: cardinal; Buf: PChar): integer;
631 
632   procedure RaiseInvalidUnicode;
633   begin
634     raise Exception.Create('UnicodeToUTF8: invalid unicode: '+IntToStr(CodePoint));
635   end;
636 
637 begin
638   Result:=UnicodeToUTF8Inline(CodePoint,Buf);
639   if Result=0 then
640     RaiseInvalidUnicode;
641 end;
642 
UnicodeToUTF8SkipErrorsnull643 function UnicodeToUTF8SkipErrors(CodePoint: cardinal; Buf: PChar): integer;
644 begin
645   Result:=UnicodeToUTF8Inline(CodePoint,Buf);
646 end;
647 
UnicodeToUTF8null648 function UnicodeToUTF8(CodePoint: cardinal): string;
649 var
650   Buf: array[0..6] of Char;
651   Len: Integer;
652 begin
653   if (CodePoint = 0) then
654     Result := #0 //StrPas does not like #0
655   else
656   begin
657     Len:=UnicodeToUTF8Inline(CodePoint, @Buf[0]);
658     Buf[Len]:=#0;
659     Result := StrPas(@Buf[0]);
660   end;
661 end;
662 
UnicodeToUTF8Inlinenull663 function UnicodeToUTF8Inline(CodePoint: cardinal; Buf: PChar): integer;
664 begin
665   case CodePoint of
666     0..$7f:
667       begin
668         Result:=1;
669         Buf[0]:=char(byte(CodePoint));
670       end;
671     $80..$7ff:
672       begin
673         Result:=2;
674         Buf[0]:=char(byte($c0 or (CodePoint shr 6)));
675         Buf[1]:=char(byte($80 or (CodePoint and $3f)));
676       end;
677     $800..$ffff:
678       begin
679         Result:=3;
680         Buf[0]:=char(byte($e0 or (CodePoint shr 12)));
681         Buf[1]:=char(byte((CodePoint shr 6) and $3f) or $80);
682         Buf[2]:=char(byte(CodePoint and $3f) or $80);
683       end;
684     $10000..$10ffff:
685       begin
686         Result:=4;
687         Buf[0]:=char(byte($f0 or (CodePoint shr 18)));
688         Buf[1]:=char(byte((CodePoint shr 12) and $3f) or $80);
689         Buf[2]:=char(byte((CodePoint shr 6) and $3f) or $80);
690         Buf[3]:=char(byte(CodePoint and $3f) or $80);
691       end;
692   else
693     Result:=0;
694   end;
695 end;
696 
UTF8ToDoubleByteStringnull697 function UTF8ToDoubleByteString(const s: string): string;
698 var
699   Len: Integer;
700 begin
701   Len:=UTF8Length(s);
702   SetLength(Result,Len*2);
703   if Len=0 then exit;
704   UTF8ToDoubleByte(PChar(s),length(s),PByte(Result));
705 end;
706 
707 { returns number of double bytes }
UTF8ToDoubleBytenull708 function UTF8ToDoubleByte(UTF8Str: PChar; Len: PtrInt; DBStr: PByte): PtrInt;
709 var
710   SrcPos: PChar;
711   CharLen: LongInt;
712   DestPos: PByte;
713   u: Cardinal;
714 begin
715   SrcPos:=UTF8Str;
716   DestPos:=DBStr;
717   Result:=0;
718   while Len>0 do begin
719     u:=UTF8CodepointToUnicode(SrcPos,CharLen);
720     DestPos^:=byte((u shr 8) and $ff);
721     inc(DestPos);
722     DestPos^:=byte(u and $ff);
723     inc(DestPos);
724     inc(SrcPos,CharLen);
725     dec(Len,CharLen);
726     inc(Result);
727   end;
728 end;
729 
730 
731 { Tries to find the start of a valid UTF8 codepoint that contains the character pointed to by CurPos
732   - AString: pointer to the (start of the) string
733   - CurPos: pointer to the character inside AString that we want to get the information off
734     * if the function succeeds, CurPos wil point to the start of the valid UTF8 codepoint
735     * if the function fails, CurPos will not be changed
736     Note: if CurPos points beyond the end of AString you will get a crash!
737   - CharLen: the length of the UTF8 codepoint in bytes, if the function succeeds
738   - Returns:
739     True if the character pointed to by Curpos is part of a valid UTF8 codepoint (1 to 4 bytes),
740     otherwise it returns False.                                                                          }
Utf8TryFindCodepointStartnull741 function Utf8TryFindCodepointStart(AString: PChar; var CurPos: PChar; out CodepointLen: Integer): Boolean;
742 var
743   SavedPos: PChar;
744 begin
745   Result := False;
746   CodepointLen := 0;
747   if (not (Assigned(AString) and Assigned(CurPos)))
748       or (CurPos < AString) then Exit;
749   SavedPos := CurPos;
750   //Note: UTF8CodepointStrictSize will NOT "look" beyond the terminating #0 of a PChar, so this is safe with AnsiStrings
751   CodepointLen := UTF8CodepointStrictSize(CurPos);
752   if (CodepointLen > 0) then Exit(True);
753   if (CurPos > AString) then
754   begin
755     Dec(CurPos);   //-1
756     //is it second byte of 2..4 byte codepoint?
757     CodepointLen := UTF8CodepointStrictSize(CurPos);
758     if (CodepointLen > 1) then Exit(True);
759     if (CurPos > AString) then
760     begin
761       Dec(CurPos);   //-2
762       //is it third byte of 3..4 byte codepoint?
763       CodepointLen := UTF8CodepointStrictSize(CurPos);
764       if (CodepointLen > 2) then Exit(True);
765       if (CurPos > AString) then
766       begin
767         Dec(CurPos);   //-3
768        //is it fouth byte of 4 byte codepoint?
769        CodepointLen := UTF8CodepointStrictSize(CurPos);
770        if (CodepointLen = 4) then Exit(True);
771       end;
772     end;
773   end;
774   //At this point we failed: we are NOT inside a valid UTF8 codepoint!
775   CurPos := SavedPos;
776 end;
777 
Utf8TryFindCodepointStartnull778 function Utf8TryFindCodepointStart(const AString: String; var Index: Integer; out CharLen: Integer): Boolean;
779 var
780   CurPos, SavedCurPos: PChar;
781 begin
782   CurPos := @AString[Index];
783   SavedCurPos := CurPos;
784   Result := Utf8TryFindCodepointStart(PChar(AString), CurPos, CharLen);
785   Index := Index - (SavedCurPos - CurPos);
786 end;
787 
788 { Find the start of the UTF8 character which contains BytePos,
789   if BytePos is not part of a valid Utf8 Codepoint the function returns BytePos
790   Len is length in byte, BytePos starts at 0 }
UTF8FindNearestCharStartnull791 function UTF8FindNearestCharStart(UTF8Str: PChar; Len: SizeInt; BytePos: SizeInt): SizeInt;
792 var
793   CurPos: PChar;
794   CharLen: Integer;
795 begin
796   if (BytePos > Len-1) then BytePos := Len - 1;
797   CurPos := Utf8Str + BytePos;
798   //No need to check the result value, since when it retuns False CurPos will be reset
799   //to it's original value, and that's what we want to return in that case
800   Utf8TryFindCodepointStart(Utf8Str, CurPos, CharLen);
801   Result := CurPos - Utf8Str;
802 end;
803 
804 
805 { Len is the length in bytes of UTF8Str
806   CodepointIndex is the position of the desired codepoint (starting at 0), in chars
807 }
UTF8CodepointStartnull808 function UTF8CodepointStart(UTF8Str: PChar; Len, CodepointIndex: PtrInt): PChar;
809 var
810   CharLen: LongInt;
811 begin
812   Result:=UTF8Str;
813   if Result<>nil then begin
814     while (CodepointIndex>0) and (Len>0) do begin
815       CharLen:=UTF8CodepointSize(Result);
816       dec(Len,CharLen);
817       dec(CodepointIndex);
818       inc(Result,CharLen);
819     end;
820     if (CodepointIndex<>0) or (Len<0) then
821       Result:=nil;
822   end;
823 end;
824 
UTF8CharStartnull825 function UTF8CharStart(UTF8Str: PChar; Len, CharIndex: PtrInt): PChar;
826 begin
827   Result := UTF8CodepointStart(UTF8Str, Len, CharIndex);
828 end;
829 
UTF8CodepointToByteIndexnull830 function UTF8CodepointToByteIndex(UTF8Str: PChar; Len, CodepointIndex: PtrInt): PtrInt;
831 var
832   p: PChar;
833 begin
834   p := UTF8CodepointStart(UTF8Str, Len, CodepointIndex);
835   if p = nil
836   then Result := -1
837   else Result := p - UTF8Str;
838 end;
839 
UTF8CharToByteIndexnull840 function UTF8CharToByteIndex(UTF8Str: PChar; Len, CharIndex: PtrInt): PtrInt;
841 begin
842   Result := UTF8CodepointToByteIndex(UTF8Str, Len, CharIndex);
843 end;
844 
845 { fix any broken UTF8 sequences with spaces }
846 procedure UTF8FixBroken(P: PChar);
847 var
848   c: cardinal;
849 begin
850   if p=nil then exit;
851   while p^<>#0 do begin
852     if ord(p^)<%10000000 then begin
853       // regular single byte character
854       inc(p);
855     end
856     else if ord(p^)<%11000000 then begin
857       // invalid
858       p^:=' ';
859       inc(p);
860     end
861     else if ((ord(p^) and %11100000) = %11000000) then begin
862       // starts with %110 => should be 2 byte character
863       if ((ord(p[1]) and %11000000) = %10000000) then begin
864         c:=((ord(p^) and %00011111) shl 6);
865            //or (ord(p[1]) and %00111111);
866         if c<(1 shl 7) then
867           p^:=' '  // fix XSS attack
868         else
869           inc(p,2)
870       end
871       else if p[1]<>#0 then
872         p^:=' ';
873     end
874     else if ((ord(p^) and %11110000) = %11100000) then begin
875       // starts with %1110 => should be 3 byte character
876       if ((ord(p[1]) and %11000000) = %10000000)
877       and ((ord(p[2]) and %11000000) = %10000000) then begin
878         c:=((ord(p^) and %00011111) shl 12)
879            or ((ord(p[1]) and %00111111) shl 6);
880            //or (ord(p[2]) and %00111111);
881         if c<(1 shl 11) then
882           p^:=' '  // fix XSS attack
883         else
884           inc(p,3);
885       end else
886         p^:=' ';
887     end
888     else if ((ord(p^) and %11111000) = %11110000) then begin
889       // starts with %11110 => should be 4 byte character
890       if ((ord(p[1]) and %11000000) = %10000000)
891       and ((ord(p[2]) and %11000000) = %10000000)
892       and ((ord(p[3]) and %11000000) = %10000000) then begin
893         c:=((ord(p^) and %00001111) shl 18)
894            or ((ord(p[1]) and %00111111) shl 12)
895            or ((ord(p[2]) and %00111111) shl 6);
896            //or (ord(p[3]) and %00111111);
897         if c<(1 shl 16) then
898           p^:=' ' // fix XSS attack
899         else
900           inc(p,4)
901       end else
902         p^:=' ';
903     end
904     else begin
905       p^:=' ';
906       inc(p);
907     end;
908   end;
909 end;
910 
911 procedure UTF8FixBroken(var S: string);
912 begin
913   if S='' then exit;
914   if FindInvalidUTF8Codepoint(PChar(S),length(S))<0 then exit;
915   UniqueString(S);
916   UTF8FixBroken(PChar(S));
917 end;
918 
UTF8CodepointStrictSizenull919 function UTF8CodepointStrictSize(P: PChar): integer;
920 var
921   c: Char;
922 begin
923   if p=nil then exit(0);
924   c:=p^;
925   if ord(c)<%10000000 then begin
926     // regular single byte character
927     exit(1);
928   end
929   else if ord(c)<%11000000 then begin
930     // invalid single byte character
931     exit(0);
932   end
933   else if ((ord(c) and %11100000) = %11000000) then begin
934     // should be 2 byte character
935     if (ord(p[1]) and %11000000) = %10000000 then
936       exit(2)
937     else
938       exit(0);
939   end
940   else if ((ord(c) and %11110000) = %11100000) then begin
941     // should be 3 byte character
942     if ((ord(p[1]) and %11000000) = %10000000)
943     and ((ord(p[2]) and %11000000) = %10000000) then
944       exit(3)
945     else
946       exit(0);
947   end
948   else if ((ord(c) and %11111000) = %11110000) then begin
949     // should be 4 byte character
950     if ((ord(p[1]) and %11000000) = %10000000)
951     and ((ord(p[2]) and %11000000) = %10000000)
952     and ((ord(p[3]) and %11000000) = %10000000) then
953       exit(4)
954     else
955       exit(0);
956   end else
957     exit(0);
958 end;
959 
UTF8CharacterStrictLengthnull960 function UTF8CharacterStrictLength(P: PChar): integer;
961 begin
962   Result := UTF8CodepointStrictSize(P);
963 end;
964 
UTF8CStringToUTF8Stringnull965 function UTF8CStringToUTF8String(SourceStart: PChar; SourceLen: PtrInt) : string;
966 var
967   Source: PChar;
968   Dest: PChar;
969   SourceEnd: PChar;
970   SourceCopied: PChar;
971 
972   // Copies from SourceStart till Source to Dest and updates Dest
973   procedure CopyPart; inline;
974   var
975     CopyLength: SizeInt;
976   begin
977     CopyLength := Source - SourceCopied;
978     if CopyLength=0 then exit;
979     System.move(SourceCopied^ , Dest^, CopyLength);
980     SourceCopied:=Source;
981     inc(Dest, CopyLength);
982   end;
983 
984 begin
985   SetLength(Result, SourceLen);
986   if SourceLen=0 then exit;
987   SourceCopied:=SourceStart;
988   Source:=SourceStart;
989   Dest:=PChar(Result);
990   SourceEnd := Source + SourceLen;
991   while Source<SourceEnd do begin
992     if (Source^='\') then begin
993       CopyPart;
994       inc(Source);
995       if Source^ in ['t', 'n', '"', '\'] then begin
996         case Source^ of
997          't' : Dest^ := #9;
998          '"' : Dest^ := '"';
999          '\' : Dest^ := '\';
1000          'n' :
1001          // fpc 2.1.1 stores string constants as array of char so maybe this
1002          // will work for without ifdef (once available in 2.0.x too):
1003          // move(lineending, dest^, sizeof(LineEnding));
1004 {$IFDEF WINDOWS}
1005                begin
1006                  move(lineending[1], dest^, length(LineEnding));
1007                  inc(dest, length(LineEnding)-1);
1008                end;
1009 {$ELSE}
1010                Dest^ := LineEnding;
1011 {$ENDIF}
1012         end;
1013         inc(Source);
1014         inc(Dest);
1015       end;
1016       SourceCopied := Source;
1017     end
1018     else
1019       Inc(Source); // no need for checking for UTF8, the / is never part of an UTF8 multibyte codepoint
1020   end;
1021   CopyPart;
1022   SetLength(Result, Dest - PChar(Result));
1023 end;
1024 
UTF8Posnull1025 function UTF8Pos(const SearchForText, SearchInText: string;
1026   StartPos: SizeInt = 1): PtrInt;
1027 // returns the character index, where the SearchForText starts in SearchInText
1028 // an optional StartPos can be given (in UTF-8 codepoints, not in byte)
1029 // returns 0 if not found
1030 var
1031   i: SizeInt;
1032   p: PChar;
1033   StartPosP: PChar;
1034 begin
1035   Result:=0;
1036   if StartPos=1 then
1037   begin
1038     i:=System.Pos(SearchForText,SearchInText);
1039     if i>0 then
1040       Result:=UTF8Length(PChar(SearchInText),i-1)+1;
1041   end
1042   else if StartPos>1 then
1043   begin
1044     // skip
1045     StartPosP:=UTF8CodepointStart(PChar(SearchInText),Length(SearchInText),StartPos-1);
1046     if StartPosP=nil then exit;
1047     // search
1048     p:=UTF8PosP(PChar(SearchForText),length(SearchForText),
1049                 StartPosP,length(SearchInText)+PChar(SearchInText)-StartPosP);
1050     // get UTF-8 position
1051     if p=nil then exit;
1052     Result:=StartPos+UTF8Length(StartPosP,p-StartPosP);
1053   end;
1054 end;
1055 
UTF8PosPnull1056 function UTF8PosP(SearchForText: PChar; SearchForTextLen: SizeInt;
1057   SearchInText: PChar; SearchInTextLen: SizeInt): PChar;
1058 // returns the position where SearchInText starts in SearchForText
1059 // returns nil if not found
1060 var
1061   p: SizeInt;
1062 begin
1063   Result:=nil;
1064   if (SearchForText=nil) or (SearchForTextLen=0) or (SearchInText=nil) then
1065     exit;
1066   while SearchInTextLen>0 do begin
1067     p:=IndexByte(SearchInText^,SearchInTextLen,PByte(SearchForText)^);
1068     if p<0 then exit;
1069     inc(SearchInText,p);
1070     dec(SearchInTextLen,p);
1071     if SearchInTextLen<SearchForTextLen then exit;
1072     if CompareMem(SearchInText,SearchForText,SearchForTextLen) then
1073       exit(SearchInText);
1074     inc(SearchInText);
1075     dec(SearchInTextLen);
1076   end;
1077 end;
1078 
UTF8Copynull1079 function UTF8Copy(const s: string; StartCharIndex, CharCount: PtrInt): string;
1080 // returns substring
1081 var
1082   StartBytePos: PChar;
1083   EndBytePos: PChar;
1084   MaxBytes: PtrInt;
1085 begin
1086   StartBytePos:=UTF8CodepointStart(PChar(s),length(s),StartCharIndex-1);
1087   if StartBytePos=nil then
1088     Result:=''
1089   else begin
1090     MaxBytes:=PtrInt(PChar(s)+length(s)-StartBytePos);
1091     EndBytePos:=UTF8CodepointStart(StartBytePos,MaxBytes,CharCount);
1092     if EndBytePos=nil then
1093       Result:=copy(s,StartBytePos-PChar(s)+1,MaxBytes)
1094     else
1095       Result:=copy(s,StartBytePos-PChar(s)+1,EndBytePos-StartBytePos);
1096   end;
1097 end;
1098 
1099 {$IFnDEF NO_CP_RTL}
1100 procedure UTF8Delete(var s: Utf8String; StartCharIndex, CharCount: PtrInt);
1101 var
1102   tmp: String;
1103 begin
1104   tmp := RawByteString(s);
1105   {.$IFDEF ACP_RTL}
1106   { change code page without converting the data }
1107   SetCodePage(RawByteString(tmp), CP_UTF8, False);
1108   {.$ENDIF}
1109   { keep refcount to 1 if it was 1, to avoid unnecessary copies }
1110   s := '';
1111   UTF8Delete(tmp,StartCharIndex,CharCount);
1112   { same as above }
1113   s := RawByteString(tmp);
1114   tmp := '';
1115   SetCodePage(RawByteString(s), CP_UTF8, False);
1116 end;
1117 {$ENDIF NO_ACP_RTL}
1118 
1119 procedure UTF8Delete(var s: String; StartCharIndex, CharCount: PtrInt);
1120 var
1121   StartBytePos: PChar;
1122   EndBytePos: PChar;
1123   MaxBytes: PtrInt;
1124 begin
1125   StartBytePos:=UTF8CodepointStart(PChar(s),length(s),StartCharIndex-1);
1126   if StartBytePos <> nil then
1127   begin
1128     MaxBytes:=PtrInt(PChar(s)+length(s)-StartBytePos);
1129     EndBytePos:=UTF8CodepointStart(StartBytePos,MaxBytes,CharCount);
1130     if EndBytePos=nil then
1131       Delete(s,StartBytePos-PChar(s)+1,MaxBytes)
1132     else
1133       Delete(s,StartBytePos-PChar(s)+1,EndBytePos-StartBytePos);
1134   end;
1135 end;
1136 
1137 {$IFnDEF NO_CP_RTL}
1138 {It's simper to copy the code from the variant with String parameters than writing a wrapper}
1139 procedure UTF8Insert(const source: UTF8String; var s: UTF8string;
1140   StartCharIndex: PtrInt);
1141 var
1142   StartBytePos: PChar;
1143 begin
1144   StartBytePos:=UTF8CodepointStart(PChar(s),length(s),StartCharIndex-1);
1145   if StartBytePos <> nil then
1146     Insert(source, s, StartBytePos-PChar(s)+1);
1147 end;
1148 {$ENDIF NO_CP_RTL}
1149 
1150 procedure UTF8Insert(const source: String; var s: String; StartCharIndex: PtrInt);
1151 var
1152   StartBytePos: PChar;
1153 begin
1154   StartBytePos:=UTF8CodepointStart(PChar(s),length(s),StartCharIndex-1);
1155   if StartBytePos <> nil then
1156     Insert(source, s, StartBytePos-PChar(s)+1);
1157 end;
1158 
UTF8StringReplacenull1159 function UTF8StringReplace(const S, OldPattern, NewPattern: String;
1160   Flags: TReplaceFlags; ALanguage: string): String;
1161 // same algorithm as StringReplace, but using UTF8LowerCase
1162 // for case insensitive search
1163 var
1164   Srch, OldP, RemS: string;
1165   P: Integer;
1166 begin
1167   Srch := S;
1168   OldP := OldPattern;
1169   if rfIgnoreCase in Flags then
1170   begin
1171     Srch := UTF8LowerCase(Srch,ALanguage);
1172     OldP := UTF8LowerCase(OldP,ALanguage);
1173   end;
1174   RemS := S;
1175   Result := '';
1176   while Length(Srch) <> 0 do
1177   begin
1178     P := Pos(OldP, Srch);
1179     if P = 0 then
1180     begin
1181       Result := Result + RemS;
1182       Srch := '';
1183     end
1184     else
1185     begin
1186       Result := Result + Copy(RemS,1,P-1) + NewPattern;
1187       P := P + Length(OldP);
1188       RemS := Copy(RemS, P, Length(RemS)-P+1);
1189       if not (rfReplaceAll in Flags) then
1190       begin
1191         Result := Result + RemS;
1192         Srch := '';
1193       end
1194       else
1195         Srch := Copy(Srch, P, Length(Srch)-P+1);
1196     end;
1197   end;
1198 end;
1199 
1200 {
1201   UTF8SwapCase - a "naive" implementation that uses UTF8UpperCase and UTF8LowerCase.
1202     It serves its purpose and performs OK for short and resonably long strings
1203     but it should be rewritten in the future if better performance and lower
1204     memory consumption is needed.
1205 
1206   AInStr - The input string.
1207   ALanguage - The language. Use '' for maximum speed if one desires to ignore the language
1208     (See UTF8LowerCase comment for more details on ALanguage parameter.)
1209 }
UTF8SwapCasenull1210 function UTF8SwapCase(const AInStr: string; ALanguage: string=''): string;
1211 var
1212   xUpperCase: string;
1213   xLowerCase: string;
1214   I: Integer;
1215 begin
1216   if AInStr = '' then
1217     Exit('');
1218 
1219   xUpperCase := UTF8UpperCase(AInStr, ALanguage);
1220   xLowerCase := UTF8LowerCase(AInStr, ALanguage);
1221   if (Length(xUpperCase) <> Length(AInStr)) or (Length(xLowerCase) <> Length(AInStr)) then
1222     Exit(AInStr);//something went wrong -> the lengths of utf8 strings changed
1223 
1224   SetLength(Result, Length(AInStr));
1225   for I := 1 to Length(AInStr) do
1226     if AInStr[I] <> xUpperCase[I] then
1227       Result[I] := xUpperCase[I]
1228     else
1229       Result[I] := xLowerCase[I];
1230 end;
1231 
UTF8ProperCasenull1232 function UTF8ProperCase(const AInStr: string; const WordDelims: TSysCharSet): string;
1233 var
1234   P, PE : PChar;
1235   CharLen: Integer;
1236   Capital: string;
1237 begin
1238   Result := UTF8LowerCase(AInStr);
1239   UniqueString(Result);
1240   P := PChar(Result);
1241   PE := P+Length(Result);
1242   while (P<PE) do
1243   begin
1244     while (P<PE) and (P^ in WordDelims) do
1245       inc(P);
1246     if (P<PE) then
1247     begin
1248       CharLen := UTF8CodepointSize(P);
1249       SetLength(Capital, CharLen);
1250       System.Move(P^, Capital[1], CharLen); // Copy one codepoint to Capital,
1251       Capital := UTF8UpperCase(Capital);    // UpperCase it
1252       System.Move(Capital[1], P^, CharLen); // and copy it back.
1253       Inc(P, CharLen);
1254     end;
1255     while (P<PE) and not (P^ in WordDelims) do
1256       inc(P);
1257   end;
1258 end;
1259 
1260 {
1261   AInStr - The input string
1262   ALanguage - The language. Use '' for maximum speed if one desires to ignore the language
1263               The language should be specified in the format from ISO 639-1,
1264               which uses 2 characters to represent each language.
1265               If the language has no code in ISO 639-1, then the 3-chars code
1266               from ISO 639-2 should be used.
1267               Example: "tr" - Turkish language locale
1268 
1269   Data from here: ftp://ftp.unicode.org/Public/UNIDATA/UnicodeData.txt
1270 
1271   The columns in the file UnicodeData.txt are explained here:
1272   http://www.ksu.ru/eng/departments/ktk/test/perl/lib/unicode/UCDFF301.html#Case Mappings
1273 }
UTF8LowerCasenull1274 function UTF8LowerCase(const AInStr: string; ALanguage: string=''): string;
1275 var
1276   CounterDiff: PtrInt;
1277   InStr, InStrEnd, OutStr: PChar;
1278   // Language identification
1279   IsTurkish: Boolean;
1280   c1, c2, c3, new_c1, new_c2, new_c3: Char;
1281   p: SizeInt;
1282 begin
1283   Result:=AInStr;
1284   InStr := PChar(AInStr);
1285   InStrEnd := InStr + length(AInStr); // points behind last char
1286 
1287   // Do a fast initial parsing of the string to maybe avoid doing
1288   // UniqueString if the resulting string will be identical
1289   while (InStr < InStrEnd) do
1290   begin
1291     c1 := InStr^;
1292     case c1 of
1293     'A'..'Z': Break;
1294     #$C3..#$FF:
1295       case c1 of
1296       #$C3..#$C9, #$CE, #$CF, #$D0..#$D5, #$E1..#$E2,#$E5:
1297         begin
1298           c2 := InStr[1];
1299           case c1 of
1300           #$C3: if c2 in [#$80..#$9E] then Break;
1301           #$C4:
1302           begin
1303             case c2 of
1304             #$80..#$AF, #$B2..#$B6: if ord(c2) mod 2 = 0 then Break;
1305             #$B8..#$FF: if ord(c2) mod 2 = 1 then Break;
1306             #$B0: Break;
1307             end;
1308           end;
1309           #$C5:
1310           begin
1311             case c2 of
1312               #$8A..#$B7: if ord(c2) mod 2 = 0 then Break;
1313               #$00..#$88, #$B9..#$FF: if ord(c2) mod 2 = 1 then Break;
1314               #$B8: Break;
1315             end;
1316           end;
1317           // Process E5 to avoid stopping on chinese chars
1318           #$E5: if (c2 = #$BC) and (InStr[2] in [#$A1..#$BA]) then Break;
1319           // Others are too complex, better not to pre-inspect them
1320           else
1321             Break;
1322           end;
1323           // already lower, or otherwhise not affected
1324         end;
1325       end;
1326     end;
1327     inc(InStr);
1328   end;
1329 
1330   if InStr >= InStrEnd then Exit;
1331 
1332   // Language identification
1333   IsTurkish := (ALanguage = 'tr') or (ALanguage = 'az'); // Turkish and Azeri have a special handling
1334 
1335   UniqueString(Result);
1336   OutStr := PChar(Result) + (InStr - PChar(AInStr));
1337   CounterDiff := 0;
1338 
1339   while InStr < InStrEnd do
1340   begin
1341     c1 := InStr^;
1342     case c1 of
1343       // codepoints      UTF-8 range           Description                Case change
1344       // $0041..$005A    $41..$5A              Capital ASCII              X+$20
1345       'A'..'Z':
1346       begin
1347         { First ASCII chars }
1348         // Special turkish handling
1349         // capital undotted I to small undotted i
1350         if IsTurkish and (c1 = 'I') then
1351         begin
1352           p:=OutStr - PChar(Result);
1353           SetLength(Result,Length(Result)+1);// Increase the buffer
1354           OutStr := PChar(Result)+p;
1355           OutStr^ := #$C4;
1356           inc(OutStr);
1357           OutStr^ := #$B1;
1358           dec(CounterDiff);
1359         end
1360         else
1361         begin
1362           OutStr^ := chr(ord(c1)+32);
1363         end;
1364         inc(InStr);
1365         inc(OutStr);
1366       end;
1367 
1368       // Chars with 2-bytes which might be modified
1369       #$C3..#$D5:
1370       begin
1371         c2 := InStr[1];
1372         new_c1 := c1;
1373         new_c2 := c2;
1374         case c1 of
1375         // Latin Characters 0000–0FFF http://en.wikibooks.org/wiki/Unicode/Character_reference/0000-0FFF
1376         // codepoints      UTF-8 range           Description                Case change
1377         // $00C0..$00D6    C3 80..C3 96          Capital Latin with accents X+$20
1378         // $D7             C3 97                 Multiplication Sign        N/A
1379         // $00D8..$00DE    C3 98..C3 9E          Capital Latin with accents X+$20
1380         // $DF             C3 9F                 German beta ß              already lowercase
1381         #$C3:
1382         begin
1383           case c2 of
1384           #$80..#$96, #$98..#$9E: new_c2 := chr(ord(c2) + $20)
1385           end;
1386         end;
1387         // $0100..$012F    C4 80..C4 AF        Capital/Small Latin accents  if mod 2 = 0 then X+1
1388         // $0130..$0131    C4 B0..C4 B1        Turkish
1389         //  C4 B0 turkish uppercase dotted i -> 'i'
1390         //  C4 B1 turkish lowercase undotted ı
1391         // $0132..$0137    C4 B2..C4 B7        Capital/Small Latin accents  if mod 2 = 0 then X+1
1392         // $0138           C4 B8               ĸ                            N/A
1393         // $0139..$024F    C4 B9..C5 88        Capital/Small Latin accents  if mod 2 = 1 then X+1
1394         #$C4:
1395         begin
1396           case c2 of
1397             #$80..#$AF, #$B2..#$B7: if ord(c2) mod 2 = 0 then new_c2 := chr(ord(c2) + 1);
1398             #$B0: // Turkish
1399             begin
1400               OutStr^ := 'i';
1401               inc(InStr, 2);
1402               inc(OutStr);
1403               inc(CounterDiff, 1);
1404               Continue;
1405             end;
1406             #$B9..#$BE: if ord(c2) mod 2 = 1 then new_c2 := chr(ord(c2) + 1);
1407             #$BF: // This crosses the borders between the first byte of the UTF-8 char
1408             begin
1409               new_c1 := #$C5;
1410               new_c2 := #$80;
1411             end;
1412           end;
1413         end;
1414         // $C589 ʼn
1415         // $C58A..$C5B7: if OldChar mod 2 = 0 then NewChar := OldChar + 1;
1416         // $C5B8:        NewChar := $C3BF; // Ÿ
1417         // $C5B9..$C8B3: if OldChar mod 2 = 1 then NewChar := OldChar + 1;
1418         #$C5:
1419         begin
1420           case c2 of
1421             #$8A..#$B7: //0
1422             begin
1423               if ord(c2) mod 2 = 0 then
1424                 new_c2 := chr(ord(c2) + 1);
1425             end;
1426             #$00..#$88, #$B9..#$BE: //1
1427             begin
1428               if ord(c2) mod 2 = 1 then
1429                 new_c2 := chr(ord(c2) + 1);
1430             end;
1431             #$B8:  // Ÿ
1432             begin
1433               new_c1 := #$C3;
1434               new_c2 := #$BF;
1435             end;
1436           end;
1437         end;
1438         {A convoluted part: C6 80..C6 8F
1439 
1440         0180;LATIN SMALL LETTER B WITH STROKE;Ll;0;L;;;;;N;LATIN SMALL LETTER B BAR;;0243;;0243
1441         0181;LATIN CAPITAL LETTER B WITH HOOK;Lu;0;L;;;;;N;LATIN CAPITAL LETTER B HOOK;;;0253; => C6 81=>C9 93
1442         0182;LATIN CAPITAL LETTER B WITH TOPBAR;Lu;0;L;;;;;N;LATIN CAPITAL LETTER B TOPBAR;;;0183;
1443         0183;LATIN SMALL LETTER B WITH TOPBAR;Ll;0;L;;;;;N;LATIN SMALL LETTER B TOPBAR;;0182;;0182
1444         0184;LATIN CAPITAL LETTER TONE SIX;Lu;0;L;;;;;N;;;;0185;
1445         0185;LATIN SMALL LETTER TONE SIX;Ll;0;L;;;;;N;;;0184;;0184
1446         0186;LATIN CAPITAL LETTER OPEN O;Lu;0;L;;;;;N;;;;0254; ==> C9 94
1447         0187;LATIN CAPITAL LETTER C WITH HOOK;Lu;0;L;;;;;N;LATIN CAPITAL LETTER C HOOK;;;0188;
1448         0188;LATIN SMALL LETTER C WITH HOOK;Ll;0;L;;;;;N;LATIN SMALL LETTER C HOOK;;0187;;0187
1449         0189;LATIN CAPITAL LETTER AFRICAN D;Lu;0;L;;;;;N;;;;0256; => C9 96
1450         018A;LATIN CAPITAL LETTER D WITH HOOK;Lu;0;L;;;;;N;LATIN CAPITAL LETTER D HOOK;;;0257; => C9 97
1451         018B;LATIN CAPITAL LETTER D WITH TOPBAR;Lu;0;L;;;;;N;LATIN CAPITAL LETTER D TOPBAR;;;018C;
1452         018C;LATIN SMALL LETTER D WITH TOPBAR;Ll;0;L;;;;;N;LATIN SMALL LETTER D TOPBAR;;018B;;018B
1453         018D;LATIN SMALL LETTER TURNED DELTA;Ll;0;L;;;;;N;;;;;
1454         018E;LATIN CAPITAL LETTER REVERSED E;Lu;0;L;;;;;N;LATIN CAPITAL LETTER TURNED E;;;01DD; => C7 9D
1455         018F;LATIN CAPITAL LETTER SCHWA;Lu;0;L;;;;;N;;;;0259; => C9 99
1456         }
1457         #$C6:
1458         begin
1459           case c2 of
1460             #$81:
1461             begin
1462               new_c1 := #$C9;
1463               new_c2 := #$93;
1464             end;
1465             #$82..#$85:
1466             begin
1467               if ord(c2) mod 2 = 0 then
1468                 new_c2 := chr(ord(c2) + 1);
1469             end;
1470             #$87..#$88,#$8B..#$8C:
1471             begin
1472               if ord(c2) mod 2 = 1 then
1473                 new_c2 := chr(ord(c2) + 1);
1474             end;
1475             #$86:
1476             begin
1477               new_c1 := #$C9;
1478               new_c2 := #$94;
1479             end;
1480             #$89:
1481             begin
1482               new_c1 := #$C9;
1483               new_c2 := #$96;
1484             end;
1485             #$8A:
1486             begin
1487               new_c1 := #$C9;
1488               new_c2 := #$97;
1489             end;
1490             #$8E:
1491             begin
1492               new_c1 := #$C7;
1493               new_c2 := #$9D;
1494             end;
1495             #$8F:
1496             begin
1497               new_c1 := #$C9;
1498               new_c2 := #$99;
1499             end;
1500           {
1501           And also C6 90..C6 9F
1502 
1503           0190;LATIN CAPITAL LETTER OPEN E;Lu;0;L;;;;;N;LATIN CAPITAL LETTER EPSILON;;;025B; => C9 9B
1504           0191;LATIN CAPITAL LETTER F WITH HOOK;Lu;0;L;;;;;N;LATIN CAPITAL LETTER F HOOK;;;0192; => +1
1505           0192;LATIN SMALL LETTER F WITH HOOK;Ll;0;L;;;;;N;LATIN SMALL LETTER SCRIPT F;;0191;;0191 <=
1506           0193;LATIN CAPITAL LETTER G WITH HOOK;Lu;0;L;;;;;N;LATIN CAPITAL LETTER G HOOK;;;0260; => C9 A0
1507           0194;LATIN CAPITAL LETTER GAMMA;Lu;0;L;;;;;N;;;;0263; => C9 A3
1508           0195;LATIN SMALL LETTER HV;Ll;0;L;;;;;N;LATIN SMALL LETTER H V;;01F6;;01F6 <=
1509           0196;LATIN CAPITAL LETTER IOTA;Lu;0;L;;;;;N;;;;0269; => C9 A9
1510           0197;LATIN CAPITAL LETTER I WITH STROKE;Lu;0;L;;;;;N;LATIN CAPITAL LETTER BARRED I;;;0268; => C9 A8
1511           0198;LATIN CAPITAL LETTER K WITH HOOK;Lu;0;L;;;;;N;LATIN CAPITAL LETTER K HOOK;;;0199; => +1
1512           0199;LATIN SMALL LETTER K WITH HOOK;Ll;0;L;;;;;N;LATIN SMALL LETTER K HOOK;;0198;;0198 <=
1513           019A;LATIN SMALL LETTER L WITH BAR;Ll;0;L;;;;;N;LATIN SMALL LETTER BARRED L;;023D;;023D <=
1514           019B;LATIN SMALL LETTER LAMBDA WITH STROKE;Ll;0;L;;;;;N;LATIN SMALL LETTER BARRED LAMBDA;;;; <=
1515           019C;LATIN CAPITAL LETTER TURNED M;Lu;0;L;;;;;N;;;;026F; => C9 AF
1516           019D;LATIN CAPITAL LETTER N WITH LEFT HOOK;Lu;0;L;;;;;N;LATIN CAPITAL LETTER N HOOK;;;0272; => C9 B2
1517           019E;LATIN SMALL LETTER N WITH LONG RIGHT LEG;Ll;0;L;;;;;N;;;0220;;0220 <=
1518           019F;LATIN CAPITAL LETTER O WITH MIDDLE TILDE;Lu;0;L;;;;;N;LATIN CAPITAL LETTER BARRED O;;;0275; => C9 B5
1519           }
1520           #$90:
1521           begin
1522             new_c1 := #$C9;
1523             new_c2 := #$9B;
1524           end;
1525           #$91, #$98: new_c2 := chr(ord(c2)+1);
1526           #$93:
1527           begin
1528             new_c1 := #$C9;
1529             new_c2 := #$A0;
1530           end;
1531           #$94:
1532           begin
1533             new_c1 := #$C9;
1534             new_c2 := #$A3;
1535           end;
1536           #$96:
1537           begin
1538             new_c1 := #$C9;
1539             new_c2 := #$A9;
1540           end;
1541           #$97:
1542           begin
1543             new_c1 := #$C9;
1544             new_c2 := #$A8;
1545           end;
1546           #$9C:
1547           begin
1548             new_c1 := #$C9;
1549             new_c2 := #$AF;
1550           end;
1551           #$9D:
1552           begin
1553             new_c1 := #$C9;
1554             new_c2 := #$B2;
1555           end;
1556           #$9F:
1557           begin
1558             new_c1 := #$C9;
1559             new_c2 := #$B5;
1560           end;
1561           {
1562           And also C6 A0..C6 AF
1563 
1564           01A0;LATIN CAPITAL LETTER O WITH HORN;Lu;0;L;004F 031B;;;;N;LATIN CAPITAL LETTER O HORN;;;01A1; => +1
1565           01A1;LATIN SMALL LETTER O WITH HORN;Ll;0;L;006F 031B;;;;N;LATIN SMALL LETTER O HORN;;01A0;;01A0 <=
1566           01A2;LATIN CAPITAL LETTER OI;Lu;0;L;;;;;N;LATIN CAPITAL LETTER O I;;;01A3; => +1
1567           01A3;LATIN SMALL LETTER OI;Ll;0;L;;;;;N;LATIN SMALL LETTER O I;;01A2;;01A2 <=
1568           01A4;LATIN CAPITAL LETTER P WITH HOOK;Lu;0;L;;;;;N;LATIN CAPITAL LETTER P HOOK;;;01A5; => +1
1569           01A5;LATIN SMALL LETTER P WITH HOOK;Ll;0;L;;;;;N;LATIN SMALL LETTER P HOOK;;01A4;;01A4 <=
1570           01A6;LATIN LETTER YR;Lu;0;L;;;;;N;LATIN LETTER Y R;;;0280; => CA 80
1571           01A7;LATIN CAPITAL LETTER TONE TWO;Lu;0;L;;;;;N;;;;01A8; => +1
1572           01A8;LATIN SMALL LETTER TONE TWO;Ll;0;L;;;;;N;;;01A7;;01A7 <=
1573           01A9;LATIN CAPITAL LETTER ESH;Lu;0;L;;;;;N;;;;0283; => CA 83
1574           01AA;LATIN LETTER REVERSED ESH LOOP;Ll;0;L;;;;;N;;;;;
1575           01AB;LATIN SMALL LETTER T WITH PALATAL HOOK;Ll;0;L;;;;;N;LATIN SMALL LETTER T PALATAL HOOK;;;; <=
1576           01AC;LATIN CAPITAL LETTER T WITH HOOK;Lu;0;L;;;;;N;LATIN CAPITAL LETTER T HOOK;;;01AD; => +1
1577           01AD;LATIN SMALL LETTER T WITH HOOK;Ll;0;L;;;;;N;LATIN SMALL LETTER T HOOK;;01AC;;01AC <=
1578           01AE;LATIN CAPITAL LETTER T WITH RETROFLEX HOOK;Lu;0;L;;;;;N;LATIN CAPITAL LETTER T RETROFLEX HOOK;;;0288; => CA 88
1579           01AF;LATIN CAPITAL LETTER U WITH HORN;Lu;0;L;0055 031B;;;;N;LATIN CAPITAL LETTER U HORN;;;01B0; => +1
1580           }
1581           #$A0..#$A5,#$AC:
1582           begin
1583             if ord(c2) mod 2 = 0 then
1584               new_c2 := chr(ord(c2) + 1);
1585           end;
1586           #$A7,#$AF:
1587           begin
1588             if ord(c2) mod 2 = 1 then
1589               new_c2 := chr(ord(c2) + 1);
1590           end;
1591           #$A6:
1592           begin
1593             new_c1 := #$CA;
1594             new_c2 := #$80;
1595           end;
1596           #$A9:
1597           begin
1598             new_c1 := #$CA;
1599             new_c2 := #$83;
1600           end;
1601           #$AE:
1602           begin
1603             new_c1 := #$CA;
1604             new_c2 := #$88;
1605           end;
1606           {
1607           And also C6 B0..C6 BF
1608 
1609           01B0;LATIN SMALL LETTER U WITH HORN;Ll;0;L;0075 031B;;;;N;LATIN SMALL LETTER U HORN;;01AF;;01AF <= -1
1610           01B1;LATIN CAPITAL LETTER UPSILON;Lu;0;L;;;;;N;;;;028A; => CA 8A
1611           01B2;LATIN CAPITAL LETTER V WITH HOOK;Lu;0;L;;;;;N;LATIN CAPITAL LETTER SCRIPT V;;;028B; => CA 8B
1612           01B3;LATIN CAPITAL LETTER Y WITH HOOK;Lu;0;L;;;;;N;LATIN CAPITAL LETTER Y HOOK;;;01B4; => +1
1613           01B4;LATIN SMALL LETTER Y WITH HOOK;Ll;0;L;;;;;N;LATIN SMALL LETTER Y HOOK;;01B3;;01B3 <=
1614           01B5;LATIN CAPITAL LETTER Z WITH STROKE;Lu;0;L;;;;;N;LATIN CAPITAL LETTER Z BAR;;;01B6; => +1
1615           01B6;LATIN SMALL LETTER Z WITH STROKE;Ll;0;L;;;;;N;LATIN SMALL LETTER Z BAR;;01B5;;01B5 <=
1616           01B7;LATIN CAPITAL LETTER EZH;Lu;0;L;;;;;N;LATIN CAPITAL LETTER YOGH;;;0292; => CA 92
1617           01B8;LATIN CAPITAL LETTER EZH REVERSED;Lu;0;L;;;;;N;LATIN CAPITAL LETTER REVERSED YOGH;;;01B9; => +1
1618           01B9;LATIN SMALL LETTER EZH REVERSED;Ll;0;L;;;;;N;LATIN SMALL LETTER REVERSED YOGH;;01B8;;01B8 <=
1619           01BA;LATIN SMALL LETTER EZH WITH TAIL;Ll;0;L;;;;;N;LATIN SMALL LETTER YOGH WITH TAIL;;;; <=
1620           01BB;LATIN LETTER TWO WITH STROKE;Lo;0;L;;;;;N;LATIN LETTER TWO BAR;;;; X
1621           01BC;LATIN CAPITAL LETTER TONE FIVE;Lu;0;L;;;;;N;;;;01BD; => +1
1622           01BD;LATIN SMALL LETTER TONE FIVE;Ll;0;L;;;;;N;;;01BC;;01BC <=
1623           01BE;LATIN LETTER INVERTED GLOTTAL STOP WITH STROKE;Ll;0;L;;;;;N;LATIN LETTER INVERTED GLOTTAL STOP BAR;;;; X
1624           01BF;LATIN LETTER WYNN;Ll;0;L;;;;;N;;;01F7;;01F7  <=
1625           }
1626           #$B8,#$BC:
1627           begin
1628             if ord(c2) mod 2 = 0 then
1629               new_c2 := chr(ord(c2) + 1);
1630           end;
1631           #$B3..#$B6:
1632           begin
1633             if ord(c2) mod 2 = 1 then
1634               new_c2 := chr(ord(c2) + 1);
1635           end;
1636           #$B1:
1637           begin
1638             new_c1 := #$CA;
1639             new_c2 := #$8A;
1640           end;
1641           #$B2:
1642           begin
1643             new_c1 := #$CA;
1644             new_c2 := #$8B;
1645           end;
1646           #$B7:
1647           begin
1648             new_c1 := #$CA;
1649             new_c2 := #$92;
1650           end;
1651           end;
1652         end;
1653         #$C7:
1654         begin
1655           case c2 of
1656           #$84..#$8C,#$B1..#$B3:
1657           begin
1658             if (ord(c2) and $F) mod 3 = 1 then new_c2 := chr(ord(c2) + 2)
1659             else if (ord(c2) and $F) mod 3 = 2 then new_c2 := chr(ord(c2) + 1);
1660           end;
1661           #$8D..#$9C:
1662           begin
1663             if ord(c2) mod 2 = 1 then
1664               new_c2 := chr(ord(c2) + 1);
1665           end;
1666           #$9E..#$AF,#$B4..#$B5,#$B8..#$BF:
1667           begin
1668             if ord(c2) mod 2 = 0 then
1669               new_c2 := chr(ord(c2) + 1);
1670           end;
1671           {
1672           01F6;LATIN CAPITAL LETTER HWAIR;Lu;0;L;;;;;N;;;;0195;
1673           01F7;LATIN CAPITAL LETTER WYNN;Lu;0;L;;;;;N;;;;01BF;
1674           }
1675           #$B6:
1676           begin
1677             new_c1 := #$C6;
1678             new_c2 := #$95;
1679           end;
1680           #$B7:
1681           begin
1682             new_c1 := #$C6;
1683             new_c2 := #$BF;
1684           end;
1685           end;
1686         end;
1687         {
1688         Codepoints 0200 to 023F
1689         }
1690         #$C8:
1691         begin
1692           // For this one we can simply start with a default and override for some specifics
1693           if (c2 in [#$80..#$B3]) and (ord(c2) mod 2 = 0) then new_c2 := chr(ord(c2) + 1);
1694 
1695           case c2 of
1696           #$A0:
1697           begin
1698             new_c1 := #$C6;
1699             new_c2 := #$9E;
1700           end;
1701           #$A1: new_c2 := c2;
1702           {
1703           023A;LATIN CAPITAL LETTER A WITH STROKE;Lu;0;L;;;;;N;;;;2C65; => E2 B1 A5
1704           023B;LATIN CAPITAL LETTER C WITH STROKE;Lu;0;L;;;;;N;;;;023C; => +1
1705           023C;LATIN SMALL LETTER C WITH STROKE;Ll;0;L;;;;;N;;;023B;;023B <=
1706           023D;LATIN CAPITAL LETTER L WITH BAR;Lu;0;L;;;;;N;;;;019A; => C6 9A
1707           023E;LATIN CAPITAL LETTER T WITH DIAGONAL STROKE;Lu;0;L;;;;;N;;;;2C66; => E2 B1 A6
1708           023F;LATIN SMALL LETTER S WITH SWASH TAIL;Ll;0;L;;;;;N;;;2C7E;;2C7E <=
1709           0240;LATIN SMALL LETTER Z WITH SWASH TAIL;Ll;0;L;;;;;N;;;2C7F;;2C7F <=
1710           }
1711           #$BA,#$BE:
1712           begin
1713             p:= OutStr - PChar(Result);
1714             SetLength(Result,Length(Result)+1);// Increase the buffer
1715             OutStr := PChar(Result)+p;
1716             OutStr^ := #$E2;
1717             inc(OutStr);
1718             OutStr^ := #$B1;
1719             inc(OutStr);
1720             if c2 = #$BA then OutStr^ := #$A5
1721             else OutStr^ := #$A6;
1722             dec(CounterDiff);
1723             inc(OutStr);
1724             inc(InStr, 2);
1725             Continue;
1726           end;
1727           #$BD:
1728           begin
1729             new_c1 := #$C6;
1730             new_c2 := #$9A;
1731           end;
1732           #$BB: new_c2 := chr(ord(c2) + 1);
1733           end;
1734         end;
1735         {
1736         Codepoints 0240 to 027F
1737 
1738         Here only 0240..024F needs lowercase
1739         }
1740         #$C9:
1741         begin
1742           case c2 of
1743           #$81..#$82:
1744           begin
1745             if ord(c2) mod 2 = 1 then
1746               new_c2 := chr(ord(c2) + 1);
1747           end;
1748           #$86..#$8F:
1749           begin
1750             if ord(c2) mod 2 = 0 then
1751               new_c2 := chr(ord(c2) + 1);
1752           end;
1753           #$83:
1754           begin
1755             new_c1 := #$C6;
1756             new_c2 := #$80;
1757           end;
1758           #$84:
1759           begin
1760             new_c1 := #$CA;
1761             new_c2 := #$89;
1762           end;
1763           #$85:
1764           begin
1765             new_c1 := #$CA;
1766             new_c2 := #$8C;
1767           end;
1768           end;
1769         end;
1770         // $CE91..$CE9F: NewChar := OldChar + $20; // Greek Characters
1771         // $CEA0..$CEA9: NewChar := OldChar + $E0; // Greek Characters
1772         #$CE:
1773         begin
1774           case c2 of
1775             // 0380 = CE 80
1776             #$86: new_c2 := #$AC;
1777             #$88: new_c2 := #$AD;
1778             #$89: new_c2 := #$AE;
1779             #$8A: new_c2 := #$AF;
1780             #$8C: new_c1 := #$CF; // By coincidence new_c2 remains the same
1781             #$8E:
1782             begin
1783               new_c1 := #$CF;
1784               new_c2 := #$8D;
1785             end;
1786             #$8F:
1787             begin
1788               new_c1 := #$CF;
1789               new_c2 := #$8E;
1790             end;
1791             // 0390 = CE 90
1792             #$91..#$9F:
1793             begin
1794               new_c2 := chr(ord(c2) + $20);
1795             end;
1796             // 03A0 = CE A0
1797             #$A0..#$AB:
1798             begin
1799               new_c1 := #$CF;
1800               new_c2 := chr(ord(c2) - $20);
1801             end;
1802           end;
1803         end;
1804         // 03C0 = CF 80
1805         // 03D0 = CF 90
1806         // 03E0 = CF A0
1807         // 03F0 = CF B0
1808         #$CF:
1809         begin
1810           case c2 of
1811             // 03CF;GREEK CAPITAL KAI SYMBOL;Lu;0;L;;;;;N;;;;03D7; CF 8F => CF 97
1812             #$8F: new_c2 := #$97;
1813             // 03D8;GREEK LETTER ARCHAIC KOPPA;Lu;0;L;;;;;N;;;;03D9;
1814             #$98: new_c2 := #$99;
1815             // 03DA;GREEK LETTER STIGMA;Lu;0;L;;;;;N;GREEK CAPITAL LETTER STIGMA;;;03DB;
1816             #$9A: new_c2 := #$9B;
1817             // 03DC;GREEK LETTER DIGAMMA;Lu;0;L;;;;;N;GREEK CAPITAL LETTER DIGAMMA;;;03DD;
1818             #$9C: new_c2 := #$9D;
1819             // 03DE;GREEK LETTER KOPPA;Lu;0;L;;;;;N;GREEK CAPITAL LETTER KOPPA;;;03DF;
1820             #$9E: new_c2 := #$9F;
1821             {
1822             03E0;GREEK LETTER SAMPI;Lu;0;L;;;;;N;GREEK CAPITAL LETTER SAMPI;;;03E1;
1823             03E1;GREEK SMALL LETTER SAMPI;Ll;0;L;;;;;N;;;03E0;;03E0
1824             03E2;COPTIC CAPITAL LETTER SHEI;Lu;0;L;;;;;N;GREEK CAPITAL LETTER SHEI;;;03E3;
1825             03E3;COPTIC SMALL LETTER SHEI;Ll;0;L;;;;;N;GREEK SMALL LETTER SHEI;;03E2;;03E2
1826             ...
1827             03EE;COPTIC CAPITAL LETTER DEI;Lu;0;L;;;;;N;GREEK CAPITAL LETTER DEI;;;03EF;
1828             03EF;COPTIC SMALL LETTER DEI;Ll;0;L;;;;;N;GREEK SMALL LETTER DEI;;03EE;;03EE
1829             }
1830             #$A0..#$AF: if ord(c2) mod 2 = 0 then
1831                           new_c2 := chr(ord(c2) + 1);
1832             // 03F4;GREEK CAPITAL THETA SYMBOL;Lu;0;L;<compat> 0398;;;;N;;;;03B8;
1833             #$B4:
1834             begin
1835               new_c1 := #$CE;
1836               new_c2 := #$B8;
1837             end;
1838             // 03F7;GREEK CAPITAL LETTER SHO;Lu;0;L;;;;;N;;;;03F8;
1839             #$B7: new_c2 := #$B8;
1840             // 03F9;GREEK CAPITAL LUNATE SIGMA SYMBOL;Lu;0;L;<compat> 03A3;;;;N;;;;03F2;
1841             #$B9: new_c2 := #$B2;
1842             // 03FA;GREEK CAPITAL LETTER SAN;Lu;0;L;;;;;N;;;;03FB;
1843             #$BA: new_c2 := #$BB;
1844             // 03FD;GREEK CAPITAL REVERSED LUNATE SIGMA SYMBOL;Lu;0;L;;;;;N;;;;037B;
1845             #$BD:
1846             begin
1847               new_c1 := #$CD;
1848               new_c2 := #$BB;
1849             end;
1850             // 03FE;GREEK CAPITAL DOTTED LUNATE SIGMA SYMBOL;Lu;0;L;;;;;N;;;;037C;
1851             #$BE:
1852             begin
1853               new_c1 := #$CD;
1854               new_c2 := #$BC;
1855             end;
1856             // 03FF;GREEK CAPITAL REVERSED DOTTED LUNATE SIGMA SYMBOL;Lu;0;L;;;;;N;;;;037D;
1857             #$BF:
1858             begin
1859               new_c1 := #$CD;
1860               new_c2 := #$BD;
1861             end;
1862           end;
1863         end;
1864         // $D080..$D08F: NewChar := OldChar + $110; // Cyrillic alphabet
1865         // $D090..$D09F: NewChar := OldChar + $20; // Cyrillic alphabet
1866         // $D0A0..$D0AF: NewChar := OldChar + $E0; // Cyrillic alphabet
1867         #$D0:
1868         begin
1869           c2 := InStr[1];
1870           case c2 of
1871             #$80..#$8F:
1872             begin
1873               new_c1 := chr(ord(c1)+1);
1874               new_c2  := chr(ord(c2) + $10);
1875             end;
1876             #$90..#$9F:
1877             begin
1878               new_c2 := chr(ord(c2) + $20);
1879             end;
1880             #$A0..#$AF:
1881             begin
1882               new_c1 := chr(ord(c1)+1);
1883               new_c2 := chr(ord(c2) - $20);
1884             end;
1885           end;
1886         end;
1887         // Archaic and non-slavic cyrillic 460-47F = D1A0-D1BF
1888         // These require just adding 1 to get the lowercase
1889         #$D1:
1890         begin
1891           if (c2 in [#$A0..#$BF]) and (ord(c2) mod 2 = 0) then
1892             new_c2 := chr(ord(c2) + 1);
1893         end;
1894         // Archaic and non-slavic cyrillic 480-4BF = D280-D2BF
1895         // These mostly require just adding 1 to get the lowercase
1896         #$D2:
1897         begin
1898           case c2 of
1899             #$80:
1900             begin
1901               new_c2 := chr(ord(c2) + 1);
1902             end;
1903             // #$81 is already lowercase
1904             // #$82-#$89 ???
1905             #$8A..#$BF:
1906             begin
1907               if ord(c2) mod 2 = 0 then
1908                 new_c2 := chr(ord(c2) + 1);
1909             end;
1910           end;
1911         end;
1912         {
1913         Codepoints  04C0..04FF
1914         }
1915         #$D3:
1916         begin
1917           case c2 of
1918             #$80: new_c2 := #$8F;
1919             #$81..#$8E:
1920             begin
1921               if ord(c2) mod 2 = 1 then
1922                 new_c2 := chr(ord(c2) + 1);
1923             end;
1924             #$90..#$BF:
1925             begin
1926               if ord(c2) mod 2 = 0 then
1927                 new_c2 := chr(ord(c2) + 1);
1928             end;
1929           end;
1930         end;
1931         {
1932         Codepoints  0500..053F
1933 
1934         Armenian starts in 0531
1935         }
1936         #$D4:
1937         begin
1938           if ord(c2) mod 2 = 0 then
1939             new_c2 := chr(ord(c2) + 1);
1940 
1941           // Armenian
1942           if c2 in [#$B1..#$BF] then
1943           begin
1944             new_c1 := #$D5;
1945             new_c2 := chr(ord(c2) - $10);
1946           end;
1947         end;
1948         {
1949         Codepoints  0540..057F
1950 
1951         Armenian
1952         }
1953         #$D5:
1954         begin
1955           case c2 of
1956             #$80..#$8F:
1957             begin
1958               new_c2 := chr(ord(c2) + $30);
1959             end;
1960             #$90..#$96:
1961             begin
1962               new_c1 := #$D6;
1963               new_c2 := chr(ord(c2) - $10);
1964             end;
1965           end;
1966         end;
1967         end;
1968         // Common code 2-byte modifiable chars
1969         if (CounterDiff <> 0) then
1970         begin
1971           OutStr^ := new_c1;
1972           OutStr[1] := new_c2;
1973         end
1974         else
1975         begin
1976           if (new_c1 <> c1) then OutStr^ := new_c1;
1977           if (new_c2 <> c2) then OutStr[1] := new_c2;
1978         end;
1979         inc(InStr, 2);
1980         inc(OutStr, 2);
1981       end;
1982       {
1983       Characters with 3 bytes
1984       }
1985       #$E1:
1986       begin
1987         new_c1 := c1;
1988         c2 := InStr[1];
1989         c3 := InStr[2];
1990         new_c2 := c2;
1991         new_c3 := c3;
1992         {
1993         Georgian codepoints 10A0-10C5 => 2D00-2D25
1994 
1995         In UTF-8 this is:
1996         E1 82 A0 - E1 82 BF => E2 B4 80 - E2 B4 9F
1997         E1 83 80 - E1 83 85 => E2 B4 A0 - E2 B4 A5
1998         }
1999         case c2 of
2000         #$82:
2001         if (c3 in [#$A0..#$BF]) then
2002         begin
2003           new_c1 := #$E2;
2004           new_c2 := #$B4;
2005           new_c3 := chr(ord(c3) - $20);
2006         end;
2007         #$83:
2008         if (c3 in [#$80..#$85]) then
2009         begin
2010           new_c1 := #$E2;
2011           new_c2 := #$B4;
2012           new_c3 := chr(ord(c3) + $20);
2013         end;
2014         {
2015         Extra chars between 1E00..1EFF
2016 
2017         Blocks of chars:
2018           1E00..1E3F    E1 B8 80..E1 B8 BF
2019           1E40..1E7F    E1 B9 80..E1 B9 BF
2020           1E80..1EBF    E1 BA 80..E1 BA BF
2021           1EC0..1EFF    E1 BB 80..E1 BB BF
2022         }
2023         #$B8..#$BB:
2024         begin
2025           // Start with a default and change for some particular chars
2026           if ord(c3) mod 2 = 0 then
2027             new_c3 := chr(ord(c3) + 1);
2028 
2029           { Only 1E96..1E9F are different E1 BA 96..E1 BA 9F
2030 
2031           1E96;LATIN SMALL LETTER H WITH LINE BELOW;Ll;0;L;0068 0331;;;;N;;;;;
2032           1E97;LATIN SMALL LETTER T WITH DIAERESIS;Ll;0;L;0074 0308;;;;N;;;;;
2033           1E98;LATIN SMALL LETTER W WITH RING ABOVE;Ll;0;L;0077 030A;;;;N;;;;;
2034           1E99;LATIN SMALL LETTER Y WITH RING ABOVE;Ll;0;L;0079 030A;;;;N;;;;;
2035           1E9A;LATIN SMALL LETTER A WITH RIGHT HALF RING;Ll;0;L;<compat> 0061 02BE;;;;N;;;;;
2036           1E9B;LATIN SMALL LETTER LONG S WITH DOT ABOVE;Ll;0;L;017F 0307;;;;N;;;1E60;;1E60
2037           1E9C;LATIN SMALL LETTER LONG S WITH DIAGONAL STROKE;Ll;0;L;;;;;N;;;;;
2038           1E9D;LATIN SMALL LETTER LONG S WITH HIGH STROKE;Ll;0;L;;;;;N;;;;;
2039           1E9E;LATIN CAPITAL LETTER SHARP S;Lu;0;L;;;;;N;;;;00DF; => C3 9F
2040           1E9F;LATIN SMALL LETTER DELTA;Ll;0;L;;;;;N;;;;;
2041           }
2042           if (c2 = #$BA) and (c3 in [#$96..#$9F]) then new_c3 := c3;
2043           // LATIN CAPITAL LETTER SHARP S => to german Beta
2044           if (c2 = #$BA) and (c3 = #$9E) then
2045           begin
2046             inc(InStr, 3);
2047             OutStr^ := #$C3;
2048             inc(OutStr);
2049             OutStr^ := #$9F;
2050             inc(OutStr);
2051             inc(CounterDiff, 1);
2052             Continue;
2053           end;
2054         end;
2055         {
2056         Extra chars between 1F00..1FFF
2057 
2058         Blocks of chars:
2059           1E00..1E3F    E1 BC 80..E1 BC BF
2060           1E40..1E7F    E1 BD 80..E1 BD BF
2061           1E80..1EBF    E1 BE 80..E1 BE BF
2062           1EC0..1EFF    E1 BF 80..E1 BF BF
2063         }
2064         #$BC:
2065         begin
2066           // Start with a default and change for some particular chars
2067           if (ord(c3) mod $10) div 8 = 1 then
2068             new_c3 := chr(ord(c3) - 8);
2069         end;
2070         #$BD:
2071         begin
2072           // Start with a default and change for some particular chars
2073           case c3 of
2074           #$80..#$8F, #$A0..#$AF: if (ord(c3) mod $10) div 8 = 1 then
2075                         new_c3 := chr(ord(c3) - 8);
2076           {
2077           1F50;GREEK SMALL LETTER UPSILON WITH PSILI;Ll;0;L;03C5 0313;;;;N;;;;;
2078           1F51;GREEK SMALL LETTER UPSILON WITH DASIA;Ll;0;L;03C5 0314;;;;N;;;1F59;;1F59
2079           1F52;GREEK SMALL LETTER UPSILON WITH PSILI AND VARIA;Ll;0;L;1F50 0300;;;;N;;;;;
2080           1F53;GREEK SMALL LETTER UPSILON WITH DASIA AND VARIA;Ll;0;L;1F51 0300;;;;N;;;1F5B;;1F5B
2081           1F54;GREEK SMALL LETTER UPSILON WITH PSILI AND OXIA;Ll;0;L;1F50 0301;;;;N;;;;;
2082           1F55;GREEK SMALL LETTER UPSILON WITH DASIA AND OXIA;Ll;0;L;1F51 0301;;;;N;;;1F5D;;1F5D
2083           1F56;GREEK SMALL LETTER UPSILON WITH PSILI AND PERISPOMENI;Ll;0;L;1F50 0342;;;;N;;;;;
2084           1F57;GREEK SMALL LETTER UPSILON WITH DASIA AND PERISPOMENI;Ll;0;L;1F51 0342;;;;N;;;1F5F;;1F5F
2085           1F59;GREEK CAPITAL LETTER UPSILON WITH DASIA;Lu;0;L;03A5 0314;;;;N;;;;1F51;
2086           1F5B;GREEK CAPITAL LETTER UPSILON WITH DASIA AND VARIA;Lu;0;L;1F59 0300;;;;N;;;;1F53;
2087           1F5D;GREEK CAPITAL LETTER UPSILON WITH DASIA AND OXIA;Lu;0;L;1F59 0301;;;;N;;;;1F55;
2088           1F5F;GREEK CAPITAL LETTER UPSILON WITH DASIA AND PERISPOMENI;Lu;0;L;1F59 0342;;;;N;;;;1F57;
2089           }
2090           #$99,#$9B,#$9D,#$9F: new_c3 := chr(ord(c3) - 8);
2091           end;
2092         end;
2093         #$BE:
2094         begin
2095           // Start with a default and change for some particular chars
2096           case c3 of
2097           #$80..#$B9: if (ord(c3) mod $10) div 8 = 1 then
2098                         new_c3 := chr(ord(c3) - 8);
2099           {
2100           1FB0;GREEK SMALL LETTER ALPHA WITH VRACHY;Ll;0;L;03B1 0306;;;;N;;;1FB8;;1FB8
2101           1FB1;GREEK SMALL LETTER ALPHA WITH MACRON;Ll;0;L;03B1 0304;;;;N;;;1FB9;;1FB9
2102           1FB2;GREEK SMALL LETTER ALPHA WITH VARIA AND YPOGEGRAMMENI;Ll;0;L;1F70 0345;;;;N;;;;;
2103           1FB3;GREEK SMALL LETTER ALPHA WITH YPOGEGRAMMENI;Ll;0;L;03B1 0345;;;;N;;;1FBC;;1FBC
2104           1FB4;GREEK SMALL LETTER ALPHA WITH OXIA AND YPOGEGRAMMENI;Ll;0;L;03AC 0345;;;;N;;;;;
2105           1FB6;GREEK SMALL LETTER ALPHA WITH PERISPOMENI;Ll;0;L;03B1 0342;;;;N;;;;;
2106           1FB7;GREEK SMALL LETTER ALPHA WITH PERISPOMENI AND YPOGEGRAMMENI;Ll;0;L;1FB6 0345;;;;N;;;;;
2107           1FB8;GREEK CAPITAL LETTER ALPHA WITH VRACHY;Lu;0;L;0391 0306;;;;N;;;;1FB0;
2108           1FB9;GREEK CAPITAL LETTER ALPHA WITH MACRON;Lu;0;L;0391 0304;;;;N;;;;1FB1;
2109           1FBA;GREEK CAPITAL LETTER ALPHA WITH VARIA;Lu;0;L;0391 0300;;;;N;;;;1F70;
2110           1FBB;GREEK CAPITAL LETTER ALPHA WITH OXIA;Lu;0;L;0386;;;;N;;;;1F71;
2111           1FBC;GREEK CAPITAL LETTER ALPHA WITH PROSGEGRAMMENI;Lt;0;L;0391 0345;;;;N;;;;1FB3;
2112           1FBD;GREEK KORONIS;Sk;0;ON;<compat> 0020 0313;;;;N;;;;;
2113           1FBE;GREEK PROSGEGRAMMENI;Ll;0;L;03B9;;;;N;;;0399;;0399
2114           1FBF;GREEK PSILI;Sk;0;ON;<compat> 0020 0313;;;;N;;;;;
2115           }
2116           #$BA:
2117           begin
2118             new_c2 := #$BD;
2119             new_c3 := #$B0;
2120           end;
2121           #$BB:
2122           begin
2123             new_c2 := #$BD;
2124             new_c3 := #$B1;
2125           end;
2126           #$BC: new_c3 := #$B3;
2127           end;
2128         end;
2129         end;
2130 
2131         if (CounterDiff <> 0) then
2132         begin
2133           OutStr^ := new_c1;
2134           OutStr[1] := new_c2;
2135           OutStr[2] := new_c3;
2136         end
2137         else
2138         begin
2139           if c1 <> new_c1 then OutStr^ := new_c1;
2140           if c2 <> new_c2 then OutStr[1] := new_c2;
2141           if c3 <> new_c3 then OutStr[2] := new_c3;
2142         end;
2143 
2144         inc(InStr, 3);
2145         inc(OutStr, 3);
2146       end;
2147       {
2148       More Characters with 3 bytes, so exotic stuff between:
2149       $2126..$2183                    E2 84 A6..E2 86 83
2150       $24B6..$24CF    Result:=u+26;   E2 92 B6..E2 93 8F
2151       $2C00..$2C2E    Result:=u+48;   E2 B0 80..E2 B0 AE
2152       $2C60..$2CE2                    E2 B1 A0..E2 B3 A2
2153       }
2154       #$E2:
2155       begin
2156         new_c1 := c1;
2157         c2 := InStr[1];
2158         c3 := InStr[2];
2159         new_c2 := c2;
2160         new_c3 := c3;
2161         // 2126;OHM SIGN;Lu;0;L;03A9;;;;N;OHM;;;03C9; E2 84 A6 => CF 89
2162         if (c2 = #$84) and (c3 = #$A6) then
2163         begin
2164           inc(InStr, 3);
2165           OutStr^ := #$CF;
2166           inc(OutStr);
2167           OutStr^ := #$89;
2168           inc(OutStr);
2169           inc(CounterDiff, 1);
2170           Continue;
2171         end
2172         {
2173         212A;KELVIN SIGN;Lu;0;L;004B;;;;N;DEGREES KELVIN;;;006B; E2 84 AA => 6B
2174         }
2175         else if (c2 = #$84) and (c3 = #$AA) then
2176         begin
2177           inc(InStr, 3);
2178           OutStr^ := #$6B;
2179           inc(OutStr);
2180           inc(CounterDiff, 2);
2181           Continue;
2182         end
2183         {
2184         212B;ANGSTROM SIGN;Lu;0;L;00C5;;;;N;ANGSTROM UNIT;;;00E5; E2 84 AB => C3 A5
2185         }
2186         else if (c2 = #$84) and (c3 = #$AB) then
2187         begin
2188           inc(InStr, 3);
2189           OutStr^ := #$C3;
2190           inc(OutStr);
2191           OutStr^ := #$A5;
2192           inc(OutStr);
2193           inc(CounterDiff, 1);
2194           Continue;
2195         end
2196         {
2197         2160;ROMAN NUMERAL ONE;Nl;0;L;<compat> 0049;;;1;N;;;;2170; E2 85 A0 => E2 85 B0
2198         2161;ROMAN NUMERAL TWO;Nl;0;L;<compat> 0049 0049;;;2;N;;;;2171;
2199         2162;ROMAN NUMERAL THREE;Nl;0;L;<compat> 0049 0049 0049;;;3;N;;;;2172;
2200         2163;ROMAN NUMERAL FOUR;Nl;0;L;<compat> 0049 0056;;;4;N;;;;2173;
2201         2164;ROMAN NUMERAL FIVE;Nl;0;L;<compat> 0056;;;5;N;;;;2174;
2202         2165;ROMAN NUMERAL SIX;Nl;0;L;<compat> 0056 0049;;;6;N;;;;2175;
2203         2166;ROMAN NUMERAL SEVEN;Nl;0;L;<compat> 0056 0049 0049;;;7;N;;;;2176;
2204         2167;ROMAN NUMERAL EIGHT;Nl;0;L;<compat> 0056 0049 0049 0049;;;8;N;;;;2177;
2205         2168;ROMAN NUMERAL NINE;Nl;0;L;<compat> 0049 0058;;;9;N;;;;2178;
2206         2169;ROMAN NUMERAL TEN;Nl;0;L;<compat> 0058;;;10;N;;;;2179;
2207         216A;ROMAN NUMERAL ELEVEN;Nl;0;L;<compat> 0058 0049;;;11;N;;;;217A;
2208         216B;ROMAN NUMERAL TWELVE;Nl;0;L;<compat> 0058 0049 0049;;;12;N;;;;217B;
2209         216C;ROMAN NUMERAL FIFTY;Nl;0;L;<compat> 004C;;;50;N;;;;217C;
2210         216D;ROMAN NUMERAL ONE HUNDRED;Nl;0;L;<compat> 0043;;;100;N;;;;217D;
2211         216E;ROMAN NUMERAL FIVE HUNDRED;Nl;0;L;<compat> 0044;;;500;N;;;;217E;
2212         216F;ROMAN NUMERAL ONE THOUSAND;Nl;0;L;<compat> 004D;;;1000;N;;;;217F;
2213         }
2214         else if (c2 = #$85) and (c3 in [#$A0..#$AF]) then new_c3 := chr(ord(c3) + $10)
2215         {
2216         2183;ROMAN NUMERAL REVERSED ONE HUNDRED;Lu;0;L;;;;;N;;;;2184; E2 86 83 => E2 86 84
2217         }
2218         else if (c2 = #$86) and (c3 = #$83) then new_c3 := chr(ord(c3) + 1)
2219         {
2220         $24B6..$24CF    Result:=u+26;   E2 92 B6..E2 93 8F
2221 
2222         Ex: 24B6;CIRCLED LATIN CAPITAL LETTER A;So;0;L;<circle> 0041;;;;N;;;;24D0; E2 92 B6 => E2 93 90
2223         }
2224         else if (c2 = #$92) and (c3 in [#$B6..#$BF]) then
2225         begin
2226           new_c2 := #$93;
2227           new_c3 := chr(ord(c3) - $26);
2228         end
2229         // CIRCLED LATIN CAPITAL LETTER K  $24C0 -> $24DA
2230         else if (c2 = #$93) and (c3 in [#$80..#$8F]) then new_c3 := chr(ord(c3) + $1A)
2231         {
2232         $2C00..$2C2E    Result:=u+48;   E2 B0 80..E2 B0 AE
2233 
2234         2C00;GLAGOLITIC CAPITAL LETTER AZU;Lu;0;L;;;;;N;;;;2C30; E2 B0 80 => E2 B0 B0
2235 
2236         2C10;GLAGOLITIC CAPITAL LETTER NASHI;Lu;0;L;;;;;N;;;;2C40; E2 B0 90 => E2 B1 80
2237         }
2238         else if (c2 = #$B0) and (c3 in [#$80..#$8F]) then new_c3 := chr(ord(c3) + $30)
2239         else if (c2 = #$B0) and (c3 in [#$90..#$AE]) then
2240         begin
2241           new_c2 := #$B1;
2242           new_c3 := chr(ord(c3) - $10);
2243         end
2244         {
2245         $2C60..$2CE2                    E2 B1 A0..E2 B3 A2
2246 
2247         2C60;LATIN CAPITAL LETTER L WITH DOUBLE BAR;Lu;0;L;;;;;N;;;;2C61; E2 B1 A0 => +1
2248         2C61;LATIN SMALL LETTER L WITH DOUBLE BAR;Ll;0;L;;;;;N;;;2C60;;2C60
2249         2C62;LATIN CAPITAL LETTER L WITH MIDDLE TILDE;Lu;0;L;;;;;N;;;;026B; => 	C9 AB
2250         2C63;LATIN CAPITAL LETTER P WITH STROKE;Lu;0;L;;;;;N;;;;1D7D; => E1 B5 BD
2251         2C64;LATIN CAPITAL LETTER R WITH TAIL;Lu;0;L;;;;;N;;;;027D; => 	C9 BD
2252         2C65;LATIN SMALL LETTER A WITH STROKE;Ll;0;L;;;;;N;;;023A;;023A
2253         2C66;LATIN SMALL LETTER T WITH DIAGONAL STROKE;Ll;0;L;;;;;N;;;023E;;023E
2254         2C67;LATIN CAPITAL LETTER H WITH DESCENDER;Lu;0;L;;;;;N;;;;2C68; => E2 B1 A8
2255         2C68;LATIN SMALL LETTER H WITH DESCENDER;Ll;0;L;;;;;N;;;2C67;;2C67
2256         2C69;LATIN CAPITAL LETTER K WITH DESCENDER;Lu;0;L;;;;;N;;;;2C6A; => E2 B1 AA
2257         2C6A;LATIN SMALL LETTER K WITH DESCENDER;Ll;0;L;;;;;N;;;2C69;;2C69
2258         2C6B;LATIN CAPITAL LETTER Z WITH DESCENDER;Lu;0;L;;;;;N;;;;2C6C; => E2 B1 AC
2259         2C6C;LATIN SMALL LETTER Z WITH DESCENDER;Ll;0;L;;;;;N;;;2C6B;;2C6B
2260         2C6D;LATIN CAPITAL LETTER ALPHA;Lu;0;L;;;;;N;;;;0251; => C9 91
2261         2C6E;LATIN CAPITAL LETTER M WITH HOOK;Lu;0;L;;;;;N;;;;0271; => C9 B1
2262         2C6F;LATIN CAPITAL LETTER TURNED A;Lu;0;L;;;;;N;;;;0250; => C9 90
2263 
2264         2C70;LATIN CAPITAL LETTER TURNED ALPHA;Lu;0;L;;;;;N;;;;0252; => C9 92
2265         }
2266         else if (c2 = #$B1) then
2267         begin
2268           case c3 of
2269           #$A0: new_c3 := chr(ord(c3)+1);
2270           #$A2,#$A4,#$AD..#$AF,#$B0:
2271           begin
2272             inc(InStr, 3);
2273             OutStr^ := #$C9;
2274             inc(OutStr);
2275             case c3 of
2276             #$A2: OutStr^ := #$AB;
2277             #$A4: OutStr^ := #$BD;
2278             #$AD: OutStr^ := #$91;
2279             #$AE: OutStr^ := #$B1;
2280             #$AF: OutStr^ := #$90;
2281             #$B0: OutStr^ := #$92;
2282             end;
2283             inc(OutStr);
2284             inc(CounterDiff, 1);
2285             Continue;
2286           end;
2287           #$A3:
2288           begin
2289             new_c2 := #$B5;
2290             new_c3 := #$BD;
2291           end;
2292           #$A7,#$A9,#$AB: new_c3 := chr(ord(c3)+1);
2293           {
2294           2C71;LATIN SMALL LETTER V WITH RIGHT HOOK;Ll;0;L;;;;;N;;;;;
2295           2C72;LATIN CAPITAL LETTER W WITH HOOK;Lu;0;L;;;;;N;;;;2C73;
2296           2C73;LATIN SMALL LETTER W WITH HOOK;Ll;0;L;;;;;N;;;2C72;;2C72
2297           2C74;LATIN SMALL LETTER V WITH CURL;Ll;0;L;;;;;N;;;;;
2298           2C75;LATIN CAPITAL LETTER HALF H;Lu;0;L;;;;;N;;;;2C76;
2299           2C76;LATIN SMALL LETTER HALF H;Ll;0;L;;;;;N;;;2C75;;2C75
2300           2C77;LATIN SMALL LETTER TAILLESS PHI;Ll;0;L;;;;;N;;;;;
2301           2C78;LATIN SMALL LETTER E WITH NOTCH;Ll;0;L;;;;;N;;;;;
2302           2C79;LATIN SMALL LETTER TURNED R WITH TAIL;Ll;0;L;;;;;N;;;;;
2303           2C7A;LATIN SMALL LETTER O WITH LOW RING INSIDE;Ll;0;L;;;;;N;;;;;
2304           2C7B;LATIN LETTER SMALL CAPITAL TURNED E;Ll;0;L;;;;;N;;;;;
2305           2C7C;LATIN SUBSCRIPT SMALL LETTER J;Ll;0;L;<sub> 006A;;;;N;;;;;
2306           2C7D;MODIFIER LETTER CAPITAL V;Lm;0;L;<super> 0056;;;;N;;;;;
2307           2C7E;LATIN CAPITAL LETTER S WITH SWASH TAIL;Lu;0;L;;;;;N;;;;023F; => C8 BF
2308           2C7F;LATIN CAPITAL LETTER Z WITH SWASH TAIL;Lu;0;L;;;;;N;;;;0240; => C9 80
2309           }
2310           #$B2,#$B5: new_c3 := chr(ord(c3)+1);
2311           #$BE,#$BF:
2312           begin
2313             inc(InStr, 3);
2314             case c3 of
2315             #$BE: OutStr^ := #$C8;
2316             #$BF: OutStr^ := #$C9;
2317             end;
2318             OutStr^ := #$C8;
2319             inc(OutStr);
2320             case c3 of
2321             #$BE: OutStr^ := #$BF;
2322             #$BF: OutStr^ := #$80;
2323             end;
2324             inc(OutStr);
2325             inc(CounterDiff, 1);
2326             Continue;
2327           end;
2328           end;
2329         end
2330         {
2331         2C80;COPTIC CAPITAL LETTER ALFA;Lu;0;L;;;;;N;;;;2C81; E2 B2 80 => E2 B2 81
2332         ...
2333         2CBE;COPTIC CAPITAL LETTER OLD COPTIC OOU;Lu;0;L;;;;;N;;;;2CBF; E2 B2 BE => E2 B2 BF
2334         2CBF;COPTIC SMALL LETTER OLD COPTIC OOU;Ll;0;L;;;;;N;;;2CBE;;2CBE
2335         ...
2336         2CC0;COPTIC CAPITAL LETTER SAMPI;Lu;0;L;;;;;N;;;;2CC1; E2 B3 80 => E2 B2 81
2337         2CC1;COPTIC SMALL LETTER SAMPI;Ll;0;L;;;;;N;;;2CC0;;2CC0
2338         ...
2339         2CE2;COPTIC CAPITAL LETTER OLD NUBIAN WAU;Lu;0;L;;;;;N;;;;2CE3; E2 B3 A2 => E2 B3 A3
2340         2CE3;COPTIC SMALL LETTER OLD NUBIAN WAU;Ll;0;L;;;;;N;;;2CE2;;2CE2 <=
2341         }
2342         else if (c2 = #$B2) then
2343         begin
2344           if ord(c3) mod 2 = 0 then new_c3 := chr(ord(c3) + 1);
2345         end
2346         else if (c2 = #$B3) and (c3 in [#$80..#$A3]) then
2347         begin
2348           if ord(c3) mod 2 = 0 then new_c3 := chr(ord(c3) + 1);
2349         end;
2350 
2351         if (CounterDiff <> 0) then
2352         begin
2353           OutStr^ := new_c1;
2354           OutStr[1] := new_c2;
2355           OutStr[2] := new_c3;
2356         end
2357         else
2358         begin
2359           if c1 <> new_c1 then OutStr^ := new_c1;
2360           if c2 <> new_c2 then OutStr[1] := new_c2;
2361           if c3 <> new_c3 then OutStr[2] := new_c3;
2362         end;
2363 
2364         inc(InStr, 3);
2365         inc(OutStr, 3);
2366       end;
2367       {
2368       FF21;FULLWIDTH LATIN CAPITAL LETTER A;Lu;0;L;<wide> 0041;;;;N;;;;FF41; EF BC A1 => EF BD 81
2369       ...
2370       FF3A;FULLWIDTH LATIN CAPITAL LETTER Z;Lu;0;L;<wide> 005A;;;;N;;;;FF5A; EF BC BA => EF BD 9A
2371       }
2372       #$EF:
2373       begin
2374         c2 := InStr[1];
2375         c3 := InStr[2];
2376 
2377         if (c2 = #$BC) and (c3 in [#$A1..#$BA]) then
2378         begin
2379           OutStr^ := c1;
2380           OutStr[1] := #$BD;
2381           OutStr[2] := chr(ord(c3) - $20);
2382         end;
2383 
2384         if (CounterDiff <> 0) then
2385         begin
2386           OutStr^ := c1;
2387           OutStr[1] := c2;
2388           OutStr[2] := c3;
2389         end;
2390 
2391         inc(InStr, 3);
2392         inc(OutStr, 3);
2393       end;
2394     else
2395       // Copy the character if the string was disaligned by previous changes
2396       if (CounterDiff <> 0) then OutStr^:= c1;
2397       inc(InStr);
2398       inc(OutStr);
2399     end; // Case InStr^
2400   end; // while
2401 
2402   // Final correction of the buffer size
2403   SetLength(Result,OutStr - PChar(Result));
2404 end;
2405 
UTF8LowerStringnull2406 function UTF8LowerString(const s: string): string;
2407 begin
2408   Result:=UTF8LowerCase(s);
2409 end;
2410 
2411 
2412 {
2413   AInStr - The input string
2414   ALanguage - The language. Use '' for maximum speed if one desires to ignore the language
2415               The language should be specified in the format from ISO 639-1,
2416               which uses 2 characters to represent each language.
2417               If the language has no code in ISO 639-1, then the 3-chars code
2418               from ISO 639-2 should be used.
2419               Example: "tr" - Turkish language locale
2420 
2421   Data from here: ftp://ftp.unicode.org/Public/UNIDATA/UnicodeData.txt
2422 
2423   The columns in the file UnicodeData.txt are explained here:
2424   http://www.ksu.ru/eng/departments/ktk/test/perl/lib/unicode/UCDFF301.html#Case Mappings
2425 }
UTF8UpperCasenull2426 function UTF8UpperCase(const AInStr: string; ALanguage: string=''): string;
2427 var
2428   i, InCounter, OutCounter: PtrInt;
2429   OutStr: PChar;
2430   CharLen: integer;
2431   CharProcessed: Boolean;
2432   NewCharLen: integer;
2433   NewChar, OldChar: Word;
2434   // Language identification
2435   IsTurkish: Boolean;
2436 
2437   procedure CorrectOutStrSize(AOldCharSize, ANewCharSize: Integer);
2438   begin
2439     if not (ANewCharSize > AOldCharSize) then Exit; // no correction needed
2440     if (ANewCharSize > 20) or (AOldCharSize > 20) then Exit; // sanity check
2441     // Fix for bug 23428
2442     // If the string wasn't decreased by previous char changes,
2443     // and our current operation will make it bigger, then for safety
2444     // increase the buffer
2445     if (ANewCharSize > AOldCharSize) and (OutCounter >= InCounter-1) then
2446     begin
2447       SetLength(Result, Length(Result)+ANewCharSize-AOldCharSize);
2448       OutStr := PChar(Result);
2449     end;
2450   end;
2451 
2452 begin
2453   // Start with the same string, and progressively modify
2454   Result:=AInStr;
2455   UniqueString(Result);
2456   OutStr := PChar(Result);
2457 
2458   // Language identification
2459   IsTurkish := (ALanguage = 'tr') or (ALanguage = 'az'); // Turkish and Azeri have a special handling
2460 
2461   InCounter:=1; // for AInStr
2462   OutCounter := 0; // for Result
2463   while InCounter<=length(AInStr) do
2464   begin
2465     { First ASCII chars }
2466     if (AInStr[InCounter] <= 'z') and (AInStr[InCounter] >= 'a') then
2467     begin
2468       // Special turkish handling
2469       // small dotted i to capital dotted i
2470       if IsTurkish and (AInStr[InCounter] = 'i') then
2471       begin
2472         SetLength(Result,Length(Result)+1);// Increase the buffer
2473         OutStr := PChar(Result);
2474         OutStr[OutCounter]:=#$C4;
2475         OutStr[OutCounter+1]:=#$B0;
2476         inc(InCounter);
2477         inc(OutCounter,2);
2478       end
2479       else
2480       begin
2481         OutStr[OutCounter]:=chr(ord(AInStr[InCounter])-32);
2482         inc(InCounter);
2483         inc(OutCounter);
2484       end;
2485     end
2486     { Now everything else }
2487     else
2488     begin
2489       CharLen := UTF8CodepointSize(@AInStr[InCounter]);
2490       CharProcessed := False;
2491       NewCharLen := CharLen;
2492 
2493       if CharLen = 2 then
2494       begin
2495         OldChar := (Ord(AInStr[InCounter]) shl 8) or Ord(AInStr[InCounter+1]);
2496         NewChar := 0;
2497 
2498         // Major processing
2499         case OldChar of
2500         // Latin Characters 0000–0FFF http://en.wikibooks.org/wiki/Unicode/Character_reference/0000-0FFF
2501         $C39F:        NewChar := $5353; // ß => SS
2502         $C3A0..$C3B6,$C3B8..$C3BE: NewChar := OldChar - $20;
2503         $C3BF:        NewChar := $C5B8; // ÿ
2504         $C481..$C4B0: if OldChar mod 2 = 1 then NewChar := OldChar - 1;
2505         // 0130 = C4 B0
2506         // turkish small undotted i to capital undotted i
2507         $C4B1:
2508         begin
2509           OutStr[OutCounter]:='I';
2510           NewCharLen := 1;
2511           CharProcessed := True;
2512         end;
2513         $C4B2..$C4B7: if OldChar mod 2 = 1 then NewChar := OldChar - 1;
2514         // $C4B8: ĸ without upper/lower
2515         $C4B9..$C4BF: if OldChar mod 2 = 0 then NewChar := OldChar - 1;
2516         $C580: NewChar := $C4BF; // border between bytes
2517         $C581..$C588: if OldChar mod 2 = 0 then NewChar := OldChar - 1;
2518         // $C589 ʼn => ?
2519         $C58A..$C5B7: if OldChar mod 2 = 1 then NewChar := OldChar - 1;
2520         // $C5B8: // Ÿ already uppercase
2521         $C5B9..$C5BE: if OldChar mod 2 = 0 then NewChar := OldChar - 1;
2522         $C5BF: // 017F
2523         begin
2524           OutStr[OutCounter]:='S';
2525           NewCharLen := 1;
2526           CharProcessed := True;
2527         end;
2528         // 0180 = C6 80 -> A convoluted part
2529         $C680: NewChar := $C983;
2530         $C682..$C685: if OldChar mod 2 = 1 then NewChar := OldChar - 1;
2531         $C688: NewChar := $C687;
2532         $C68C: NewChar := $C68B;
2533         // 0190 = C6 90 -> A convoluted part
2534         $C692: NewChar := $C691;
2535         $C695: NewChar := $C7B6;
2536         $C699: NewChar := $C698;
2537         $C69A: NewChar := $C8BD;
2538         $C69E: NewChar := $C8A0;
2539         // 01A0 = C6 A0 -> A convoluted part
2540         $C6A0..$C6A5: if OldChar mod 2 = 1 then NewChar := OldChar - 1;
2541         $C6A8: NewChar := $C6A7;
2542         $C6AD: NewChar := $C6AC;
2543         // 01B0 = C6 B0
2544         $C6B0: NewChar := $C6AF;
2545         $C6B3..$C6B6: if OldChar mod 2 = 0 then NewChar := OldChar - 1;
2546         $C6B9: NewChar := $C6B8;
2547         $C6BD: NewChar := $C6BC;
2548         $C6BF: NewChar := $C7B7;
2549         // 01C0 = C7 80
2550         $C784..$C786: NewChar := $C784;
2551         $C787..$C789: NewChar := $C787;
2552         $C78A..$C78C: NewChar := $C78A;
2553         $C78E: NewChar := $C78D;
2554         // 01D0 = C7 90
2555         $C790: NewChar := $C78F;
2556         $C791..$C79C: if OldChar mod 2 = 0 then NewChar := OldChar - 1;
2557         $C79D: NewChar := $C68E;
2558         $C79F: NewChar := $C79E;
2559         // 01E0 = C7 A0
2560         $C7A0..$C7AF: if OldChar mod 2 = 1 then NewChar := OldChar - 1;
2561         // 01F0 = C7 B0
2562         $C7B2..$C7B3: NewChar := $C7B1;
2563         $C7B5: NewChar := $C7B4;
2564         $C7B8..$C7BF: if OldChar mod 2 = 1 then NewChar := OldChar - 1;
2565         // 0200 = C8 80
2566         // 0210 = C8 90
2567         $C880..$C89F: if OldChar mod 2 = 1 then NewChar := OldChar - 1;
2568         // 0220 = C8 A0
2569         // 0230 = C8 B0
2570         $C8A2..$C8B3: if OldChar mod 2 = 1 then NewChar := OldChar - 1;
2571         $C8BC: NewChar := $C8BB;
2572         $C8BF:
2573         begin
2574           CorrectOutStrSize(2, 3);
2575           OutStr[OutCounter]  := #$E2;
2576           OutStr[OutCounter+1]:= #$B1;
2577           OutStr[OutCounter+2]:= #$BE;
2578           NewCharLen := 3;
2579           CharProcessed := True;
2580         end;
2581         // 0240 = C9 80
2582         $C980:
2583         begin
2584           CorrectOutStrSize(2, 3);
2585           OutStr[OutCounter]  := #$E2;
2586           OutStr[OutCounter+1]:= #$B1;
2587           OutStr[OutCounter+2]:= #$BF;
2588           NewCharLen := 3;
2589           CharProcessed := True;
2590         end;
2591         $C982: NewChar := $C981;
2592         $C986..$C98F: if OldChar mod 2 = 1 then NewChar := OldChar - 1;
2593         // 0250 = C9 90
2594         $C990:
2595         begin
2596           CorrectOutStrSize(2, 3);
2597           OutStr[OutCounter]  := #$E2;
2598           OutStr[OutCounter+1]:= #$B1;
2599           OutStr[OutCounter+2]:= #$AF;
2600           NewCharLen := 3;
2601           CharProcessed := True;
2602         end;
2603         $C991:
2604         begin
2605           CorrectOutStrSize(2, 3);
2606           OutStr[OutCounter]  := #$E2;
2607           OutStr[OutCounter+1]:= #$B1;
2608           OutStr[OutCounter+2]:= #$AD;
2609           NewCharLen := 3;
2610           CharProcessed := True;
2611         end;
2612         $C992:
2613         begin
2614           CorrectOutStrSize(2, 3);
2615           OutStr[OutCounter]  := #$E2;
2616           OutStr[OutCounter+1]:= #$B1;
2617           OutStr[OutCounter+2]:= #$B0;
2618           NewCharLen := 3;
2619           CharProcessed := True;
2620         end;
2621         $C993: NewChar := $C681;
2622         $C994: NewChar := $C686;
2623         $C996: NewChar := $C689;
2624         $C997: NewChar := $C68A;
2625         $C999: NewChar := $C68F;
2626         $C99B: NewChar := $C690;
2627         // 0260 = C9 A0
2628         $C9A0: NewChar := $C693;
2629         $C9A3: NewChar := $C694;
2630         $C9A5:
2631         begin
2632           CorrectOutStrSize(2, 3);
2633           OutStr[OutCounter]  := #$EA;
2634           OutStr[OutCounter+1]:= #$9E;
2635           OutStr[OutCounter+2]:= #$8D;
2636           NewCharLen := 3;
2637           CharProcessed := True;
2638         end;
2639         $C9A8: NewChar := $C697;
2640         $C9A9: NewChar := $C696;
2641         $C9AB:
2642         begin
2643           CorrectOutStrSize(2, 3);
2644           OutStr[OutCounter]  := #$E2;
2645           OutStr[OutCounter+1]:= #$B1;
2646           OutStr[OutCounter+2]:= #$A2;
2647           NewCharLen := 3;
2648           CharProcessed := True;
2649         end;
2650         $C9AF: NewChar := $C69C;
2651         // 0270 = C9 B0
2652         $C9B1:
2653         begin
2654           CorrectOutStrSize(2, 3);
2655           OutStr[OutCounter]  := #$E2;
2656           OutStr[OutCounter+1]:= #$B1;
2657           OutStr[OutCounter+2]:= #$AE;
2658           NewCharLen := 3;
2659           CharProcessed := True;
2660         end;
2661         $C9B2: NewChar := $C69D;
2662         $C9B5: NewChar := $C69F;
2663         $C9BD:
2664         begin
2665           CorrectOutStrSize(2, 3);
2666           OutStr[OutCounter]  := #$E2;
2667           OutStr[OutCounter+1]:= #$B1;
2668           OutStr[OutCounter+2]:= #$A4;
2669           NewCharLen := 3;
2670           CharProcessed := True;
2671         end;
2672         // 0280 = CA 80
2673         $CA80: NewChar := $C6A6;
2674         $CA83: NewChar := $C6A9;
2675         $CA88: NewChar := $C6AE;
2676         $CA89: NewChar := $C984;
2677         $CA8A: NewChar := $C6B1;
2678         $CA8B: NewChar := $C6B2;
2679         $CA8C: NewChar := $C985;
2680         // 0290 = CA 90
2681         $CA92: NewChar := $C6B7;
2682         {
2683         03A0 = CE A0
2684 
2685         03AC;GREEK SMALL LETTER ALPHA WITH TONOS;Ll;0;L;03B1 0301;;;;N;GREEK SMALL LETTER ALPHA TONOS;;0386;;0386
2686         03AD;GREEK SMALL LETTER EPSILON WITH TONOS;Ll;0;L;03B5 0301;;;;N;GREEK SMALL LETTER EPSILON TONOS;;0388;;0388
2687         03AE;GREEK SMALL LETTER ETA WITH TONOS;Ll;0;L;03B7 0301;;;;N;GREEK SMALL LETTER ETA TONOS;;0389;;0389
2688         03AF;GREEK SMALL LETTER IOTA WITH TONOS;Ll;0;L;03B9 0301;;;;N;GREEK SMALL LETTER IOTA TONOS;;038A;;038A
2689         }
2690         $CEAC: NewChar := $CE86;
2691         $CEAD: NewChar := $CE88;
2692         $CEAE: NewChar := $CE89;
2693         $CEAF: NewChar := $CE8A;
2694         {
2695         03B0 = CE B0
2696 
2697         03B0;GREEK SMALL LETTER UPSILON WITH DIALYTIKA AND TONOS;Ll;0;L;03CB 0301;;;;N;GREEK SMALL LETTER UPSILON DIAERESIS TONOS;;;;
2698         03B1;GREEK SMALL LETTER ALPHA;Ll;0;L;;;;;N;;;0391;;0391
2699         ...
2700         03BF;GREEK SMALL LETTER OMICRON;Ll;0;L;;;;;N;;;039F;;039F
2701         }
2702         $CEB1..$CEBF: NewChar := OldChar - $20; // Greek Characters
2703         {
2704         03C0 = CF 80
2705 
2706         03C0;GREEK SMALL LETTER PI;Ll;0;L;;;;;N;;;03A0;;03A0 CF 80 => CE A0
2707         03C1;GREEK SMALL LETTER RHO;Ll;0;L;;;;;N;;;03A1;;03A1
2708         03C2;GREEK SMALL LETTER FINAL SIGMA;Ll;0;L;;;;;N;;;03A3;;03A3
2709         03C3;GREEK SMALL LETTER SIGMA;Ll;0;L;;;;;N;;;03A3;;03A3
2710         03C4;GREEK SMALL LETTER TAU;Ll;0;L;;;;;N;;;03A4;;03A4
2711         ....
2712         03CB;GREEK SMALL LETTER UPSILON WITH DIALYTIKA;Ll;0;L;03C5 0308;;;;N;GREEK SMALL LETTER UPSILON DIAERESIS;;03AB;;03AB
2713         03CC;GREEK SMALL LETTER OMICRON WITH TONOS;Ll;0;L;03BF 0301;;;;N;GREEK SMALL LETTER OMICRON TONOS;;038C;;038C
2714         03CD;GREEK SMALL LETTER UPSILON WITH TONOS;Ll;0;L;03C5 0301;;;;N;GREEK SMALL LETTER UPSILON TONOS;;038E;;038E
2715         03CE;GREEK SMALL LETTER OMEGA WITH TONOS;Ll;0;L;03C9 0301;;;;N;GREEK SMALL LETTER OMEGA TONOS;;038F;;038F
2716         03CF;GREEK CAPITAL KAI SYMBOL;Lu;0;L;;;;;N;;;;03D7;
2717         }
2718         $CF80,$CF81,$CF83..$CF8B: NewChar := OldChar - $E0; // Greek Characters
2719         $CF82: NewChar := $CEA3;
2720         $CF8C: NewChar := $CE8C;
2721         $CF8D: NewChar := $CE8E;
2722         $CF8E: NewChar := $CE8F;
2723         {
2724         03D0 = CF 90
2725 
2726         03D0;GREEK BETA SYMBOL;Ll;0;L;<compat> 03B2;;;;N;GREEK SMALL LETTER CURLED BETA;;0392;;0392 CF 90 => CE 92
2727         03D1;GREEK THETA SYMBOL;Ll;0;L;<compat> 03B8;;;;N;GREEK SMALL LETTER SCRIPT THETA;;0398;;0398 => CE 98
2728         03D5;GREEK PHI SYMBOL;Ll;0;L;<compat> 03C6;;;;N;GREEK SMALL LETTER SCRIPT PHI;;03A6;;03A6 => CE A6
2729         03D6;GREEK PI SYMBOL;Ll;0;L;<compat> 03C0;;;;N;GREEK SMALL LETTER OMEGA PI;;03A0;;03A0 => CE A0
2730         03D7;GREEK KAI SYMBOL;Ll;0;L;;;;;N;;;03CF;;03CF => CF 8F
2731         03D9;GREEK SMALL LETTER ARCHAIC KOPPA;Ll;0;L;;;;;N;;;03D8;;03D8
2732         03DB;GREEK SMALL LETTER STIGMA;Ll;0;L;;;;;N;;;03DA;;03DA
2733         03DD;GREEK SMALL LETTER DIGAMMA;Ll;0;L;;;;;N;;;03DC;;03DC
2734         03DF;GREEK SMALL LETTER KOPPA;Ll;0;L;;;;;N;;;03DE;;03DE
2735         }
2736         $CF90: NewChar := $CE92;
2737         $CF91: NewChar := $CE98;
2738         $CF95: NewChar := $CEA6;
2739         $CF96: NewChar := $CEA0;
2740         $CF97: NewChar := $CF8F;
2741         $CF99..$CF9F: if OldChar mod 2 = 1 then NewChar := OldChar - 1;
2742         // 03E0 = CF A0
2743         $CFA0..$CFAF: if OldChar mod 2 = 1 then NewChar := OldChar - 1;
2744         {
2745         03F0 = CF B0
2746 
2747         03F0;GREEK KAPPA SYMBOL;Ll;0;L;<compat> 03BA;;;;N;GREEK SMALL LETTER SCRIPT KAPPA;;039A;;039A => CE 9A
2748         03F1;GREEK RHO SYMBOL;Ll;0;L;<compat> 03C1;;;;N;GREEK SMALL LETTER TAILED RHO;;03A1;;03A1 => CE A1
2749         03F2;GREEK LUNATE SIGMA SYMBOL;Ll;0;L;<compat> 03C2;;;;N;GREEK SMALL LETTER LUNATE SIGMA;;03F9;;03F9
2750         03F5;GREEK LUNATE EPSILON SYMBOL;Ll;0;L;<compat> 03B5;;;;N;;;0395;;0395 => CE 95
2751         03F8;GREEK SMALL LETTER SHO;Ll;0;L;;;;;N;;;03F7;;03F7
2752         03FB;GREEK SMALL LETTER SAN;Ll;0;L;;;;;N;;;03FA;;03FA
2753         }
2754         $CFB0: NewChar := $CE9A;
2755         $CFB1: NewChar := $CEA1;
2756         $CFB2: NewChar := $CFB9;
2757         $CFB5: NewChar := $CE95;
2758         $CFB8: NewChar := $CFB7;
2759         $CFBB: NewChar := $CFBA;
2760         // 0400 = D0 80 ... 042F everything already uppercase
2761         // 0430 = D0 B0
2762         $D0B0..$D0BF: NewChar := OldChar - $20; // Cyrillic alphabet
2763         // 0440 = D1 80
2764         $D180..$D18F: NewChar := OldChar - $E0; // Cyrillic alphabet
2765         // 0450 = D1 90
2766         $D190..$D19F: NewChar := OldChar - $110; // Cyrillic alphabet
2767         end;
2768 
2769         if NewChar <> 0 then
2770         begin
2771           OutStr[OutCounter]  := Chr(Hi(NewChar));
2772           OutStr[OutCounter+1]:= Chr(Lo(NewChar));
2773           CharProcessed := True;
2774         end;
2775       end;
2776 
2777       // Copy the character if the string was disaligned by previous changed
2778       // and no processing was done in this character
2779       if (InCounter <> OutCounter+1) and (not CharProcessed) then
2780       begin
2781         for i := 0 to CharLen-1 do
2782           OutStr[OutCounter+i]  :=AInStr[InCounter+i];
2783       end;
2784 
2785       inc(InCounter, CharLen);
2786       inc(OutCounter, NewCharLen);
2787     end;
2788   end; // while
2789 
2790   // Final correction of the buffer size
2791   SetLength(Result,OutCounter);
2792 end;
2793 
2794 function UTF8UpperString(const s: string): string;
2795 begin
2796   Result:=UTF8UpperCase(s);
2797 end;
2798 
2799 
2800 function FindInvalidUTF8Codepoint(p: PChar; Count: PtrInt; StopOnNonUTF8: Boolean): PtrInt;
2801 // return -1 if ok
2802 var
2803   CharLen: Integer;
2804   c: Char;
2805 begin
2806   if (p<>nil) then begin
2807     Result:=0;
2808     while Result<Count do begin
2809       c:=p^;
2810       if ord(c)<%10000000 then begin
2811         // regular single byte ASCII character (#0 is a character, this is Pascal ;)
2812         CharLen:=1;
2813       end else if ord(c)<=%11000001 then begin
2814         // single byte character, between valid UTF-8 encodings
2815         // %11000000 and %11000001 map 2 byte to #0..#128, which is invalid and used for XSS attacks
2816         if StopOnNonUTF8 or (ord(c)>=192) then
2817           exit;
2818         CharLen:=1;
2819       end else if ord(c)<=%11011111 then begin
2820         // could be 2 byte character (%110xxxxx %10xxxxxx)
2821         if (Result<Count-1)
2822         and ((ord(p[1]) and %11000000) = %10000000) then
2823           CharLen:=2
2824         else
2825           exit; // missing following bytes
2826       end
2827       else if ord(c)<=%11101111 then begin
2828         // could be 3 byte character (%1110xxxx %10xxxxxx %10xxxxxx)
2829         if (Result<Count-2)
2830         and ((ord(p[1]) and %11000000) = %10000000)
2831         and ((ord(p[2]) and %11000000) = %10000000) then begin
2832           if (ord(c)=%11100000) and (ord(p[1])<=%10011111) then
2833             exit; // XSS attack: 3 bytes are mapped to the 1 or 2 byte codes
2834           CharLen:=3;
2835         end else
2836           exit; // missing following bytes
2837       end
2838       else if ord(c)<=%11110111 then begin
2839         // could be 4 byte character (%11110xxx %10xxxxxx %10xxxxxx %10xxxxxx)
2840         if (Result<Count-3)
2841         and ((ord(p[1]) and %11000000) = %10000000)
2842         and ((ord(p[2]) and %11000000) = %10000000)
2843         and ((ord(p[3]) and %11000000) = %10000000) then begin
2844           if (ord(c)=%11110000) and (ord(p[1])<=%10001111) then
2845             exit; // XSS attack: 4 bytes are mapped to the 1-3 byte codes
2846           CharLen:=4;
2847         end else
2848           exit; // missing following bytes
2849       end
2850       else begin
2851         if StopOnNonUTF8 then
2852           exit;
2853         CharLen:=1;
2854       end;
2855       inc(Result,CharLen);
2856       inc(p,CharLen);
2857       if Result>Count then begin
2858         dec(Result,CharLen);
2859         exit; // missing following bytes
2860       end;
2861     end;
2862   end;
2863   // ok
2864   Result:=-1;
2865 end;
2866 
2867 function FindInvalidUTF8Character(p: PChar; Count: PtrInt; StopOnNonUTF8: Boolean = true): PtrInt;
2868 begin
2869   Result := FindInvalidUTF8Codepoint(p, Count, StopOnNonUTF8);
2870 end;
2871 
2872 {
2873   Translates escape characters inside an UTF8 encoded string into
2874   human readable format.
2875   Mainly used for logging purposes.
2876   Parameters:
2877     S         : Input string. Must be UTF8 encoded.
2878     EscapeMode: controls the human readable format for escape characters.
2879 }
2880 function Utf8EscapeControlChars(S: String; EscapeMode: TEscapeMode = emPascal): String;
2881 const
2882   //lookuptables are about 1.8 to 1.3 times faster than a function using IntToStr or IntToHex
2883   PascalEscapeStrings: Array[#0..#31] of string = (
2884     '#00', '#01', '#02', '#03', '#04', '#05', '#06', '#07',
2885     '#08', '#09', '#10', '#11', '#12', '#13', '#14', '#15',
2886     '#16', '#17', '#18', '#19', '#20', '#21', '#22', '#23',
2887     '#24', '#25', '#26', '#27', '#28', '#29', '#30', '#31');
2888   CEscapeStrings: Array[#0..#31] of string = (
2889     '\0'   , '\0x01', '\0x02', '\0x03', '\0x04', '\0x05', '\0x06', '\0x07',
2890     '\0x08', '\t'   , '\r'   , '\0x0B', '\0x0C', '\n'   , '\0x0E', '\0x0F',
2891     '\0x10', '\0x11', '\0x12', '\0x13', '\0x14', '\0x15', '\0x16', '\0x17',
2892     '\0x18', '\0x19', '\0x1A', '\0x1B', '\0x1C', '\0x1D', '\0x1E', '\0x1F');
2893   HexEscapeCStrings: Array[#0..#31] of string = (
2894     '\0x00', '\0x01', '\0x02', '\0x03', '\0x04', '\0x05', '\0x06', '\0x07',
2895     '\0x08', '\0x09', '\0x0A', '\0x0B', '\0x0C', '\0x0D', '\0x0E', '\0x0F',
2896     '\0x10', '\0x11', '\0x12', '\0x13', '\0x14', '\0x15', '\0x16', '\0x17',
2897     '\0x18', '\0x19', '\0x1A', '\0x1B', '\0x1C', '\0x1D', '\0x1E', '\0x1F');
2898   HexEscapePascalStrings: Array[#0..#31] of string = (
2899     '#$00', '#$01', '#$02', '#$03', '#$04', '#$05', '#$06', '#$07',
2900     '#$08', '#$09', '#$0A', '#$0B', '#$0C', '#$0D', '#$0E', '#$0F',
2901     '#$10', '#$11', '#$12', '#$13', '#$14', '#$15', '#$16', '#$17',
2902     '#$18', '#$19', '#$1A', '#$1B', '#$1C', '#$1D', '#$1E', '#$1F');
2903   AsciiControlStrings: Array[#0..#31] of string = (
2904     '[NUL]', '[SOH]', '[STX]', '[ETX]', '[EOT]', '[ENQ]', '[ACK]', '[BEL]',
2905     '[BS]' , '[HT]' , '[LF]' , '[VT]' , '[FF]' , '[CR]' , '[SO]' , '[SI]' ,
2906     '[DLE]', '[DC1]', '[DC2]', '[DC3]', '[DC4]', '[NAK]', '[SYN]', '[ETB]',
2907     '[CAN]', '[EM]' , '[SUB]', '[ESC]', '[FS]' , '[GS]' , '[RS]' , '[US]');
2908 var
2909   Ch: Char;
2910   i: Integer;
2911 begin
2912   if FindInvalidUTF8Codepoint(PChar(S), Length(S)) <> -1 then
2913   begin
2914     UTF8FixBroken(S);
2915   end;
2916   Result := '';
2917   //a byte < 127 cannot be part of a multi-byte codepoint, so this is safe
2918   for i := 1 to Length(S) do
2919   begin
2920     Ch := S[i];
2921     if (Ch < #32) then
2922     begin
2923       case EscapeMode of
2924         emPascal: Result := Result + PascalEscapeStrings[Ch];
2925         emHexPascal: Result := Result + HexEscapePascalStrings[Ch];
2926         emHexC: Result := Result + HexEscapeCStrings[Ch];
2927         emC: Result := Result + CEscapeStrings[Ch];
2928         emAsciiControlNames: Result := Result + AsciiControlStrings[Ch];
2929       end;//case
2930     end
2931     else
2932       Result := Result + Ch;
2933   end;
2934 end;
2935 
2936 function UTF8StringOfChar(AUtf8Char: String; N: Integer): String;
2937 var
2938   UCharLen, i: Integer;
2939   C1, C2, C3: Char;
2940   PC: PChar;
2941 begin
2942   Result := '';
2943   if (N <= 0) or (Utf8Length(AUtf8Char) <> 1) then Exit;
2944   UCharLen := Length(AUtf8Char);
2945   Case UCharLen of
2946     1: Result := StringOfChar(AUtf8Char[1], N);
2947     2:
2948     begin
2949       SetLength(Result, 2 * N);
2950       System.FillWord(Result[1], N, PWord(Pointer(AUtf8Char))^);
2951      end;
2952     3:
2953     begin
2954       SetLength(Result, 3 * N);
2955       C1 := AUtf8Char[1];
2956       C2 := AUtf8Char[2];
2957       C3 := AUtf8Char[3];
2958       PC := PChar(Result);
2959       for i:=1 to N do
2960       begin
2961         PC[0] := C1;
2962         PC[1] := C2;
2963         PC[2] := C3;
2964         inc(PC,3);
2965       end;
2966     end;
2967     4:
2968     begin
2969       SetLength(Result, 4 * N);
2970       System.FillDWord(Result[1], N, PDWord(Pointer(AUtf8Char))^);
2971     end;
2972     else
2973     begin
2974       //In November 2003 UTF-8 was restricted by RFC 3629 to four bytes to match
2975       //the constraints of the UTF-16 character encoding.
2976       //http://en.wikipedia.org/wiki/UTF-8
2977       Result := StringOfChar('?', N);
2978     end;
2979   end;
2980 end;
2981 
2982 function UTF8AddChar(AUtf8Char: String; const S: String; N: Integer): String;
2983 var
2984   L : Integer;
2985 begin
2986   Result := S;
2987   if Utf8Length(AUtf8Char) <> 1 then Exit;
2988   L := Utf8Length(Result);
2989   if L < N then
2990     Result := Utf8StringOfChar(AUtf8Char, N-l) + Result;
2991 end;
2992 
2993 function UTF8AddCharR(AUtf8Char: String; const S: String; N: Integer): String;
2994 var
2995   L : Integer;
2996 begin
2997   Result := S;
2998   if Utf8Length(AUtf8Char) <> 1 then Exit;
2999   L := Utf8Length(Result);
3000   if L < N then
3001     Result := Result + Utf8StringOfChar(AUtf8Char, N-l);
3002 end;
3003 
3004 function UTF8PadLeft(const S: String; const N: Integer; const AUtf8Char: String = #32): String;
3005 begin
3006   Result := Utf8AddChar(AUtf8Char, S, N);
3007 end;
3008 
3009 function UTF8PadRight(const S: String; const N: Integer; const AUtf8Char: String = #32): String;
3010 begin
3011   Result := Utf8AddCharR(AUtf8Char, S, N);
3012 end;
3013 
3014 function UTF8PadCenter(const S: String; const N: Integer; const AUtf8Char: String = #32): String;
3015 var
3016   ULen: PtrInt;
3017 begin
3018   ULen := Utf8Length(S);
3019   if ULen < N then
3020     begin
3021       Result := Utf8StringOfChar(AUtf8Char,(N div 2) - (ULen div 2)) + S;
3022       Result := Result + Utf8StringOfChar(AUtf8Char, N - Utf8Length(Result));
3023     end
3024   else
3025     Result := S;
3026 end;
3027 
3028 function UTF8LeftStr(const AText: String; const ACount: Integer): String;
3029 begin
3030   Result := Utf8Copy(AText,1,ACount);
3031 end;
3032 
3033 function UTF8RightStr(const AText: String; const ACount: Integer): String;
3034 var
3035   j,l:integer;
3036 begin
3037   l := Utf8Length(AText);
3038   j := ACount;
3039   if (j > l) then j := l;
3040   Result := Utf8Copy(AText,l-j+1,j);
3041 end;
3042 
3043 function UTF8QuotedStr(const S, Quote: string): string;
3044 // replace all Quote in S with double Quote and enclose the result in Quote.
3045 var
3046   QuoteC: Char;
3047   p, QuoteP, CopyPos: PChar;
3048   QuoteLen: SizeInt;
3049 begin
3050   Result:=Quote;
3051   p:=PChar(S);
3052   CopyPos:=p;
3053   QuoteC:=Quote[1];
3054   QuoteP:=PChar(Quote);
3055   QuoteLen:=length(Quote);
3056   repeat
3057     if (p^=#0) and (p-PChar(S)=length(S)) then
3058       break;
3059     if (p^=QuoteC) and CompareMem(p,QuoteP,QuoteLen) then begin
3060       inc(p,QuoteLen);
3061       Result+=copy(S,CopyPos-PChar(S)+1,p-CopyPos)+Quote;
3062       CopyPos:=p;
3063     end else
3064       inc(p);
3065   until false;
3066   Result+=copy(S,CopyPos-PChar(S)+1,p-CopyPos)+Quote;
3067 end;
3068 
3069 function UTF8StartsText(const ASubText, AText: string): Boolean;
3070 var
3071   TextLen, SubTextLen: PtrInt;
3072 begin
3073   Result := False;
3074   if (ASubText <> '') then
3075   begin
3076     TextLen := Utf8Length(AText);
3077     SubTextLen := Utf8Length(ASubText);
3078     if (TextLen >= SubTextLen) then
3079       Result := (Utf8CompareText(Utf8Copy(AText,1,SubTextLen),ASubText) = 0);
3080   end;
3081 end;
3082 
3083 function UTF8EndsText(const ASubText, AText: string): Boolean;
3084 var
3085   TextLen, SubTextLen: PtrInt;
3086 begin
3087   Result := False;
3088   if (ASubText <> '') then
3089   begin
3090     TextLen := Utf8Length(AText);
3091     SubTextLen := Utf8Length(ASubText);
3092     if (TextLen >= SubTextLen) then
3093       Result := (Utf8CompareText(Utf8Copy(AText,TextLen-SubTextLen+1,SubTextLen),ASubText) = 0);
3094   end;
3095 end;
3096 
3097 function UTF8ReverseString(p: PChar; const ByteCount: LongInt): string;
3098 var
3099   CharLen, rBytePos: LongInt;
3100 begin
3101   SetLength(Result, ByteCount);
3102   rBytePos := ByteCount + 1;
3103   while (rBytePos > 1) do
3104   begin
3105     CharLen:=UTF8CodepointSize(p);
3106     Dec(rBytePos, CharLen);
3107     System.Move(p^, Result[rBytePos], CharLen);
3108     Inc(p, CharLen);
3109   end;
3110 end;
3111 
3112 function UTF8ReverseString(const AText: string): string; inline;
3113 begin
3114   Result := UTF8ReverseString(PChar(AText), length(AText));
3115 end;
3116 
3117 function UTF8RPos(const Substr, Source: string): PtrInt;
3118 var
3119   pRev: PtrInt;
3120 begin
3121   pRev := RPos(Substr, Source);              // Scan from the end.
3122   Result := UTF8Length(PChar(Source), pRev); // Length of the leading part.
3123 end;
3124 
3125 function UTF8WrapText(S, BreakStr: string; BreakChars: TSysCharSet; MaxCol: integer): string;
3126 var
3127   P : PChar;
3128   RightSpace : integer = 0;
3129   N : integer = 0;
3130   Len : integer = 0;
3131   i, j : integer;
3132   CharLen, ResultLen, RP : integer;
3133 begin
3134   Result := '';
3135   if (S = '') or (MaxCol = 0) or (BreakStr = '') or (BreakChars = []) then Exit;
3136   P := PChar(S);
3137   while P^ <> #0 do
3138   begin
3139     CharLen := UTF8CodepointSize(P);
3140     i := 1;
3141     j := 0;
3142     ResultLen := Length(Result);
3143     SetLength(Result, ResultLen + CharLen);
3144     while i <= CharLen do
3145     begin
3146       Result[ResultLen + i] := (P + J)^;
3147       Inc(i);
3148       Inc(j);
3149     end;
3150     Inc(N);
3151     if P^ = BreakStr[Length(BreakStr)] then
3152       N := 0;
3153     if N > MaxCol then
3154     begin
3155       Len := Length(Result);
3156       RP := Len;
3157       while not (Result[RP] in BreakChars) do
3158         Dec(RP);
3159       RightSpace := Len - RP;
3160       if (RightSpace > 0) and (RightSpace < MaxCol) then
3161       begin
3162         Dec(P, RightSpace);
3163         SetLength(Result, Len - RightSpace);
3164       end;
3165       Result := Result + BreakStr;
3166       N := 0;
3167     end;
3168     Inc(P, CharLen);
3169   end;
3170 end;
3171 
3172 function UTF8WrapText(S: string; MaxCol: integer): string;
3173 begin
3174   Result := UTF8WrapText(S, LineEnding, [' ', '-', #9], MaxCol);
3175 end;
3176 
3177 function UTF8Trim(const s: string; Flags: TUTF8TrimFlags): string;
3178 var
3179   p: PChar;
3180   u: Cardinal;
3181   StartP: PtrUInt;
3182   l: Integer;
3183   KeepAllNonASCII: boolean;
3184 begin
3185   Result:=s;
3186   if Result='' then exit;
3187   KeepAllNonASCII:=[u8tKeepControlCodes,u8tKeepNoBreakSpaces]*Flags=[u8tKeepControlCodes,u8tKeepNoBreakSpaces];
3188   if not (u8tKeepStart in Flags) then begin
3189     // trim start
3190     p:=PChar(Result);
3191     repeat
3192       l:=1;
3193       case p^ of
3194       #0:
3195         if p-PChar(Result)=length(Result) then
3196         begin
3197           // everything was trimmed
3198           exit('')
3199         end else if u8tKeepControlCodes in Flags then
3200           break;
3201       ' ': ;
3202       #10,#13:
3203         if u8tKeepLineBreaks in Flags then
3204           break;
3205       #9:
3206         if u8tKeepTabs in Flags then
3207           break;
3208       #1..#8,#11,#12,#14..#31,#127:
3209         if u8tKeepControlCodes in Flags then
3210           break;
3211       #128..#255:
3212         begin
3213           if KeepAllNonASCII then break;
3214           u:=UTF8CodepointToUnicode(p,l);
3215           if (l<=1) then break; // invalid character
3216           case u of
3217           128..159, // C1 set of control codes
3218           8206, 8207: // left-to-right, right-to-left mark
3219             if u8tKeepControlCodes in Flags then break;
3220           160,   // no break space
3221           $2007, // figure space
3222           $2026, // narrow no-break space
3223           $FEFF: // zero with no-break space
3224             if u8tKeepNoBreakSpaces in Flags then break;
3225           else
3226             break;
3227           end;
3228         end;
3229       else
3230         break;
3231       end;
3232       inc(p,l);
3233     until false;
3234     if p>PChar(Result) then begin
3235       Result:=copy(Result,p-PChar(Result)+1,length(Result));
3236       if Result='' then exit;
3237     end;
3238   end;
3239 
3240   if not (u8tKeepEnd in Flags) then begin
3241     // trim end
3242     p:=@Result[length(Result)];
3243     repeat
3244       case p^ of
3245       #0:
3246         if u8tKeepControlCodes in Flags then
3247           break;
3248       ' ': ;
3249       #10,#13:
3250         if u8tKeepLineBreaks in Flags then
3251           break;
3252       #9:
3253         if u8tKeepTabs in Flags then
3254           break;
3255       #1..#8,#11,#12,#14..#31,#127:
3256         if u8tKeepControlCodes in Flags then
3257           break;
3258       #128..#255:
3259         begin
3260           if KeepAllNonASCII then break;
3261           StartP:=UTF8FindNearestCharStart(PChar(Result),length(Result),p-PChar(Result));
3262           u:=UTF8CodepointToUnicode(PChar(Result)+StartP,l);
3263           if (l<=1) then break; // invalid character
3264           case u of
3265           128..159, // C1 set of control codes
3266           8206, 8207: // left-to-right, right-to-left mark
3267             if u8tKeepControlCodes in Flags then break;
3268           160,   // no break space
3269           $2007, // figure space
3270           $2026, // narrow no-break space
3271           $FEFF: // zero with no-break space
3272             if u8tKeepNoBreakSpaces in Flags then break;
3273           else
3274             break;
3275           end;
3276           p:=PChar(Result)+StartP;
3277         end;
3278       else
3279         break;
3280       end;
3281       dec(p);
3282     until p<PChar(Result);
3283     // p is on last good byte
3284     SetLength(Result,p+1-PChar(Result));
3285   end;
3286 end;
3287 
3288 {------------------------------------------------------------------------------
3289   Name:    UTF8CompareStr
3290   Params:  S1, S2 - UTF8 encoded strings
3291   Compares UTF8 encoded strings
3292   Returns
3293      0: if S1 = S2
3294     -1: if S1 < S2 ("alphabetically")
3295     +1: if S1 > S2
3296     -2: if S1 < S2, comparison ended at a different byte in an invalid UTF8 codepoint in either S1 or S2 (byte at S1 > byte at S2)
3297     +2: if S1 > S2, comparison ended at a different byte in an invalid UTF8 codepoint in either S1 or S2
3298 
3299   Compare two UTF8 encoded strings, case sensitive.
3300 
3301   Internally it uses WideCompareStr on the first Utf8 codepoint that differs between S1 and S2
3302   and therefor has proper colation on platforms where the WidestringManager supports this
3303   (Windows, *nix with cwstring unit)
3304 ------------------------------------------------------------------------------}
3305 function UTF8CompareStr(const S1, S2: string): PtrInt;
3306 begin
3307   Result := UTF8CompareStr(PChar(Pointer(S1)),length(S1),
3308                            PChar(Pointer(S2)),length(S2));
3309 end;
3310 
3311 function UTF8CompareStrP(S1, S2: PChar): PtrInt;
3312 begin
3313   Result:=UTF8CompareStr(S1,StrLen(S1),S2,StrLen(S2));
3314 end;
3315 
3316 
3317 function UTF8CompareStr(S1: PChar; Count1: SizeInt; S2: PChar; Count2: SizeInt): PtrInt;
3318 var
3319   Count: SizeInt;
3320   i, CL1, CL2: Integer;
3321   B1, B2: Byte;
3322   W1, W2: WideString;
3323   Org1, Org2: PChar;
3324 begin
3325   Result := 0;
3326   Org1 := S1;
3327   Org2 := S2;
3328   if (Count1 > Count2) then
3329     Count := Count2
3330   else
3331     Count := Count1;
3332 
3333   i := 0;
3334   if (Count > 0) then
3335   begin
3336    //unfortunately we cannot use CompareByte here, so we have to iterate ourselves
3337     while (i < Count) do
3338     begin
3339       B1 := byte(S1^);
3340       B2 := byte(S2^);
3341       if (B1 <> B2) then
3342       begin
3343         //writeln('UCS: B1=',IntToHex(B1,2),', B2=',IntToHex(B2,2));
3344         Break;
3345       end;
3346       Inc(S1); Inc(S2); Inc(I);
3347     end;
3348   end;
3349   if (i < Count) then
3350   begin
3351     //Fallback result
3352     Result := B1 - B2;
3353     if (Result < 0) then
3354       Result := -2
3355     else
3356       Result := 2;
3357     //writeln('UCS: FallBack Result = ',Result);
3358     //Try t find start of valid UTF8 codepoints
3359     if (not Utf8TryFindCodepointStart(Org1, S1, CL1)) or
3360         not Utf8TryFindCodepointStart(Org2, S2, CL2) then
3361       Exit;
3362 
3363     //writeln('UCS: CL1=',CL1,', CL2=',CL2);
3364     //writeln('S1 = "',S1,'"');
3365     //writeln('S2 = "',S2,'"');
3366     W1 := Utf8ToUtf16(S1, CL1);
3367     W2 := Utf8ToUtf16(S2, CL2);
3368     //writeln('UCS: W1 = ',Word(W1[1]),' W2 = ',Word(W2[1]));
3369     Result := WideCompareStr(W1, W2);
3370   end
3371   else
3372     //Strings are the same up and until size of smallest one
3373     Result := Count1 - Count2;
3374   if (Result > 1) then
3375     Result := 1
3376   else if (Result < -1) then
3377     Result := -1;
3378 end;
3379 
3380 {------------------------------------------------------------------------------
3381   Name:    UTF8CompareText
3382   Params: S1, S2 - UTF8 encoded strings
3383   Returns: < 0 if S1 < S2, 0 if S1 = S2, > 0 if S1 > S2.
3384   Compare two UTF8 encoded strings, case insensitive.
3385   Note: Use this function instead of AnsiCompareText.
3386   This function guarantees proper collation on all supported platforms.
3387   Internally it uses WideCompareText.
3388  ------------------------------------------------------------------------------}
3389  function UTF8CompareText(const S1, S2: String): PtrInt;
3390  begin
3391    Result := WideCompareText(Utf8ToUtf16(S1),Utf8ToUtf16(S2));
3392  end;
3393 
3394 function UTF8CompareStrCollated(const S1, S2: string): PtrInt; {$IFnDEF ACP_RTL}inline;{$endif}
3395 begin
3396   {$IFDEF ACP_RTL}
3397     //Only with this define AnsiCompareStr does not point to Utf8CompareStr
3398     Result := AnsiCompareStr(UTF8ToSys(S1), UTF8ToSys(S2));
3399   {$ELSE}
3400     Result := Utf8CompareStr(S1,S2);
3401   {$ENDIF}
3402 end;
3403 
3404 function CompareStrListUTF8LowerCase(List: TStringList; Index1, Index2: Integer
3405   ): Integer;
3406 begin
3407   Result:=UTF8CompareText(List[Index1],List[Index2]);
3408 end;
3409 
3410 {------------------------------------------------------------------------------
3411   Name:    ConvertUTF8ToUTF16
3412   Params:  Dest                - Pointer to destination string
3413            DestWideCharCount   - Wide char count allocated in destination string
3414            Src                 - Pointer to source string
3415            SrcCharCount        - Char count allocated in source string
3416            Options             - Conversion options, if none is set, both
3417              invalid and unfinished source chars are skipped
3418 
3419              toInvalidCharError       - Stop on invalid source char and report
3420                                       error
3421              toInvalidCharToSymbol    - Replace invalid source chars with '?'
3422              toUnfinishedCharError    - Stop on unfinished source char and
3423                                       report error
3424              toUnfinishedCharToSymbol - Replace unfinished source char with '?'
3425 
3426            ActualWideCharCount - Actual wide char count converted from source
3427                                string to destination string
3428   Returns:
3429     trNoError        - The string was successfully converted without
3430                      any error
3431     trNullSrc        - Pointer to source string is nil
3432     trNullDest       - Pointer to destination string is nil
3433     trDestExhausted  - Destination buffer size is not big enough to hold
3434                      converted string
3435     trInvalidChar    - Invalid source char has occured
3436     trUnfinishedChar - Unfinished source char has occured
3437 
3438   Converts the specified UTF-8 encoded string to UTF-16 encoded (system endian)
3439  ------------------------------------------------------------------------------}
3440 function ConvertUTF8ToUTF16(Dest: PWideChar; DestWideCharCount: SizeUInt;
3441   Src: PChar; SrcCharCount: SizeUInt; Options: TConvertOptions;
3442   out ActualWideCharCount: SizeUInt): TConvertResult;
3443 var
3444   DestI, SrcI: SizeUInt;
3445   B1, B2, B3, B4: Byte;
3446   W: Word;
3447   C: Cardinal;
3448 
3449   function UnfinishedCharError: Boolean;
3450   begin
3451     if toUnfinishedCharToSymbol in Options then
3452     begin
3453       Dest[DestI] := System.WideChar('?');
3454       Inc(DestI);
3455       Result := False;
3456     end
3457     else
3458       if toUnfinishedCharError in Options then
3459       begin
3460         ConvertUTF8ToUTF16 := trUnfinishedChar;
3461         Result := True;
3462       end
3463       else Result := False;
3464   end;
3465 
3466   function InvalidCharError(Count: SizeUInt): Boolean; inline;
3467   begin
3468     if not (toInvalidCharError in Options) then
3469     begin
3470       if toInvalidCharToSymbol in Options then
3471       begin
3472         Dest[DestI] := System.WideChar('?');
3473         Inc(DestI);
3474       end;
3475 
3476       Dec(SrcI, Count);
3477 
3478       // skip trailing UTF-8 char bytes
3479       while (Count > 0) do
3480       begin
3481         if (Byte(Src[SrcI]) and %11000000) <> %10000000 then Break;
3482         Inc(SrcI);
3483         Dec(Count);
3484       end;
3485 
3486       Result := False;
3487     end
3488     else
3489       if toInvalidCharError in Options then
3490       begin
3491         ConvertUTF8ToUTF16 := trUnfinishedChar;
3492         Result := True;
3493       end;
3494   end;
3495 
3496 begin
3497   ActualWideCharCount := 0;
3498 
3499   if not Assigned(Src) then
3500   begin
3501     Result := trNullSrc;
3502     Exit;
3503   end;
3504 
3505   if not Assigned(Dest) then
3506   begin
3507     Result := trNullDest;
3508     Exit;
3509   end;
3510   SrcI := 0;
3511   DestI := 0;
3512 
3513   while (DestI < DestWideCharCount) and (SrcI < SrcCharCount) do
3514   begin
3515     B1 := Byte(Src[SrcI]);
3516     Inc(SrcI);
3517 
3518     if B1 < 128 then // single byte UTF-8 char
3519     begin
3520       Dest[DestI] := System.WideChar(B1);
3521       Inc(DestI);
3522     end
3523     else
3524     begin
3525       if SrcI >= SrcCharCount then
3526         if UnfinishedCharError then Exit(trInvalidChar)
3527         else Break;
3528 
3529       B2 := Byte(Src[SrcI]);
3530       Inc(SrcI);
3531 
3532       if (B1 and %11100000) = %11000000 then // double byte UTF-8 char
3533       begin
3534         if (B2 and %11000000) = %10000000 then
3535         begin
3536           Dest[DestI] := System.WideChar(((B1 and %00011111) shl 6) or (B2 and %00111111));
3537           Inc(DestI);
3538         end
3539         else // invalid character, assume single byte UTF-8 char
3540           if InvalidCharError(1) then Exit(trInvalidChar);
3541       end
3542       else
3543       begin
3544         if SrcI >= SrcCharCount then
3545           if UnfinishedCharError then Exit(trInvalidChar)
3546           else Break;
3547 
3548         B3 := Byte(Src[SrcI]);
3549         Inc(SrcI);
3550 
3551         if (B1 and %11110000) = %11100000 then // triple byte UTF-8 char
3552         begin
3553           if ((B2 and %11000000) = %10000000) and ((B3 and %11000000) = %10000000) then
3554           begin
3555             W := ((B1 and %00011111) shl 12) or ((B2 and %00111111) shl 6) or (B3 and %00111111);
3556             if (W < $D800) or (W > $DFFF) then // to single wide char UTF-16 char
3557             begin
3558               Dest[DestI] := System.WideChar(W);
3559               Inc(DestI);
3560             end
3561             else // invalid UTF-16 character, assume double byte UTF-8 char
3562               if InvalidCharError(2) then Exit(trInvalidChar);
3563           end
3564           else // invalid character, assume double byte UTF-8 char
3565             if InvalidCharError(2) then Exit(trInvalidChar);
3566         end
3567         else
3568         begin
3569           if SrcI >= SrcCharCount then
3570             if UnfinishedCharError then Exit(trInvalidChar)
3571             else Break;
3572 
3573           B4 := Byte(Src[SrcI]);
3574           Inc(SrcI);
3575 
3576           if ((B1 and %11111000) = %11110000) and ((B2 and %11000000) = %10000000)
3577             and ((B3 and %11000000) = %10000000) and ((B4 and %11000000) = %10000000) then
3578           begin // 4 byte UTF-8 char
3579             C := ((B1 and %00011111) shl 18) or ((B2 and %00111111) shl 12)
3580               or ((B3 and %00111111) shl 6)  or (B4 and %00111111);
3581             // to double wide char UTF-16 char
3582             Dest[DestI] := System.WideChar($D800 or ((C - $10000) shr 10));
3583             Inc(DestI);
3584             if DestI >= DestWideCharCount then Break;
3585             Dest[DestI] := System.WideChar($DC00 or ((C - $10000) and %0000001111111111));
3586             Inc(DestI);
3587           end
3588           else // invalid character, assume triple byte UTF-8 char
3589             if InvalidCharError(3) then Exit(trInvalidChar);
3590         end;
3591       end;
3592     end;
3593   end;
3594 
3595   if DestI >= DestWideCharCount then
3596   begin
3597     DestI := DestWideCharCount - 1;
3598     Result := trDestExhausted;
3599   end
3600   else
3601     Result := trNoError;
3602 
3603   Dest[DestI] := #0;
3604   ActualWideCharCount := DestI + 1;
3605 end;
3606 
3607 {------------------------------------------------------------------------------
3608   Name:    ConvertUTF16ToUTF8
3609   Params:  Dest             - Pointer to destination string
3610            DestCharCount    - Char count allocated in destination string
3611            Src              - Pointer to source string
3612            SrcWideCharCount - Wide char count allocated in source string
3613            Options          - Conversion options, if none is set, both
3614              invalid and unfinished source chars are skipped.
3615              See ConvertUTF8ToUTF16 for details.
3616 
3617            ActualCharCount  - Actual char count converted from source
3618                             string to destination string
3619   Returns: See ConvertUTF8ToUTF16
3620 
3621   Converts the specified UTF-16 encoded string (system endian) to UTF-8 encoded
3622  ------------------------------------------------------------------------------}
3623 function ConvertUTF16ToUTF8(Dest: PChar; DestCharCount: SizeUInt;
3624   Src: PWideChar; SrcWideCharCount: SizeUInt; Options: TConvertOptions;
3625   out ActualCharCount: SizeUInt): TConvertResult;
3626 var
3627   DestI, SrcI: SizeUInt;
3628   W1, W2: Word;
3629   C: Cardinal;
3630 
3631   function UnfinishedCharError: Boolean;
3632   begin
3633     if toUnfinishedCharToSymbol in Options then
3634     begin
3635       Dest[DestI] := Char('?');
3636       Inc(DestI);
3637       Result := False;
3638     end
3639     else
3640       if toUnfinishedCharError in Options then
3641       begin
3642         ConvertUTF16ToUTF8 := trUnfinishedChar;
3643         Result := True;
3644       end
3645       else Result := False;
3646   end;
3647 
3648   function InvalidCharError(Count: SizeUInt): Boolean; inline;
3649   begin
3650     if not (toInvalidCharError in Options) then
3651     begin
3652       if toInvalidCharToSymbol in Options then
3653       begin
3654         Dest[DestI] := Char('?');
3655         Inc(DestI);
3656       end;
3657 
3658       Dec(SrcI, Count);
3659       // skip trailing UTF-16 wide char
3660       if (Word(Src[SrcI]) and $FC00) = $DC00 then Inc(SrcI);
3661 
3662       Result := False;
3663     end
3664     else
3665       if toInvalidCharError in Options then
3666       begin
3667         ConvertUTF16ToUTF8 := trUnfinishedChar;
3668         Result := True;
3669       end;
3670   end;
3671 
3672 begin
3673   ActualCharCount := 0;
3674 
3675   if not Assigned(Src) then
3676   begin
3677     Result := trNullSrc;
3678     Exit;
3679   end;
3680 
3681   if not Assigned(Dest) then
3682   begin
3683     Result := trNullDest;
3684     Exit;
3685   end;
3686   SrcI := 0;
3687   DestI := 0;
3688 
3689   while (DestI < DestCharCount) and (SrcI < SrcWideCharCount) do
3690   begin
3691     W1 := Word(Src[SrcI]);
3692     Inc(SrcI);
3693 
3694     if (W1 < $D800) or (W1 > $DFFF) then // single wide char UTF-16 char
3695     begin
3696       if W1 < $0080 then // to single byte UTF-8 char
3697       begin
3698         Dest[DestI] := Char(W1);
3699         Inc(DestI);
3700       end
3701       else
3702         if W1 < $0800 then // to double byte UTF-8 char
3703         begin
3704           Dest[DestI] := Char(%11000000 or ((W1 and %11111000000) shr 6));
3705           Inc(DestI);
3706           if DestI >= DestCharCount then Break;
3707           Dest[DestI] := Char(%10000000 or (W1 and %111111));
3708           Inc(DestI);
3709         end
3710         else
3711         begin // to triple byte UTF-8 char
3712           Dest[DestI] := Char(%11100000 or ((W1 and %1111000000000000) shr 12));
3713           Inc(DestI);
3714           if DestI >= DestCharCount then Break;
3715           Dest[DestI] := Char(%10000000 or ((W1 and %111111000000) shr 6));
3716           Inc(DestI);
3717           if DestI >= DestCharCount then Break;
3718           Dest[DestI] := Char(%10000000 or (W1 and %111111));
3719           Inc(DestI);
3720         end;
3721     end
3722     else
3723     begin
3724       if SrcI >= SrcWideCharCount then
3725         if UnfinishedCharError then Exit(trInvalidChar)
3726         else Break;
3727 
3728       W2 := Word(Src[SrcI]);
3729       Inc(SrcI);
3730 
3731       if (W1 and $F800) = $D800 then // double wide char UTF-16 char
3732       begin
3733         if (W2 and $FC00) = $DC00 then
3734         begin
3735           C := (W1 - $D800) shl 10 + (W2 - $DC00) + $10000;
3736 
3737           // to 4 byte UTF-8 char
3738           Dest[DestI] := Char(%11110000 or (C shr 18));
3739           Inc(DestI);
3740           if DestI >= DestCharCount then Break;
3741           Dest[DestI] := Char(%10000000 or ((C and $3F000) shr 12));
3742           Inc(DestI);
3743           if DestI >= DestCharCount then Break;
3744           Dest[DestI] := Char(%10000000 or ((C and %111111000000) shr 6));
3745           Inc(DestI);
3746           if DestI >= DestCharCount then Break;
3747           Dest[DestI] := Char(%10000000 or (C and %111111));
3748           Inc(DestI);
3749         end
3750         else // invalid character, assume single wide char UTF-16 char
3751           if InvalidCharError(1) then Exit(trInvalidChar);
3752       end
3753       else // invalid character, assume single wide char UTF-16 char
3754         if InvalidCharError(1) then Exit(trInvalidChar);
3755     end;
3756   end;
3757 
3758   if DestI >= DestCharCount then
3759   begin
3760     DestI := DestCharCount - 1;
3761     Result := trDestExhausted;
3762   end
3763   else
3764     Result := trNoError;
3765 
3766   Dest[DestI] := #0;
3767   ActualCharCount := DestI + 1;
3768 end;
3769 
3770 {------------------------------------------------------------------------------
3771   Name:    UTF8ToUTF16
3772   Params:  S - Source UTF-8 string
3773   Returns: UTF-16 encoded string
3774 
3775   Converts the specified UTF-8 encoded string to UTF-16 encoded (system endian)
3776   Avoid copying the result string since on windows a widestring requires a full
3777   copy
3778  ------------------------------------------------------------------------------}
3779 function UTF8ToUTF16(const S: AnsiString): UnicodeString;
3780 begin
3781   Result:=UTF8ToUTF16(PChar(S),length(S));
3782 end;
3783 
3784 function UTF8ToUTF16(const P: PChar; ByteCnt: SizeUInt): UnicodeString;
3785 var
3786   L: SizeUInt;
3787 begin
3788   if ByteCnt=0 then
3789     exit('');
3790   SetLength(Result, ByteCnt);
3791   // wide chars of UTF-16 <= bytes of UTF-8 string
3792   if ConvertUTF8ToUTF16(PWideChar(Result), Length(Result) + 1, P, ByteCnt,
3793     [toInvalidCharToSymbol], L) = trNoError
3794   then SetLength(Result, L - 1)
3795   else Result := '';
3796 end;
3797 
3798 {------------------------------------------------------------------------------
3799   Name:    UTF16ToUTF8
3800   Params:  S - Source UTF-16 string (system endian)
3801   Returns: UTF-8 encoded string
3802 
3803   Converts the specified UTF-16 encoded string (system endian) to UTF-8 encoded
3804  ------------------------------------------------------------------------------}
3805 function UTF16ToUTF8(const S: UnicodeString): AnsiString;
3806 begin
3807   Result := UTF16ToUTF8(PWideChar(S),length(S));
3808 end;
3809 
3810 function UTF16ToUTF8(const P: PWideChar; WideCnt: SizeUInt): AnsiString;
3811 var
3812   L: SizeUInt;
3813 begin
3814   if WideCnt=0 then
3815     exit('');
3816 
3817   SetLength(Result, WideCnt * 3);
3818   // bytes of UTF-8 <= 3 * wide chars of UTF-16 string
3819   // e.g. %11100000 10100000 10000000 (UTF-8) is $0800 (UTF-16)
3820   if ConvertUTF16ToUTF8(PChar(Result), Length(Result) + 1, P, WideCnt,
3821     [toInvalidCharToSymbol], L) = trNoError then
3822   begin
3823     SetLength(Result, L - 1);
3824   end else
3825     Result := '';
3826 end;
3827 
3828 procedure LazGetLanguageIDs(var Lang, FallbackLang: String);
3829 
3830   {$IFDEF DARWIN}
3831   function GetLanguage: boolean;
3832   var
3833     Ref: CFStringRef;
3834     LangArray: CFMutableArrayRef;
3835     StrSize: CFIndex;
3836     StrRange: CFRange;
3837     Locals: CFArrayRef;
3838     Bundle: CFBundleRef;
3839   begin
3840     Result := false;
3841     Bundle:=CFBundleGetMainBundle;
3842     if Bundle=nil then exit;
3843     Locals:=CFBundleCopyBundleLocalizations(Bundle);
3844     if Locals=nil then exit;
3845     LangArray := CFBundleCopyLocalizationsForPreferences(Locals, nil);
3846     try
3847       if CFArrayGetCount(LangArray) > 0 then
3848       begin
3849         Ref := CFArrayGetValueAtIndex(LangArray, 0);
3850         StrRange.location := 0;
3851         StrRange.length := CFStringGetLength(Ref);
3852 
3853         StrSize:=0;
3854         CFStringGetBytes(Ref, StrRange, kCFStringEncodingUTF8,
3855           Ord('?'), False, nil, 0, StrSize);
3856         SetLength(Lang, StrSize);
3857 
3858         if StrSize > 0 then
3859         begin
3860           CFStringGetBytes(Ref, StrRange, kCFStringEncodingUTF8,
3861             Ord('?'), False, @Lang[1], StrSize, StrSize);
3862           Result:=true;
3863           FallbackLang := Copy(Lang, 1, 2);
3864         end;
3865       end;
3866     finally
3867       CFRelease(LangArray);
3868       CFRelease(Locals);
3869     end;
3870   end;
3871   {$ENDIF}
3872 begin
3873 {$IFDEF DARWIN}
3874   if not GetLanguage then
3875     GetLanguageIDs(Lang, FallbackLang);
3876 {$ELSE}
3877   GetLanguageIDs(Lang, FallbackLang);
3878 {$ENDIF}
3879 end;
3880 
3881 {
3882 This routine will strip country information from the language ID
3883 making it more simple
3884 
3885 Ideally the resulting ID from here should conform to ISO 639-1
3886 or ISO 639-2, if the language has no code in ISO 639-1
3887 }
3888 procedure LazGetShortLanguageID(var Lang: String);
3889 var
3890   FallbackLang: String;
3891 begin
3892   FallbackLang:='';
3893   LazGetLanguageIDs(Lang, FallbackLang);
3894 
3895   // Simply making sure its length is at most 2 should be enough for most languages
3896   if Length(Lang) > 2 then Lang := Lang[1] + Lang[2];
3897 end;
3898 
3899 procedure ReplaceSubstring(var s: string; StartPos, Count: SizeInt;
3900   const Insertion: string);
3901 var
3902   MaxCount: SizeInt;
3903   InsertionLen: SizeInt;
3904   SLen: SizeInt;
3905   RestLen: SizeInt;
3906   p: PByte;
3907 begin
3908   SLen:=length(s);
3909   if StartPos>SLen then begin
3910     s:=s+Insertion;
3911     exit;
3912   end;
3913   if StartPos<1 then StartPos:=1;
3914   if Count<0 then Count:=0;
3915   MaxCount:=SLen-StartPos+1;
3916   if Count>MaxCount then
3917     Count:=MaxCount;
3918   InsertionLen:=length(Insertion);
3919   if (Count=0) and (InsertionLen=0) then
3920     exit; // nothing to do
3921   if (Count=InsertionLen) then begin
3922     if CompareMem(PByte(s)+StartPos-1,Pointer(Insertion),Count) then
3923       // already the same content
3924       exit;
3925     UniqueString(s);
3926   end else begin
3927     RestLen:=SLen-StartPos-Count+1;
3928     if InsertionLen<Count then begin
3929       // shorten
3930       if RestLen>0 then begin
3931         UniqueString(s);
3932         p:=PByte(s)+StartPos-1;
3933         System.Move((p+Count)^,(p+InsertionLen)^,RestLen);
3934       end;
3935       Setlength(s,SLen-Count+InsertionLen);
3936     end else begin
3937       // longen
3938       Setlength(s,SLen-Count+InsertionLen);
3939       if RestLen>0 then begin
3940         p:=PByte(s)+StartPos-1;
3941         System.Move((p+Count)^,(p+InsertionLen)^,RestLen);
3942       end;
3943     end;
3944   end;
3945   if InsertionLen>0 then
3946     System.Move(PByte(Insertion)^,(PByte(s)+StartPos-1)^,InsertionLen);
3947 end;
3948 
3949 procedure InitFPUpchars;
3950 var
3951   c: Char;
3952 begin
3953   for c:=Low(char) to High(char) do begin
3954     FPUpChars[c]:=upcase(c);
3955   end;
3956 end;
3957 
3958 initialization
3959   InitFPUpchars;
3960   InitLazUtf8;
3961 finalization
3962   FinalizeLazUTF8;
3963 
3964 end.
3965 
3966