1 {
2 /***************************************************************************
3 lazutf8.pas
4 ***************************************************************************/
5
6 *****************************************************************************
7 This file is part of LazUtils
8
9 See the file COPYING.modifiedLGPL.txt, included in this distribution,
10 for details about the license.
11 *****************************************************************************
12
13 Useful routines for managing UTF-8 strings
14
15 - all functions are thread safe unless explicitely stated
16 }
17 unit LazUTF8;
18
19 {$mode objfpc}{$H+}{$inline on}
20
21 {$i lazutils_defines.inc}
22
23 interface
24
25 uses
26 {$ifdef unix}
27 // WideCompare* functions on Unix requires this. Must be used although it pulls in clib.
28 cwstring,
29 {$endif}
30 {$IFDEF UTF8_RTL}
31 FPCAdds,
32 {$ENDIF}
33 {$ifdef windows}
34 Windows,
35 {$endif}
36 Classes, SysUtils, strutils;
37
38 // AnsiToUTF8 and UTF8ToAnsi need a widestring manager under Linux, BSD, MacOSX
39 // but normally these OS use UTF-8 as system encoding so the widestringmanager
40 // is not needed.
NeedRTLAnsinull41 function NeedRTLAnsi: boolean;// true if system encoding is not UTF-8
42 procedure SetNeedRTLAnsi(NewValue: boolean);
43
44 // UTF8ToSys works like UTF8ToAnsi but more independent of widestringmanager
UTF8ToSysnull45 function UTF8ToSys(const s: string): string; overload; {$IFDEF UTF8_RTL}inline;{$ENDIF}
UTF8ToSysnull46 function UTF8ToSys(const AFormatSettings: TFormatSettings): TFormatSettings; overload; {$IFDEF UTF8_RTL}inline;{$ENDIF}
47
48 // SysToUTF8 works like AnsiToUTF8 but more independent of widestringmanager
SysToUTF8null49 function SysToUTF8(const s: string): string; overload; {$IFDEF UTF8_RTL}inline;{$ENDIF}
SysToUTF8null50 function SysToUTF8(const AFormatSettings: TFormatSettings): TFormatSettings; overload;
51
52 // converts OEM encoded string to UTF8 (used with some Windows specific functions)
ConsoleToUTF8null53 function ConsoleToUTF8(const s: string): string; {$IFDEF UTF8_RTL}inline;{$ENDIF}
54 // converts UTF8 string to console encoding (used by Write, WriteLn)
UTF8ToConsolenull55 function UTF8ToConsole(const s: string): string; {$IFDEF UTF8_RTL}inline;{$ENDIF}
56
57 // for all Windows supporting 8bit codepages (e.g. not WinCE)
58 // converts string in Windows code page to UTF8 (used with some Windows specific functions)
WinCPToUTF8null59 function WinCPToUTF8(const s: string): string; {$ifdef WinCe}inline;{$endif}
60 // converts UTF8 string to Windows code page encoding (used by Write, WriteLn)
UTF8ToWinCPnull61 function UTF8ToWinCP(const s: string): string; {$ifdef WinCe}inline;{$endif}
62
ParamStrUTF8null63 function ParamStrUTF8(Param: Integer): string;
64
65 {$ifdef windows}
66 procedure GetFormatSettingsUTF8;
67 procedure GetLocaleFormatSettingsUTF8(LCID: Integer; var aFormatSettings: TFormatSettings);
68 {$endif}
69
GetEnvironmentVariableCountUTF8null70 Function GetEnvironmentVariableCountUTF8: Integer;
GetEnvironmentStringUTF8null71 function GetEnvironmentStringUTF8(Index: Integer): string;
GetEnvironmentVariableUTF8null72 function GetEnvironmentVariableUTF8(const EnvVar: string): String;
SysErrorMessageUTF8null73 function SysErrorMessageUTF8(ErrorCode: Integer): String;
74
75 // Returns the size of one codepoint in bytes.
UTF8CodepointSizenull76 function UTF8CodepointSize(p: PChar): integer; inline;
UTF8CharacterLengthnull77 function UTF8CharacterLength(p: PChar): integer; deprecated 'Use UTF8CodepointSize instead.';
78 // Fast version of UTF8CodepointSize. Assumes the UTF-8 codepoint is valid.
UTF8CodepointSizeFastnull79 function UTF8CodepointSizeFast(p: PChar): integer; inline;
80
UTF8Lengthnull81 function UTF8Length(const s: string): PtrInt; inline;
UTF8Lengthnull82 function UTF8Length(p: PChar; ByteCount: PtrInt): PtrInt;
83 // Fast versions of UTF8Length. They assume the UTF-8 data is valid.
UTF8LengthFastnull84 function UTF8LengthFast(const s: string): PtrInt; inline;
UTF8LengthFastnull85 function UTF8LengthFast(p: PChar; ByteCount: PtrInt): PtrInt;
86
87 // Functions dealing with unicode number U+xxx.
UTF8CodepointToUnicodenull88 function UTF8CodepointToUnicode(p: PChar; out CodepointLen: integer): Cardinal;
UTF8CharacterToUnicodenull89 function UTF8CharacterToUnicode(p: PChar; out CharLen: integer): Cardinal; deprecated 'Use UTF8CodepointToUnicode instead.';
UnicodeToUTF8null90 function UnicodeToUTF8(CodePoint: cardinal): string; // UTF32 to UTF8
UnicodeToUTF8null91 function UnicodeToUTF8(CodePoint: cardinal; Buf: PChar): integer; // UTF32 to UTF8
UnicodeToUTF8SkipErrorsnull92 function UnicodeToUTF8SkipErrors(CodePoint: cardinal; Buf: PChar): integer; // UTF32 to UTF8
UnicodeToUTF8Inlinenull93 function UnicodeToUTF8Inline(CodePoint: cardinal; Buf: PChar): integer; inline; // UTF32 to UTF8
UTF8ToDoubleByteStringnull94 function UTF8ToDoubleByteString(const s: string): string;
UTF8ToDoubleBytenull95 function UTF8ToDoubleByte(UTF8Str: PChar; Len: PtrInt; DBStr: PByte): PtrInt;
UTF8FindNearestCharStartnull96 function UTF8FindNearestCharStart(UTF8Str: PChar; Len: SizeInt;
97 BytePos: SizeInt): SizeInt;
Utf8TryFindCodepointStartnull98 function Utf8TryFindCodepointStart(AString: PChar; var CurPos: PChar; out CodepointLen: Integer): Boolean;
Utf8TryFindCodepointStartnull99 function Utf8TryFindCodepointStart(const AString: String; var Index: Integer; out CharLen: Integer): Boolean;
100 // find the n-th UTF8 codepoint, ignoring BIDI
UTF8CodepointStartnull101 function UTF8CodepointStart(UTF8Str: PChar; Len, CodepointIndex: PtrInt): PChar;
UTF8CharStartnull102 function UTF8CharStart(UTF8Str: PChar; Len, CharIndex: PtrInt): PChar; deprecated 'Use UTF8CodepointStart instead.';
103 // find the byte index of the n-th UTF8 codepoint, ignoring BIDI (byte len of substr)
UTF8CodepointToByteIndexnull104 function UTF8CodepointToByteIndex(UTF8Str: PChar; Len, CodepointIndex: PtrInt): PtrInt;
UTF8CharToByteIndexnull105 function UTF8CharToByteIndex(UTF8Str: PChar; Len, CharIndex: PtrInt): PtrInt; deprecated 'Use UTF8CodepointToByteIndex instead.';
106 procedure UTF8FixBroken(P: PChar); overload;
107 procedure UTF8FixBroken(var S: string); overload;
UTF8CodepointStrictSizenull108 function UTF8CodepointStrictSize(P: PChar): integer;
UTF8CharacterStrictLengthnull109 function UTF8CharacterStrictLength(P: PChar): integer; deprecated 'Use UTF8CodepointStrictSize instead.';
UTF8CStringToUTF8Stringnull110 function UTF8CStringToUTF8String(SourceStart: PChar; SourceLen: PtrInt) : string;
111
UTF8Posnull112 function UTF8Pos(const SearchForText, SearchInText: string; StartPos: SizeInt = 1): PtrInt;
UTF8PosPnull113 function UTF8PosP(SearchForText: PChar; SearchForTextLen: SizeInt;
114 SearchInText: PChar; SearchInTextLen: SizeInt): PChar;
UTF8Copynull115 function UTF8Copy(const s: string; StartCharIndex, CharCount: PtrInt): string;
116 {$IFnDEF NO_CP_RTL}
117 procedure UTF8Delete(var s: Utf8String; StartCharIndex, CharCount: PtrInt);
118 {$ENDIF}
119 procedure UTF8Delete(var s: String; StartCharIndex, CharCount: PtrInt);
120 {$IFnDEF NO_CP_RTL}
121 procedure UTF8Insert(const source: Utf8String; var s: Utf8String; StartCharIndex: PtrInt);
122 {$ENDIF}
123 procedure UTF8Insert(const source: String; var s: String; StartCharIndex: PtrInt);
UTF8StringReplacenull124 function UTF8StringReplace(const S, OldPattern, NewPattern: String;
125 Flags: TReplaceFlags; ALanguage: string=''): String;
126
UTF8LowerCasenull127 function UTF8LowerCase(const AInStr: string; ALanguage: string=''): string;
UTF8LowerStringnull128 function UTF8LowerString(const s: string): string;
UTF8UpperCasenull129 function UTF8UpperCase(const AInStr: string; ALanguage: string=''): string;
UTF8UpperStringnull130 function UTF8UpperString(const s: string): string;
UTF8SwapCasenull131 function UTF8SwapCase(const AInStr: string; ALanguage: string=''): string;
132 // Capitalize the first letters of every word
UTF8ProperCasenull133 function UTF8ProperCase(const AInStr: string; const WordDelims: TSysCharSet): string;
FindInvalidUTF8Codepointnull134 function FindInvalidUTF8Codepoint(p: PChar; Count: PtrInt; StopOnNonUTF8: Boolean = true): PtrInt;
FindInvalidUTF8Characternull135 function FindInvalidUTF8Character(p: PChar; Count: PtrInt; StopOnNonUTF8: Boolean = true): PtrInt; deprecated 'Use FindInvalidUTF8Codepoint instead.';
UTF8StringOfCharnull136 function UTF8StringOfChar(AUtf8Char: String; N: Integer): String;
UTF8AddCharnull137 function UTF8AddChar(AUtf8Char: String; const S: String; N: Integer): String;
UTF8AddCharRnull138 function UTF8AddCharR(AUtf8Char: String; const S: String; N: Integer): String;
UTF8PadLeftnull139 function UTF8PadLeft(const S: String; const N: Integer; const AUtf8Char: String = #32): String;
UTF8PadRightnull140 function UTF8PadRight(const S: String; const N: Integer; const AUtf8Char: String = #32): String;
UTF8PadCenternull141 function UTF8PadCenter(const S: String; const N: Integer; const AUtf8Char: String = #32): String;
UTF8LeftStrnull142 function UTF8LeftStr(const AText: String; const ACount: Integer): String;
UTF8RightStrnull143 function UTF8RightStr(const AText: String; const ACount: Integer): String;
UTF8QuotedStrnull144 function UTF8QuotedStr(const S, Quote: string): string;
145 //Utf8 version of MidStr is just Utf8Copy with same parameters, so it is not implemented here
UTF8StartsTextnull146 function UTF8StartsText(const ASubText, AText: string): Boolean;
UTF8EndsTextnull147 function UTF8EndsText(const ASubText, AText: string): Boolean;
UTF8ReverseStringnull148 function UTF8ReverseString(p: PChar; const ByteCount: LongInt): string;
UTF8ReverseStringnull149 function UTF8ReverseString(const AText: string): string; inline;
UTF8RPosnull150 function UTF8RPos(const Substr, Source: string): PtrInt;
151
UTF8WrapTextnull152 function UTF8WrapText(S, BreakStr: string; BreakChars: TSysCharSet; MaxCol: integer): string; overload;
UTF8WrapTextnull153 function UTF8WrapText(S: string; MaxCol: integer): string; overload;
154
155 type
156 TEscapeMode = (emPascal, emHexPascal, emHexC, emC, emAsciiControlNames);
157
Utf8EscapeControlCharsnull158 function Utf8EscapeControlChars(S: String; EscapeMode: TEscapeMode = emPascal): String;
159
160 type
161 TUTF8TrimFlag = (
162 u8tKeepStart,
163 u8tKeepEnd,
164 u8tKeepTabs,
165 u8tKeepLineBreaks,
166 u8tKeepNoBreakSpaces,
167 u8tKeepControlCodes // excluding tabs and line breaks
168 );
169 TUTF8TrimFlags = set of TUTF8TrimFlag;
UTF8Trimnull170 function UTF8Trim(const s: string; Flags: TUTF8TrimFlags = []): string;
171
172 //compare functions
173
UTF8CompareStrnull174 function UTF8CompareStr(const S1, S2: string): PtrInt; inline;
UTF8CompareStrPnull175 function UTF8CompareStrP(S1, S2: PChar): PtrInt;
UTF8CompareStrnull176 function UTF8CompareStr(S1: PChar; Count1: SizeInt; S2: PChar; Count2: SizeInt): PtrInt;
UTF8CompareTextnull177 function UTF8CompareText(const S1, S2: string): PtrInt;
UTF8CompareStrCollatednull178 function UTF8CompareStrCollated(const S1, S2: string): PtrInt; {$IFnDEF ACP_RTL}inline;{$endif}
CompareStrListUTF8LowerCasenull179 function CompareStrListUTF8LowerCase(List: TStringList; Index1, Index2: Integer): Integer;
180
181 type
182 TConvertResult = (trNoError, trNullSrc, trNullDest, trDestExhausted,
183 trInvalidChar, trUnfinishedChar);
184
185 TConvertOption = (toInvalidCharError, toInvalidCharToSymbol,
186 toUnfinishedCharError, toUnfinishedCharToSymbol);
187 TConvertOptions = set of TConvertOption;
188
ConvertUTF8ToUTF16null189 function ConvertUTF8ToUTF16(Dest: PWideChar; DestWideCharCount: SizeUInt;
190 Src: PChar; SrcCharCount: SizeUInt; Options: TConvertOptions;
191 out ActualWideCharCount: SizeUInt): TConvertResult;
192
ConvertUTF16ToUTF8null193 function ConvertUTF16ToUTF8(Dest: PChar; DestCharCount: SizeUInt;
194 Src: PWideChar; SrcWideCharCount: SizeUInt; Options: TConvertOptions;
195 out ActualCharCount: SizeUInt): TConvertResult;
196
UTF8ToUTF16null197 function UTF8ToUTF16(const S: AnsiString): UnicodeString; overload;
UTF8ToUTF16null198 function UTF8ToUTF16(const P: PChar; ByteCnt: SizeUInt): UnicodeString; overload;
UTF16ToUTF8null199 function UTF16ToUTF8(const S: UnicodeString): AnsiString; overload;
UTF16ToUTF8null200 function UTF16ToUTF8(const P: PWideChar; WideCnt: SizeUInt): AnsiString; overload;
201
202 // locale
203 procedure LazGetLanguageIDs(var Lang, FallbackLang: String);
204 procedure LazGetShortLanguageID(var Lang: String);
205
206 var
207 FPUpChars: array[char] of char;
208
209 procedure ReplaceSubstring(var s: string; StartPos, Count: SizeInt;
210 const Insertion: string);
211
212 implementation
213
214 uses
215 gettext
216 {$IFDEF Darwin}, MacOSAll{$ENDIF}
217 ;
218
219 {$IFDEF WinCE}
220 // CP_UTF8 is missing in the windows unit of the Windows CE RTL
221 const
222 CP_UTF8 = 65001;
223 {$ENDIF}
224
IsASCIInull225 function IsASCII(const s: string): boolean; inline;
226 var
227 i: Integer;
228 begin
229 for i:=1 to length(s) do if ord(s[i])>127 then exit(false);
230 Result:=true;
231 end;
232
233 {$IFDEF windows}
234 {$i winlazutf8.inc}
235 {$ELSE}
236 {$IFDEF HASAMIGA}
237 {$i unixlazutf8.inc} // Reuse UNIX code for Amiga
238 {$ELSE}
239 {$i unixlazutf8.inc}
240 {$ENDIF}
241 {$ENDIF}
242
243 var
244 FNeedRTLAnsi: boolean = false;
245 FNeedRTLAnsiValid: boolean = false;
246
NeedRTLAnsinull247 function NeedRTLAnsi: boolean;
248 {$IFNDEF Windows}
249 var
250 Lang: String;
251 i: LongInt;
252 Encoding: String;
253 {$ENDIF}
254 begin
255 if FNeedRTLAnsiValid then
256 exit(FNeedRTLAnsi);
257 {$IFDEF Windows}
258 {$IF FPC_FULLVERSION>=20701}
259 FNeedRTLAnsi:=DefaultSystemCodePage<>CP_UTF8;
260 {$ELSE}
261 FNeedRTLAnsi:=GetACP<>CP_UTF8;
262 {$ENDIF}
263 {$ELSE}
264 FNeedRTLAnsi:=false;
265 Lang := SysUtils.GetEnvironmentVariable('LC_ALL');
266 if lang = '' then
267 begin
268 Lang := SysUtils.GetEnvironmentVariable('LC_MESSAGES');
269 if Lang = '' then
270 begin
271 Lang := SysUtils.GetEnvironmentVariable('LANG');
272 end;
273 end;
274 i:=System.Pos('.',Lang);
275 if (i>0) then begin
276 Encoding:=copy(Lang,i+1,length(Lang)-i);
277 FNeedRTLAnsi:=(SysUtils.CompareText(Encoding,'UTF-8')<>0)
278 and (SysUtils.CompareText(Encoding,'UTF8')<>0);
279 end;
280 {$ENDIF}
281 FNeedRTLAnsiValid:=true;
282 Result:=FNeedRTLAnsi;
283 end;
284
285 procedure SetNeedRTLAnsi(NewValue: boolean);
286 begin
287 FNeedRTLAnsi:=NewValue;
288 FNeedRTLAnsiValid:=true;
289 end;
290
UTF8ToSysnull291 function UTF8ToSys(const s: string): string;
292 begin
293 {$IFDEF UTF8_RTL}
294 Result:=s;
295 {$ELSE}
296 if NeedRTLAnsi and (not IsASCII(s)) then
297 Result:=UTF8ToAnsi(s)
298 else
299 Result:=s;
300 {$ENDIF}
301 end;
302
SysToUTF8null303 function SysToUTF8(const s: string): string;
304 begin
305 {$IFDEF UTF8_RTL}
306 Result:=s;
307 {$ELSE}
308 if NeedRTLAnsi and (not IsASCII(s)) then
309 begin
310 Result:=AnsiToUTF8(s);
311 {$ifdef FPC_HAS_CPSTRING}
312 // prevent UTF8 codepage appear in the strings - we don't need codepage
313 // conversion magic in LCL code
314 SetCodePage(RawByteString(Result), StringCodePage(s), False);
315 {$endif}
316 end
317 else
318 Result:=s;
319 {$ENDIF}
320 end;
321
322 function SysToUTF8(const AFormatSettings: TFormatSettings): TFormatSettings;
323 {$IFNDEF UTF8_RTL}
324 var
325 i: Integer;
326 {$ENDIF}
327 begin
328 Result := AFormatSettings;
329 {$IFNDEF UTF8_RTL}
330 Result.CurrencyString := SysToUTF8(AFormatSettings.CurrencyString);
331 for i:=1 to 12 do begin
332 Result.LongMonthNames[i] := SysToUTF8(AFormatSettings.LongMonthNames[i]);
333 Result.ShortMonthNames[i] := SysToUTF8(AFormatSettings.ShortMonthNames[i]);
334 end;
335 for i:=1 to 7 do begin
336 Result.LongDayNames[i] := SysToUTF8(AFormatSettings.LongDayNames[i]);
337 Result.ShortDayNames[i] := SysToUTF8(AFormatSettings.ShortDayNames[i]);
338 end;
339 {$ENDIF}
340 end;
341
342 function UTF8ToSys(const AFormatSettings: TFormatSettings): TFormatSettings;
343 {$IFnDEF UTF8_RTL}
344 var
345 i: Integer;
346 {$ENDIF}
347 begin
348 Result := AFormatSettings;
349 {$IFnDEF UTF8_RTL}
350 Result.CurrencyString := UTF8ToSys(AFormatSettings.CurrencyString);
351 for i:=1 to 12 do begin
352 Result.LongMonthNames[i] := UTF8ToSys(AFormatSettings.LongMonthNames[i]);
353 Result.ShortMonthNames[i] := UTF8ToSys(AFormatSettings.ShortMonthNames[i]);
354 end;
355 for i:=1 to 7 do begin
356 Result.LongDayNames[i] := UTF8ToSys(AFormatSettings.LongDayNames[i]);
357 Result.ShortDayNames[i] := UTF8ToSys(AFormatSettings.ShortDayNames[i]);
358 end;
359 {$ENDIF}
360 end;
361
362 function GetEnvironmentVariableCountUTF8: Integer;
363 begin
364 {$IF defined(FPC_RTL_UNICODE) or not defined(MSWindows)} //also WinCE, issue #0031788
365 Result:=SysUtils.GetEnvironmentVariableCount;
366 {$ELSE}
367 Result:=GetGetEnvironmentVariableCountWide;
368 {$ENDIF}
369 end;
370
371 function GetEnvironmentStringUTF8(Index: Integer): string;
372 begin
373 {$IFDEF FPC_RTL_UNICODE}
374 Result:=UTF16ToUTF8(SysUtils.GetEnvironmentString(Index));
375 {$ELSE}
376 {$IFDEF MSWindows} //not for WinCE, issue #0031788
377 Result:=UTF16ToUTF8(GetEnvironmentStringWide(Index));
378 {$ELSE}
379 // by default the environment is in console encoding
380 // see also RTL issue: http://bugs.freepascal.org/view.php?id=15233
381 Result:=ConsoleToUTF8(SysUtils.GetEnvironmentString(Index));
382 {$ENDIF}
383 {$ENDIF}
384 end;
385
386 function GetEnvironmentVariableUTF8(const EnvVar: string): String;
387 begin
388 {$IFDEF FPC_RTL_UNICODE}
389 Result:=UTF16ToUTF8(SysUtils.GetEnvironmentVariable(UTF8ToUTF16(EnvVar)));
390 {$ELSE}
391 {$IFDEF Windows}
392 Result:=UTF16ToUTF8(GetEnvironmentVariableWide(EnvVar));
393 {$ELSE}
394 // by default the environment is in console encoding
395 // RTL issue: http://bugs.freepascal.org/view.php?id=15233
396 Result:=ConsoleToUTF8(SysUtils.GetEnvironmentVariable(UTF8ToSys(EnvVar)));
397 {$ENDIF}
398 {$ENDIF}
399 end;
400
401 function SysErrorMessageUTF8(ErrorCode: Integer): String;
402 begin
403 Result := SysToUTF8(SysUtils.SysErrorMessage(ErrorCode));
404 end;
405
406 function UTF8CodepointSizeFull(p: PChar): integer;
407 begin
408 case p^ of
409 #0..#191: // %11000000
410 // regular single byte character (#0 is a character, this is Pascal ;)
411 Result:=1;
412 #192..#223: // p^ and %11100000 = %11000000
413 begin
414 // could be 2 byte character
415 if (ord(p[1]) and %11000000) = %10000000 then
416 Result:=2
417 else
418 Result:=1;
419 end;
420 #224..#239: // p^ and %11110000 = %11100000
421 begin
422 // could be 3 byte character
423 if ((ord(p[1]) and %11000000) = %10000000)
424 and ((ord(p[2]) and %11000000) = %10000000) then
425 Result:=3
426 else
427 Result:=1;
428 end;
429 #240..#247: // p^ and %11111000 = %11110000
430 begin
431 // could be 4 byte character
432 if ((ord(p[1]) and %11000000) = %10000000)
433 and ((ord(p[2]) and %11000000) = %10000000)
434 and ((ord(p[3]) and %11000000) = %10000000) then
435 Result:=4
436 else
437 Result:=1;
438 end;
439 else
440 Result:=1;
441 end;
442 end;
443
444 function UTF8CodepointSize(p: PChar): integer; inline;
445 begin
446 if p=nil then exit(0);
447 if p^<#192 then exit(1);
448 Result:=UTF8CodepointSizeFull(p);
449 end;
450
451 function UTF8CharacterLength(p: PChar): integer;
452 begin
453 Result := UTF8CodepointSize(p);
454 end;
455
456 function UTF8CodepointSizeFast(p: PChar): integer;
457 begin
458 case p^ of
459 #0..#191 : Result := 1;
460 #192..#223 : Result := 2;
461 #224..#239 : Result := 3;
462 #240..#247 : Result := 4;
463 //#248..#255 : Result := 1;
464 // Theoretically UTF-8 supports length 1-7, but since 2003, RFC 3629 limits
465 // it to 1-4 bytes.
466 // This is an inline function, so keep the function short.
467 //#248..#251 : Result := 5;
468 //#252, #253 : Result := 6;
469 //#254 : Result := 7;
470
471 else Result := 1; // An optimization + prevents compiler warning about uninitialized Result.
472 end;
473 end;
474
475 function UTF8Length(const s: string): PtrInt;
476 begin
477 Result:=UTF8Length(PChar(s),length(s));
478 end;
479
480 function UTF8Length(p: PChar; ByteCount: PtrInt): PtrInt;
481 var
482 CharLen: LongInt;
483 begin
484 Result:=0;
485 while (ByteCount>0) do begin
486 inc(Result);
487 CharLen:=UTF8CodepointSize(p);
488 inc(p,CharLen);
489 dec(ByteCount,CharLen);
490 end;
491 end;
492
493 function UTF8LengthFast(const s: string): PtrInt;
494 begin
495 Result := UTF8LengthFast(PChar(s), Length(s));
496 end;
497
498 // Ported from:
499 // http://www.daemonology.net/blog/2008-06-05-faster-utf8-strlen.html
500 // The code uses CPU's native data size. In a 64-bit CPU it means 8 bytes at once.
501 // The UTF-8 data is assumed to be valid.
UTF8LengthFastnull502 function UTF8LengthFast(p: PChar; ByteCount: PtrInt): PtrInt;
503 const
504 {$ifdef CPU32}
505 ONEMASK =$01010101;
506 EIGHTYMASK=$80808080;
507 {$endif}
508 {$ifdef CPU64}
509 ONEMASK =$0101010101010101;
510 EIGHTYMASK=$8080808080808080;
511 {$endif}
512 var
513 pnx: PPtrInt absolute p; // To get contents of text in PtrInt blocks. x refers to 32 or 64 bits
514 pn8: pint8 absolute pnx; // To read text as Int8 in the initial and final loops
515 ix: PtrInt absolute pnx; // To read text as PtrInt in the block loop
516 nx: PtrInt; // values processed in block loop
517 i,cnt,e: PtrInt;
518 begin
519 Result := 0;
520 e := ix+ByteCount; // End marker
521 // Handle any initial misaligned bytes.
522 cnt := (not (ix-1)) and (sizeof(PtrInt)-1);
523 if cnt>ByteCount then
524 cnt := ByteCount;
525 for i := 1 to cnt do
526 begin
527 // Is this byte NOT the first byte of a character?
528 Result += (pn8^ shr 7) and ((not pn8^) shr 6);
529 inc(pn8);
530 end;
531 // Handle complete blocks
532 for i := 1 to (ByteCount-cnt) div sizeof(PtrInt) do
533 begin
534 // Count bytes which are NOT the first byte of a character.
535 nx := ((pnx^ and EIGHTYMASK) shr 7) and ((not pnx^) shr 6);
536 {$push}{$overflowchecks off} // "nx * ONEMASK" causes an arithmetic overflow.
537 Result += (nx * ONEMASK) >> ((sizeof(PtrInt) - 1) * 8);
538 {$pop}
539 inc(pnx);
540 end;
541 // Take care of any left-over bytes.
542 while ix<e do
543 begin
544 // Is this byte NOT the first byte of a character?
545 Result += (pn8^ shr 7) and ((not pn8^) shr 6);
546 inc(pn8);
547 end;
548 Result := ByteCount - Result;
549 end;
550
UTF8CodepointToUnicodenull551 function UTF8CodepointToUnicode(p: PChar; out CodepointLen: integer): Cardinal;
552 { if p=nil then CodepointLen=0 otherwise CodepointLen>0
553 If there is an encoding error the Result is 0 and CodepointLen=1.
554 Use UTF8FixBroken to fix UTF-8 encoding.
555 It does not check if the codepoint is defined in the Unicode tables.
556 }
557 begin
558 if p<>nil then begin
559 if ord(p^)<%11000000 then begin
560 // regular single byte character (#0 is a normal char, this is pascal ;)
561 Result:=ord(p^);
562 CodepointLen:=1;
563 end
564 else if ((ord(p^) and %11100000) = %11000000) then begin
565 // starts with %110 => could be double byte character
566 if (ord(p[1]) and %11000000) = %10000000 then begin
567 CodepointLen:=2;
568 Result:=((ord(p^) and %00011111) shl 6) or (ord(p[1]) and %00111111);
569 if Result<(1 shl 7) then begin
570 // wrong encoded, could be an XSS attack
571 Result:=0;
572 end;
573 end else begin
574 Result:=ord(p^);
575 CodepointLen:=1;
576 end;
577 end
578 else if ((ord(p^) and %11110000) = %11100000) then begin
579 // starts with %1110 => could be triple byte character
580 if ((ord(p[1]) and %11000000) = %10000000)
581 and ((ord(p[2]) and %11000000) = %10000000) then begin
582 CodepointLen:=3;
583 Result:=((ord(p^) and %00011111) shl 12)
584 or ((ord(p[1]) and %00111111) shl 6)
585 or (ord(p[2]) and %00111111);
586 if Result<(1 shl 11) then begin
587 // wrong encoded, could be an XSS attack
588 Result:=0;
589 end;
590 end else begin
591 Result:=ord(p^);
592 CodepointLen:=1;
593 end;
594 end
595 else if ((ord(p^) and %11111000) = %11110000) then begin
596 // starts with %11110 => could be 4 byte character
597 if ((ord(p[1]) and %11000000) = %10000000)
598 and ((ord(p[2]) and %11000000) = %10000000)
599 and ((ord(p[3]) and %11000000) = %10000000) then begin
600 CodepointLen:=4;
601 Result:=((ord(p^) and %00001111) shl 18)
602 or ((ord(p[1]) and %00111111) shl 12)
603 or ((ord(p[2]) and %00111111) shl 6)
604 or (ord(p[3]) and %00111111);
605 if Result<(1 shl 16) then begin
606 // wrong encoded, could be an XSS attack
607 Result:=0;
608 end;
609 end else begin
610 Result:=ord(p^);
611 CodepointLen:=1;
612 end;
613 end
614 else begin
615 // invalid character
616 Result:=ord(p^);
617 CodepointLen:=1;
618 end;
619 end else begin
620 Result:=0;
621 CodepointLen:=0;
622 end;
623 end;
624
UTF8CharacterToUnicodenull625 function UTF8CharacterToUnicode(p: PChar; out CharLen: integer): Cardinal;
626 begin
627 Result := UTF8CodepointToUnicode(p, CharLen);
628 end;
629
UnicodeToUTF8null630 function UnicodeToUTF8(CodePoint: cardinal; Buf: PChar): integer;
631
632 procedure RaiseInvalidUnicode;
633 begin
634 raise Exception.Create('UnicodeToUTF8: invalid unicode: '+IntToStr(CodePoint));
635 end;
636
637 begin
638 Result:=UnicodeToUTF8Inline(CodePoint,Buf);
639 if Result=0 then
640 RaiseInvalidUnicode;
641 end;
642
UnicodeToUTF8SkipErrorsnull643 function UnicodeToUTF8SkipErrors(CodePoint: cardinal; Buf: PChar): integer;
644 begin
645 Result:=UnicodeToUTF8Inline(CodePoint,Buf);
646 end;
647
UnicodeToUTF8null648 function UnicodeToUTF8(CodePoint: cardinal): string;
649 var
650 Buf: array[0..6] of Char;
651 Len: Integer;
652 begin
653 if (CodePoint = 0) then
654 Result := #0 //StrPas does not like #0
655 else
656 begin
657 Len:=UnicodeToUTF8Inline(CodePoint, @Buf[0]);
658 Buf[Len]:=#0;
659 Result := StrPas(@Buf[0]);
660 end;
661 end;
662
UnicodeToUTF8Inlinenull663 function UnicodeToUTF8Inline(CodePoint: cardinal; Buf: PChar): integer;
664 begin
665 case CodePoint of
666 0..$7f:
667 begin
668 Result:=1;
669 Buf[0]:=char(byte(CodePoint));
670 end;
671 $80..$7ff:
672 begin
673 Result:=2;
674 Buf[0]:=char(byte($c0 or (CodePoint shr 6)));
675 Buf[1]:=char(byte($80 or (CodePoint and $3f)));
676 end;
677 $800..$ffff:
678 begin
679 Result:=3;
680 Buf[0]:=char(byte($e0 or (CodePoint shr 12)));
681 Buf[1]:=char(byte((CodePoint shr 6) and $3f) or $80);
682 Buf[2]:=char(byte(CodePoint and $3f) or $80);
683 end;
684 $10000..$10ffff:
685 begin
686 Result:=4;
687 Buf[0]:=char(byte($f0 or (CodePoint shr 18)));
688 Buf[1]:=char(byte((CodePoint shr 12) and $3f) or $80);
689 Buf[2]:=char(byte((CodePoint shr 6) and $3f) or $80);
690 Buf[3]:=char(byte(CodePoint and $3f) or $80);
691 end;
692 else
693 Result:=0;
694 end;
695 end;
696
UTF8ToDoubleByteStringnull697 function UTF8ToDoubleByteString(const s: string): string;
698 var
699 Len: Integer;
700 begin
701 Len:=UTF8Length(s);
702 SetLength(Result,Len*2);
703 if Len=0 then exit;
704 UTF8ToDoubleByte(PChar(s),length(s),PByte(Result));
705 end;
706
707 { returns number of double bytes }
UTF8ToDoubleBytenull708 function UTF8ToDoubleByte(UTF8Str: PChar; Len: PtrInt; DBStr: PByte): PtrInt;
709 var
710 SrcPos: PChar;
711 CharLen: LongInt;
712 DestPos: PByte;
713 u: Cardinal;
714 begin
715 SrcPos:=UTF8Str;
716 DestPos:=DBStr;
717 Result:=0;
718 while Len>0 do begin
719 u:=UTF8CodepointToUnicode(SrcPos,CharLen);
720 DestPos^:=byte((u shr 8) and $ff);
721 inc(DestPos);
722 DestPos^:=byte(u and $ff);
723 inc(DestPos);
724 inc(SrcPos,CharLen);
725 dec(Len,CharLen);
726 inc(Result);
727 end;
728 end;
729
730
731 { Tries to find the start of a valid UTF8 codepoint that contains the character pointed to by CurPos
732 - AString: pointer to the (start of the) string
733 - CurPos: pointer to the character inside AString that we want to get the information off
734 * if the function succeeds, CurPos wil point to the start of the valid UTF8 codepoint
735 * if the function fails, CurPos will not be changed
736 Note: if CurPos points beyond the end of AString you will get a crash!
737 - CharLen: the length of the UTF8 codepoint in bytes, if the function succeeds
738 - Returns:
739 True if the character pointed to by Curpos is part of a valid UTF8 codepoint (1 to 4 bytes),
740 otherwise it returns False. }
Utf8TryFindCodepointStartnull741 function Utf8TryFindCodepointStart(AString: PChar; var CurPos: PChar; out CodepointLen: Integer): Boolean;
742 var
743 SavedPos: PChar;
744 begin
745 Result := False;
746 CodepointLen := 0;
747 if (not (Assigned(AString) and Assigned(CurPos)))
748 or (CurPos < AString) then Exit;
749 SavedPos := CurPos;
750 //Note: UTF8CodepointStrictSize will NOT "look" beyond the terminating #0 of a PChar, so this is safe with AnsiStrings
751 CodepointLen := UTF8CodepointStrictSize(CurPos);
752 if (CodepointLen > 0) then Exit(True);
753 if (CurPos > AString) then
754 begin
755 Dec(CurPos); //-1
756 //is it second byte of 2..4 byte codepoint?
757 CodepointLen := UTF8CodepointStrictSize(CurPos);
758 if (CodepointLen > 1) then Exit(True);
759 if (CurPos > AString) then
760 begin
761 Dec(CurPos); //-2
762 //is it third byte of 3..4 byte codepoint?
763 CodepointLen := UTF8CodepointStrictSize(CurPos);
764 if (CodepointLen > 2) then Exit(True);
765 if (CurPos > AString) then
766 begin
767 Dec(CurPos); //-3
768 //is it fouth byte of 4 byte codepoint?
769 CodepointLen := UTF8CodepointStrictSize(CurPos);
770 if (CodepointLen = 4) then Exit(True);
771 end;
772 end;
773 end;
774 //At this point we failed: we are NOT inside a valid UTF8 codepoint!
775 CurPos := SavedPos;
776 end;
777
Utf8TryFindCodepointStartnull778 function Utf8TryFindCodepointStart(const AString: String; var Index: Integer; out CharLen: Integer): Boolean;
779 var
780 CurPos, SavedCurPos: PChar;
781 begin
782 CurPos := @AString[Index];
783 SavedCurPos := CurPos;
784 Result := Utf8TryFindCodepointStart(PChar(AString), CurPos, CharLen);
785 Index := Index - (SavedCurPos - CurPos);
786 end;
787
788 { Find the start of the UTF8 character which contains BytePos,
789 if BytePos is not part of a valid Utf8 Codepoint the function returns BytePos
790 Len is length in byte, BytePos starts at 0 }
UTF8FindNearestCharStartnull791 function UTF8FindNearestCharStart(UTF8Str: PChar; Len: SizeInt; BytePos: SizeInt): SizeInt;
792 var
793 CurPos: PChar;
794 CharLen: Integer;
795 begin
796 if (BytePos > Len-1) then BytePos := Len - 1;
797 CurPos := Utf8Str + BytePos;
798 //No need to check the result value, since when it retuns False CurPos will be reset
799 //to it's original value, and that's what we want to return in that case
800 Utf8TryFindCodepointStart(Utf8Str, CurPos, CharLen);
801 Result := CurPos - Utf8Str;
802 end;
803
804
805 { Len is the length in bytes of UTF8Str
806 CodepointIndex is the position of the desired codepoint (starting at 0), in chars
807 }
UTF8CodepointStartnull808 function UTF8CodepointStart(UTF8Str: PChar; Len, CodepointIndex: PtrInt): PChar;
809 var
810 CharLen: LongInt;
811 begin
812 Result:=UTF8Str;
813 if Result<>nil then begin
814 while (CodepointIndex>0) and (Len>0) do begin
815 CharLen:=UTF8CodepointSize(Result);
816 dec(Len,CharLen);
817 dec(CodepointIndex);
818 inc(Result,CharLen);
819 end;
820 if (CodepointIndex<>0) or (Len<0) then
821 Result:=nil;
822 end;
823 end;
824
UTF8CharStartnull825 function UTF8CharStart(UTF8Str: PChar; Len, CharIndex: PtrInt): PChar;
826 begin
827 Result := UTF8CodepointStart(UTF8Str, Len, CharIndex);
828 end;
829
UTF8CodepointToByteIndexnull830 function UTF8CodepointToByteIndex(UTF8Str: PChar; Len, CodepointIndex: PtrInt): PtrInt;
831 var
832 p: PChar;
833 begin
834 p := UTF8CodepointStart(UTF8Str, Len, CodepointIndex);
835 if p = nil
836 then Result := -1
837 else Result := p - UTF8Str;
838 end;
839
UTF8CharToByteIndexnull840 function UTF8CharToByteIndex(UTF8Str: PChar; Len, CharIndex: PtrInt): PtrInt;
841 begin
842 Result := UTF8CodepointToByteIndex(UTF8Str, Len, CharIndex);
843 end;
844
845 { fix any broken UTF8 sequences with spaces }
846 procedure UTF8FixBroken(P: PChar);
847 var
848 c: cardinal;
849 begin
850 if p=nil then exit;
851 while p^<>#0 do begin
852 if ord(p^)<%10000000 then begin
853 // regular single byte character
854 inc(p);
855 end
856 else if ord(p^)<%11000000 then begin
857 // invalid
858 p^:=' ';
859 inc(p);
860 end
861 else if ((ord(p^) and %11100000) = %11000000) then begin
862 // starts with %110 => should be 2 byte character
863 if ((ord(p[1]) and %11000000) = %10000000) then begin
864 c:=((ord(p^) and %00011111) shl 6);
865 //or (ord(p[1]) and %00111111);
866 if c<(1 shl 7) then
867 p^:=' ' // fix XSS attack
868 else
869 inc(p,2)
870 end
871 else if p[1]<>#0 then
872 p^:=' ';
873 end
874 else if ((ord(p^) and %11110000) = %11100000) then begin
875 // starts with %1110 => should be 3 byte character
876 if ((ord(p[1]) and %11000000) = %10000000)
877 and ((ord(p[2]) and %11000000) = %10000000) then begin
878 c:=((ord(p^) and %00011111) shl 12)
879 or ((ord(p[1]) and %00111111) shl 6);
880 //or (ord(p[2]) and %00111111);
881 if c<(1 shl 11) then
882 p^:=' ' // fix XSS attack
883 else
884 inc(p,3);
885 end else
886 p^:=' ';
887 end
888 else if ((ord(p^) and %11111000) = %11110000) then begin
889 // starts with %11110 => should be 4 byte character
890 if ((ord(p[1]) and %11000000) = %10000000)
891 and ((ord(p[2]) and %11000000) = %10000000)
892 and ((ord(p[3]) and %11000000) = %10000000) then begin
893 c:=((ord(p^) and %00001111) shl 18)
894 or ((ord(p[1]) and %00111111) shl 12)
895 or ((ord(p[2]) and %00111111) shl 6);
896 //or (ord(p[3]) and %00111111);
897 if c<(1 shl 16) then
898 p^:=' ' // fix XSS attack
899 else
900 inc(p,4)
901 end else
902 p^:=' ';
903 end
904 else begin
905 p^:=' ';
906 inc(p);
907 end;
908 end;
909 end;
910
911 procedure UTF8FixBroken(var S: string);
912 begin
913 if S='' then exit;
914 if FindInvalidUTF8Codepoint(PChar(S),length(S))<0 then exit;
915 UniqueString(S);
916 UTF8FixBroken(PChar(S));
917 end;
918
UTF8CodepointStrictSizenull919 function UTF8CodepointStrictSize(P: PChar): integer;
920 var
921 c: Char;
922 begin
923 if p=nil then exit(0);
924 c:=p^;
925 if ord(c)<%10000000 then begin
926 // regular single byte character
927 exit(1);
928 end
929 else if ord(c)<%11000000 then begin
930 // invalid single byte character
931 exit(0);
932 end
933 else if ((ord(c) and %11100000) = %11000000) then begin
934 // should be 2 byte character
935 if (ord(p[1]) and %11000000) = %10000000 then
936 exit(2)
937 else
938 exit(0);
939 end
940 else if ((ord(c) and %11110000) = %11100000) then begin
941 // should be 3 byte character
942 if ((ord(p[1]) and %11000000) = %10000000)
943 and ((ord(p[2]) and %11000000) = %10000000) then
944 exit(3)
945 else
946 exit(0);
947 end
948 else if ((ord(c) and %11111000) = %11110000) then begin
949 // should be 4 byte character
950 if ((ord(p[1]) and %11000000) = %10000000)
951 and ((ord(p[2]) and %11000000) = %10000000)
952 and ((ord(p[3]) and %11000000) = %10000000) then
953 exit(4)
954 else
955 exit(0);
956 end else
957 exit(0);
958 end;
959
UTF8CharacterStrictLengthnull960 function UTF8CharacterStrictLength(P: PChar): integer;
961 begin
962 Result := UTF8CodepointStrictSize(P);
963 end;
964
UTF8CStringToUTF8Stringnull965 function UTF8CStringToUTF8String(SourceStart: PChar; SourceLen: PtrInt) : string;
966 var
967 Source: PChar;
968 Dest: PChar;
969 SourceEnd: PChar;
970 SourceCopied: PChar;
971
972 // Copies from SourceStart till Source to Dest and updates Dest
973 procedure CopyPart; inline;
974 var
975 CopyLength: SizeInt;
976 begin
977 CopyLength := Source - SourceCopied;
978 if CopyLength=0 then exit;
979 System.move(SourceCopied^ , Dest^, CopyLength);
980 SourceCopied:=Source;
981 inc(Dest, CopyLength);
982 end;
983
984 begin
985 SetLength(Result, SourceLen);
986 if SourceLen=0 then exit;
987 SourceCopied:=SourceStart;
988 Source:=SourceStart;
989 Dest:=PChar(Result);
990 SourceEnd := Source + SourceLen;
991 while Source<SourceEnd do begin
992 if (Source^='\') then begin
993 CopyPart;
994 inc(Source);
995 if Source^ in ['t', 'n', '"', '\'] then begin
996 case Source^ of
997 't' : Dest^ := #9;
998 '"' : Dest^ := '"';
999 '\' : Dest^ := '\';
1000 'n' :
1001 // fpc 2.1.1 stores string constants as array of char so maybe this
1002 // will work for without ifdef (once available in 2.0.x too):
1003 // move(lineending, dest^, sizeof(LineEnding));
1004 {$IFDEF WINDOWS}
1005 begin
1006 move(lineending[1], dest^, length(LineEnding));
1007 inc(dest, length(LineEnding)-1);
1008 end;
1009 {$ELSE}
1010 Dest^ := LineEnding;
1011 {$ENDIF}
1012 end;
1013 inc(Source);
1014 inc(Dest);
1015 end;
1016 SourceCopied := Source;
1017 end
1018 else
1019 Inc(Source); // no need for checking for UTF8, the / is never part of an UTF8 multibyte codepoint
1020 end;
1021 CopyPart;
1022 SetLength(Result, Dest - PChar(Result));
1023 end;
1024
UTF8Posnull1025 function UTF8Pos(const SearchForText, SearchInText: string;
1026 StartPos: SizeInt = 1): PtrInt;
1027 // returns the character index, where the SearchForText starts in SearchInText
1028 // an optional StartPos can be given (in UTF-8 codepoints, not in byte)
1029 // returns 0 if not found
1030 var
1031 i: SizeInt;
1032 p: PChar;
1033 StartPosP: PChar;
1034 begin
1035 Result:=0;
1036 if StartPos=1 then
1037 begin
1038 i:=System.Pos(SearchForText,SearchInText);
1039 if i>0 then
1040 Result:=UTF8Length(PChar(SearchInText),i-1)+1;
1041 end
1042 else if StartPos>1 then
1043 begin
1044 // skip
1045 StartPosP:=UTF8CodepointStart(PChar(SearchInText),Length(SearchInText),StartPos-1);
1046 if StartPosP=nil then exit;
1047 // search
1048 p:=UTF8PosP(PChar(SearchForText),length(SearchForText),
1049 StartPosP,length(SearchInText)+PChar(SearchInText)-StartPosP);
1050 // get UTF-8 position
1051 if p=nil then exit;
1052 Result:=StartPos+UTF8Length(StartPosP,p-StartPosP);
1053 end;
1054 end;
1055
UTF8PosPnull1056 function UTF8PosP(SearchForText: PChar; SearchForTextLen: SizeInt;
1057 SearchInText: PChar; SearchInTextLen: SizeInt): PChar;
1058 // returns the position where SearchInText starts in SearchForText
1059 // returns nil if not found
1060 var
1061 p: SizeInt;
1062 begin
1063 Result:=nil;
1064 if (SearchForText=nil) or (SearchForTextLen=0) or (SearchInText=nil) then
1065 exit;
1066 while SearchInTextLen>0 do begin
1067 p:=IndexByte(SearchInText^,SearchInTextLen,PByte(SearchForText)^);
1068 if p<0 then exit;
1069 inc(SearchInText,p);
1070 dec(SearchInTextLen,p);
1071 if SearchInTextLen<SearchForTextLen then exit;
1072 if CompareMem(SearchInText,SearchForText,SearchForTextLen) then
1073 exit(SearchInText);
1074 inc(SearchInText);
1075 dec(SearchInTextLen);
1076 end;
1077 end;
1078
UTF8Copynull1079 function UTF8Copy(const s: string; StartCharIndex, CharCount: PtrInt): string;
1080 // returns substring
1081 var
1082 StartBytePos: PChar;
1083 EndBytePos: PChar;
1084 MaxBytes: PtrInt;
1085 begin
1086 StartBytePos:=UTF8CodepointStart(PChar(s),length(s),StartCharIndex-1);
1087 if StartBytePos=nil then
1088 Result:=''
1089 else begin
1090 MaxBytes:=PtrInt(PChar(s)+length(s)-StartBytePos);
1091 EndBytePos:=UTF8CodepointStart(StartBytePos,MaxBytes,CharCount);
1092 if EndBytePos=nil then
1093 Result:=copy(s,StartBytePos-PChar(s)+1,MaxBytes)
1094 else
1095 Result:=copy(s,StartBytePos-PChar(s)+1,EndBytePos-StartBytePos);
1096 end;
1097 end;
1098
1099 {$IFnDEF NO_CP_RTL}
1100 procedure UTF8Delete(var s: Utf8String; StartCharIndex, CharCount: PtrInt);
1101 var
1102 tmp: String;
1103 begin
1104 tmp := RawByteString(s);
1105 {.$IFDEF ACP_RTL}
1106 { change code page without converting the data }
1107 SetCodePage(RawByteString(tmp), CP_UTF8, False);
1108 {.$ENDIF}
1109 { keep refcount to 1 if it was 1, to avoid unnecessary copies }
1110 s := '';
1111 UTF8Delete(tmp,StartCharIndex,CharCount);
1112 { same as above }
1113 s := RawByteString(tmp);
1114 tmp := '';
1115 SetCodePage(RawByteString(s), CP_UTF8, False);
1116 end;
1117 {$ENDIF NO_ACP_RTL}
1118
1119 procedure UTF8Delete(var s: String; StartCharIndex, CharCount: PtrInt);
1120 var
1121 StartBytePos: PChar;
1122 EndBytePos: PChar;
1123 MaxBytes: PtrInt;
1124 begin
1125 StartBytePos:=UTF8CodepointStart(PChar(s),length(s),StartCharIndex-1);
1126 if StartBytePos <> nil then
1127 begin
1128 MaxBytes:=PtrInt(PChar(s)+length(s)-StartBytePos);
1129 EndBytePos:=UTF8CodepointStart(StartBytePos,MaxBytes,CharCount);
1130 if EndBytePos=nil then
1131 Delete(s,StartBytePos-PChar(s)+1,MaxBytes)
1132 else
1133 Delete(s,StartBytePos-PChar(s)+1,EndBytePos-StartBytePos);
1134 end;
1135 end;
1136
1137 {$IFnDEF NO_CP_RTL}
1138 {It's simper to copy the code from the variant with String parameters than writing a wrapper}
1139 procedure UTF8Insert(const source: UTF8String; var s: UTF8string;
1140 StartCharIndex: PtrInt);
1141 var
1142 StartBytePos: PChar;
1143 begin
1144 StartBytePos:=UTF8CodepointStart(PChar(s),length(s),StartCharIndex-1);
1145 if StartBytePos <> nil then
1146 Insert(source, s, StartBytePos-PChar(s)+1);
1147 end;
1148 {$ENDIF NO_CP_RTL}
1149
1150 procedure UTF8Insert(const source: String; var s: String; StartCharIndex: PtrInt);
1151 var
1152 StartBytePos: PChar;
1153 begin
1154 StartBytePos:=UTF8CodepointStart(PChar(s),length(s),StartCharIndex-1);
1155 if StartBytePos <> nil then
1156 Insert(source, s, StartBytePos-PChar(s)+1);
1157 end;
1158
UTF8StringReplacenull1159 function UTF8StringReplace(const S, OldPattern, NewPattern: String;
1160 Flags: TReplaceFlags; ALanguage: string): String;
1161 // same algorithm as StringReplace, but using UTF8LowerCase
1162 // for case insensitive search
1163 var
1164 Srch, OldP, RemS: string;
1165 P: Integer;
1166 begin
1167 Srch := S;
1168 OldP := OldPattern;
1169 if rfIgnoreCase in Flags then
1170 begin
1171 Srch := UTF8LowerCase(Srch,ALanguage);
1172 OldP := UTF8LowerCase(OldP,ALanguage);
1173 end;
1174 RemS := S;
1175 Result := '';
1176 while Length(Srch) <> 0 do
1177 begin
1178 P := Pos(OldP, Srch);
1179 if P = 0 then
1180 begin
1181 Result := Result + RemS;
1182 Srch := '';
1183 end
1184 else
1185 begin
1186 Result := Result + Copy(RemS,1,P-1) + NewPattern;
1187 P := P + Length(OldP);
1188 RemS := Copy(RemS, P, Length(RemS)-P+1);
1189 if not (rfReplaceAll in Flags) then
1190 begin
1191 Result := Result + RemS;
1192 Srch := '';
1193 end
1194 else
1195 Srch := Copy(Srch, P, Length(Srch)-P+1);
1196 end;
1197 end;
1198 end;
1199
1200 {
1201 UTF8SwapCase - a "naive" implementation that uses UTF8UpperCase and UTF8LowerCase.
1202 It serves its purpose and performs OK for short and resonably long strings
1203 but it should be rewritten in the future if better performance and lower
1204 memory consumption is needed.
1205
1206 AInStr - The input string.
1207 ALanguage - The language. Use '' for maximum speed if one desires to ignore the language
1208 (See UTF8LowerCase comment for more details on ALanguage parameter.)
1209 }
UTF8SwapCasenull1210 function UTF8SwapCase(const AInStr: string; ALanguage: string=''): string;
1211 var
1212 xUpperCase: string;
1213 xLowerCase: string;
1214 I: Integer;
1215 begin
1216 if AInStr = '' then
1217 Exit('');
1218
1219 xUpperCase := UTF8UpperCase(AInStr, ALanguage);
1220 xLowerCase := UTF8LowerCase(AInStr, ALanguage);
1221 if (Length(xUpperCase) <> Length(AInStr)) or (Length(xLowerCase) <> Length(AInStr)) then
1222 Exit(AInStr);//something went wrong -> the lengths of utf8 strings changed
1223
1224 SetLength(Result, Length(AInStr));
1225 for I := 1 to Length(AInStr) do
1226 if AInStr[I] <> xUpperCase[I] then
1227 Result[I] := xUpperCase[I]
1228 else
1229 Result[I] := xLowerCase[I];
1230 end;
1231
UTF8ProperCasenull1232 function UTF8ProperCase(const AInStr: string; const WordDelims: TSysCharSet): string;
1233 var
1234 P, PE : PChar;
1235 CharLen: Integer;
1236 Capital: string;
1237 begin
1238 Result := UTF8LowerCase(AInStr);
1239 UniqueString(Result);
1240 P := PChar(Result);
1241 PE := P+Length(Result);
1242 while (P<PE) do
1243 begin
1244 while (P<PE) and (P^ in WordDelims) do
1245 inc(P);
1246 if (P<PE) then
1247 begin
1248 CharLen := UTF8CodepointSize(P);
1249 SetLength(Capital, CharLen);
1250 System.Move(P^, Capital[1], CharLen); // Copy one codepoint to Capital,
1251 Capital := UTF8UpperCase(Capital); // UpperCase it
1252 System.Move(Capital[1], P^, CharLen); // and copy it back.
1253 Inc(P, CharLen);
1254 end;
1255 while (P<PE) and not (P^ in WordDelims) do
1256 inc(P);
1257 end;
1258 end;
1259
1260 {
1261 AInStr - The input string
1262 ALanguage - The language. Use '' for maximum speed if one desires to ignore the language
1263 The language should be specified in the format from ISO 639-1,
1264 which uses 2 characters to represent each language.
1265 If the language has no code in ISO 639-1, then the 3-chars code
1266 from ISO 639-2 should be used.
1267 Example: "tr" - Turkish language locale
1268
1269 Data from here: ftp://ftp.unicode.org/Public/UNIDATA/UnicodeData.txt
1270
1271 The columns in the file UnicodeData.txt are explained here:
1272 http://www.ksu.ru/eng/departments/ktk/test/perl/lib/unicode/UCDFF301.html#Case Mappings
1273 }
UTF8LowerCasenull1274 function UTF8LowerCase(const AInStr: string; ALanguage: string=''): string;
1275 var
1276 CounterDiff: PtrInt;
1277 InStr, InStrEnd, OutStr: PChar;
1278 // Language identification
1279 IsTurkish: Boolean;
1280 c1, c2, c3, new_c1, new_c2, new_c3: Char;
1281 p: SizeInt;
1282 begin
1283 Result:=AInStr;
1284 InStr := PChar(AInStr);
1285 InStrEnd := InStr + length(AInStr); // points behind last char
1286
1287 // Do a fast initial parsing of the string to maybe avoid doing
1288 // UniqueString if the resulting string will be identical
1289 while (InStr < InStrEnd) do
1290 begin
1291 c1 := InStr^;
1292 case c1 of
1293 'A'..'Z': Break;
1294 #$C3..#$FF:
1295 case c1 of
1296 #$C3..#$C9, #$CE, #$CF, #$D0..#$D5, #$E1..#$E2,#$E5:
1297 begin
1298 c2 := InStr[1];
1299 case c1 of
1300 #$C3: if c2 in [#$80..#$9E] then Break;
1301 #$C4:
1302 begin
1303 case c2 of
1304 #$80..#$AF, #$B2..#$B6: if ord(c2) mod 2 = 0 then Break;
1305 #$B8..#$FF: if ord(c2) mod 2 = 1 then Break;
1306 #$B0: Break;
1307 end;
1308 end;
1309 #$C5:
1310 begin
1311 case c2 of
1312 #$8A..#$B7: if ord(c2) mod 2 = 0 then Break;
1313 #$00..#$88, #$B9..#$FF: if ord(c2) mod 2 = 1 then Break;
1314 #$B8: Break;
1315 end;
1316 end;
1317 // Process E5 to avoid stopping on chinese chars
1318 #$E5: if (c2 = #$BC) and (InStr[2] in [#$A1..#$BA]) then Break;
1319 // Others are too complex, better not to pre-inspect them
1320 else
1321 Break;
1322 end;
1323 // already lower, or otherwhise not affected
1324 end;
1325 end;
1326 end;
1327 inc(InStr);
1328 end;
1329
1330 if InStr >= InStrEnd then Exit;
1331
1332 // Language identification
1333 IsTurkish := (ALanguage = 'tr') or (ALanguage = 'az'); // Turkish and Azeri have a special handling
1334
1335 UniqueString(Result);
1336 OutStr := PChar(Result) + (InStr - PChar(AInStr));
1337 CounterDiff := 0;
1338
1339 while InStr < InStrEnd do
1340 begin
1341 c1 := InStr^;
1342 case c1 of
1343 // codepoints UTF-8 range Description Case change
1344 // $0041..$005A $41..$5A Capital ASCII X+$20
1345 'A'..'Z':
1346 begin
1347 { First ASCII chars }
1348 // Special turkish handling
1349 // capital undotted I to small undotted i
1350 if IsTurkish and (c1 = 'I') then
1351 begin
1352 p:=OutStr - PChar(Result);
1353 SetLength(Result,Length(Result)+1);// Increase the buffer
1354 OutStr := PChar(Result)+p;
1355 OutStr^ := #$C4;
1356 inc(OutStr);
1357 OutStr^ := #$B1;
1358 dec(CounterDiff);
1359 end
1360 else
1361 begin
1362 OutStr^ := chr(ord(c1)+32);
1363 end;
1364 inc(InStr);
1365 inc(OutStr);
1366 end;
1367
1368 // Chars with 2-bytes which might be modified
1369 #$C3..#$D5:
1370 begin
1371 c2 := InStr[1];
1372 new_c1 := c1;
1373 new_c2 := c2;
1374 case c1 of
1375 // Latin Characters 0000–0FFF http://en.wikibooks.org/wiki/Unicode/Character_reference/0000-0FFF
1376 // codepoints UTF-8 range Description Case change
1377 // $00C0..$00D6 C3 80..C3 96 Capital Latin with accents X+$20
1378 // $D7 C3 97 Multiplication Sign N/A
1379 // $00D8..$00DE C3 98..C3 9E Capital Latin with accents X+$20
1380 // $DF C3 9F German beta ß already lowercase
1381 #$C3:
1382 begin
1383 case c2 of
1384 #$80..#$96, #$98..#$9E: new_c2 := chr(ord(c2) + $20)
1385 end;
1386 end;
1387 // $0100..$012F C4 80..C4 AF Capital/Small Latin accents if mod 2 = 0 then X+1
1388 // $0130..$0131 C4 B0..C4 B1 Turkish
1389 // C4 B0 turkish uppercase dotted i -> 'i'
1390 // C4 B1 turkish lowercase undotted ı
1391 // $0132..$0137 C4 B2..C4 B7 Capital/Small Latin accents if mod 2 = 0 then X+1
1392 // $0138 C4 B8 ĸ N/A
1393 // $0139..$024F C4 B9..C5 88 Capital/Small Latin accents if mod 2 = 1 then X+1
1394 #$C4:
1395 begin
1396 case c2 of
1397 #$80..#$AF, #$B2..#$B7: if ord(c2) mod 2 = 0 then new_c2 := chr(ord(c2) + 1);
1398 #$B0: // Turkish
1399 begin
1400 OutStr^ := 'i';
1401 inc(InStr, 2);
1402 inc(OutStr);
1403 inc(CounterDiff, 1);
1404 Continue;
1405 end;
1406 #$B9..#$BE: if ord(c2) mod 2 = 1 then new_c2 := chr(ord(c2) + 1);
1407 #$BF: // This crosses the borders between the first byte of the UTF-8 char
1408 begin
1409 new_c1 := #$C5;
1410 new_c2 := #$80;
1411 end;
1412 end;
1413 end;
1414 // $C589 ʼn
1415 // $C58A..$C5B7: if OldChar mod 2 = 0 then NewChar := OldChar + 1;
1416 // $C5B8: NewChar := $C3BF; // Ÿ
1417 // $C5B9..$C8B3: if OldChar mod 2 = 1 then NewChar := OldChar + 1;
1418 #$C5:
1419 begin
1420 case c2 of
1421 #$8A..#$B7: //0
1422 begin
1423 if ord(c2) mod 2 = 0 then
1424 new_c2 := chr(ord(c2) + 1);
1425 end;
1426 #$00..#$88, #$B9..#$BE: //1
1427 begin
1428 if ord(c2) mod 2 = 1 then
1429 new_c2 := chr(ord(c2) + 1);
1430 end;
1431 #$B8: // Ÿ
1432 begin
1433 new_c1 := #$C3;
1434 new_c2 := #$BF;
1435 end;
1436 end;
1437 end;
1438 {A convoluted part: C6 80..C6 8F
1439
1440 0180;LATIN SMALL LETTER B WITH STROKE;Ll;0;L;;;;;N;LATIN SMALL LETTER B BAR;;0243;;0243
1441 0181;LATIN CAPITAL LETTER B WITH HOOK;Lu;0;L;;;;;N;LATIN CAPITAL LETTER B HOOK;;;0253; => C6 81=>C9 93
1442 0182;LATIN CAPITAL LETTER B WITH TOPBAR;Lu;0;L;;;;;N;LATIN CAPITAL LETTER B TOPBAR;;;0183;
1443 0183;LATIN SMALL LETTER B WITH TOPBAR;Ll;0;L;;;;;N;LATIN SMALL LETTER B TOPBAR;;0182;;0182
1444 0184;LATIN CAPITAL LETTER TONE SIX;Lu;0;L;;;;;N;;;;0185;
1445 0185;LATIN SMALL LETTER TONE SIX;Ll;0;L;;;;;N;;;0184;;0184
1446 0186;LATIN CAPITAL LETTER OPEN O;Lu;0;L;;;;;N;;;;0254; ==> C9 94
1447 0187;LATIN CAPITAL LETTER C WITH HOOK;Lu;0;L;;;;;N;LATIN CAPITAL LETTER C HOOK;;;0188;
1448 0188;LATIN SMALL LETTER C WITH HOOK;Ll;0;L;;;;;N;LATIN SMALL LETTER C HOOK;;0187;;0187
1449 0189;LATIN CAPITAL LETTER AFRICAN D;Lu;0;L;;;;;N;;;;0256; => C9 96
1450 018A;LATIN CAPITAL LETTER D WITH HOOK;Lu;0;L;;;;;N;LATIN CAPITAL LETTER D HOOK;;;0257; => C9 97
1451 018B;LATIN CAPITAL LETTER D WITH TOPBAR;Lu;0;L;;;;;N;LATIN CAPITAL LETTER D TOPBAR;;;018C;
1452 018C;LATIN SMALL LETTER D WITH TOPBAR;Ll;0;L;;;;;N;LATIN SMALL LETTER D TOPBAR;;018B;;018B
1453 018D;LATIN SMALL LETTER TURNED DELTA;Ll;0;L;;;;;N;;;;;
1454 018E;LATIN CAPITAL LETTER REVERSED E;Lu;0;L;;;;;N;LATIN CAPITAL LETTER TURNED E;;;01DD; => C7 9D
1455 018F;LATIN CAPITAL LETTER SCHWA;Lu;0;L;;;;;N;;;;0259; => C9 99
1456 }
1457 #$C6:
1458 begin
1459 case c2 of
1460 #$81:
1461 begin
1462 new_c1 := #$C9;
1463 new_c2 := #$93;
1464 end;
1465 #$82..#$85:
1466 begin
1467 if ord(c2) mod 2 = 0 then
1468 new_c2 := chr(ord(c2) + 1);
1469 end;
1470 #$87..#$88,#$8B..#$8C:
1471 begin
1472 if ord(c2) mod 2 = 1 then
1473 new_c2 := chr(ord(c2) + 1);
1474 end;
1475 #$86:
1476 begin
1477 new_c1 := #$C9;
1478 new_c2 := #$94;
1479 end;
1480 #$89:
1481 begin
1482 new_c1 := #$C9;
1483 new_c2 := #$96;
1484 end;
1485 #$8A:
1486 begin
1487 new_c1 := #$C9;
1488 new_c2 := #$97;
1489 end;
1490 #$8E:
1491 begin
1492 new_c1 := #$C7;
1493 new_c2 := #$9D;
1494 end;
1495 #$8F:
1496 begin
1497 new_c1 := #$C9;
1498 new_c2 := #$99;
1499 end;
1500 {
1501 And also C6 90..C6 9F
1502
1503 0190;LATIN CAPITAL LETTER OPEN E;Lu;0;L;;;;;N;LATIN CAPITAL LETTER EPSILON;;;025B; => C9 9B
1504 0191;LATIN CAPITAL LETTER F WITH HOOK;Lu;0;L;;;;;N;LATIN CAPITAL LETTER F HOOK;;;0192; => +1
1505 0192;LATIN SMALL LETTER F WITH HOOK;Ll;0;L;;;;;N;LATIN SMALL LETTER SCRIPT F;;0191;;0191 <=
1506 0193;LATIN CAPITAL LETTER G WITH HOOK;Lu;0;L;;;;;N;LATIN CAPITAL LETTER G HOOK;;;0260; => C9 A0
1507 0194;LATIN CAPITAL LETTER GAMMA;Lu;0;L;;;;;N;;;;0263; => C9 A3
1508 0195;LATIN SMALL LETTER HV;Ll;0;L;;;;;N;LATIN SMALL LETTER H V;;01F6;;01F6 <=
1509 0196;LATIN CAPITAL LETTER IOTA;Lu;0;L;;;;;N;;;;0269; => C9 A9
1510 0197;LATIN CAPITAL LETTER I WITH STROKE;Lu;0;L;;;;;N;LATIN CAPITAL LETTER BARRED I;;;0268; => C9 A8
1511 0198;LATIN CAPITAL LETTER K WITH HOOK;Lu;0;L;;;;;N;LATIN CAPITAL LETTER K HOOK;;;0199; => +1
1512 0199;LATIN SMALL LETTER K WITH HOOK;Ll;0;L;;;;;N;LATIN SMALL LETTER K HOOK;;0198;;0198 <=
1513 019A;LATIN SMALL LETTER L WITH BAR;Ll;0;L;;;;;N;LATIN SMALL LETTER BARRED L;;023D;;023D <=
1514 019B;LATIN SMALL LETTER LAMBDA WITH STROKE;Ll;0;L;;;;;N;LATIN SMALL LETTER BARRED LAMBDA;;;; <=
1515 019C;LATIN CAPITAL LETTER TURNED M;Lu;0;L;;;;;N;;;;026F; => C9 AF
1516 019D;LATIN CAPITAL LETTER N WITH LEFT HOOK;Lu;0;L;;;;;N;LATIN CAPITAL LETTER N HOOK;;;0272; => C9 B2
1517 019E;LATIN SMALL LETTER N WITH LONG RIGHT LEG;Ll;0;L;;;;;N;;;0220;;0220 <=
1518 019F;LATIN CAPITAL LETTER O WITH MIDDLE TILDE;Lu;0;L;;;;;N;LATIN CAPITAL LETTER BARRED O;;;0275; => C9 B5
1519 }
1520 #$90:
1521 begin
1522 new_c1 := #$C9;
1523 new_c2 := #$9B;
1524 end;
1525 #$91, #$98: new_c2 := chr(ord(c2)+1);
1526 #$93:
1527 begin
1528 new_c1 := #$C9;
1529 new_c2 := #$A0;
1530 end;
1531 #$94:
1532 begin
1533 new_c1 := #$C9;
1534 new_c2 := #$A3;
1535 end;
1536 #$96:
1537 begin
1538 new_c1 := #$C9;
1539 new_c2 := #$A9;
1540 end;
1541 #$97:
1542 begin
1543 new_c1 := #$C9;
1544 new_c2 := #$A8;
1545 end;
1546 #$9C:
1547 begin
1548 new_c1 := #$C9;
1549 new_c2 := #$AF;
1550 end;
1551 #$9D:
1552 begin
1553 new_c1 := #$C9;
1554 new_c2 := #$B2;
1555 end;
1556 #$9F:
1557 begin
1558 new_c1 := #$C9;
1559 new_c2 := #$B5;
1560 end;
1561 {
1562 And also C6 A0..C6 AF
1563
1564 01A0;LATIN CAPITAL LETTER O WITH HORN;Lu;0;L;004F 031B;;;;N;LATIN CAPITAL LETTER O HORN;;;01A1; => +1
1565 01A1;LATIN SMALL LETTER O WITH HORN;Ll;0;L;006F 031B;;;;N;LATIN SMALL LETTER O HORN;;01A0;;01A0 <=
1566 01A2;LATIN CAPITAL LETTER OI;Lu;0;L;;;;;N;LATIN CAPITAL LETTER O I;;;01A3; => +1
1567 01A3;LATIN SMALL LETTER OI;Ll;0;L;;;;;N;LATIN SMALL LETTER O I;;01A2;;01A2 <=
1568 01A4;LATIN CAPITAL LETTER P WITH HOOK;Lu;0;L;;;;;N;LATIN CAPITAL LETTER P HOOK;;;01A5; => +1
1569 01A5;LATIN SMALL LETTER P WITH HOOK;Ll;0;L;;;;;N;LATIN SMALL LETTER P HOOK;;01A4;;01A4 <=
1570 01A6;LATIN LETTER YR;Lu;0;L;;;;;N;LATIN LETTER Y R;;;0280; => CA 80
1571 01A7;LATIN CAPITAL LETTER TONE TWO;Lu;0;L;;;;;N;;;;01A8; => +1
1572 01A8;LATIN SMALL LETTER TONE TWO;Ll;0;L;;;;;N;;;01A7;;01A7 <=
1573 01A9;LATIN CAPITAL LETTER ESH;Lu;0;L;;;;;N;;;;0283; => CA 83
1574 01AA;LATIN LETTER REVERSED ESH LOOP;Ll;0;L;;;;;N;;;;;
1575 01AB;LATIN SMALL LETTER T WITH PALATAL HOOK;Ll;0;L;;;;;N;LATIN SMALL LETTER T PALATAL HOOK;;;; <=
1576 01AC;LATIN CAPITAL LETTER T WITH HOOK;Lu;0;L;;;;;N;LATIN CAPITAL LETTER T HOOK;;;01AD; => +1
1577 01AD;LATIN SMALL LETTER T WITH HOOK;Ll;0;L;;;;;N;LATIN SMALL LETTER T HOOK;;01AC;;01AC <=
1578 01AE;LATIN CAPITAL LETTER T WITH RETROFLEX HOOK;Lu;0;L;;;;;N;LATIN CAPITAL LETTER T RETROFLEX HOOK;;;0288; => CA 88
1579 01AF;LATIN CAPITAL LETTER U WITH HORN;Lu;0;L;0055 031B;;;;N;LATIN CAPITAL LETTER U HORN;;;01B0; => +1
1580 }
1581 #$A0..#$A5,#$AC:
1582 begin
1583 if ord(c2) mod 2 = 0 then
1584 new_c2 := chr(ord(c2) + 1);
1585 end;
1586 #$A7,#$AF:
1587 begin
1588 if ord(c2) mod 2 = 1 then
1589 new_c2 := chr(ord(c2) + 1);
1590 end;
1591 #$A6:
1592 begin
1593 new_c1 := #$CA;
1594 new_c2 := #$80;
1595 end;
1596 #$A9:
1597 begin
1598 new_c1 := #$CA;
1599 new_c2 := #$83;
1600 end;
1601 #$AE:
1602 begin
1603 new_c1 := #$CA;
1604 new_c2 := #$88;
1605 end;
1606 {
1607 And also C6 B0..C6 BF
1608
1609 01B0;LATIN SMALL LETTER U WITH HORN;Ll;0;L;0075 031B;;;;N;LATIN SMALL LETTER U HORN;;01AF;;01AF <= -1
1610 01B1;LATIN CAPITAL LETTER UPSILON;Lu;0;L;;;;;N;;;;028A; => CA 8A
1611 01B2;LATIN CAPITAL LETTER V WITH HOOK;Lu;0;L;;;;;N;LATIN CAPITAL LETTER SCRIPT V;;;028B; => CA 8B
1612 01B3;LATIN CAPITAL LETTER Y WITH HOOK;Lu;0;L;;;;;N;LATIN CAPITAL LETTER Y HOOK;;;01B4; => +1
1613 01B4;LATIN SMALL LETTER Y WITH HOOK;Ll;0;L;;;;;N;LATIN SMALL LETTER Y HOOK;;01B3;;01B3 <=
1614 01B5;LATIN CAPITAL LETTER Z WITH STROKE;Lu;0;L;;;;;N;LATIN CAPITAL LETTER Z BAR;;;01B6; => +1
1615 01B6;LATIN SMALL LETTER Z WITH STROKE;Ll;0;L;;;;;N;LATIN SMALL LETTER Z BAR;;01B5;;01B5 <=
1616 01B7;LATIN CAPITAL LETTER EZH;Lu;0;L;;;;;N;LATIN CAPITAL LETTER YOGH;;;0292; => CA 92
1617 01B8;LATIN CAPITAL LETTER EZH REVERSED;Lu;0;L;;;;;N;LATIN CAPITAL LETTER REVERSED YOGH;;;01B9; => +1
1618 01B9;LATIN SMALL LETTER EZH REVERSED;Ll;0;L;;;;;N;LATIN SMALL LETTER REVERSED YOGH;;01B8;;01B8 <=
1619 01BA;LATIN SMALL LETTER EZH WITH TAIL;Ll;0;L;;;;;N;LATIN SMALL LETTER YOGH WITH TAIL;;;; <=
1620 01BB;LATIN LETTER TWO WITH STROKE;Lo;0;L;;;;;N;LATIN LETTER TWO BAR;;;; X
1621 01BC;LATIN CAPITAL LETTER TONE FIVE;Lu;0;L;;;;;N;;;;01BD; => +1
1622 01BD;LATIN SMALL LETTER TONE FIVE;Ll;0;L;;;;;N;;;01BC;;01BC <=
1623 01BE;LATIN LETTER INVERTED GLOTTAL STOP WITH STROKE;Ll;0;L;;;;;N;LATIN LETTER INVERTED GLOTTAL STOP BAR;;;; X
1624 01BF;LATIN LETTER WYNN;Ll;0;L;;;;;N;;;01F7;;01F7 <=
1625 }
1626 #$B8,#$BC:
1627 begin
1628 if ord(c2) mod 2 = 0 then
1629 new_c2 := chr(ord(c2) + 1);
1630 end;
1631 #$B3..#$B6:
1632 begin
1633 if ord(c2) mod 2 = 1 then
1634 new_c2 := chr(ord(c2) + 1);
1635 end;
1636 #$B1:
1637 begin
1638 new_c1 := #$CA;
1639 new_c2 := #$8A;
1640 end;
1641 #$B2:
1642 begin
1643 new_c1 := #$CA;
1644 new_c2 := #$8B;
1645 end;
1646 #$B7:
1647 begin
1648 new_c1 := #$CA;
1649 new_c2 := #$92;
1650 end;
1651 end;
1652 end;
1653 #$C7:
1654 begin
1655 case c2 of
1656 #$84..#$8C,#$B1..#$B3:
1657 begin
1658 if (ord(c2) and $F) mod 3 = 1 then new_c2 := chr(ord(c2) + 2)
1659 else if (ord(c2) and $F) mod 3 = 2 then new_c2 := chr(ord(c2) + 1);
1660 end;
1661 #$8D..#$9C:
1662 begin
1663 if ord(c2) mod 2 = 1 then
1664 new_c2 := chr(ord(c2) + 1);
1665 end;
1666 #$9E..#$AF,#$B4..#$B5,#$B8..#$BF:
1667 begin
1668 if ord(c2) mod 2 = 0 then
1669 new_c2 := chr(ord(c2) + 1);
1670 end;
1671 {
1672 01F6;LATIN CAPITAL LETTER HWAIR;Lu;0;L;;;;;N;;;;0195;
1673 01F7;LATIN CAPITAL LETTER WYNN;Lu;0;L;;;;;N;;;;01BF;
1674 }
1675 #$B6:
1676 begin
1677 new_c1 := #$C6;
1678 new_c2 := #$95;
1679 end;
1680 #$B7:
1681 begin
1682 new_c1 := #$C6;
1683 new_c2 := #$BF;
1684 end;
1685 end;
1686 end;
1687 {
1688 Codepoints 0200 to 023F
1689 }
1690 #$C8:
1691 begin
1692 // For this one we can simply start with a default and override for some specifics
1693 if (c2 in [#$80..#$B3]) and (ord(c2) mod 2 = 0) then new_c2 := chr(ord(c2) + 1);
1694
1695 case c2 of
1696 #$A0:
1697 begin
1698 new_c1 := #$C6;
1699 new_c2 := #$9E;
1700 end;
1701 #$A1: new_c2 := c2;
1702 {
1703 023A;LATIN CAPITAL LETTER A WITH STROKE;Lu;0;L;;;;;N;;;;2C65; => E2 B1 A5
1704 023B;LATIN CAPITAL LETTER C WITH STROKE;Lu;0;L;;;;;N;;;;023C; => +1
1705 023C;LATIN SMALL LETTER C WITH STROKE;Ll;0;L;;;;;N;;;023B;;023B <=
1706 023D;LATIN CAPITAL LETTER L WITH BAR;Lu;0;L;;;;;N;;;;019A; => C6 9A
1707 023E;LATIN CAPITAL LETTER T WITH DIAGONAL STROKE;Lu;0;L;;;;;N;;;;2C66; => E2 B1 A6
1708 023F;LATIN SMALL LETTER S WITH SWASH TAIL;Ll;0;L;;;;;N;;;2C7E;;2C7E <=
1709 0240;LATIN SMALL LETTER Z WITH SWASH TAIL;Ll;0;L;;;;;N;;;2C7F;;2C7F <=
1710 }
1711 #$BA,#$BE:
1712 begin
1713 p:= OutStr - PChar(Result);
1714 SetLength(Result,Length(Result)+1);// Increase the buffer
1715 OutStr := PChar(Result)+p;
1716 OutStr^ := #$E2;
1717 inc(OutStr);
1718 OutStr^ := #$B1;
1719 inc(OutStr);
1720 if c2 = #$BA then OutStr^ := #$A5
1721 else OutStr^ := #$A6;
1722 dec(CounterDiff);
1723 inc(OutStr);
1724 inc(InStr, 2);
1725 Continue;
1726 end;
1727 #$BD:
1728 begin
1729 new_c1 := #$C6;
1730 new_c2 := #$9A;
1731 end;
1732 #$BB: new_c2 := chr(ord(c2) + 1);
1733 end;
1734 end;
1735 {
1736 Codepoints 0240 to 027F
1737
1738 Here only 0240..024F needs lowercase
1739 }
1740 #$C9:
1741 begin
1742 case c2 of
1743 #$81..#$82:
1744 begin
1745 if ord(c2) mod 2 = 1 then
1746 new_c2 := chr(ord(c2) + 1);
1747 end;
1748 #$86..#$8F:
1749 begin
1750 if ord(c2) mod 2 = 0 then
1751 new_c2 := chr(ord(c2) + 1);
1752 end;
1753 #$83:
1754 begin
1755 new_c1 := #$C6;
1756 new_c2 := #$80;
1757 end;
1758 #$84:
1759 begin
1760 new_c1 := #$CA;
1761 new_c2 := #$89;
1762 end;
1763 #$85:
1764 begin
1765 new_c1 := #$CA;
1766 new_c2 := #$8C;
1767 end;
1768 end;
1769 end;
1770 // $CE91..$CE9F: NewChar := OldChar + $20; // Greek Characters
1771 // $CEA0..$CEA9: NewChar := OldChar + $E0; // Greek Characters
1772 #$CE:
1773 begin
1774 case c2 of
1775 // 0380 = CE 80
1776 #$86: new_c2 := #$AC;
1777 #$88: new_c2 := #$AD;
1778 #$89: new_c2 := #$AE;
1779 #$8A: new_c2 := #$AF;
1780 #$8C: new_c1 := #$CF; // By coincidence new_c2 remains the same
1781 #$8E:
1782 begin
1783 new_c1 := #$CF;
1784 new_c2 := #$8D;
1785 end;
1786 #$8F:
1787 begin
1788 new_c1 := #$CF;
1789 new_c2 := #$8E;
1790 end;
1791 // 0390 = CE 90
1792 #$91..#$9F:
1793 begin
1794 new_c2 := chr(ord(c2) + $20);
1795 end;
1796 // 03A0 = CE A0
1797 #$A0..#$AB:
1798 begin
1799 new_c1 := #$CF;
1800 new_c2 := chr(ord(c2) - $20);
1801 end;
1802 end;
1803 end;
1804 // 03C0 = CF 80
1805 // 03D0 = CF 90
1806 // 03E0 = CF A0
1807 // 03F0 = CF B0
1808 #$CF:
1809 begin
1810 case c2 of
1811 // 03CF;GREEK CAPITAL KAI SYMBOL;Lu;0;L;;;;;N;;;;03D7; CF 8F => CF 97
1812 #$8F: new_c2 := #$97;
1813 // 03D8;GREEK LETTER ARCHAIC KOPPA;Lu;0;L;;;;;N;;;;03D9;
1814 #$98: new_c2 := #$99;
1815 // 03DA;GREEK LETTER STIGMA;Lu;0;L;;;;;N;GREEK CAPITAL LETTER STIGMA;;;03DB;
1816 #$9A: new_c2 := #$9B;
1817 // 03DC;GREEK LETTER DIGAMMA;Lu;0;L;;;;;N;GREEK CAPITAL LETTER DIGAMMA;;;03DD;
1818 #$9C: new_c2 := #$9D;
1819 // 03DE;GREEK LETTER KOPPA;Lu;0;L;;;;;N;GREEK CAPITAL LETTER KOPPA;;;03DF;
1820 #$9E: new_c2 := #$9F;
1821 {
1822 03E0;GREEK LETTER SAMPI;Lu;0;L;;;;;N;GREEK CAPITAL LETTER SAMPI;;;03E1;
1823 03E1;GREEK SMALL LETTER SAMPI;Ll;0;L;;;;;N;;;03E0;;03E0
1824 03E2;COPTIC CAPITAL LETTER SHEI;Lu;0;L;;;;;N;GREEK CAPITAL LETTER SHEI;;;03E3;
1825 03E3;COPTIC SMALL LETTER SHEI;Ll;0;L;;;;;N;GREEK SMALL LETTER SHEI;;03E2;;03E2
1826 ...
1827 03EE;COPTIC CAPITAL LETTER DEI;Lu;0;L;;;;;N;GREEK CAPITAL LETTER DEI;;;03EF;
1828 03EF;COPTIC SMALL LETTER DEI;Ll;0;L;;;;;N;GREEK SMALL LETTER DEI;;03EE;;03EE
1829 }
1830 #$A0..#$AF: if ord(c2) mod 2 = 0 then
1831 new_c2 := chr(ord(c2) + 1);
1832 // 03F4;GREEK CAPITAL THETA SYMBOL;Lu;0;L;<compat> 0398;;;;N;;;;03B8;
1833 #$B4:
1834 begin
1835 new_c1 := #$CE;
1836 new_c2 := #$B8;
1837 end;
1838 // 03F7;GREEK CAPITAL LETTER SHO;Lu;0;L;;;;;N;;;;03F8;
1839 #$B7: new_c2 := #$B8;
1840 // 03F9;GREEK CAPITAL LUNATE SIGMA SYMBOL;Lu;0;L;<compat> 03A3;;;;N;;;;03F2;
1841 #$B9: new_c2 := #$B2;
1842 // 03FA;GREEK CAPITAL LETTER SAN;Lu;0;L;;;;;N;;;;03FB;
1843 #$BA: new_c2 := #$BB;
1844 // 03FD;GREEK CAPITAL REVERSED LUNATE SIGMA SYMBOL;Lu;0;L;;;;;N;;;;037B;
1845 #$BD:
1846 begin
1847 new_c1 := #$CD;
1848 new_c2 := #$BB;
1849 end;
1850 // 03FE;GREEK CAPITAL DOTTED LUNATE SIGMA SYMBOL;Lu;0;L;;;;;N;;;;037C;
1851 #$BE:
1852 begin
1853 new_c1 := #$CD;
1854 new_c2 := #$BC;
1855 end;
1856 // 03FF;GREEK CAPITAL REVERSED DOTTED LUNATE SIGMA SYMBOL;Lu;0;L;;;;;N;;;;037D;
1857 #$BF:
1858 begin
1859 new_c1 := #$CD;
1860 new_c2 := #$BD;
1861 end;
1862 end;
1863 end;
1864 // $D080..$D08F: NewChar := OldChar + $110; // Cyrillic alphabet
1865 // $D090..$D09F: NewChar := OldChar + $20; // Cyrillic alphabet
1866 // $D0A0..$D0AF: NewChar := OldChar + $E0; // Cyrillic alphabet
1867 #$D0:
1868 begin
1869 c2 := InStr[1];
1870 case c2 of
1871 #$80..#$8F:
1872 begin
1873 new_c1 := chr(ord(c1)+1);
1874 new_c2 := chr(ord(c2) + $10);
1875 end;
1876 #$90..#$9F:
1877 begin
1878 new_c2 := chr(ord(c2) + $20);
1879 end;
1880 #$A0..#$AF:
1881 begin
1882 new_c1 := chr(ord(c1)+1);
1883 new_c2 := chr(ord(c2) - $20);
1884 end;
1885 end;
1886 end;
1887 // Archaic and non-slavic cyrillic 460-47F = D1A0-D1BF
1888 // These require just adding 1 to get the lowercase
1889 #$D1:
1890 begin
1891 if (c2 in [#$A0..#$BF]) and (ord(c2) mod 2 = 0) then
1892 new_c2 := chr(ord(c2) + 1);
1893 end;
1894 // Archaic and non-slavic cyrillic 480-4BF = D280-D2BF
1895 // These mostly require just adding 1 to get the lowercase
1896 #$D2:
1897 begin
1898 case c2 of
1899 #$80:
1900 begin
1901 new_c2 := chr(ord(c2) + 1);
1902 end;
1903 // #$81 is already lowercase
1904 // #$82-#$89 ???
1905 #$8A..#$BF:
1906 begin
1907 if ord(c2) mod 2 = 0 then
1908 new_c2 := chr(ord(c2) + 1);
1909 end;
1910 end;
1911 end;
1912 {
1913 Codepoints 04C0..04FF
1914 }
1915 #$D3:
1916 begin
1917 case c2 of
1918 #$80: new_c2 := #$8F;
1919 #$81..#$8E:
1920 begin
1921 if ord(c2) mod 2 = 1 then
1922 new_c2 := chr(ord(c2) + 1);
1923 end;
1924 #$90..#$BF:
1925 begin
1926 if ord(c2) mod 2 = 0 then
1927 new_c2 := chr(ord(c2) + 1);
1928 end;
1929 end;
1930 end;
1931 {
1932 Codepoints 0500..053F
1933
1934 Armenian starts in 0531
1935 }
1936 #$D4:
1937 begin
1938 if ord(c2) mod 2 = 0 then
1939 new_c2 := chr(ord(c2) + 1);
1940
1941 // Armenian
1942 if c2 in [#$B1..#$BF] then
1943 begin
1944 new_c1 := #$D5;
1945 new_c2 := chr(ord(c2) - $10);
1946 end;
1947 end;
1948 {
1949 Codepoints 0540..057F
1950
1951 Armenian
1952 }
1953 #$D5:
1954 begin
1955 case c2 of
1956 #$80..#$8F:
1957 begin
1958 new_c2 := chr(ord(c2) + $30);
1959 end;
1960 #$90..#$96:
1961 begin
1962 new_c1 := #$D6;
1963 new_c2 := chr(ord(c2) - $10);
1964 end;
1965 end;
1966 end;
1967 end;
1968 // Common code 2-byte modifiable chars
1969 if (CounterDiff <> 0) then
1970 begin
1971 OutStr^ := new_c1;
1972 OutStr[1] := new_c2;
1973 end
1974 else
1975 begin
1976 if (new_c1 <> c1) then OutStr^ := new_c1;
1977 if (new_c2 <> c2) then OutStr[1] := new_c2;
1978 end;
1979 inc(InStr, 2);
1980 inc(OutStr, 2);
1981 end;
1982 {
1983 Characters with 3 bytes
1984 }
1985 #$E1:
1986 begin
1987 new_c1 := c1;
1988 c2 := InStr[1];
1989 c3 := InStr[2];
1990 new_c2 := c2;
1991 new_c3 := c3;
1992 {
1993 Georgian codepoints 10A0-10C5 => 2D00-2D25
1994
1995 In UTF-8 this is:
1996 E1 82 A0 - E1 82 BF => E2 B4 80 - E2 B4 9F
1997 E1 83 80 - E1 83 85 => E2 B4 A0 - E2 B4 A5
1998 }
1999 case c2 of
2000 #$82:
2001 if (c3 in [#$A0..#$BF]) then
2002 begin
2003 new_c1 := #$E2;
2004 new_c2 := #$B4;
2005 new_c3 := chr(ord(c3) - $20);
2006 end;
2007 #$83:
2008 if (c3 in [#$80..#$85]) then
2009 begin
2010 new_c1 := #$E2;
2011 new_c2 := #$B4;
2012 new_c3 := chr(ord(c3) + $20);
2013 end;
2014 {
2015 Extra chars between 1E00..1EFF
2016
2017 Blocks of chars:
2018 1E00..1E3F E1 B8 80..E1 B8 BF
2019 1E40..1E7F E1 B9 80..E1 B9 BF
2020 1E80..1EBF E1 BA 80..E1 BA BF
2021 1EC0..1EFF E1 BB 80..E1 BB BF
2022 }
2023 #$B8..#$BB:
2024 begin
2025 // Start with a default and change for some particular chars
2026 if ord(c3) mod 2 = 0 then
2027 new_c3 := chr(ord(c3) + 1);
2028
2029 { Only 1E96..1E9F are different E1 BA 96..E1 BA 9F
2030
2031 1E96;LATIN SMALL LETTER H WITH LINE BELOW;Ll;0;L;0068 0331;;;;N;;;;;
2032 1E97;LATIN SMALL LETTER T WITH DIAERESIS;Ll;0;L;0074 0308;;;;N;;;;;
2033 1E98;LATIN SMALL LETTER W WITH RING ABOVE;Ll;0;L;0077 030A;;;;N;;;;;
2034 1E99;LATIN SMALL LETTER Y WITH RING ABOVE;Ll;0;L;0079 030A;;;;N;;;;;
2035 1E9A;LATIN SMALL LETTER A WITH RIGHT HALF RING;Ll;0;L;<compat> 0061 02BE;;;;N;;;;;
2036 1E9B;LATIN SMALL LETTER LONG S WITH DOT ABOVE;Ll;0;L;017F 0307;;;;N;;;1E60;;1E60
2037 1E9C;LATIN SMALL LETTER LONG S WITH DIAGONAL STROKE;Ll;0;L;;;;;N;;;;;
2038 1E9D;LATIN SMALL LETTER LONG S WITH HIGH STROKE;Ll;0;L;;;;;N;;;;;
2039 1E9E;LATIN CAPITAL LETTER SHARP S;Lu;0;L;;;;;N;;;;00DF; => C3 9F
2040 1E9F;LATIN SMALL LETTER DELTA;Ll;0;L;;;;;N;;;;;
2041 }
2042 if (c2 = #$BA) and (c3 in [#$96..#$9F]) then new_c3 := c3;
2043 // LATIN CAPITAL LETTER SHARP S => to german Beta
2044 if (c2 = #$BA) and (c3 = #$9E) then
2045 begin
2046 inc(InStr, 3);
2047 OutStr^ := #$C3;
2048 inc(OutStr);
2049 OutStr^ := #$9F;
2050 inc(OutStr);
2051 inc(CounterDiff, 1);
2052 Continue;
2053 end;
2054 end;
2055 {
2056 Extra chars between 1F00..1FFF
2057
2058 Blocks of chars:
2059 1E00..1E3F E1 BC 80..E1 BC BF
2060 1E40..1E7F E1 BD 80..E1 BD BF
2061 1E80..1EBF E1 BE 80..E1 BE BF
2062 1EC0..1EFF E1 BF 80..E1 BF BF
2063 }
2064 #$BC:
2065 begin
2066 // Start with a default and change for some particular chars
2067 if (ord(c3) mod $10) div 8 = 1 then
2068 new_c3 := chr(ord(c3) - 8);
2069 end;
2070 #$BD:
2071 begin
2072 // Start with a default and change for some particular chars
2073 case c3 of
2074 #$80..#$8F, #$A0..#$AF: if (ord(c3) mod $10) div 8 = 1 then
2075 new_c3 := chr(ord(c3) - 8);
2076 {
2077 1F50;GREEK SMALL LETTER UPSILON WITH PSILI;Ll;0;L;03C5 0313;;;;N;;;;;
2078 1F51;GREEK SMALL LETTER UPSILON WITH DASIA;Ll;0;L;03C5 0314;;;;N;;;1F59;;1F59
2079 1F52;GREEK SMALL LETTER UPSILON WITH PSILI AND VARIA;Ll;0;L;1F50 0300;;;;N;;;;;
2080 1F53;GREEK SMALL LETTER UPSILON WITH DASIA AND VARIA;Ll;0;L;1F51 0300;;;;N;;;1F5B;;1F5B
2081 1F54;GREEK SMALL LETTER UPSILON WITH PSILI AND OXIA;Ll;0;L;1F50 0301;;;;N;;;;;
2082 1F55;GREEK SMALL LETTER UPSILON WITH DASIA AND OXIA;Ll;0;L;1F51 0301;;;;N;;;1F5D;;1F5D
2083 1F56;GREEK SMALL LETTER UPSILON WITH PSILI AND PERISPOMENI;Ll;0;L;1F50 0342;;;;N;;;;;
2084 1F57;GREEK SMALL LETTER UPSILON WITH DASIA AND PERISPOMENI;Ll;0;L;1F51 0342;;;;N;;;1F5F;;1F5F
2085 1F59;GREEK CAPITAL LETTER UPSILON WITH DASIA;Lu;0;L;03A5 0314;;;;N;;;;1F51;
2086 1F5B;GREEK CAPITAL LETTER UPSILON WITH DASIA AND VARIA;Lu;0;L;1F59 0300;;;;N;;;;1F53;
2087 1F5D;GREEK CAPITAL LETTER UPSILON WITH DASIA AND OXIA;Lu;0;L;1F59 0301;;;;N;;;;1F55;
2088 1F5F;GREEK CAPITAL LETTER UPSILON WITH DASIA AND PERISPOMENI;Lu;0;L;1F59 0342;;;;N;;;;1F57;
2089 }
2090 #$99,#$9B,#$9D,#$9F: new_c3 := chr(ord(c3) - 8);
2091 end;
2092 end;
2093 #$BE:
2094 begin
2095 // Start with a default and change for some particular chars
2096 case c3 of
2097 #$80..#$B9: if (ord(c3) mod $10) div 8 = 1 then
2098 new_c3 := chr(ord(c3) - 8);
2099 {
2100 1FB0;GREEK SMALL LETTER ALPHA WITH VRACHY;Ll;0;L;03B1 0306;;;;N;;;1FB8;;1FB8
2101 1FB1;GREEK SMALL LETTER ALPHA WITH MACRON;Ll;0;L;03B1 0304;;;;N;;;1FB9;;1FB9
2102 1FB2;GREEK SMALL LETTER ALPHA WITH VARIA AND YPOGEGRAMMENI;Ll;0;L;1F70 0345;;;;N;;;;;
2103 1FB3;GREEK SMALL LETTER ALPHA WITH YPOGEGRAMMENI;Ll;0;L;03B1 0345;;;;N;;;1FBC;;1FBC
2104 1FB4;GREEK SMALL LETTER ALPHA WITH OXIA AND YPOGEGRAMMENI;Ll;0;L;03AC 0345;;;;N;;;;;
2105 1FB6;GREEK SMALL LETTER ALPHA WITH PERISPOMENI;Ll;0;L;03B1 0342;;;;N;;;;;
2106 1FB7;GREEK SMALL LETTER ALPHA WITH PERISPOMENI AND YPOGEGRAMMENI;Ll;0;L;1FB6 0345;;;;N;;;;;
2107 1FB8;GREEK CAPITAL LETTER ALPHA WITH VRACHY;Lu;0;L;0391 0306;;;;N;;;;1FB0;
2108 1FB9;GREEK CAPITAL LETTER ALPHA WITH MACRON;Lu;0;L;0391 0304;;;;N;;;;1FB1;
2109 1FBA;GREEK CAPITAL LETTER ALPHA WITH VARIA;Lu;0;L;0391 0300;;;;N;;;;1F70;
2110 1FBB;GREEK CAPITAL LETTER ALPHA WITH OXIA;Lu;0;L;0386;;;;N;;;;1F71;
2111 1FBC;GREEK CAPITAL LETTER ALPHA WITH PROSGEGRAMMENI;Lt;0;L;0391 0345;;;;N;;;;1FB3;
2112 1FBD;GREEK KORONIS;Sk;0;ON;<compat> 0020 0313;;;;N;;;;;
2113 1FBE;GREEK PROSGEGRAMMENI;Ll;0;L;03B9;;;;N;;;0399;;0399
2114 1FBF;GREEK PSILI;Sk;0;ON;<compat> 0020 0313;;;;N;;;;;
2115 }
2116 #$BA:
2117 begin
2118 new_c2 := #$BD;
2119 new_c3 := #$B0;
2120 end;
2121 #$BB:
2122 begin
2123 new_c2 := #$BD;
2124 new_c3 := #$B1;
2125 end;
2126 #$BC: new_c3 := #$B3;
2127 end;
2128 end;
2129 end;
2130
2131 if (CounterDiff <> 0) then
2132 begin
2133 OutStr^ := new_c1;
2134 OutStr[1] := new_c2;
2135 OutStr[2] := new_c3;
2136 end
2137 else
2138 begin
2139 if c1 <> new_c1 then OutStr^ := new_c1;
2140 if c2 <> new_c2 then OutStr[1] := new_c2;
2141 if c3 <> new_c3 then OutStr[2] := new_c3;
2142 end;
2143
2144 inc(InStr, 3);
2145 inc(OutStr, 3);
2146 end;
2147 {
2148 More Characters with 3 bytes, so exotic stuff between:
2149 $2126..$2183 E2 84 A6..E2 86 83
2150 $24B6..$24CF Result:=u+26; E2 92 B6..E2 93 8F
2151 $2C00..$2C2E Result:=u+48; E2 B0 80..E2 B0 AE
2152 $2C60..$2CE2 E2 B1 A0..E2 B3 A2
2153 }
2154 #$E2:
2155 begin
2156 new_c1 := c1;
2157 c2 := InStr[1];
2158 c3 := InStr[2];
2159 new_c2 := c2;
2160 new_c3 := c3;
2161 // 2126;OHM SIGN;Lu;0;L;03A9;;;;N;OHM;;;03C9; E2 84 A6 => CF 89
2162 if (c2 = #$84) and (c3 = #$A6) then
2163 begin
2164 inc(InStr, 3);
2165 OutStr^ := #$CF;
2166 inc(OutStr);
2167 OutStr^ := #$89;
2168 inc(OutStr);
2169 inc(CounterDiff, 1);
2170 Continue;
2171 end
2172 {
2173 212A;KELVIN SIGN;Lu;0;L;004B;;;;N;DEGREES KELVIN;;;006B; E2 84 AA => 6B
2174 }
2175 else if (c2 = #$84) and (c3 = #$AA) then
2176 begin
2177 inc(InStr, 3);
2178 OutStr^ := #$6B;
2179 inc(OutStr);
2180 inc(CounterDiff, 2);
2181 Continue;
2182 end
2183 {
2184 212B;ANGSTROM SIGN;Lu;0;L;00C5;;;;N;ANGSTROM UNIT;;;00E5; E2 84 AB => C3 A5
2185 }
2186 else if (c2 = #$84) and (c3 = #$AB) then
2187 begin
2188 inc(InStr, 3);
2189 OutStr^ := #$C3;
2190 inc(OutStr);
2191 OutStr^ := #$A5;
2192 inc(OutStr);
2193 inc(CounterDiff, 1);
2194 Continue;
2195 end
2196 {
2197 2160;ROMAN NUMERAL ONE;Nl;0;L;<compat> 0049;;;1;N;;;;2170; E2 85 A0 => E2 85 B0
2198 2161;ROMAN NUMERAL TWO;Nl;0;L;<compat> 0049 0049;;;2;N;;;;2171;
2199 2162;ROMAN NUMERAL THREE;Nl;0;L;<compat> 0049 0049 0049;;;3;N;;;;2172;
2200 2163;ROMAN NUMERAL FOUR;Nl;0;L;<compat> 0049 0056;;;4;N;;;;2173;
2201 2164;ROMAN NUMERAL FIVE;Nl;0;L;<compat> 0056;;;5;N;;;;2174;
2202 2165;ROMAN NUMERAL SIX;Nl;0;L;<compat> 0056 0049;;;6;N;;;;2175;
2203 2166;ROMAN NUMERAL SEVEN;Nl;0;L;<compat> 0056 0049 0049;;;7;N;;;;2176;
2204 2167;ROMAN NUMERAL EIGHT;Nl;0;L;<compat> 0056 0049 0049 0049;;;8;N;;;;2177;
2205 2168;ROMAN NUMERAL NINE;Nl;0;L;<compat> 0049 0058;;;9;N;;;;2178;
2206 2169;ROMAN NUMERAL TEN;Nl;0;L;<compat> 0058;;;10;N;;;;2179;
2207 216A;ROMAN NUMERAL ELEVEN;Nl;0;L;<compat> 0058 0049;;;11;N;;;;217A;
2208 216B;ROMAN NUMERAL TWELVE;Nl;0;L;<compat> 0058 0049 0049;;;12;N;;;;217B;
2209 216C;ROMAN NUMERAL FIFTY;Nl;0;L;<compat> 004C;;;50;N;;;;217C;
2210 216D;ROMAN NUMERAL ONE HUNDRED;Nl;0;L;<compat> 0043;;;100;N;;;;217D;
2211 216E;ROMAN NUMERAL FIVE HUNDRED;Nl;0;L;<compat> 0044;;;500;N;;;;217E;
2212 216F;ROMAN NUMERAL ONE THOUSAND;Nl;0;L;<compat> 004D;;;1000;N;;;;217F;
2213 }
2214 else if (c2 = #$85) and (c3 in [#$A0..#$AF]) then new_c3 := chr(ord(c3) + $10)
2215 {
2216 2183;ROMAN NUMERAL REVERSED ONE HUNDRED;Lu;0;L;;;;;N;;;;2184; E2 86 83 => E2 86 84
2217 }
2218 else if (c2 = #$86) and (c3 = #$83) then new_c3 := chr(ord(c3) + 1)
2219 {
2220 $24B6..$24CF Result:=u+26; E2 92 B6..E2 93 8F
2221
2222 Ex: 24B6;CIRCLED LATIN CAPITAL LETTER A;So;0;L;<circle> 0041;;;;N;;;;24D0; E2 92 B6 => E2 93 90
2223 }
2224 else if (c2 = #$92) and (c3 in [#$B6..#$BF]) then
2225 begin
2226 new_c2 := #$93;
2227 new_c3 := chr(ord(c3) - $26);
2228 end
2229 // CIRCLED LATIN CAPITAL LETTER K $24C0 -> $24DA
2230 else if (c2 = #$93) and (c3 in [#$80..#$8F]) then new_c3 := chr(ord(c3) + $1A)
2231 {
2232 $2C00..$2C2E Result:=u+48; E2 B0 80..E2 B0 AE
2233
2234 2C00;GLAGOLITIC CAPITAL LETTER AZU;Lu;0;L;;;;;N;;;;2C30; E2 B0 80 => E2 B0 B0
2235
2236 2C10;GLAGOLITIC CAPITAL LETTER NASHI;Lu;0;L;;;;;N;;;;2C40; E2 B0 90 => E2 B1 80
2237 }
2238 else if (c2 = #$B0) and (c3 in [#$80..#$8F]) then new_c3 := chr(ord(c3) + $30)
2239 else if (c2 = #$B0) and (c3 in [#$90..#$AE]) then
2240 begin
2241 new_c2 := #$B1;
2242 new_c3 := chr(ord(c3) - $10);
2243 end
2244 {
2245 $2C60..$2CE2 E2 B1 A0..E2 B3 A2
2246
2247 2C60;LATIN CAPITAL LETTER L WITH DOUBLE BAR;Lu;0;L;;;;;N;;;;2C61; E2 B1 A0 => +1
2248 2C61;LATIN SMALL LETTER L WITH DOUBLE BAR;Ll;0;L;;;;;N;;;2C60;;2C60
2249 2C62;LATIN CAPITAL LETTER L WITH MIDDLE TILDE;Lu;0;L;;;;;N;;;;026B; => C9 AB
2250 2C63;LATIN CAPITAL LETTER P WITH STROKE;Lu;0;L;;;;;N;;;;1D7D; => E1 B5 BD
2251 2C64;LATIN CAPITAL LETTER R WITH TAIL;Lu;0;L;;;;;N;;;;027D; => C9 BD
2252 2C65;LATIN SMALL LETTER A WITH STROKE;Ll;0;L;;;;;N;;;023A;;023A
2253 2C66;LATIN SMALL LETTER T WITH DIAGONAL STROKE;Ll;0;L;;;;;N;;;023E;;023E
2254 2C67;LATIN CAPITAL LETTER H WITH DESCENDER;Lu;0;L;;;;;N;;;;2C68; => E2 B1 A8
2255 2C68;LATIN SMALL LETTER H WITH DESCENDER;Ll;0;L;;;;;N;;;2C67;;2C67
2256 2C69;LATIN CAPITAL LETTER K WITH DESCENDER;Lu;0;L;;;;;N;;;;2C6A; => E2 B1 AA
2257 2C6A;LATIN SMALL LETTER K WITH DESCENDER;Ll;0;L;;;;;N;;;2C69;;2C69
2258 2C6B;LATIN CAPITAL LETTER Z WITH DESCENDER;Lu;0;L;;;;;N;;;;2C6C; => E2 B1 AC
2259 2C6C;LATIN SMALL LETTER Z WITH DESCENDER;Ll;0;L;;;;;N;;;2C6B;;2C6B
2260 2C6D;LATIN CAPITAL LETTER ALPHA;Lu;0;L;;;;;N;;;;0251; => C9 91
2261 2C6E;LATIN CAPITAL LETTER M WITH HOOK;Lu;0;L;;;;;N;;;;0271; => C9 B1
2262 2C6F;LATIN CAPITAL LETTER TURNED A;Lu;0;L;;;;;N;;;;0250; => C9 90
2263
2264 2C70;LATIN CAPITAL LETTER TURNED ALPHA;Lu;0;L;;;;;N;;;;0252; => C9 92
2265 }
2266 else if (c2 = #$B1) then
2267 begin
2268 case c3 of
2269 #$A0: new_c3 := chr(ord(c3)+1);
2270 #$A2,#$A4,#$AD..#$AF,#$B0:
2271 begin
2272 inc(InStr, 3);
2273 OutStr^ := #$C9;
2274 inc(OutStr);
2275 case c3 of
2276 #$A2: OutStr^ := #$AB;
2277 #$A4: OutStr^ := #$BD;
2278 #$AD: OutStr^ := #$91;
2279 #$AE: OutStr^ := #$B1;
2280 #$AF: OutStr^ := #$90;
2281 #$B0: OutStr^ := #$92;
2282 end;
2283 inc(OutStr);
2284 inc(CounterDiff, 1);
2285 Continue;
2286 end;
2287 #$A3:
2288 begin
2289 new_c2 := #$B5;
2290 new_c3 := #$BD;
2291 end;
2292 #$A7,#$A9,#$AB: new_c3 := chr(ord(c3)+1);
2293 {
2294 2C71;LATIN SMALL LETTER V WITH RIGHT HOOK;Ll;0;L;;;;;N;;;;;
2295 2C72;LATIN CAPITAL LETTER W WITH HOOK;Lu;0;L;;;;;N;;;;2C73;
2296 2C73;LATIN SMALL LETTER W WITH HOOK;Ll;0;L;;;;;N;;;2C72;;2C72
2297 2C74;LATIN SMALL LETTER V WITH CURL;Ll;0;L;;;;;N;;;;;
2298 2C75;LATIN CAPITAL LETTER HALF H;Lu;0;L;;;;;N;;;;2C76;
2299 2C76;LATIN SMALL LETTER HALF H;Ll;0;L;;;;;N;;;2C75;;2C75
2300 2C77;LATIN SMALL LETTER TAILLESS PHI;Ll;0;L;;;;;N;;;;;
2301 2C78;LATIN SMALL LETTER E WITH NOTCH;Ll;0;L;;;;;N;;;;;
2302 2C79;LATIN SMALL LETTER TURNED R WITH TAIL;Ll;0;L;;;;;N;;;;;
2303 2C7A;LATIN SMALL LETTER O WITH LOW RING INSIDE;Ll;0;L;;;;;N;;;;;
2304 2C7B;LATIN LETTER SMALL CAPITAL TURNED E;Ll;0;L;;;;;N;;;;;
2305 2C7C;LATIN SUBSCRIPT SMALL LETTER J;Ll;0;L;<sub> 006A;;;;N;;;;;
2306 2C7D;MODIFIER LETTER CAPITAL V;Lm;0;L;<super> 0056;;;;N;;;;;
2307 2C7E;LATIN CAPITAL LETTER S WITH SWASH TAIL;Lu;0;L;;;;;N;;;;023F; => C8 BF
2308 2C7F;LATIN CAPITAL LETTER Z WITH SWASH TAIL;Lu;0;L;;;;;N;;;;0240; => C9 80
2309 }
2310 #$B2,#$B5: new_c3 := chr(ord(c3)+1);
2311 #$BE,#$BF:
2312 begin
2313 inc(InStr, 3);
2314 case c3 of
2315 #$BE: OutStr^ := #$C8;
2316 #$BF: OutStr^ := #$C9;
2317 end;
2318 OutStr^ := #$C8;
2319 inc(OutStr);
2320 case c3 of
2321 #$BE: OutStr^ := #$BF;
2322 #$BF: OutStr^ := #$80;
2323 end;
2324 inc(OutStr);
2325 inc(CounterDiff, 1);
2326 Continue;
2327 end;
2328 end;
2329 end
2330 {
2331 2C80;COPTIC CAPITAL LETTER ALFA;Lu;0;L;;;;;N;;;;2C81; E2 B2 80 => E2 B2 81
2332 ...
2333 2CBE;COPTIC CAPITAL LETTER OLD COPTIC OOU;Lu;0;L;;;;;N;;;;2CBF; E2 B2 BE => E2 B2 BF
2334 2CBF;COPTIC SMALL LETTER OLD COPTIC OOU;Ll;0;L;;;;;N;;;2CBE;;2CBE
2335 ...
2336 2CC0;COPTIC CAPITAL LETTER SAMPI;Lu;0;L;;;;;N;;;;2CC1; E2 B3 80 => E2 B2 81
2337 2CC1;COPTIC SMALL LETTER SAMPI;Ll;0;L;;;;;N;;;2CC0;;2CC0
2338 ...
2339 2CE2;COPTIC CAPITAL LETTER OLD NUBIAN WAU;Lu;0;L;;;;;N;;;;2CE3; E2 B3 A2 => E2 B3 A3
2340 2CE3;COPTIC SMALL LETTER OLD NUBIAN WAU;Ll;0;L;;;;;N;;;2CE2;;2CE2 <=
2341 }
2342 else if (c2 = #$B2) then
2343 begin
2344 if ord(c3) mod 2 = 0 then new_c3 := chr(ord(c3) + 1);
2345 end
2346 else if (c2 = #$B3) and (c3 in [#$80..#$A3]) then
2347 begin
2348 if ord(c3) mod 2 = 0 then new_c3 := chr(ord(c3) + 1);
2349 end;
2350
2351 if (CounterDiff <> 0) then
2352 begin
2353 OutStr^ := new_c1;
2354 OutStr[1] := new_c2;
2355 OutStr[2] := new_c3;
2356 end
2357 else
2358 begin
2359 if c1 <> new_c1 then OutStr^ := new_c1;
2360 if c2 <> new_c2 then OutStr[1] := new_c2;
2361 if c3 <> new_c3 then OutStr[2] := new_c3;
2362 end;
2363
2364 inc(InStr, 3);
2365 inc(OutStr, 3);
2366 end;
2367 {
2368 FF21;FULLWIDTH LATIN CAPITAL LETTER A;Lu;0;L;<wide> 0041;;;;N;;;;FF41; EF BC A1 => EF BD 81
2369 ...
2370 FF3A;FULLWIDTH LATIN CAPITAL LETTER Z;Lu;0;L;<wide> 005A;;;;N;;;;FF5A; EF BC BA => EF BD 9A
2371 }
2372 #$EF:
2373 begin
2374 c2 := InStr[1];
2375 c3 := InStr[2];
2376
2377 if (c2 = #$BC) and (c3 in [#$A1..#$BA]) then
2378 begin
2379 OutStr^ := c1;
2380 OutStr[1] := #$BD;
2381 OutStr[2] := chr(ord(c3) - $20);
2382 end;
2383
2384 if (CounterDiff <> 0) then
2385 begin
2386 OutStr^ := c1;
2387 OutStr[1] := c2;
2388 OutStr[2] := c3;
2389 end;
2390
2391 inc(InStr, 3);
2392 inc(OutStr, 3);
2393 end;
2394 else
2395 // Copy the character if the string was disaligned by previous changes
2396 if (CounterDiff <> 0) then OutStr^:= c1;
2397 inc(InStr);
2398 inc(OutStr);
2399 end; // Case InStr^
2400 end; // while
2401
2402 // Final correction of the buffer size
2403 SetLength(Result,OutStr - PChar(Result));
2404 end;
2405
UTF8LowerStringnull2406 function UTF8LowerString(const s: string): string;
2407 begin
2408 Result:=UTF8LowerCase(s);
2409 end;
2410
2411
2412 {
2413 AInStr - The input string
2414 ALanguage - The language. Use '' for maximum speed if one desires to ignore the language
2415 The language should be specified in the format from ISO 639-1,
2416 which uses 2 characters to represent each language.
2417 If the language has no code in ISO 639-1, then the 3-chars code
2418 from ISO 639-2 should be used.
2419 Example: "tr" - Turkish language locale
2420
2421 Data from here: ftp://ftp.unicode.org/Public/UNIDATA/UnicodeData.txt
2422
2423 The columns in the file UnicodeData.txt are explained here:
2424 http://www.ksu.ru/eng/departments/ktk/test/perl/lib/unicode/UCDFF301.html#Case Mappings
2425 }
UTF8UpperCasenull2426 function UTF8UpperCase(const AInStr: string; ALanguage: string=''): string;
2427 var
2428 i, InCounter, OutCounter: PtrInt;
2429 OutStr: PChar;
2430 CharLen: integer;
2431 CharProcessed: Boolean;
2432 NewCharLen: integer;
2433 NewChar, OldChar: Word;
2434 // Language identification
2435 IsTurkish: Boolean;
2436
2437 procedure CorrectOutStrSize(AOldCharSize, ANewCharSize: Integer);
2438 begin
2439 if not (ANewCharSize > AOldCharSize) then Exit; // no correction needed
2440 if (ANewCharSize > 20) or (AOldCharSize > 20) then Exit; // sanity check
2441 // Fix for bug 23428
2442 // If the string wasn't decreased by previous char changes,
2443 // and our current operation will make it bigger, then for safety
2444 // increase the buffer
2445 if (ANewCharSize > AOldCharSize) and (OutCounter >= InCounter-1) then
2446 begin
2447 SetLength(Result, Length(Result)+ANewCharSize-AOldCharSize);
2448 OutStr := PChar(Result);
2449 end;
2450 end;
2451
2452 begin
2453 // Start with the same string, and progressively modify
2454 Result:=AInStr;
2455 UniqueString(Result);
2456 OutStr := PChar(Result);
2457
2458 // Language identification
2459 IsTurkish := (ALanguage = 'tr') or (ALanguage = 'az'); // Turkish and Azeri have a special handling
2460
2461 InCounter:=1; // for AInStr
2462 OutCounter := 0; // for Result
2463 while InCounter<=length(AInStr) do
2464 begin
2465 { First ASCII chars }
2466 if (AInStr[InCounter] <= 'z') and (AInStr[InCounter] >= 'a') then
2467 begin
2468 // Special turkish handling
2469 // small dotted i to capital dotted i
2470 if IsTurkish and (AInStr[InCounter] = 'i') then
2471 begin
2472 SetLength(Result,Length(Result)+1);// Increase the buffer
2473 OutStr := PChar(Result);
2474 OutStr[OutCounter]:=#$C4;
2475 OutStr[OutCounter+1]:=#$B0;
2476 inc(InCounter);
2477 inc(OutCounter,2);
2478 end
2479 else
2480 begin
2481 OutStr[OutCounter]:=chr(ord(AInStr[InCounter])-32);
2482 inc(InCounter);
2483 inc(OutCounter);
2484 end;
2485 end
2486 { Now everything else }
2487 else
2488 begin
2489 CharLen := UTF8CodepointSize(@AInStr[InCounter]);
2490 CharProcessed := False;
2491 NewCharLen := CharLen;
2492
2493 if CharLen = 2 then
2494 begin
2495 OldChar := (Ord(AInStr[InCounter]) shl 8) or Ord(AInStr[InCounter+1]);
2496 NewChar := 0;
2497
2498 // Major processing
2499 case OldChar of
2500 // Latin Characters 0000–0FFF http://en.wikibooks.org/wiki/Unicode/Character_reference/0000-0FFF
2501 $C39F: NewChar := $5353; // ß => SS
2502 $C3A0..$C3B6,$C3B8..$C3BE: NewChar := OldChar - $20;
2503 $C3BF: NewChar := $C5B8; // ÿ
2504 $C481..$C4B0: if OldChar mod 2 = 1 then NewChar := OldChar - 1;
2505 // 0130 = C4 B0
2506 // turkish small undotted i to capital undotted i
2507 $C4B1:
2508 begin
2509 OutStr[OutCounter]:='I';
2510 NewCharLen := 1;
2511 CharProcessed := True;
2512 end;
2513 $C4B2..$C4B7: if OldChar mod 2 = 1 then NewChar := OldChar - 1;
2514 // $C4B8: ĸ without upper/lower
2515 $C4B9..$C4BF: if OldChar mod 2 = 0 then NewChar := OldChar - 1;
2516 $C580: NewChar := $C4BF; // border between bytes
2517 $C581..$C588: if OldChar mod 2 = 0 then NewChar := OldChar - 1;
2518 // $C589 ʼn => ?
2519 $C58A..$C5B7: if OldChar mod 2 = 1 then NewChar := OldChar - 1;
2520 // $C5B8: // Ÿ already uppercase
2521 $C5B9..$C5BE: if OldChar mod 2 = 0 then NewChar := OldChar - 1;
2522 $C5BF: // 017F
2523 begin
2524 OutStr[OutCounter]:='S';
2525 NewCharLen := 1;
2526 CharProcessed := True;
2527 end;
2528 // 0180 = C6 80 -> A convoluted part
2529 $C680: NewChar := $C983;
2530 $C682..$C685: if OldChar mod 2 = 1 then NewChar := OldChar - 1;
2531 $C688: NewChar := $C687;
2532 $C68C: NewChar := $C68B;
2533 // 0190 = C6 90 -> A convoluted part
2534 $C692: NewChar := $C691;
2535 $C695: NewChar := $C7B6;
2536 $C699: NewChar := $C698;
2537 $C69A: NewChar := $C8BD;
2538 $C69E: NewChar := $C8A0;
2539 // 01A0 = C6 A0 -> A convoluted part
2540 $C6A0..$C6A5: if OldChar mod 2 = 1 then NewChar := OldChar - 1;
2541 $C6A8: NewChar := $C6A7;
2542 $C6AD: NewChar := $C6AC;
2543 // 01B0 = C6 B0
2544 $C6B0: NewChar := $C6AF;
2545 $C6B3..$C6B6: if OldChar mod 2 = 0 then NewChar := OldChar - 1;
2546 $C6B9: NewChar := $C6B8;
2547 $C6BD: NewChar := $C6BC;
2548 $C6BF: NewChar := $C7B7;
2549 // 01C0 = C7 80
2550 $C784..$C786: NewChar := $C784;
2551 $C787..$C789: NewChar := $C787;
2552 $C78A..$C78C: NewChar := $C78A;
2553 $C78E: NewChar := $C78D;
2554 // 01D0 = C7 90
2555 $C790: NewChar := $C78F;
2556 $C791..$C79C: if OldChar mod 2 = 0 then NewChar := OldChar - 1;
2557 $C79D: NewChar := $C68E;
2558 $C79F: NewChar := $C79E;
2559 // 01E0 = C7 A0
2560 $C7A0..$C7AF: if OldChar mod 2 = 1 then NewChar := OldChar - 1;
2561 // 01F0 = C7 B0
2562 $C7B2..$C7B3: NewChar := $C7B1;
2563 $C7B5: NewChar := $C7B4;
2564 $C7B8..$C7BF: if OldChar mod 2 = 1 then NewChar := OldChar - 1;
2565 // 0200 = C8 80
2566 // 0210 = C8 90
2567 $C880..$C89F: if OldChar mod 2 = 1 then NewChar := OldChar - 1;
2568 // 0220 = C8 A0
2569 // 0230 = C8 B0
2570 $C8A2..$C8B3: if OldChar mod 2 = 1 then NewChar := OldChar - 1;
2571 $C8BC: NewChar := $C8BB;
2572 $C8BF:
2573 begin
2574 CorrectOutStrSize(2, 3);
2575 OutStr[OutCounter] := #$E2;
2576 OutStr[OutCounter+1]:= #$B1;
2577 OutStr[OutCounter+2]:= #$BE;
2578 NewCharLen := 3;
2579 CharProcessed := True;
2580 end;
2581 // 0240 = C9 80
2582 $C980:
2583 begin
2584 CorrectOutStrSize(2, 3);
2585 OutStr[OutCounter] := #$E2;
2586 OutStr[OutCounter+1]:= #$B1;
2587 OutStr[OutCounter+2]:= #$BF;
2588 NewCharLen := 3;
2589 CharProcessed := True;
2590 end;
2591 $C982: NewChar := $C981;
2592 $C986..$C98F: if OldChar mod 2 = 1 then NewChar := OldChar - 1;
2593 // 0250 = C9 90
2594 $C990:
2595 begin
2596 CorrectOutStrSize(2, 3);
2597 OutStr[OutCounter] := #$E2;
2598 OutStr[OutCounter+1]:= #$B1;
2599 OutStr[OutCounter+2]:= #$AF;
2600 NewCharLen := 3;
2601 CharProcessed := True;
2602 end;
2603 $C991:
2604 begin
2605 CorrectOutStrSize(2, 3);
2606 OutStr[OutCounter] := #$E2;
2607 OutStr[OutCounter+1]:= #$B1;
2608 OutStr[OutCounter+2]:= #$AD;
2609 NewCharLen := 3;
2610 CharProcessed := True;
2611 end;
2612 $C992:
2613 begin
2614 CorrectOutStrSize(2, 3);
2615 OutStr[OutCounter] := #$E2;
2616 OutStr[OutCounter+1]:= #$B1;
2617 OutStr[OutCounter+2]:= #$B0;
2618 NewCharLen := 3;
2619 CharProcessed := True;
2620 end;
2621 $C993: NewChar := $C681;
2622 $C994: NewChar := $C686;
2623 $C996: NewChar := $C689;
2624 $C997: NewChar := $C68A;
2625 $C999: NewChar := $C68F;
2626 $C99B: NewChar := $C690;
2627 // 0260 = C9 A0
2628 $C9A0: NewChar := $C693;
2629 $C9A3: NewChar := $C694;
2630 $C9A5:
2631 begin
2632 CorrectOutStrSize(2, 3);
2633 OutStr[OutCounter] := #$EA;
2634 OutStr[OutCounter+1]:= #$9E;
2635 OutStr[OutCounter+2]:= #$8D;
2636 NewCharLen := 3;
2637 CharProcessed := True;
2638 end;
2639 $C9A8: NewChar := $C697;
2640 $C9A9: NewChar := $C696;
2641 $C9AB:
2642 begin
2643 CorrectOutStrSize(2, 3);
2644 OutStr[OutCounter] := #$E2;
2645 OutStr[OutCounter+1]:= #$B1;
2646 OutStr[OutCounter+2]:= #$A2;
2647 NewCharLen := 3;
2648 CharProcessed := True;
2649 end;
2650 $C9AF: NewChar := $C69C;
2651 // 0270 = C9 B0
2652 $C9B1:
2653 begin
2654 CorrectOutStrSize(2, 3);
2655 OutStr[OutCounter] := #$E2;
2656 OutStr[OutCounter+1]:= #$B1;
2657 OutStr[OutCounter+2]:= #$AE;
2658 NewCharLen := 3;
2659 CharProcessed := True;
2660 end;
2661 $C9B2: NewChar := $C69D;
2662 $C9B5: NewChar := $C69F;
2663 $C9BD:
2664 begin
2665 CorrectOutStrSize(2, 3);
2666 OutStr[OutCounter] := #$E2;
2667 OutStr[OutCounter+1]:= #$B1;
2668 OutStr[OutCounter+2]:= #$A4;
2669 NewCharLen := 3;
2670 CharProcessed := True;
2671 end;
2672 // 0280 = CA 80
2673 $CA80: NewChar := $C6A6;
2674 $CA83: NewChar := $C6A9;
2675 $CA88: NewChar := $C6AE;
2676 $CA89: NewChar := $C984;
2677 $CA8A: NewChar := $C6B1;
2678 $CA8B: NewChar := $C6B2;
2679 $CA8C: NewChar := $C985;
2680 // 0290 = CA 90
2681 $CA92: NewChar := $C6B7;
2682 {
2683 03A0 = CE A0
2684
2685 03AC;GREEK SMALL LETTER ALPHA WITH TONOS;Ll;0;L;03B1 0301;;;;N;GREEK SMALL LETTER ALPHA TONOS;;0386;;0386
2686 03AD;GREEK SMALL LETTER EPSILON WITH TONOS;Ll;0;L;03B5 0301;;;;N;GREEK SMALL LETTER EPSILON TONOS;;0388;;0388
2687 03AE;GREEK SMALL LETTER ETA WITH TONOS;Ll;0;L;03B7 0301;;;;N;GREEK SMALL LETTER ETA TONOS;;0389;;0389
2688 03AF;GREEK SMALL LETTER IOTA WITH TONOS;Ll;0;L;03B9 0301;;;;N;GREEK SMALL LETTER IOTA TONOS;;038A;;038A
2689 }
2690 $CEAC: NewChar := $CE86;
2691 $CEAD: NewChar := $CE88;
2692 $CEAE: NewChar := $CE89;
2693 $CEAF: NewChar := $CE8A;
2694 {
2695 03B0 = CE B0
2696
2697 03B0;GREEK SMALL LETTER UPSILON WITH DIALYTIKA AND TONOS;Ll;0;L;03CB 0301;;;;N;GREEK SMALL LETTER UPSILON DIAERESIS TONOS;;;;
2698 03B1;GREEK SMALL LETTER ALPHA;Ll;0;L;;;;;N;;;0391;;0391
2699 ...
2700 03BF;GREEK SMALL LETTER OMICRON;Ll;0;L;;;;;N;;;039F;;039F
2701 }
2702 $CEB1..$CEBF: NewChar := OldChar - $20; // Greek Characters
2703 {
2704 03C0 = CF 80
2705
2706 03C0;GREEK SMALL LETTER PI;Ll;0;L;;;;;N;;;03A0;;03A0 CF 80 => CE A0
2707 03C1;GREEK SMALL LETTER RHO;Ll;0;L;;;;;N;;;03A1;;03A1
2708 03C2;GREEK SMALL LETTER FINAL SIGMA;Ll;0;L;;;;;N;;;03A3;;03A3
2709 03C3;GREEK SMALL LETTER SIGMA;Ll;0;L;;;;;N;;;03A3;;03A3
2710 03C4;GREEK SMALL LETTER TAU;Ll;0;L;;;;;N;;;03A4;;03A4
2711 ....
2712 03CB;GREEK SMALL LETTER UPSILON WITH DIALYTIKA;Ll;0;L;03C5 0308;;;;N;GREEK SMALL LETTER UPSILON DIAERESIS;;03AB;;03AB
2713 03CC;GREEK SMALL LETTER OMICRON WITH TONOS;Ll;0;L;03BF 0301;;;;N;GREEK SMALL LETTER OMICRON TONOS;;038C;;038C
2714 03CD;GREEK SMALL LETTER UPSILON WITH TONOS;Ll;0;L;03C5 0301;;;;N;GREEK SMALL LETTER UPSILON TONOS;;038E;;038E
2715 03CE;GREEK SMALL LETTER OMEGA WITH TONOS;Ll;0;L;03C9 0301;;;;N;GREEK SMALL LETTER OMEGA TONOS;;038F;;038F
2716 03CF;GREEK CAPITAL KAI SYMBOL;Lu;0;L;;;;;N;;;;03D7;
2717 }
2718 $CF80,$CF81,$CF83..$CF8B: NewChar := OldChar - $E0; // Greek Characters
2719 $CF82: NewChar := $CEA3;
2720 $CF8C: NewChar := $CE8C;
2721 $CF8D: NewChar := $CE8E;
2722 $CF8E: NewChar := $CE8F;
2723 {
2724 03D0 = CF 90
2725
2726 03D0;GREEK BETA SYMBOL;Ll;0;L;<compat> 03B2;;;;N;GREEK SMALL LETTER CURLED BETA;;0392;;0392 CF 90 => CE 92
2727 03D1;GREEK THETA SYMBOL;Ll;0;L;<compat> 03B8;;;;N;GREEK SMALL LETTER SCRIPT THETA;;0398;;0398 => CE 98
2728 03D5;GREEK PHI SYMBOL;Ll;0;L;<compat> 03C6;;;;N;GREEK SMALL LETTER SCRIPT PHI;;03A6;;03A6 => CE A6
2729 03D6;GREEK PI SYMBOL;Ll;0;L;<compat> 03C0;;;;N;GREEK SMALL LETTER OMEGA PI;;03A0;;03A0 => CE A0
2730 03D7;GREEK KAI SYMBOL;Ll;0;L;;;;;N;;;03CF;;03CF => CF 8F
2731 03D9;GREEK SMALL LETTER ARCHAIC KOPPA;Ll;0;L;;;;;N;;;03D8;;03D8
2732 03DB;GREEK SMALL LETTER STIGMA;Ll;0;L;;;;;N;;;03DA;;03DA
2733 03DD;GREEK SMALL LETTER DIGAMMA;Ll;0;L;;;;;N;;;03DC;;03DC
2734 03DF;GREEK SMALL LETTER KOPPA;Ll;0;L;;;;;N;;;03DE;;03DE
2735 }
2736 $CF90: NewChar := $CE92;
2737 $CF91: NewChar := $CE98;
2738 $CF95: NewChar := $CEA6;
2739 $CF96: NewChar := $CEA0;
2740 $CF97: NewChar := $CF8F;
2741 $CF99..$CF9F: if OldChar mod 2 = 1 then NewChar := OldChar - 1;
2742 // 03E0 = CF A0
2743 $CFA0..$CFAF: if OldChar mod 2 = 1 then NewChar := OldChar - 1;
2744 {
2745 03F0 = CF B0
2746
2747 03F0;GREEK KAPPA SYMBOL;Ll;0;L;<compat> 03BA;;;;N;GREEK SMALL LETTER SCRIPT KAPPA;;039A;;039A => CE 9A
2748 03F1;GREEK RHO SYMBOL;Ll;0;L;<compat> 03C1;;;;N;GREEK SMALL LETTER TAILED RHO;;03A1;;03A1 => CE A1
2749 03F2;GREEK LUNATE SIGMA SYMBOL;Ll;0;L;<compat> 03C2;;;;N;GREEK SMALL LETTER LUNATE SIGMA;;03F9;;03F9
2750 03F5;GREEK LUNATE EPSILON SYMBOL;Ll;0;L;<compat> 03B5;;;;N;;;0395;;0395 => CE 95
2751 03F8;GREEK SMALL LETTER SHO;Ll;0;L;;;;;N;;;03F7;;03F7
2752 03FB;GREEK SMALL LETTER SAN;Ll;0;L;;;;;N;;;03FA;;03FA
2753 }
2754 $CFB0: NewChar := $CE9A;
2755 $CFB1: NewChar := $CEA1;
2756 $CFB2: NewChar := $CFB9;
2757 $CFB5: NewChar := $CE95;
2758 $CFB8: NewChar := $CFB7;
2759 $CFBB: NewChar := $CFBA;
2760 // 0400 = D0 80 ... 042F everything already uppercase
2761 // 0430 = D0 B0
2762 $D0B0..$D0BF: NewChar := OldChar - $20; // Cyrillic alphabet
2763 // 0440 = D1 80
2764 $D180..$D18F: NewChar := OldChar - $E0; // Cyrillic alphabet
2765 // 0450 = D1 90
2766 $D190..$D19F: NewChar := OldChar - $110; // Cyrillic alphabet
2767 end;
2768
2769 if NewChar <> 0 then
2770 begin
2771 OutStr[OutCounter] := Chr(Hi(NewChar));
2772 OutStr[OutCounter+1]:= Chr(Lo(NewChar));
2773 CharProcessed := True;
2774 end;
2775 end;
2776
2777 // Copy the character if the string was disaligned by previous changed
2778 // and no processing was done in this character
2779 if (InCounter <> OutCounter+1) and (not CharProcessed) then
2780 begin
2781 for i := 0 to CharLen-1 do
2782 OutStr[OutCounter+i] :=AInStr[InCounter+i];
2783 end;
2784
2785 inc(InCounter, CharLen);
2786 inc(OutCounter, NewCharLen);
2787 end;
2788 end; // while
2789
2790 // Final correction of the buffer size
2791 SetLength(Result,OutCounter);
2792 end;
2793
2794 function UTF8UpperString(const s: string): string;
2795 begin
2796 Result:=UTF8UpperCase(s);
2797 end;
2798
2799
2800 function FindInvalidUTF8Codepoint(p: PChar; Count: PtrInt; StopOnNonUTF8: Boolean): PtrInt;
2801 // return -1 if ok
2802 var
2803 CharLen: Integer;
2804 c: Char;
2805 begin
2806 if (p<>nil) then begin
2807 Result:=0;
2808 while Result<Count do begin
2809 c:=p^;
2810 if ord(c)<%10000000 then begin
2811 // regular single byte ASCII character (#0 is a character, this is Pascal ;)
2812 CharLen:=1;
2813 end else if ord(c)<=%11000001 then begin
2814 // single byte character, between valid UTF-8 encodings
2815 // %11000000 and %11000001 map 2 byte to #0..#128, which is invalid and used for XSS attacks
2816 if StopOnNonUTF8 or (ord(c)>=192) then
2817 exit;
2818 CharLen:=1;
2819 end else if ord(c)<=%11011111 then begin
2820 // could be 2 byte character (%110xxxxx %10xxxxxx)
2821 if (Result<Count-1)
2822 and ((ord(p[1]) and %11000000) = %10000000) then
2823 CharLen:=2
2824 else
2825 exit; // missing following bytes
2826 end
2827 else if ord(c)<=%11101111 then begin
2828 // could be 3 byte character (%1110xxxx %10xxxxxx %10xxxxxx)
2829 if (Result<Count-2)
2830 and ((ord(p[1]) and %11000000) = %10000000)
2831 and ((ord(p[2]) and %11000000) = %10000000) then begin
2832 if (ord(c)=%11100000) and (ord(p[1])<=%10011111) then
2833 exit; // XSS attack: 3 bytes are mapped to the 1 or 2 byte codes
2834 CharLen:=3;
2835 end else
2836 exit; // missing following bytes
2837 end
2838 else if ord(c)<=%11110111 then begin
2839 // could be 4 byte character (%11110xxx %10xxxxxx %10xxxxxx %10xxxxxx)
2840 if (Result<Count-3)
2841 and ((ord(p[1]) and %11000000) = %10000000)
2842 and ((ord(p[2]) and %11000000) = %10000000)
2843 and ((ord(p[3]) and %11000000) = %10000000) then begin
2844 if (ord(c)=%11110000) and (ord(p[1])<=%10001111) then
2845 exit; // XSS attack: 4 bytes are mapped to the 1-3 byte codes
2846 CharLen:=4;
2847 end else
2848 exit; // missing following bytes
2849 end
2850 else begin
2851 if StopOnNonUTF8 then
2852 exit;
2853 CharLen:=1;
2854 end;
2855 inc(Result,CharLen);
2856 inc(p,CharLen);
2857 if Result>Count then begin
2858 dec(Result,CharLen);
2859 exit; // missing following bytes
2860 end;
2861 end;
2862 end;
2863 // ok
2864 Result:=-1;
2865 end;
2866
2867 function FindInvalidUTF8Character(p: PChar; Count: PtrInt; StopOnNonUTF8: Boolean = true): PtrInt;
2868 begin
2869 Result := FindInvalidUTF8Codepoint(p, Count, StopOnNonUTF8);
2870 end;
2871
2872 {
2873 Translates escape characters inside an UTF8 encoded string into
2874 human readable format.
2875 Mainly used for logging purposes.
2876 Parameters:
2877 S : Input string. Must be UTF8 encoded.
2878 EscapeMode: controls the human readable format for escape characters.
2879 }
2880 function Utf8EscapeControlChars(S: String; EscapeMode: TEscapeMode = emPascal): String;
2881 const
2882 //lookuptables are about 1.8 to 1.3 times faster than a function using IntToStr or IntToHex
2883 PascalEscapeStrings: Array[#0..#31] of string = (
2884 '#00', '#01', '#02', '#03', '#04', '#05', '#06', '#07',
2885 '#08', '#09', '#10', '#11', '#12', '#13', '#14', '#15',
2886 '#16', '#17', '#18', '#19', '#20', '#21', '#22', '#23',
2887 '#24', '#25', '#26', '#27', '#28', '#29', '#30', '#31');
2888 CEscapeStrings: Array[#0..#31] of string = (
2889 '\0' , '\0x01', '\0x02', '\0x03', '\0x04', '\0x05', '\0x06', '\0x07',
2890 '\0x08', '\t' , '\r' , '\0x0B', '\0x0C', '\n' , '\0x0E', '\0x0F',
2891 '\0x10', '\0x11', '\0x12', '\0x13', '\0x14', '\0x15', '\0x16', '\0x17',
2892 '\0x18', '\0x19', '\0x1A', '\0x1B', '\0x1C', '\0x1D', '\0x1E', '\0x1F');
2893 HexEscapeCStrings: Array[#0..#31] of string = (
2894 '\0x00', '\0x01', '\0x02', '\0x03', '\0x04', '\0x05', '\0x06', '\0x07',
2895 '\0x08', '\0x09', '\0x0A', '\0x0B', '\0x0C', '\0x0D', '\0x0E', '\0x0F',
2896 '\0x10', '\0x11', '\0x12', '\0x13', '\0x14', '\0x15', '\0x16', '\0x17',
2897 '\0x18', '\0x19', '\0x1A', '\0x1B', '\0x1C', '\0x1D', '\0x1E', '\0x1F');
2898 HexEscapePascalStrings: Array[#0..#31] of string = (
2899 '#$00', '#$01', '#$02', '#$03', '#$04', '#$05', '#$06', '#$07',
2900 '#$08', '#$09', '#$0A', '#$0B', '#$0C', '#$0D', '#$0E', '#$0F',
2901 '#$10', '#$11', '#$12', '#$13', '#$14', '#$15', '#$16', '#$17',
2902 '#$18', '#$19', '#$1A', '#$1B', '#$1C', '#$1D', '#$1E', '#$1F');
2903 AsciiControlStrings: Array[#0..#31] of string = (
2904 '[NUL]', '[SOH]', '[STX]', '[ETX]', '[EOT]', '[ENQ]', '[ACK]', '[BEL]',
2905 '[BS]' , '[HT]' , '[LF]' , '[VT]' , '[FF]' , '[CR]' , '[SO]' , '[SI]' ,
2906 '[DLE]', '[DC1]', '[DC2]', '[DC3]', '[DC4]', '[NAK]', '[SYN]', '[ETB]',
2907 '[CAN]', '[EM]' , '[SUB]', '[ESC]', '[FS]' , '[GS]' , '[RS]' , '[US]');
2908 var
2909 Ch: Char;
2910 i: Integer;
2911 begin
2912 if FindInvalidUTF8Codepoint(PChar(S), Length(S)) <> -1 then
2913 begin
2914 UTF8FixBroken(S);
2915 end;
2916 Result := '';
2917 //a byte < 127 cannot be part of a multi-byte codepoint, so this is safe
2918 for i := 1 to Length(S) do
2919 begin
2920 Ch := S[i];
2921 if (Ch < #32) then
2922 begin
2923 case EscapeMode of
2924 emPascal: Result := Result + PascalEscapeStrings[Ch];
2925 emHexPascal: Result := Result + HexEscapePascalStrings[Ch];
2926 emHexC: Result := Result + HexEscapeCStrings[Ch];
2927 emC: Result := Result + CEscapeStrings[Ch];
2928 emAsciiControlNames: Result := Result + AsciiControlStrings[Ch];
2929 end;//case
2930 end
2931 else
2932 Result := Result + Ch;
2933 end;
2934 end;
2935
2936 function UTF8StringOfChar(AUtf8Char: String; N: Integer): String;
2937 var
2938 UCharLen, i: Integer;
2939 C1, C2, C3: Char;
2940 PC: PChar;
2941 begin
2942 Result := '';
2943 if (N <= 0) or (Utf8Length(AUtf8Char) <> 1) then Exit;
2944 UCharLen := Length(AUtf8Char);
2945 Case UCharLen of
2946 1: Result := StringOfChar(AUtf8Char[1], N);
2947 2:
2948 begin
2949 SetLength(Result, 2 * N);
2950 System.FillWord(Result[1], N, PWord(Pointer(AUtf8Char))^);
2951 end;
2952 3:
2953 begin
2954 SetLength(Result, 3 * N);
2955 C1 := AUtf8Char[1];
2956 C2 := AUtf8Char[2];
2957 C3 := AUtf8Char[3];
2958 PC := PChar(Result);
2959 for i:=1 to N do
2960 begin
2961 PC[0] := C1;
2962 PC[1] := C2;
2963 PC[2] := C3;
2964 inc(PC,3);
2965 end;
2966 end;
2967 4:
2968 begin
2969 SetLength(Result, 4 * N);
2970 System.FillDWord(Result[1], N, PDWord(Pointer(AUtf8Char))^);
2971 end;
2972 else
2973 begin
2974 //In November 2003 UTF-8 was restricted by RFC 3629 to four bytes to match
2975 //the constraints of the UTF-16 character encoding.
2976 //http://en.wikipedia.org/wiki/UTF-8
2977 Result := StringOfChar('?', N);
2978 end;
2979 end;
2980 end;
2981
2982 function UTF8AddChar(AUtf8Char: String; const S: String; N: Integer): String;
2983 var
2984 L : Integer;
2985 begin
2986 Result := S;
2987 if Utf8Length(AUtf8Char) <> 1 then Exit;
2988 L := Utf8Length(Result);
2989 if L < N then
2990 Result := Utf8StringOfChar(AUtf8Char, N-l) + Result;
2991 end;
2992
2993 function UTF8AddCharR(AUtf8Char: String; const S: String; N: Integer): String;
2994 var
2995 L : Integer;
2996 begin
2997 Result := S;
2998 if Utf8Length(AUtf8Char) <> 1 then Exit;
2999 L := Utf8Length(Result);
3000 if L < N then
3001 Result := Result + Utf8StringOfChar(AUtf8Char, N-l);
3002 end;
3003
3004 function UTF8PadLeft(const S: String; const N: Integer; const AUtf8Char: String = #32): String;
3005 begin
3006 Result := Utf8AddChar(AUtf8Char, S, N);
3007 end;
3008
3009 function UTF8PadRight(const S: String; const N: Integer; const AUtf8Char: String = #32): String;
3010 begin
3011 Result := Utf8AddCharR(AUtf8Char, S, N);
3012 end;
3013
3014 function UTF8PadCenter(const S: String; const N: Integer; const AUtf8Char: String = #32): String;
3015 var
3016 ULen: PtrInt;
3017 begin
3018 ULen := Utf8Length(S);
3019 if ULen < N then
3020 begin
3021 Result := Utf8StringOfChar(AUtf8Char,(N div 2) - (ULen div 2)) + S;
3022 Result := Result + Utf8StringOfChar(AUtf8Char, N - Utf8Length(Result));
3023 end
3024 else
3025 Result := S;
3026 end;
3027
3028 function UTF8LeftStr(const AText: String; const ACount: Integer): String;
3029 begin
3030 Result := Utf8Copy(AText,1,ACount);
3031 end;
3032
3033 function UTF8RightStr(const AText: String; const ACount: Integer): String;
3034 var
3035 j,l:integer;
3036 begin
3037 l := Utf8Length(AText);
3038 j := ACount;
3039 if (j > l) then j := l;
3040 Result := Utf8Copy(AText,l-j+1,j);
3041 end;
3042
3043 function UTF8QuotedStr(const S, Quote: string): string;
3044 // replace all Quote in S with double Quote and enclose the result in Quote.
3045 var
3046 QuoteC: Char;
3047 p, QuoteP, CopyPos: PChar;
3048 QuoteLen: SizeInt;
3049 begin
3050 Result:=Quote;
3051 p:=PChar(S);
3052 CopyPos:=p;
3053 QuoteC:=Quote[1];
3054 QuoteP:=PChar(Quote);
3055 QuoteLen:=length(Quote);
3056 repeat
3057 if (p^=#0) and (p-PChar(S)=length(S)) then
3058 break;
3059 if (p^=QuoteC) and CompareMem(p,QuoteP,QuoteLen) then begin
3060 inc(p,QuoteLen);
3061 Result+=copy(S,CopyPos-PChar(S)+1,p-CopyPos)+Quote;
3062 CopyPos:=p;
3063 end else
3064 inc(p);
3065 until false;
3066 Result+=copy(S,CopyPos-PChar(S)+1,p-CopyPos)+Quote;
3067 end;
3068
3069 function UTF8StartsText(const ASubText, AText: string): Boolean;
3070 var
3071 TextLen, SubTextLen: PtrInt;
3072 begin
3073 Result := False;
3074 if (ASubText <> '') then
3075 begin
3076 TextLen := Utf8Length(AText);
3077 SubTextLen := Utf8Length(ASubText);
3078 if (TextLen >= SubTextLen) then
3079 Result := (Utf8CompareText(Utf8Copy(AText,1,SubTextLen),ASubText) = 0);
3080 end;
3081 end;
3082
3083 function UTF8EndsText(const ASubText, AText: string): Boolean;
3084 var
3085 TextLen, SubTextLen: PtrInt;
3086 begin
3087 Result := False;
3088 if (ASubText <> '') then
3089 begin
3090 TextLen := Utf8Length(AText);
3091 SubTextLen := Utf8Length(ASubText);
3092 if (TextLen >= SubTextLen) then
3093 Result := (Utf8CompareText(Utf8Copy(AText,TextLen-SubTextLen+1,SubTextLen),ASubText) = 0);
3094 end;
3095 end;
3096
3097 function UTF8ReverseString(p: PChar; const ByteCount: LongInt): string;
3098 var
3099 CharLen, rBytePos: LongInt;
3100 begin
3101 SetLength(Result, ByteCount);
3102 rBytePos := ByteCount + 1;
3103 while (rBytePos > 1) do
3104 begin
3105 CharLen:=UTF8CodepointSize(p);
3106 Dec(rBytePos, CharLen);
3107 System.Move(p^, Result[rBytePos], CharLen);
3108 Inc(p, CharLen);
3109 end;
3110 end;
3111
3112 function UTF8ReverseString(const AText: string): string; inline;
3113 begin
3114 Result := UTF8ReverseString(PChar(AText), length(AText));
3115 end;
3116
3117 function UTF8RPos(const Substr, Source: string): PtrInt;
3118 var
3119 pRev: PtrInt;
3120 begin
3121 pRev := RPos(Substr, Source); // Scan from the end.
3122 Result := UTF8Length(PChar(Source), pRev); // Length of the leading part.
3123 end;
3124
3125 function UTF8WrapText(S, BreakStr: string; BreakChars: TSysCharSet; MaxCol: integer): string;
3126 var
3127 P : PChar;
3128 RightSpace : integer = 0;
3129 N : integer = 0;
3130 Len : integer = 0;
3131 i, j : integer;
3132 CharLen, ResultLen, RP : integer;
3133 begin
3134 Result := '';
3135 if (S = '') or (MaxCol = 0) or (BreakStr = '') or (BreakChars = []) then Exit;
3136 P := PChar(S);
3137 while P^ <> #0 do
3138 begin
3139 CharLen := UTF8CodepointSize(P);
3140 i := 1;
3141 j := 0;
3142 ResultLen := Length(Result);
3143 SetLength(Result, ResultLen + CharLen);
3144 while i <= CharLen do
3145 begin
3146 Result[ResultLen + i] := (P + J)^;
3147 Inc(i);
3148 Inc(j);
3149 end;
3150 Inc(N);
3151 if P^ = BreakStr[Length(BreakStr)] then
3152 N := 0;
3153 if N > MaxCol then
3154 begin
3155 Len := Length(Result);
3156 RP := Len;
3157 while not (Result[RP] in BreakChars) do
3158 Dec(RP);
3159 RightSpace := Len - RP;
3160 if (RightSpace > 0) and (RightSpace < MaxCol) then
3161 begin
3162 Dec(P, RightSpace);
3163 SetLength(Result, Len - RightSpace);
3164 end;
3165 Result := Result + BreakStr;
3166 N := 0;
3167 end;
3168 Inc(P, CharLen);
3169 end;
3170 end;
3171
3172 function UTF8WrapText(S: string; MaxCol: integer): string;
3173 begin
3174 Result := UTF8WrapText(S, LineEnding, [' ', '-', #9], MaxCol);
3175 end;
3176
3177 function UTF8Trim(const s: string; Flags: TUTF8TrimFlags): string;
3178 var
3179 p: PChar;
3180 u: Cardinal;
3181 StartP: PtrUInt;
3182 l: Integer;
3183 KeepAllNonASCII: boolean;
3184 begin
3185 Result:=s;
3186 if Result='' then exit;
3187 KeepAllNonASCII:=[u8tKeepControlCodes,u8tKeepNoBreakSpaces]*Flags=[u8tKeepControlCodes,u8tKeepNoBreakSpaces];
3188 if not (u8tKeepStart in Flags) then begin
3189 // trim start
3190 p:=PChar(Result);
3191 repeat
3192 l:=1;
3193 case p^ of
3194 #0:
3195 if p-PChar(Result)=length(Result) then
3196 begin
3197 // everything was trimmed
3198 exit('')
3199 end else if u8tKeepControlCodes in Flags then
3200 break;
3201 ' ': ;
3202 #10,#13:
3203 if u8tKeepLineBreaks in Flags then
3204 break;
3205 #9:
3206 if u8tKeepTabs in Flags then
3207 break;
3208 #1..#8,#11,#12,#14..#31,#127:
3209 if u8tKeepControlCodes in Flags then
3210 break;
3211 #128..#255:
3212 begin
3213 if KeepAllNonASCII then break;
3214 u:=UTF8CodepointToUnicode(p,l);
3215 if (l<=1) then break; // invalid character
3216 case u of
3217 128..159, // C1 set of control codes
3218 8206, 8207: // left-to-right, right-to-left mark
3219 if u8tKeepControlCodes in Flags then break;
3220 160, // no break space
3221 $2007, // figure space
3222 $2026, // narrow no-break space
3223 $FEFF: // zero with no-break space
3224 if u8tKeepNoBreakSpaces in Flags then break;
3225 else
3226 break;
3227 end;
3228 end;
3229 else
3230 break;
3231 end;
3232 inc(p,l);
3233 until false;
3234 if p>PChar(Result) then begin
3235 Result:=copy(Result,p-PChar(Result)+1,length(Result));
3236 if Result='' then exit;
3237 end;
3238 end;
3239
3240 if not (u8tKeepEnd in Flags) then begin
3241 // trim end
3242 p:=@Result[length(Result)];
3243 repeat
3244 case p^ of
3245 #0:
3246 if u8tKeepControlCodes in Flags then
3247 break;
3248 ' ': ;
3249 #10,#13:
3250 if u8tKeepLineBreaks in Flags then
3251 break;
3252 #9:
3253 if u8tKeepTabs in Flags then
3254 break;
3255 #1..#8,#11,#12,#14..#31,#127:
3256 if u8tKeepControlCodes in Flags then
3257 break;
3258 #128..#255:
3259 begin
3260 if KeepAllNonASCII then break;
3261 StartP:=UTF8FindNearestCharStart(PChar(Result),length(Result),p-PChar(Result));
3262 u:=UTF8CodepointToUnicode(PChar(Result)+StartP,l);
3263 if (l<=1) then break; // invalid character
3264 case u of
3265 128..159, // C1 set of control codes
3266 8206, 8207: // left-to-right, right-to-left mark
3267 if u8tKeepControlCodes in Flags then break;
3268 160, // no break space
3269 $2007, // figure space
3270 $2026, // narrow no-break space
3271 $FEFF: // zero with no-break space
3272 if u8tKeepNoBreakSpaces in Flags then break;
3273 else
3274 break;
3275 end;
3276 p:=PChar(Result)+StartP;
3277 end;
3278 else
3279 break;
3280 end;
3281 dec(p);
3282 until p<PChar(Result);
3283 // p is on last good byte
3284 SetLength(Result,p+1-PChar(Result));
3285 end;
3286 end;
3287
3288 {------------------------------------------------------------------------------
3289 Name: UTF8CompareStr
3290 Params: S1, S2 - UTF8 encoded strings
3291 Compares UTF8 encoded strings
3292 Returns
3293 0: if S1 = S2
3294 -1: if S1 < S2 ("alphabetically")
3295 +1: if S1 > S2
3296 -2: if S1 < S2, comparison ended at a different byte in an invalid UTF8 codepoint in either S1 or S2 (byte at S1 > byte at S2)
3297 +2: if S1 > S2, comparison ended at a different byte in an invalid UTF8 codepoint in either S1 or S2
3298
3299 Compare two UTF8 encoded strings, case sensitive.
3300
3301 Internally it uses WideCompareStr on the first Utf8 codepoint that differs between S1 and S2
3302 and therefor has proper colation on platforms where the WidestringManager supports this
3303 (Windows, *nix with cwstring unit)
3304 ------------------------------------------------------------------------------}
3305 function UTF8CompareStr(const S1, S2: string): PtrInt;
3306 begin
3307 Result := UTF8CompareStr(PChar(Pointer(S1)),length(S1),
3308 PChar(Pointer(S2)),length(S2));
3309 end;
3310
3311 function UTF8CompareStrP(S1, S2: PChar): PtrInt;
3312 begin
3313 Result:=UTF8CompareStr(S1,StrLen(S1),S2,StrLen(S2));
3314 end;
3315
3316
3317 function UTF8CompareStr(S1: PChar; Count1: SizeInt; S2: PChar; Count2: SizeInt): PtrInt;
3318 var
3319 Count: SizeInt;
3320 i, CL1, CL2: Integer;
3321 B1, B2: Byte;
3322 W1, W2: WideString;
3323 Org1, Org2: PChar;
3324 begin
3325 Result := 0;
3326 Org1 := S1;
3327 Org2 := S2;
3328 if (Count1 > Count2) then
3329 Count := Count2
3330 else
3331 Count := Count1;
3332
3333 i := 0;
3334 if (Count > 0) then
3335 begin
3336 //unfortunately we cannot use CompareByte here, so we have to iterate ourselves
3337 while (i < Count) do
3338 begin
3339 B1 := byte(S1^);
3340 B2 := byte(S2^);
3341 if (B1 <> B2) then
3342 begin
3343 //writeln('UCS: B1=',IntToHex(B1,2),', B2=',IntToHex(B2,2));
3344 Break;
3345 end;
3346 Inc(S1); Inc(S2); Inc(I);
3347 end;
3348 end;
3349 if (i < Count) then
3350 begin
3351 //Fallback result
3352 Result := B1 - B2;
3353 if (Result < 0) then
3354 Result := -2
3355 else
3356 Result := 2;
3357 //writeln('UCS: FallBack Result = ',Result);
3358 //Try t find start of valid UTF8 codepoints
3359 if (not Utf8TryFindCodepointStart(Org1, S1, CL1)) or
3360 not Utf8TryFindCodepointStart(Org2, S2, CL2) then
3361 Exit;
3362
3363 //writeln('UCS: CL1=',CL1,', CL2=',CL2);
3364 //writeln('S1 = "',S1,'"');
3365 //writeln('S2 = "',S2,'"');
3366 W1 := Utf8ToUtf16(S1, CL1);
3367 W2 := Utf8ToUtf16(S2, CL2);
3368 //writeln('UCS: W1 = ',Word(W1[1]),' W2 = ',Word(W2[1]));
3369 Result := WideCompareStr(W1, W2);
3370 end
3371 else
3372 //Strings are the same up and until size of smallest one
3373 Result := Count1 - Count2;
3374 if (Result > 1) then
3375 Result := 1
3376 else if (Result < -1) then
3377 Result := -1;
3378 end;
3379
3380 {------------------------------------------------------------------------------
3381 Name: UTF8CompareText
3382 Params: S1, S2 - UTF8 encoded strings
3383 Returns: < 0 if S1 < S2, 0 if S1 = S2, > 0 if S1 > S2.
3384 Compare two UTF8 encoded strings, case insensitive.
3385 Note: Use this function instead of AnsiCompareText.
3386 This function guarantees proper collation on all supported platforms.
3387 Internally it uses WideCompareText.
3388 ------------------------------------------------------------------------------}
3389 function UTF8CompareText(const S1, S2: String): PtrInt;
3390 begin
3391 Result := WideCompareText(Utf8ToUtf16(S1),Utf8ToUtf16(S2));
3392 end;
3393
3394 function UTF8CompareStrCollated(const S1, S2: string): PtrInt; {$IFnDEF ACP_RTL}inline;{$endif}
3395 begin
3396 {$IFDEF ACP_RTL}
3397 //Only with this define AnsiCompareStr does not point to Utf8CompareStr
3398 Result := AnsiCompareStr(UTF8ToSys(S1), UTF8ToSys(S2));
3399 {$ELSE}
3400 Result := Utf8CompareStr(S1,S2);
3401 {$ENDIF}
3402 end;
3403
3404 function CompareStrListUTF8LowerCase(List: TStringList; Index1, Index2: Integer
3405 ): Integer;
3406 begin
3407 Result:=UTF8CompareText(List[Index1],List[Index2]);
3408 end;
3409
3410 {------------------------------------------------------------------------------
3411 Name: ConvertUTF8ToUTF16
3412 Params: Dest - Pointer to destination string
3413 DestWideCharCount - Wide char count allocated in destination string
3414 Src - Pointer to source string
3415 SrcCharCount - Char count allocated in source string
3416 Options - Conversion options, if none is set, both
3417 invalid and unfinished source chars are skipped
3418
3419 toInvalidCharError - Stop on invalid source char and report
3420 error
3421 toInvalidCharToSymbol - Replace invalid source chars with '?'
3422 toUnfinishedCharError - Stop on unfinished source char and
3423 report error
3424 toUnfinishedCharToSymbol - Replace unfinished source char with '?'
3425
3426 ActualWideCharCount - Actual wide char count converted from source
3427 string to destination string
3428 Returns:
3429 trNoError - The string was successfully converted without
3430 any error
3431 trNullSrc - Pointer to source string is nil
3432 trNullDest - Pointer to destination string is nil
3433 trDestExhausted - Destination buffer size is not big enough to hold
3434 converted string
3435 trInvalidChar - Invalid source char has occured
3436 trUnfinishedChar - Unfinished source char has occured
3437
3438 Converts the specified UTF-8 encoded string to UTF-16 encoded (system endian)
3439 ------------------------------------------------------------------------------}
3440 function ConvertUTF8ToUTF16(Dest: PWideChar; DestWideCharCount: SizeUInt;
3441 Src: PChar; SrcCharCount: SizeUInt; Options: TConvertOptions;
3442 out ActualWideCharCount: SizeUInt): TConvertResult;
3443 var
3444 DestI, SrcI: SizeUInt;
3445 B1, B2, B3, B4: Byte;
3446 W: Word;
3447 C: Cardinal;
3448
3449 function UnfinishedCharError: Boolean;
3450 begin
3451 if toUnfinishedCharToSymbol in Options then
3452 begin
3453 Dest[DestI] := System.WideChar('?');
3454 Inc(DestI);
3455 Result := False;
3456 end
3457 else
3458 if toUnfinishedCharError in Options then
3459 begin
3460 ConvertUTF8ToUTF16 := trUnfinishedChar;
3461 Result := True;
3462 end
3463 else Result := False;
3464 end;
3465
3466 function InvalidCharError(Count: SizeUInt): Boolean; inline;
3467 begin
3468 if not (toInvalidCharError in Options) then
3469 begin
3470 if toInvalidCharToSymbol in Options then
3471 begin
3472 Dest[DestI] := System.WideChar('?');
3473 Inc(DestI);
3474 end;
3475
3476 Dec(SrcI, Count);
3477
3478 // skip trailing UTF-8 char bytes
3479 while (Count > 0) do
3480 begin
3481 if (Byte(Src[SrcI]) and %11000000) <> %10000000 then Break;
3482 Inc(SrcI);
3483 Dec(Count);
3484 end;
3485
3486 Result := False;
3487 end
3488 else
3489 if toInvalidCharError in Options then
3490 begin
3491 ConvertUTF8ToUTF16 := trUnfinishedChar;
3492 Result := True;
3493 end;
3494 end;
3495
3496 begin
3497 ActualWideCharCount := 0;
3498
3499 if not Assigned(Src) then
3500 begin
3501 Result := trNullSrc;
3502 Exit;
3503 end;
3504
3505 if not Assigned(Dest) then
3506 begin
3507 Result := trNullDest;
3508 Exit;
3509 end;
3510 SrcI := 0;
3511 DestI := 0;
3512
3513 while (DestI < DestWideCharCount) and (SrcI < SrcCharCount) do
3514 begin
3515 B1 := Byte(Src[SrcI]);
3516 Inc(SrcI);
3517
3518 if B1 < 128 then // single byte UTF-8 char
3519 begin
3520 Dest[DestI] := System.WideChar(B1);
3521 Inc(DestI);
3522 end
3523 else
3524 begin
3525 if SrcI >= SrcCharCount then
3526 if UnfinishedCharError then Exit(trInvalidChar)
3527 else Break;
3528
3529 B2 := Byte(Src[SrcI]);
3530 Inc(SrcI);
3531
3532 if (B1 and %11100000) = %11000000 then // double byte UTF-8 char
3533 begin
3534 if (B2 and %11000000) = %10000000 then
3535 begin
3536 Dest[DestI] := System.WideChar(((B1 and %00011111) shl 6) or (B2 and %00111111));
3537 Inc(DestI);
3538 end
3539 else // invalid character, assume single byte UTF-8 char
3540 if InvalidCharError(1) then Exit(trInvalidChar);
3541 end
3542 else
3543 begin
3544 if SrcI >= SrcCharCount then
3545 if UnfinishedCharError then Exit(trInvalidChar)
3546 else Break;
3547
3548 B3 := Byte(Src[SrcI]);
3549 Inc(SrcI);
3550
3551 if (B1 and %11110000) = %11100000 then // triple byte UTF-8 char
3552 begin
3553 if ((B2 and %11000000) = %10000000) and ((B3 and %11000000) = %10000000) then
3554 begin
3555 W := ((B1 and %00011111) shl 12) or ((B2 and %00111111) shl 6) or (B3 and %00111111);
3556 if (W < $D800) or (W > $DFFF) then // to single wide char UTF-16 char
3557 begin
3558 Dest[DestI] := System.WideChar(W);
3559 Inc(DestI);
3560 end
3561 else // invalid UTF-16 character, assume double byte UTF-8 char
3562 if InvalidCharError(2) then Exit(trInvalidChar);
3563 end
3564 else // invalid character, assume double byte UTF-8 char
3565 if InvalidCharError(2) then Exit(trInvalidChar);
3566 end
3567 else
3568 begin
3569 if SrcI >= SrcCharCount then
3570 if UnfinishedCharError then Exit(trInvalidChar)
3571 else Break;
3572
3573 B4 := Byte(Src[SrcI]);
3574 Inc(SrcI);
3575
3576 if ((B1 and %11111000) = %11110000) and ((B2 and %11000000) = %10000000)
3577 and ((B3 and %11000000) = %10000000) and ((B4 and %11000000) = %10000000) then
3578 begin // 4 byte UTF-8 char
3579 C := ((B1 and %00011111) shl 18) or ((B2 and %00111111) shl 12)
3580 or ((B3 and %00111111) shl 6) or (B4 and %00111111);
3581 // to double wide char UTF-16 char
3582 Dest[DestI] := System.WideChar($D800 or ((C - $10000) shr 10));
3583 Inc(DestI);
3584 if DestI >= DestWideCharCount then Break;
3585 Dest[DestI] := System.WideChar($DC00 or ((C - $10000) and %0000001111111111));
3586 Inc(DestI);
3587 end
3588 else // invalid character, assume triple byte UTF-8 char
3589 if InvalidCharError(3) then Exit(trInvalidChar);
3590 end;
3591 end;
3592 end;
3593 end;
3594
3595 if DestI >= DestWideCharCount then
3596 begin
3597 DestI := DestWideCharCount - 1;
3598 Result := trDestExhausted;
3599 end
3600 else
3601 Result := trNoError;
3602
3603 Dest[DestI] := #0;
3604 ActualWideCharCount := DestI + 1;
3605 end;
3606
3607 {------------------------------------------------------------------------------
3608 Name: ConvertUTF16ToUTF8
3609 Params: Dest - Pointer to destination string
3610 DestCharCount - Char count allocated in destination string
3611 Src - Pointer to source string
3612 SrcWideCharCount - Wide char count allocated in source string
3613 Options - Conversion options, if none is set, both
3614 invalid and unfinished source chars are skipped.
3615 See ConvertUTF8ToUTF16 for details.
3616
3617 ActualCharCount - Actual char count converted from source
3618 string to destination string
3619 Returns: See ConvertUTF8ToUTF16
3620
3621 Converts the specified UTF-16 encoded string (system endian) to UTF-8 encoded
3622 ------------------------------------------------------------------------------}
3623 function ConvertUTF16ToUTF8(Dest: PChar; DestCharCount: SizeUInt;
3624 Src: PWideChar; SrcWideCharCount: SizeUInt; Options: TConvertOptions;
3625 out ActualCharCount: SizeUInt): TConvertResult;
3626 var
3627 DestI, SrcI: SizeUInt;
3628 W1, W2: Word;
3629 C: Cardinal;
3630
3631 function UnfinishedCharError: Boolean;
3632 begin
3633 if toUnfinishedCharToSymbol in Options then
3634 begin
3635 Dest[DestI] := Char('?');
3636 Inc(DestI);
3637 Result := False;
3638 end
3639 else
3640 if toUnfinishedCharError in Options then
3641 begin
3642 ConvertUTF16ToUTF8 := trUnfinishedChar;
3643 Result := True;
3644 end
3645 else Result := False;
3646 end;
3647
3648 function InvalidCharError(Count: SizeUInt): Boolean; inline;
3649 begin
3650 if not (toInvalidCharError in Options) then
3651 begin
3652 if toInvalidCharToSymbol in Options then
3653 begin
3654 Dest[DestI] := Char('?');
3655 Inc(DestI);
3656 end;
3657
3658 Dec(SrcI, Count);
3659 // skip trailing UTF-16 wide char
3660 if (Word(Src[SrcI]) and $FC00) = $DC00 then Inc(SrcI);
3661
3662 Result := False;
3663 end
3664 else
3665 if toInvalidCharError in Options then
3666 begin
3667 ConvertUTF16ToUTF8 := trUnfinishedChar;
3668 Result := True;
3669 end;
3670 end;
3671
3672 begin
3673 ActualCharCount := 0;
3674
3675 if not Assigned(Src) then
3676 begin
3677 Result := trNullSrc;
3678 Exit;
3679 end;
3680
3681 if not Assigned(Dest) then
3682 begin
3683 Result := trNullDest;
3684 Exit;
3685 end;
3686 SrcI := 0;
3687 DestI := 0;
3688
3689 while (DestI < DestCharCount) and (SrcI < SrcWideCharCount) do
3690 begin
3691 W1 := Word(Src[SrcI]);
3692 Inc(SrcI);
3693
3694 if (W1 < $D800) or (W1 > $DFFF) then // single wide char UTF-16 char
3695 begin
3696 if W1 < $0080 then // to single byte UTF-8 char
3697 begin
3698 Dest[DestI] := Char(W1);
3699 Inc(DestI);
3700 end
3701 else
3702 if W1 < $0800 then // to double byte UTF-8 char
3703 begin
3704 Dest[DestI] := Char(%11000000 or ((W1 and %11111000000) shr 6));
3705 Inc(DestI);
3706 if DestI >= DestCharCount then Break;
3707 Dest[DestI] := Char(%10000000 or (W1 and %111111));
3708 Inc(DestI);
3709 end
3710 else
3711 begin // to triple byte UTF-8 char
3712 Dest[DestI] := Char(%11100000 or ((W1 and %1111000000000000) shr 12));
3713 Inc(DestI);
3714 if DestI >= DestCharCount then Break;
3715 Dest[DestI] := Char(%10000000 or ((W1 and %111111000000) shr 6));
3716 Inc(DestI);
3717 if DestI >= DestCharCount then Break;
3718 Dest[DestI] := Char(%10000000 or (W1 and %111111));
3719 Inc(DestI);
3720 end;
3721 end
3722 else
3723 begin
3724 if SrcI >= SrcWideCharCount then
3725 if UnfinishedCharError then Exit(trInvalidChar)
3726 else Break;
3727
3728 W2 := Word(Src[SrcI]);
3729 Inc(SrcI);
3730
3731 if (W1 and $F800) = $D800 then // double wide char UTF-16 char
3732 begin
3733 if (W2 and $FC00) = $DC00 then
3734 begin
3735 C := (W1 - $D800) shl 10 + (W2 - $DC00) + $10000;
3736
3737 // to 4 byte UTF-8 char
3738 Dest[DestI] := Char(%11110000 or (C shr 18));
3739 Inc(DestI);
3740 if DestI >= DestCharCount then Break;
3741 Dest[DestI] := Char(%10000000 or ((C and $3F000) shr 12));
3742 Inc(DestI);
3743 if DestI >= DestCharCount then Break;
3744 Dest[DestI] := Char(%10000000 or ((C and %111111000000) shr 6));
3745 Inc(DestI);
3746 if DestI >= DestCharCount then Break;
3747 Dest[DestI] := Char(%10000000 or (C and %111111));
3748 Inc(DestI);
3749 end
3750 else // invalid character, assume single wide char UTF-16 char
3751 if InvalidCharError(1) then Exit(trInvalidChar);
3752 end
3753 else // invalid character, assume single wide char UTF-16 char
3754 if InvalidCharError(1) then Exit(trInvalidChar);
3755 end;
3756 end;
3757
3758 if DestI >= DestCharCount then
3759 begin
3760 DestI := DestCharCount - 1;
3761 Result := trDestExhausted;
3762 end
3763 else
3764 Result := trNoError;
3765
3766 Dest[DestI] := #0;
3767 ActualCharCount := DestI + 1;
3768 end;
3769
3770 {------------------------------------------------------------------------------
3771 Name: UTF8ToUTF16
3772 Params: S - Source UTF-8 string
3773 Returns: UTF-16 encoded string
3774
3775 Converts the specified UTF-8 encoded string to UTF-16 encoded (system endian)
3776 Avoid copying the result string since on windows a widestring requires a full
3777 copy
3778 ------------------------------------------------------------------------------}
3779 function UTF8ToUTF16(const S: AnsiString): UnicodeString;
3780 begin
3781 Result:=UTF8ToUTF16(PChar(S),length(S));
3782 end;
3783
3784 function UTF8ToUTF16(const P: PChar; ByteCnt: SizeUInt): UnicodeString;
3785 var
3786 L: SizeUInt;
3787 begin
3788 if ByteCnt=0 then
3789 exit('');
3790 SetLength(Result, ByteCnt);
3791 // wide chars of UTF-16 <= bytes of UTF-8 string
3792 if ConvertUTF8ToUTF16(PWideChar(Result), Length(Result) + 1, P, ByteCnt,
3793 [toInvalidCharToSymbol], L) = trNoError
3794 then SetLength(Result, L - 1)
3795 else Result := '';
3796 end;
3797
3798 {------------------------------------------------------------------------------
3799 Name: UTF16ToUTF8
3800 Params: S - Source UTF-16 string (system endian)
3801 Returns: UTF-8 encoded string
3802
3803 Converts the specified UTF-16 encoded string (system endian) to UTF-8 encoded
3804 ------------------------------------------------------------------------------}
3805 function UTF16ToUTF8(const S: UnicodeString): AnsiString;
3806 begin
3807 Result := UTF16ToUTF8(PWideChar(S),length(S));
3808 end;
3809
3810 function UTF16ToUTF8(const P: PWideChar; WideCnt: SizeUInt): AnsiString;
3811 var
3812 L: SizeUInt;
3813 begin
3814 if WideCnt=0 then
3815 exit('');
3816
3817 SetLength(Result, WideCnt * 3);
3818 // bytes of UTF-8 <= 3 * wide chars of UTF-16 string
3819 // e.g. %11100000 10100000 10000000 (UTF-8) is $0800 (UTF-16)
3820 if ConvertUTF16ToUTF8(PChar(Result), Length(Result) + 1, P, WideCnt,
3821 [toInvalidCharToSymbol], L) = trNoError then
3822 begin
3823 SetLength(Result, L - 1);
3824 end else
3825 Result := '';
3826 end;
3827
3828 procedure LazGetLanguageIDs(var Lang, FallbackLang: String);
3829
3830 {$IFDEF DARWIN}
3831 function GetLanguage: boolean;
3832 var
3833 Ref: CFStringRef;
3834 LangArray: CFMutableArrayRef;
3835 StrSize: CFIndex;
3836 StrRange: CFRange;
3837 Locals: CFArrayRef;
3838 Bundle: CFBundleRef;
3839 begin
3840 Result := false;
3841 Bundle:=CFBundleGetMainBundle;
3842 if Bundle=nil then exit;
3843 Locals:=CFBundleCopyBundleLocalizations(Bundle);
3844 if Locals=nil then exit;
3845 LangArray := CFBundleCopyLocalizationsForPreferences(Locals, nil);
3846 try
3847 if CFArrayGetCount(LangArray) > 0 then
3848 begin
3849 Ref := CFArrayGetValueAtIndex(LangArray, 0);
3850 StrRange.location := 0;
3851 StrRange.length := CFStringGetLength(Ref);
3852
3853 StrSize:=0;
3854 CFStringGetBytes(Ref, StrRange, kCFStringEncodingUTF8,
3855 Ord('?'), False, nil, 0, StrSize);
3856 SetLength(Lang, StrSize);
3857
3858 if StrSize > 0 then
3859 begin
3860 CFStringGetBytes(Ref, StrRange, kCFStringEncodingUTF8,
3861 Ord('?'), False, @Lang[1], StrSize, StrSize);
3862 Result:=true;
3863 FallbackLang := Copy(Lang, 1, 2);
3864 end;
3865 end;
3866 finally
3867 CFRelease(LangArray);
3868 CFRelease(Locals);
3869 end;
3870 end;
3871 {$ENDIF}
3872 begin
3873 {$IFDEF DARWIN}
3874 if not GetLanguage then
3875 GetLanguageIDs(Lang, FallbackLang);
3876 {$ELSE}
3877 GetLanguageIDs(Lang, FallbackLang);
3878 {$ENDIF}
3879 end;
3880
3881 {
3882 This routine will strip country information from the language ID
3883 making it more simple
3884
3885 Ideally the resulting ID from here should conform to ISO 639-1
3886 or ISO 639-2, if the language has no code in ISO 639-1
3887 }
3888 procedure LazGetShortLanguageID(var Lang: String);
3889 var
3890 FallbackLang: String;
3891 begin
3892 FallbackLang:='';
3893 LazGetLanguageIDs(Lang, FallbackLang);
3894
3895 // Simply making sure its length is at most 2 should be enough for most languages
3896 if Length(Lang) > 2 then Lang := Lang[1] + Lang[2];
3897 end;
3898
3899 procedure ReplaceSubstring(var s: string; StartPos, Count: SizeInt;
3900 const Insertion: string);
3901 var
3902 MaxCount: SizeInt;
3903 InsertionLen: SizeInt;
3904 SLen: SizeInt;
3905 RestLen: SizeInt;
3906 p: PByte;
3907 begin
3908 SLen:=length(s);
3909 if StartPos>SLen then begin
3910 s:=s+Insertion;
3911 exit;
3912 end;
3913 if StartPos<1 then StartPos:=1;
3914 if Count<0 then Count:=0;
3915 MaxCount:=SLen-StartPos+1;
3916 if Count>MaxCount then
3917 Count:=MaxCount;
3918 InsertionLen:=length(Insertion);
3919 if (Count=0) and (InsertionLen=0) then
3920 exit; // nothing to do
3921 if (Count=InsertionLen) then begin
3922 if CompareMem(PByte(s)+StartPos-1,Pointer(Insertion),Count) then
3923 // already the same content
3924 exit;
3925 UniqueString(s);
3926 end else begin
3927 RestLen:=SLen-StartPos-Count+1;
3928 if InsertionLen<Count then begin
3929 // shorten
3930 if RestLen>0 then begin
3931 UniqueString(s);
3932 p:=PByte(s)+StartPos-1;
3933 System.Move((p+Count)^,(p+InsertionLen)^,RestLen);
3934 end;
3935 Setlength(s,SLen-Count+InsertionLen);
3936 end else begin
3937 // longen
3938 Setlength(s,SLen-Count+InsertionLen);
3939 if RestLen>0 then begin
3940 p:=PByte(s)+StartPos-1;
3941 System.Move((p+Count)^,(p+InsertionLen)^,RestLen);
3942 end;
3943 end;
3944 end;
3945 if InsertionLen>0 then
3946 System.Move(PByte(Insertion)^,(PByte(s)+StartPos-1)^,InsertionLen);
3947 end;
3948
3949 procedure InitFPUpchars;
3950 var
3951 c: Char;
3952 begin
3953 for c:=Low(char) to High(char) do begin
3954 FPUpChars[c]:=upcase(c);
3955 end;
3956 end;
3957
3958 initialization
3959 InitFPUpchars;
3960 InitLazUtf8;
3961 finalization
3962 FinalizeLazUTF8;
3963
3964 end.
3965
3966