1 {
2 *****************************************************************************
3 This file is part of LazUtils.
4
5 See the file COPYING.modifiedLGPL.txt, included in this distribution,
6 for details about the license.
7 *****************************************************************************
8
9 A widestring manager written in Pascal
10 and optimized for DefaultSystemCodePage CP_UTF8.
11 }
12 unit PasWString;
13
14 {$mode objfpc}
15 {$inline on}
16 {$i lazutils_defines.inc}
17
18 //{$define PASWSTRING_VERBOSE}
19 //{.$define PASWSTRING_SUPPORT_NONUTF8_ANSISTRING} disabled by default because
20 // non utf-8 ansistring is rare in UNIXes and lconvencoding makes the executable big
21
22 // sanity checks for defines
23 //{$IF FPC_FULLVERSION >= 30000}
24 {$IFnDEF NO_CP_RTL}
25 {$IFDEF UTF8_RTL}
26 {$IFDEF PASWSTRING_SUPPORT_NONUTF8_ANSISTRING}
27 {$error UTF8 or not UTF8}
28 {$ENDIF}
29 {$ENDIF}
30 {$DEFINE DisablePasWString}
31 {$ENDIF}
32
33 interface
34
35 uses
36 SysUtils, LazUTF8
37 {$ifdef PASWSTRING_SUPPORT_NONUTF8_ANSISTRING}, lconvencoding{$endif}
38 ;
39
40 {$IFnDEF DisablePasWString}
41 procedure SetPasWidestringManager;
42 {$ENDIF}
43
44 implementation
45
46 {$IFnDEF DisablePasWString}
47 procedure fpc_rangeerror; [external name 'FPC_RANGEERROR'];
48
IsASCIInull49 function IsASCII(const s: string): boolean; inline;
50 var
51 i: Integer;
52 begin
53 for i:=1 to length(s) do if ord(s[i])>127 then exit(false);
54 Result:=true;
55 end;
56
57 // len comes in widechars, not bytes
58 procedure Wide2AnsiMove(source:pwidechar;var dest:ansistring;len:SizeInt);
59 begin
60 {$ifdef PASWSTRING_VERBOSE}WriteLn('Wide2AnsiMove START');{$endif}
61 dest := UTF16ToUTF8(Source,len);
62 {$ifdef PASWSTRING_SUPPORT_NONUTF8_ANSISTRING}
63 // And correct to the real Ansi encoding
64 dest := ConvertEncoding(dest, EncodingUTF8, GetDefaultTextEncoding());
65 {$endif}
66 end;
67
68 procedure Ansi2WideMove(source:pchar;var dest:widestring;len:SizeInt);
69 {$ifdef PASWSTRING_SUPPORT_NONUTF8_ANSISTRING}
70 var
71 ansistr: ansistring;
72 {$endif}
73 begin
74 {$ifdef PASWSTRING_VERBOSE}WriteLn('Ansi2WideMove START');{$endif}
75
76 {$ifdef PASWSTRING_SUPPORT_NONUTF8_ANSISTRING}
77 // Copy the originating string taking into account the specified length
78 SetLength(ansistr, len);
79 System.Move(source^, ansistr[1], len);
80 // Convert to UTF-8
81 ansistr := ConvertEncoding(ansistr, GetDefaultTextEncoding(), EncodingUTF8);
82 // Now convert it, using UTF-8 -> UTF-16
83 dest := UTF8ToUTF16(ansistr);
84 {$else}
85 dest := UTF8ToUTF16(source,len);
86 {$endif}
87 end;
88
LowerWideStringnull89 function LowerWideString(const s : WideString) : WideString;
90 var
91 str: string;
92 begin
93 {$ifdef PASWSTRING_VERBOSE}WriteLn('LowerWideString START');{$endif}
94 str := UTF16ToUTF8(PWideChar(s),length(s));
95 str := UTF8LowerCase(str);
96 Result := UTF8ToUTF16(str);
97 end;
98
UpperWideStringnull99 function UpperWideString(const s : WideString) : WideString;
100 var
101 str: string;
102 begin
103 {$ifdef PASWSTRING_VERBOSE}WriteLn('UpperWideString START');{$endif}
104 str := UTF16ToUTF8(PWideChar(s),length(s));
105 str := UTF8UpperCase(str);
106 Result := UTF8ToUTF16(str);
107 end;
108
109 procedure EnsureAnsiLen(var S: AnsiString; const len: SizeInt); inline;
110 var
111 l: SizeUInt;
112 begin
113 {$ifdef PASWSTRING_VERBOSE}WriteLn('EnsureAnsiLen START');{$endif}
114 l:=length(s);
115 if (len>l) then
116 if (l < 128) then
117 setlength(s,l+8)
118 else
119 setlength(s,l+l shr 8);
120 end;
121
122
123 procedure ConcatCharToAnsiStr(const c: char; var S: AnsiString; var index: SizeInt);
124 begin
125 {$ifdef PASWSTRING_VERBOSE}WriteLn('ConcatCharToAnsiStr START');{$endif}
126 EnsureAnsiLen(s,index);
127 pchar(@s[index])^:=c;
128 inc(index);
129 end;
130
LowerAnsiStringnull131 function LowerAnsiString(const s : AnsiString) : AnsiString;
132 var
133 Str: string;
134 begin
135 {$ifdef PASWSTRING_VERBOSE}WriteLn('LowerAnsiString START');{$endif}
136 Str := SysToUTF8(s);
137 Str := UTF8LowerCase(Str);
138 Result := UTF8ToSys(Str);
139 end;
140
UpperAnsiStringnull141 function UpperAnsiString(const s : AnsiString) : AnsiString;
142 var
143 Str: string;
144 begin
145 {$ifdef PASWSTRING_VERBOSE}WriteLn('UpperAnsiString START');{$endif}
146 Str := SysToUTF8(s);
147 Str := UTF8UpperCase(Str);
148 Result := UTF8ToSys(Str);
149 end;
150
151 // Just do a simple byte comparison
152 // A more complex analysis would require normalization
WideCompareStrnull153 function WideCompareStr(const s1, s2 : WideString) : PtrInt;
154 var
155 count, count1, count2: integer;
156 begin
157 {$ifdef PASWSTRING_VERBOSE}WriteLn('WideCompareStr START');{$endif}
158 result := 0;
159 Count1 := Length(S1);
160 Count2 := Length(S2);
161 if Count1>Count2 then
162 Count:=Count2
163 else
164 Count:=Count1;
165 result := SysUtils.CompareMemRange(Pointer(S1),Pointer(S2), Count*2);
166 if result=0 then
167 result:=Count1-Count2;
168 end;
169
WideCompareTextnull170 function WideCompareText(const s1, s2 : WideString): PtrInt;
171 var
172 a, b: String;
173 begin
174 {$ifdef PASWSTRING_VERBOSE}WriteLn('WideCompareText START');{$endif}
175 a := UTF16ToUTF8(PWideChar(s1),length(s1));
176 a := UTF8LowerCase(a);
177 b := UTF16ToUTF8(PWideChar(s2),length(s2));
178 b := UTF8LowerCase(b);
179 result := UTF8CompareStr(a,b);
180 end;
181
CharLengthPCharnull182 function CharLengthPChar(const Str: PChar): PtrInt;
183 // return the number of codepoints (including invalid codepoints)
184 var
185 p: PChar;
186 l: Integer;
187 begin
188 {$ifdef PASWSTRING_VERBOSE}WriteLn('CharLengthPChar START');{$endif}
189 p:=Str;
190 if p=nil then exit(0);
191 while p^<>#0 do begin
192 l:=UTF8CodepointSize(p);
193 inc(Result);
194 inc(p,l);
195 end;
196 end;
197
CodePointLengthPCharnull198 function CodePointLengthPChar(const p: PChar; MaxLookAhead: PtrInt): Ptrint;
199 { return value:
200 -1 if incomplete or invalid code point
201 0 if NULL character,
202 > 0 if that's the length in bytes of the code point }
203 begin
204 {$ifdef PASWSTRING_VERBOSE}WriteLn('CodePointLengthPChar START');{$endif}
205 if (p=nil) then exit(0);
206 if (MaxLookAhead<0) then exit(-1);
207 if ord(p^)<%10000000 then begin
208 // regular single byte character
209 if p^=#0 then
210 exit(0)
211 else
212 exit(1);
213 end;
214 if ord(p^)<%11000000 then begin
215 // invalid single byte character
216 exit(-1);
217 end;
218 if (MaxLookAhead=0) then exit(-1);
219 if ((ord(p^) and %11100000) = %11000000) then begin
220 // should be 2 byte character
221 if (ord(p[1]) and %11000000) = %10000000 then
222 exit(2)
223 else
224 exit(-1);
225 end;
226 if (MaxLookAhead=1) then exit(-1);
227 if ((ord(p^) and %11110000) = %11100000) then begin
228 // should be 3 byte character
229 if ((ord(p[1]) and %11000000) = %10000000)
230 and ((ord(p[2]) and %11000000) = %10000000) then
231 exit(3)
232 else
233 exit(-1);
234 end;
235 if (MaxLookAhead=2) then exit(-1);
236 if ((ord(p^) and %11111000) = %11110000) then begin
237 // should be 4 byte character
238 if ((ord(p[1]) and %11000000) = %10000000)
239 and ((ord(p[2]) and %11000000) = %10000000)
240 and ((ord(p[3]) and %11000000) = %10000000) then
241 exit(4)
242 else
243 exit(-1);
244 end;
245 exit(-1);
246 end;
247
AnsiCompareStrnull248 function AnsiCompareStr(const s1, s2: ansistring): PtrInt;
249 begin
250 {$ifdef PASWSTRING_VERBOSE}WriteLn('AnsiCompareStr START');{$endif}
251 Result := SysUtils.CompareStr(s1, s2);
252 end;
253
254 // Similar to AnsiCompareStr, but with PChar
StrCompAnsinull255 function StrCompAnsi(s1,s2 : PChar): PtrInt;
256 var
257 Count1: SizeInt;
258 Count2: SizeInt;
259 Count: SizeInt;
260 begin
261 {$ifdef PASWSTRING_VERBOSE}WriteLn('StrCompAnsi START');{$endif}
262 result := 0;
263 Count1:=StrLen(s1);
264 Count2:=StrLen(s2);
265 if Count1>Count2 then
266 Count:=Count2
267 else
268 Count:=Count1;
269 result := CompareMemRange(s1, s2, Count);
270 if result=0 then
271 result:=Count1-Count2;
272 end;
273
274
AnsiCompareTextnull275 function AnsiCompareText(const S1, S2: ansistring): PtrInt;
276 var
277 str1, str2: string;
278 begin
279 {$ifdef PASWSTRING_VERBOSE}WriteLn('AnsiCompareText START');{$endif}
280 str1 := SysToUTF8(S1);
281 str2 := SysToUTF8(S2);
282 Result := UTF8CompareText(str1, str2);
283 end;
284
285
AnsiStrICompnull286 function AnsiStrIComp(S1, S2: PChar): PtrInt;
287 begin
288 {$ifdef PASWSTRING_VERBOSE}WriteLn('AnsiStrIComp START');{$endif}
289 Result := AnsiCompareText(StrPas(s1),StrPas(s2));
290 end;
291
292
AnsiStrLCompnull293 function AnsiStrLComp(S1, S2: PChar; MaxLen: PtrUInt): PtrInt;
294 var
295 a, b: pchar;
296 begin
297 {$ifdef PASWSTRING_VERBOSE}WriteLn('AnsiStrLComp START');{$endif}
298 Result := 0;
299 if (maxlen=0) then
300 exit(0);
301 if (s1[maxlen]<>#0) then
302 begin
303 getmem(a,maxlen+1);
304 move(s1^,a^,maxlen);
305 a[maxlen]:=#0;
306 end
307 else
308 a:=s1;
309 if (s2[maxlen]<>#0) then
310 begin
311 getmem(b,maxlen+1);
312 move(s2^,b^,maxlen);
313 b[maxlen]:=#0;
314 end
315 else
316 b:=s2;
317 result:=StrCompAnsi(a,b);
318 if (a<>s1) then
319 freemem(a);
320 if (b<>s2) then
321 freemem(b);
322 end;
323
324
AnsiStrLICompnull325 function AnsiStrLIComp(S1, S2: PChar; MaxLen: PtrUInt): PtrInt;
326 var
327 a, b: ansistring;
328 begin
329 {$ifdef PASWSTRING_VERBOSE}WriteLn('AnsiStrLIComp START');{$endif}
330 if (maxlen=0) then
331 exit(0);
332 setlength(a,maxlen);
333 move(s1^,a[1],maxlen);
334 setlength(b,maxlen);
335 move(s2^,b[1],maxlen);
336 result:=AnsiCompareText(a,b);
337 end;
338
339
340 procedure ansi2pchar(const s: ansistring; const orgp: pchar; out p: pchar);
341 var
342 newlen: sizeint;
343 begin
344 {$ifdef PASWSTRING_VERBOSE}WriteLn('ansi2pchar START');{$endif}
345 newlen:=length(s);
346 if newlen>strlen(orgp) then
347 fpc_rangeerror;
348 p:=orgp;
349 if (newlen>0) then
350 move(s[1],p[0],newlen);
351 p[newlen]:=#0;
352 end;
353
354
AnsiStrLowernull355 function AnsiStrLower(Str: PChar): PChar;
356 var
357 temp: ansistring;
358 begin
359 {$ifdef PASWSTRING_VERBOSE}WriteLn('AnsiStrLower START');{$endif}
360 temp:=loweransistring(str);
361 ansi2pchar(temp,str,result);
362 end;
363
364
AnsiStrUppernull365 function AnsiStrUpper(Str: PChar): PChar;
366 var
367 temp: ansistring;
368 begin
369 {$ifdef PASWSTRING_VERBOSE}WriteLn('AnsiStrUpper START');{$endif}
370 temp:=upperansistring(str);
371 ansi2pchar(temp,str,result);
372 end;
373
374
375 procedure InitThread;
376 begin
377 end;
378
379
380 procedure FiniThread;
381 begin
382 end;
383
384 { Unicode }
385
386 procedure Unicode2AnsiMove(source:pwidechar;var dest:ansistring;len:SizeInt);
387 begin
388 {$ifdef PASWSTRING_VERBOSE}WriteLn('Unicode2AnsiMove START');{$endif}
389 dest := UTF16ToUTF8(source,len);
390 {$ifdef PASWSTRING_SUPPORT_NONUTF8_ANSISTRING}
391 // And correct to the real Ansi encoding
392 dest := ConvertEncoding(dest, EncodingUTF8, GetDefaultTextEncoding());
393 {$endif}
394 end;
395
396 procedure Ansi2UnicodeMove(source:pchar;var dest:UnicodeString;len:SizeInt);
397 {$IFDEF PASWSTRING_SUPPORT_NONUTF8_ANSISTRING}
398 var
399 ansistr: ansistring;
400 {$ENDIF}
401 begin
402 {$ifdef PASWSTRING_VERBOSE}WriteLn('Ansi2UnicodeMove START');{$endif}
403 {$IFDEF PASWSTRING_SUPPORT_NONUTF8_ANSISTRING}
404 if NeedRTLAnsi then begin
405 // Copy the originating string taking into account the specified length
406 SetLength(ansistr, len);
407 System.Move(source^, ansistr[1], len);
408 // Convert to UTF-8
409 ansistr := ConvertEncoding(ansistr, GetDefaultTextEncoding(), EncodingUTF8);
410 // Now convert it, using UTF-8 -> UTF-16
411 dest := UTF8ToUTF16(ansistr);
412 end else begin
413 dest := UTF8ToUTF16(source,len);
414 end;
415 {$ELSE}
416 dest := UTF8ToUTF16(source,len);
417 {$ENDIF}
418 end;
419
UpperUnicodeStringnull420 function UpperUnicodeString(const s : UnicodeString) : UnicodeString;
421 var
422 str: string;
423 begin
424 {$ifdef PASWSTRING_VERBOSE}WriteLn('UpperUnicodeString START');{$endif}
425 str := UTF16ToUTF8(s);
426 str := UTF8UpperCase(str);
427 Result := UTF8ToUTF16(str);
428 end;
429
LowerUnicodeStringnull430 function LowerUnicodeString(const s : UnicodeString) : UnicodeString;
431 var
432 str: string;
433 begin
434 {$ifdef PASWSTRING_VERBOSE}WriteLn('LowerUnicodeString START');{$endif}
435 str := UTF16ToUTF8(s);
436 str := UTF8LowerCase(str);
437 Result := UTF8ToUTF16(str);
438 end;
439
440 // Just do a simple byte comparison
441 // A more complex analysis would require normalization
PasUnicodeCompareStrnull442 function PasUnicodeCompareStr(const s1, s2 : unicodestring) : PtrInt;
443 var
444 count, count1, count2: integer;
445 begin
446 {$ifdef PASWSTRING_VERBOSE}WriteLn('PasUnicodeCompareStr START');{$endif}
447 result := 0;
448 Count1 := Length(S1);
449 Count2 := Length(S2);
450 if Count1>Count2 then
451 Count:=Count2
452 else
453 Count:=Count1;
454 result := SysUtils.CompareMemRange(Pointer(S1),Pointer(S2), Count*2);
455 if result=0 then
456 result:=Count1-Count2;
457 end;
458
PasUnicodeCompareTextnull459 function PasUnicodeCompareText(const s1, s2 : unicodestring): PtrInt;
460 var
461 a, b: string;
462 begin
463 {$ifdef PASWSTRING_VERBOSE}WriteLn('PasUnicodeCompareText START');{$endif}
464 a := UTF16ToUTF8(s1);
465 a := UTF8LowerCase(a);
466 b := UTF16ToUTF8(s2);
467 b := UTF8LowerCase(b);
468 result := UTF8CompareText(a,b);
469 end;
470
471 Procedure SetPasWideStringManager;
472 Var
473 PasWideStringManager : TUnicodeStringManager;
474 begin
475 PasWideStringManager:=widestringmanager;
476 PasWideStringManager.Wide2AnsiMoveProc:=@Wide2AnsiMove;
477 PasWideStringManager.Ansi2WideMoveProc:=@Ansi2WideMove;
478
479 // UpperUTF8 : procedure(p:PUTF8String);
480 PasWideStringManager.UpperWideStringProc:=@UpperWideString;
481 // UpperUCS4 : procedure(p:PUCS4Char);
482 // LowerUTF8 : procedure(p:PUTF8String);
483 PasWideStringManager.LowerWideStringProc:=@LowerWideString;
484 // LowerUCS4 : procedure(p:PUCS4Char);
485
486 {
487 CompUTF8 : function(p1,p2:PUTF8String) : shortint;
488 CompUCS2 : function(p1,p2:PUCS2Char) : shortint;
489 CompUCS4 : function(p1,p2:PUC42Char) : shortint;
490 }
491 PasWideStringManager.CompareWideStringProc:=@WideCompareStr;
492 PasWideStringManager.CompareTextWideStringProc:=@WideCompareText;
493
494 { return value: number of code points in the string. Whenever an invalid
495 code point is encountered, all characters part of this invalid code point
496 are considered to form one "character" and the next character is
497 considered to be the start of a new (possibly also invalid) code point }
498 PasWideStringManager.CharLengthPCharProc:=@CharLengthPChar;
499 PasWideStringManager.CodePointLengthProc:=@CodePointLengthPChar;
500
501 { Ansi }
502 PasWideStringManager.UpperAnsiStringProc:=@UpperAnsiString;
503 PasWideStringManager.LowerAnsiStringProc:=@LowerAnsiString;
504 PasWideStringManager.CompareStrAnsiStringProc:=@AnsiCompareStr;
505 PasWideStringManager.CompareTextAnsiStringProc:=@AnsiCompareText;
506 PasWideStringManager.StrCompAnsiStringProc:=@StrCompAnsi;
507 PasWideStringManager.StrICompAnsiStringProc:=@AnsiStrIComp;
508 PasWideStringManager.StrLCompAnsiStringProc:=@AnsiStrLComp;
509 PasWideStringManager.StrLICompAnsiStringProc:=@AnsiStrLIComp;
510 PasWideStringManager.StrLowerAnsiStringProc:=@AnsiStrLower;
511 PasWideStringManager.StrUpperAnsiStringProc:=@AnsiStrUpper;
512 PasWideStringManager.ThreadInitProc:=@InitThread;
513 PasWideStringManager.ThreadFiniProc:=@FiniThread;
514
515 { Unicode }
516 PasWideStringManager.Unicode2AnsiMoveProc:=@Unicode2AnsiMove;
517 PasWideStringManager.Ansi2UnicodeMoveProc:=@Ansi2UnicodeMove;
518 PasWideStringManager.UpperUnicodeStringProc:=@UpperUnicodeString;
519 PasWideStringManager.LowerUnicodeStringProc:=@LowerUnicodeString;
520 PasWideStringManager.CompareUnicodeStringProc:=@PasUnicodeCompareStr;
521 PasWideStringManager.CompareTextUnicodeStringProc:=@PasUnicodeCompareText;
522
523 SetUnicodeStringManager(PasWideStringManager);
524 end;
525
526
527 initialization
528 SetPasWideStringManager;
529 {$ENDIF}
530 end.
531