1 {
2  *****************************************************************************
3   This file is part of LazUtils.
4 
5   See the file COPYING.modifiedLGPL.txt, included in this distribution,
6   for details about the license.
7  *****************************************************************************
8 
9   A widestring manager written in Pascal
10   and optimized for DefaultSystemCodePage CP_UTF8.
11 }
12 unit PasWString;
13 
14 {$mode objfpc}
15 {$inline on}
16 {$i lazutils_defines.inc}
17 
18 //{$define PASWSTRING_VERBOSE}
19 //{.$define PASWSTRING_SUPPORT_NONUTF8_ANSISTRING} disabled by default because
20 // non utf-8 ansistring is rare in UNIXes and lconvencoding makes the executable big
21 
22 // sanity checks for defines
23 //{$IF FPC_FULLVERSION >= 30000}
24 {$IFnDEF NO_CP_RTL}
25   {$IFDEF UTF8_RTL}
26     {$IFDEF PASWSTRING_SUPPORT_NONUTF8_ANSISTRING}
27       {$error UTF8 or not UTF8}
28     {$ENDIF}
29   {$ENDIF}
30   {$DEFINE DisablePasWString}
31 {$ENDIF}
32 
33 interface
34 
35 uses
36   SysUtils, LazUTF8
37   {$ifdef PASWSTRING_SUPPORT_NONUTF8_ANSISTRING}, lconvencoding{$endif}
38   ;
39 
40 {$IFnDEF DisablePasWString}
41 procedure SetPasWidestringManager;
42 {$ENDIF}
43 
44 implementation
45 
46 {$IFnDEF DisablePasWString}
47 procedure fpc_rangeerror; [external name 'FPC_RANGEERROR'];
48 
IsASCIInull49 function IsASCII(const s: string): boolean; inline;
50 var
51   i: Integer;
52 begin
53   for i:=1 to length(s) do if ord(s[i])>127 then exit(false);
54   Result:=true;
55 end;
56 
57 // len comes in widechars, not bytes
58 procedure Wide2AnsiMove(source:pwidechar;var dest:ansistring;len:SizeInt);
59 begin
60   {$ifdef PASWSTRING_VERBOSE}WriteLn('Wide2AnsiMove START');{$endif}
61   dest := UTF16ToUTF8(Source,len);
62   {$ifdef PASWSTRING_SUPPORT_NONUTF8_ANSISTRING}
63   // And correct to the real Ansi encoding
64   dest := ConvertEncoding(dest, EncodingUTF8, GetDefaultTextEncoding());
65   {$endif}
66 end;
67 
68 procedure Ansi2WideMove(source:pchar;var dest:widestring;len:SizeInt);
69 {$ifdef PASWSTRING_SUPPORT_NONUTF8_ANSISTRING}
70 var
71   ansistr: ansistring;
72 {$endif}
73 begin
74   {$ifdef PASWSTRING_VERBOSE}WriteLn('Ansi2WideMove START');{$endif}
75 
76   {$ifdef PASWSTRING_SUPPORT_NONUTF8_ANSISTRING}
77   // Copy the originating string taking into account the specified length
78   SetLength(ansistr, len);
79   System.Move(source^, ansistr[1], len);
80   // Convert to UTF-8
81   ansistr := ConvertEncoding(ansistr, GetDefaultTextEncoding(), EncodingUTF8);
82   // Now convert it, using UTF-8 -> UTF-16
83   dest := UTF8ToUTF16(ansistr);
84   {$else}
85   dest := UTF8ToUTF16(source,len);
86   {$endif}
87 end;
88 
LowerWideStringnull89 function LowerWideString(const s : WideString) : WideString;
90 var
91   str: string;
92 begin
93   {$ifdef PASWSTRING_VERBOSE}WriteLn('LowerWideString START');{$endif}
94   str := UTF16ToUTF8(PWideChar(s),length(s));
95   str := UTF8LowerCase(str);
96   Result := UTF8ToUTF16(str);
97 end;
98 
UpperWideStringnull99 function UpperWideString(const s : WideString) : WideString;
100 var
101   str: string;
102 begin
103   {$ifdef PASWSTRING_VERBOSE}WriteLn('UpperWideString START');{$endif}
104   str := UTF16ToUTF8(PWideChar(s),length(s));
105   str := UTF8UpperCase(str);
106   Result := UTF8ToUTF16(str);
107 end;
108 
109 procedure EnsureAnsiLen(var S: AnsiString; const len: SizeInt); inline;
110 var
111   l: SizeUInt;
112 begin
113   {$ifdef PASWSTRING_VERBOSE}WriteLn('EnsureAnsiLen START');{$endif}
114   l:=length(s);
115   if (len>l) then
116     if (l < 128) then
117       setlength(s,l+8)
118     else
119       setlength(s,l+l shr 8);
120 end;
121 
122 
123 procedure ConcatCharToAnsiStr(const c: char; var S: AnsiString; var index: SizeInt);
124 begin
125   {$ifdef PASWSTRING_VERBOSE}WriteLn('ConcatCharToAnsiStr START');{$endif}
126   EnsureAnsiLen(s,index);
127   pchar(@s[index])^:=c;
128   inc(index);
129 end;
130 
LowerAnsiStringnull131 function LowerAnsiString(const s : AnsiString) : AnsiString;
132 var
133   Str: string;
134 begin
135   {$ifdef PASWSTRING_VERBOSE}WriteLn('LowerAnsiString START');{$endif}
136   Str := SysToUTF8(s);
137   Str := UTF8LowerCase(Str);
138   Result := UTF8ToSys(Str);
139 end;
140 
UpperAnsiStringnull141 function UpperAnsiString(const s : AnsiString) : AnsiString;
142 var
143   Str: string;
144 begin
145   {$ifdef PASWSTRING_VERBOSE}WriteLn('UpperAnsiString START');{$endif}
146   Str := SysToUTF8(s);
147   Str := UTF8UpperCase(Str);
148   Result := UTF8ToSys(Str);
149 end;
150 
151 // Just do a simple byte comparison
152 // A more complex analysis would require normalization
WideCompareStrnull153 function WideCompareStr(const s1, s2 : WideString) : PtrInt;
154 var
155   count, count1, count2: integer;
156 begin
157   {$ifdef PASWSTRING_VERBOSE}WriteLn('WideCompareStr START');{$endif}
158   result := 0;
159   Count1 := Length(S1);
160   Count2 := Length(S2);
161   if Count1>Count2 then
162     Count:=Count2
163   else
164     Count:=Count1;
165   result := SysUtils.CompareMemRange(Pointer(S1),Pointer(S2), Count*2);
166   if result=0 then
167     result:=Count1-Count2;
168 end;
169 
WideCompareTextnull170 function WideCompareText(const s1, s2 : WideString): PtrInt;
171 var
172   a, b: String;
173 begin
174   {$ifdef PASWSTRING_VERBOSE}WriteLn('WideCompareText START');{$endif}
175   a := UTF16ToUTF8(PWideChar(s1),length(s1));
176   a := UTF8LowerCase(a);
177   b := UTF16ToUTF8(PWideChar(s2),length(s2));
178   b := UTF8LowerCase(b);
179   result := UTF8CompareStr(a,b);
180 end;
181 
CharLengthPCharnull182 function CharLengthPChar(const Str: PChar): PtrInt;
183 // return the number of codepoints (including invalid codepoints)
184 var
185   p: PChar;
186   l: Integer;
187 begin
188   {$ifdef PASWSTRING_VERBOSE}WriteLn('CharLengthPChar START');{$endif}
189   p:=Str;
190   if p=nil then exit(0);
191   while p^<>#0 do begin
192     l:=UTF8CodepointSize(p);
193     inc(Result);
194     inc(p,l);
195   end;
196 end;
197 
CodePointLengthPCharnull198 function CodePointLengthPChar(const p: PChar; MaxLookAhead: PtrInt): Ptrint;
199 { return value:
200   -1 if incomplete or invalid code point
201   0 if NULL character,
202   > 0 if that's the length in bytes of the code point }
203 begin
204   {$ifdef PASWSTRING_VERBOSE}WriteLn('CodePointLengthPChar START');{$endif}
205   if (p=nil) then exit(0);
206   if (MaxLookAhead<0) then exit(-1);
207   if ord(p^)<%10000000 then begin
208     // regular single byte character
209     if p^=#0 then
210       exit(0)
211     else
212       exit(1);
213   end;
214   if ord(p^)<%11000000 then begin
215     // invalid single byte character
216     exit(-1);
217   end;
218   if (MaxLookAhead=0) then exit(-1);
219   if ((ord(p^) and %11100000) = %11000000) then begin
220     // should be 2 byte character
221     if (ord(p[1]) and %11000000) = %10000000 then
222       exit(2)
223     else
224       exit(-1);
225   end;
226   if (MaxLookAhead=1) then exit(-1);
227   if ((ord(p^) and %11110000) = %11100000) then begin
228     // should be 3 byte character
229     if ((ord(p[1]) and %11000000) = %10000000)
230     and ((ord(p[2]) and %11000000) = %10000000) then
231       exit(3)
232     else
233       exit(-1);
234   end;
235   if (MaxLookAhead=2) then exit(-1);
236   if ((ord(p^) and %11111000) = %11110000) then begin
237     // should be 4 byte character
238     if ((ord(p[1]) and %11000000) = %10000000)
239     and ((ord(p[2]) and %11000000) = %10000000)
240     and ((ord(p[3]) and %11000000) = %10000000) then
241       exit(4)
242     else
243       exit(-1);
244   end;
245   exit(-1);
246 end;
247 
AnsiCompareStrnull248 function AnsiCompareStr(const s1, s2: ansistring): PtrInt;
249 begin
250   {$ifdef PASWSTRING_VERBOSE}WriteLn('AnsiCompareStr START');{$endif}
251   Result := SysUtils.CompareStr(s1, s2);
252 end;
253 
254 // Similar to AnsiCompareStr, but with PChar
StrCompAnsinull255 function StrCompAnsi(s1,s2 : PChar): PtrInt;
256 var
257   Count1: SizeInt;
258   Count2: SizeInt;
259   Count: SizeInt;
260 begin
261   {$ifdef PASWSTRING_VERBOSE}WriteLn('StrCompAnsi START');{$endif}
262   result := 0;
263   Count1:=StrLen(s1);
264   Count2:=StrLen(s2);
265   if Count1>Count2 then
266     Count:=Count2
267   else
268     Count:=Count1;
269   result := CompareMemRange(s1, s2, Count);
270   if result=0 then
271     result:=Count1-Count2;
272 end;
273 
274 
AnsiCompareTextnull275 function AnsiCompareText(const S1, S2: ansistring): PtrInt;
276 var
277   str1, str2: string;
278 begin
279   {$ifdef PASWSTRING_VERBOSE}WriteLn('AnsiCompareText START');{$endif}
280   str1 := SysToUTF8(S1);
281   str2 := SysToUTF8(S2);
282   Result := UTF8CompareText(str1, str2);
283 end;
284 
285 
AnsiStrICompnull286 function AnsiStrIComp(S1, S2: PChar): PtrInt;
287 begin
288   {$ifdef PASWSTRING_VERBOSE}WriteLn('AnsiStrIComp START');{$endif}
289   Result := AnsiCompareText(StrPas(s1),StrPas(s2));
290 end;
291 
292 
AnsiStrLCompnull293 function AnsiStrLComp(S1, S2: PChar; MaxLen: PtrUInt): PtrInt;
294   var
295     a, b: pchar;
296 begin
297   {$ifdef PASWSTRING_VERBOSE}WriteLn('AnsiStrLComp START');{$endif}
298   Result := 0;
299   if (maxlen=0) then
300     exit(0);
301   if (s1[maxlen]<>#0) then
302     begin
303       getmem(a,maxlen+1);
304       move(s1^,a^,maxlen);
305       a[maxlen]:=#0;
306     end
307   else
308     a:=s1;
309   if (s2[maxlen]<>#0) then
310     begin
311       getmem(b,maxlen+1);
312       move(s2^,b^,maxlen);
313       b[maxlen]:=#0;
314     end
315   else
316     b:=s2;
317   result:=StrCompAnsi(a,b);
318   if (a<>s1) then
319     freemem(a);
320   if (b<>s2) then
321     freemem(b);
322 end;
323 
324 
AnsiStrLICompnull325 function AnsiStrLIComp(S1, S2: PChar; MaxLen: PtrUInt): PtrInt;
326   var
327     a, b: ansistring;
328 begin
329   {$ifdef PASWSTRING_VERBOSE}WriteLn('AnsiStrLIComp START');{$endif}
330   if (maxlen=0) then
331     exit(0);
332   setlength(a,maxlen);
333   move(s1^,a[1],maxlen);
334   setlength(b,maxlen);
335   move(s2^,b[1],maxlen);
336   result:=AnsiCompareText(a,b);
337 end;
338 
339 
340 procedure ansi2pchar(const s: ansistring; const orgp: pchar; out p: pchar);
341 var
342   newlen: sizeint;
343 begin
344   {$ifdef PASWSTRING_VERBOSE}WriteLn('ansi2pchar START');{$endif}
345   newlen:=length(s);
346   if newlen>strlen(orgp) then
347     fpc_rangeerror;
348   p:=orgp;
349   if (newlen>0) then
350     move(s[1],p[0],newlen);
351   p[newlen]:=#0;
352 end;
353 
354 
AnsiStrLowernull355 function AnsiStrLower(Str: PChar): PChar;
356 var
357   temp: ansistring;
358 begin
359   {$ifdef PASWSTRING_VERBOSE}WriteLn('AnsiStrLower START');{$endif}
360   temp:=loweransistring(str);
361   ansi2pchar(temp,str,result);
362 end;
363 
364 
AnsiStrUppernull365 function AnsiStrUpper(Str: PChar): PChar;
366 var
367   temp: ansistring;
368 begin
369   {$ifdef PASWSTRING_VERBOSE}WriteLn('AnsiStrUpper START');{$endif}
370   temp:=upperansistring(str);
371   ansi2pchar(temp,str,result);
372 end;
373 
374 
375 procedure InitThread;
376 begin
377 end;
378 
379 
380 procedure FiniThread;
381 begin
382 end;
383 
384 { Unicode }
385 
386 procedure Unicode2AnsiMove(source:pwidechar;var dest:ansistring;len:SizeInt);
387 begin
388   {$ifdef PASWSTRING_VERBOSE}WriteLn('Unicode2AnsiMove START');{$endif}
389   dest := UTF16ToUTF8(source,len);
390   {$ifdef PASWSTRING_SUPPORT_NONUTF8_ANSISTRING}
391   // And correct to the real Ansi encoding
392   dest := ConvertEncoding(dest, EncodingUTF8, GetDefaultTextEncoding());
393   {$endif}
394 end;
395 
396 procedure Ansi2UnicodeMove(source:pchar;var dest:UnicodeString;len:SizeInt);
397 {$IFDEF PASWSTRING_SUPPORT_NONUTF8_ANSISTRING}
398 var
399   ansistr: ansistring;
400 {$ENDIF}
401 begin
402   {$ifdef PASWSTRING_VERBOSE}WriteLn('Ansi2UnicodeMove START');{$endif}
403   {$IFDEF PASWSTRING_SUPPORT_NONUTF8_ANSISTRING}
404   if NeedRTLAnsi then begin
405     // Copy the originating string taking into account the specified length
406     SetLength(ansistr, len);
407     System.Move(source^, ansistr[1], len);
408     // Convert to UTF-8
409     ansistr := ConvertEncoding(ansistr, GetDefaultTextEncoding(), EncodingUTF8);
410     // Now convert it, using UTF-8 -> UTF-16
411     dest := UTF8ToUTF16(ansistr);
412   end else begin
413     dest := UTF8ToUTF16(source,len);
414   end;
415   {$ELSE}
416   dest := UTF8ToUTF16(source,len);
417   {$ENDIF}
418 end;
419 
UpperUnicodeStringnull420 function UpperUnicodeString(const s : UnicodeString) : UnicodeString;
421 var
422   str: string;
423 begin
424   {$ifdef PASWSTRING_VERBOSE}WriteLn('UpperUnicodeString START');{$endif}
425   str := UTF16ToUTF8(s);
426   str := UTF8UpperCase(str);
427   Result := UTF8ToUTF16(str);
428 end;
429 
LowerUnicodeStringnull430 function LowerUnicodeString(const s : UnicodeString) : UnicodeString;
431 var
432   str: string;
433 begin
434   {$ifdef PASWSTRING_VERBOSE}WriteLn('LowerUnicodeString START');{$endif}
435   str := UTF16ToUTF8(s);
436   str := UTF8LowerCase(str);
437   Result := UTF8ToUTF16(str);
438 end;
439 
440 // Just do a simple byte comparison
441 // A more complex analysis would require normalization
PasUnicodeCompareStrnull442 function PasUnicodeCompareStr(const s1, s2 : unicodestring) : PtrInt;
443 var
444   count, count1, count2: integer;
445 begin
446   {$ifdef PASWSTRING_VERBOSE}WriteLn('PasUnicodeCompareStr START');{$endif}
447   result := 0;
448   Count1 := Length(S1);
449   Count2 := Length(S2);
450   if Count1>Count2 then
451     Count:=Count2
452   else
453     Count:=Count1;
454   result := SysUtils.CompareMemRange(Pointer(S1),Pointer(S2), Count*2);
455   if result=0 then
456     result:=Count1-Count2;
457 end;
458 
PasUnicodeCompareTextnull459 function PasUnicodeCompareText(const s1, s2 : unicodestring): PtrInt;
460 var
461   a, b: string;
462 begin
463   {$ifdef PASWSTRING_VERBOSE}WriteLn('PasUnicodeCompareText START');{$endif}
464   a := UTF16ToUTF8(s1);
465   a := UTF8LowerCase(a);
466   b := UTF16ToUTF8(s2);
467   b := UTF8LowerCase(b);
468   result := UTF8CompareText(a,b);
469 end;
470 
471 Procedure SetPasWideStringManager;
472 Var
473   PasWideStringManager : TUnicodeStringManager;
474 begin
475   PasWideStringManager:=widestringmanager;
476   PasWideStringManager.Wide2AnsiMoveProc:=@Wide2AnsiMove;
477   PasWideStringManager.Ansi2WideMoveProc:=@Ansi2WideMove;
478 
479   //    UpperUTF8 : procedure(p:PUTF8String);
480   PasWideStringManager.UpperWideStringProc:=@UpperWideString;
481   //    UpperUCS4 : procedure(p:PUCS4Char);
482   //    LowerUTF8 : procedure(p:PUTF8String);
483   PasWideStringManager.LowerWideStringProc:=@LowerWideString;
484   //    LowerUCS4 : procedure(p:PUCS4Char);
485 
486   {
487     CompUTF8 : function(p1,p2:PUTF8String) : shortint;
488     CompUCS2 : function(p1,p2:PUCS2Char) : shortint;
489     CompUCS4 : function(p1,p2:PUC42Char) : shortint;
490   }
491   PasWideStringManager.CompareWideStringProc:=@WideCompareStr;
492   PasWideStringManager.CompareTextWideStringProc:=@WideCompareText;
493 
494   { return value: number of code points in the string. Whenever an invalid
495     code point is encountered, all characters part of this invalid code point
496     are considered to form one "character" and the next character is
497     considered to be the start of a new (possibly also invalid) code point }
498   PasWideStringManager.CharLengthPCharProc:=@CharLengthPChar;
499   PasWideStringManager.CodePointLengthProc:=@CodePointLengthPChar;
500 
501   { Ansi }
502   PasWideStringManager.UpperAnsiStringProc:=@UpperAnsiString;
503   PasWideStringManager.LowerAnsiStringProc:=@LowerAnsiString;
504   PasWideStringManager.CompareStrAnsiStringProc:=@AnsiCompareStr;
505   PasWideStringManager.CompareTextAnsiStringProc:=@AnsiCompareText;
506   PasWideStringManager.StrCompAnsiStringProc:=@StrCompAnsi;
507   PasWideStringManager.StrICompAnsiStringProc:=@AnsiStrIComp;
508   PasWideStringManager.StrLCompAnsiStringProc:=@AnsiStrLComp;
509   PasWideStringManager.StrLICompAnsiStringProc:=@AnsiStrLIComp;
510   PasWideStringManager.StrLowerAnsiStringProc:=@AnsiStrLower;
511   PasWideStringManager.StrUpperAnsiStringProc:=@AnsiStrUpper;
512   PasWideStringManager.ThreadInitProc:=@InitThread;
513   PasWideStringManager.ThreadFiniProc:=@FiniThread;
514 
515   { Unicode }
516   PasWideStringManager.Unicode2AnsiMoveProc:=@Unicode2AnsiMove;
517   PasWideStringManager.Ansi2UnicodeMoveProc:=@Ansi2UnicodeMove;
518   PasWideStringManager.UpperUnicodeStringProc:=@UpperUnicodeString;
519   PasWideStringManager.LowerUnicodeStringProc:=@LowerUnicodeString;
520   PasWideStringManager.CompareUnicodeStringProc:=@PasUnicodeCompareStr;
521   PasWideStringManager.CompareTextUnicodeStringProc:=@PasUnicodeCompareText;
522 
523   SetUnicodeStringManager(PasWideStringManager);
524 end;
525 
526 
527 initialization
528   SetPasWideStringManager;
529 {$ENDIF}
530 end.
531