1{ 2 This file is part of the Free Pascal run time library. 3 Copyright (c) 2005 by Florian Klaempfl, 4 member of the Free Pascal development team. 5 6 libc based wide string support 7 8 See the file COPYING.FPC, included in this distribution, 9 for details about the copyright. 10 11 This program is distributed in the hope that it will be useful, 12 but WITHOUT ANY WARRANTY; without even the implied warranty of 13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. 14 **********************************************************************} 15 16{$mode objfpc} 17{$inline on} 18{$implicitexceptions off} 19 20unit cwstring; 21 22interface 23 24procedure SetCWidestringManager; 25 26implementation 27 28{$linklib c} 29 30// Linux (and maybe glibc platforms in general), have iconv in glibc. 31{$if defined(linux) or defined(solaris)} 32 {$define iconv_is_in_libc} 33{$endif} 34 35{$ifdef netbsd} 36 {$ifndef DISABLE_ICONV_LIBC} 37 {$define iconv_is_in_libc} 38 {$endif} 39{$endif} 40 41{$ifndef iconv_is_in_libc} 42 {$if defined(haiku)} 43 {$linklib textencoding} 44 {$else} 45 {$linklib iconv} 46 {$endif} 47 {$define useiconv} 48{$endif not iconv_is_in_libc} 49 50{$i rtldefs.inc} 51 52Uses 53 BaseUnix, 54 ctypes, 55 unix, 56 unixtype, 57 initc, 58 dynlibs, 59 unixcp; 60 61Const 62{$ifndef useiconv} 63 libiconvname='c'; // is in libc under Linux. 64 libprefix='lib'; 65{$else} 66 {$ifdef haiku} 67 libiconvname='textencoding'; // is in libtextencoding under Haiku 68 libprefix='lib'; 69 {$else} 70 {$if defined(darwin) or defined(aix)} 71 libiconvname='libiconv'; 72 libprefix=''; 73 {$else} 74 libiconvname='iconv'; 75 libprefix='lib'; 76 {$endif} 77 {$endif} 78{$endif} 79 80{ helper functions from libc } 81function towlower(__wc:wint_t):wint_t;cdecl;external clib name 'towlower'; 82function towupper(__wc:wint_t):wint_t;cdecl;external clib name 'towupper'; 83 84function wcscoll (__s1:pwchar_t; __s2:pwchar_t):cint;cdecl;external clib name 'wcscoll'; 85function strcoll (__s1:pchar; __s2:pchar):cint;cdecl;external clib name 'strcoll'; 86{$ifdef netbsd} 87 { NetBSD has a new setlocale function defined in /usr/include/locale.h 88 that should be used } 89function setlocale(category: cint; locale: pchar): pchar; cdecl; external clib name '__setlocale_mb_len_max_32'; 90{$else} 91function setlocale(category: cint; locale: pchar): pchar; cdecl; external clib name 'setlocale'; 92{$endif} 93{$if not(defined(beos) and not defined(haiku))} 94function mbrtowc(pwc: pwchar_t; const s: pchar; n: size_t; ps: pmbstate_t): size_t; cdecl; external clib name 'mbrtowc'; 95function wcrtomb(s: pchar; wc: wchar_t; ps: pmbstate_t): size_t; cdecl; external clib name 'wcrtomb'; 96function mbrlen(const s: pchar; n: size_t; ps: pmbstate_t): size_t; cdecl; external clib name 'mbrlen'; 97{$else beos} 98function mbtowc(pwc: pwchar_t; const s: pchar; n: size_t): size_t; cdecl; external clib name 'mbtowc'; 99function wctomb(s: pchar; wc: wchar_t): size_t; cdecl; external clib name 'wctomb'; 100function mblen(const s: pchar; n: size_t): size_t; cdecl; external clib name 'mblen'; 101{$endif beos} 102 103 104const 105{$if defined(linux)} 106 __LC_CTYPE = 0; 107 LC_ALL = 6; 108 _NL_CTYPE_CLASS = (__LC_CTYPE shl 16); 109 _NL_CTYPE_CODESET_NAME = (_NL_CTYPE_CLASS)+14; 110 CODESET = _NL_CTYPE_CODESET_NAME; 111{$elseif defined(darwin)} 112 CODESET = 0; 113 LC_ALL = 0; 114{$elseif defined(FreeBSD)} // actually FreeBSD5. internationalisation is afaik not default on 4. 115 __LC_CTYPE = 0; 116 LC_ALL = 0; 117 _NL_CTYPE_CLASS = (__LC_CTYPE shl 16); 118 _NL_CTYPE_CODESET_NAME = (_NL_CTYPE_CLASS)+14; 119 CODESET = 0; // _NL_CTYPE_CODESET_NAME; 120{$elseif defined(solaris)} 121 {$define ACCEPT_646} 122 CODESET=49; 123 LC_ALL = 6; 124{$elseif defined(beos)} 125 {$ifdef haiku} 126 CODESET= 0; // Checked for Haiku 127 LC_ALL = 0; // Checked for Haiku 128 {$else} 129 {$warning check correct value for BeOS} 130 CODESET=49; 131 LC_ALL = 6; // Checked for BeOS 132 {$endif} 133 ESysEILSEQ = EILSEQ; 134{$elseif defined(OpenBSD)} 135 {$define ACCEPT_646} 136 CODESET = 51; 137 LC_ALL = 0; 138{$elseif defined(NetBSD)} 139 {$define ACCEPT_646} 140 CODESET = 51; 141 LC_ALL = 0; 142{$elseif defined(aix)} 143 CODESET = 49; 144 LC_ALL = -1; 145{$elseif defined(dragonfly)} 146 CODESET = 0; 147 LC_ALL = 0; 148 __LC_CTYPE = 0; 149 _NL_CTYPE_CLASS = (__LC_CTYPE shl 16); 150 _NL_CTYPE_CODESET_NAME = (_NL_CTYPE_CLASS)+14; 151{$else not aix} 152{$error lookup the value of CODESET in /usr/include/langinfo.h, and the value of LC_ALL in /usr/include/locale.h for your OS } 153// and while doing it, check if iconv is in libc, and if the symbols are prefixed with iconv_ or libiconv_ 154{$endif} 155 156{ unicode encoding name } 157{$ifdef FPC_LITTLE_ENDIAN} 158 unicode_encoding2 = 'UTF-16LE'; 159 unicode_encoding4 = 'UCS-4LE'; 160{$else FPC_LITTLE_ENDIAN} 161{$ifdef AIX} 162 unicode_encoding2 = 'UTF-16'; 163{$else AIX} 164 unicode_encoding2 = 'UTF-16BE'; 165 unicode_encoding4 = 'UCS-4BE'; 166{$endif AIX} 167{$endif FPC_LITTLE_ENDIAN} 168 169{ en_US.UTF-8 needs maximally 6 chars, UCS-4/UTF-32 needs 4 } 170{ -> 10 should be enough? Should actually use MB_CUR_MAX, but } 171{ that's a libc macro mapped to internal functions/variables } 172{ and thus not a stable external API on systems where libc } 173{ breaks backwards compatibility every now and then } 174 MB_CUR_MAX = 10; 175 176{ Requests for iconvctl } 177 ICONV_TRIVIALP = 0; // int *argument 178 ICONV_GET_TRANSLITERATE = 1; // int *argument 179 ICONV_SET_TRANSLITERATE = 2; // const int *argument 180 ICONV_GET_DISCARD_ILSEQ = 3; // int *argument 181 ICONV_SET_DISCARD_ILSEQ = 4; // const int *argument 182 ICONV_SET_HOOKS = 5; // const struct iconv_hooks *argument 183 ICONV_SET_FALLBACKS = 6; // const struct iconv_fallbacks *argument 184 185type 186 piconv_t = ^iconv_t; 187 iconv_t = pointer; 188 nl_item = cint; 189 190{$ifdef haiku} 191 function nl_langinfo(__item:nl_item):pchar;cdecl;external 'root' name 'nl_langinfo'; 192{$else} 193 {$ifndef beos} 194 function nl_langinfo(__item:nl_item):pchar;cdecl;external libiconvname name 'nl_langinfo'; 195 {$endif} 196{$endif} 197 198{$if (not defined(bsd) and not defined(beos)) or defined(iconv_is_in_libc) or (defined(darwin) and not defined(cpupowerpc32))} 199function iconv_open(__tocode:pchar; __fromcode:pchar):iconv_t;cdecl;external libiconvname name 'iconv_open'; 200function iconv(__cd:iconv_t; __inbuf:ppchar; __inbytesleft:psize_t; __outbuf:ppchar; __outbytesleft:psize_t):size_t;cdecl;external libiconvname name 'iconv'; 201function iconv_close(__cd:iconv_t):cint;cdecl;external libiconvname name 'iconv_close'; 202const 203 iconvctlname='iconvctl'; 204{$else} 205function iconv_open(__tocode:pchar; __fromcode:pchar):iconv_t;cdecl;external libiconvname name 'libiconv_open'; 206function iconv(__cd:iconv_t; __inbuf:ppchar; __inbytesleft:psize_t; __outbuf:ppchar; __outbytesleft:psize_t):size_t;cdecl;external libiconvname name 'libiconv'; 207function iconv_close(__cd:iconv_t):cint;cdecl;external libiconvname name 'libiconv_close'; 208const 209 iconvctlname='libiconvctl'; 210{$endif} 211var 212 iconvctl:function(__cd:iconv_t; __request:cint; __argument:pointer):cint;cdecl; 213 214procedure fpc_rangeerror; [external name 'FPC_RANGEERROR']; 215 216 217threadvar 218 iconv_ansi2wide, 219 iconv_wide2ansi : iconv_t; 220 { since we cache the iconv_t converters, we have to do the same 221 for the DefaultSystemCodePage variable since if it changes, we 222 have to re-initialize the converters too. We can't do that via 223 a callback in the widestring manager because DefaultSystemCodePage 224 is not a threadvar and we can't automatically change this in all 225 threads } 226 current_DefaultSystemCodePage: TSystemCodePage; 227 228 229procedure InitThread; 230var 231 transliterate: cint; 232 iconvindex: longint; 233{$if not(defined(darwin) and (defined(cpuarm) or defined(cpuaarch64))) and not defined(iphonesim)} 234 iconvname: rawbytestring; 235{$endif} 236begin 237 current_DefaultSystemCodePage:=DefaultSystemCodePage; 238{$if not(defined(darwin) and (defined(cpuarm) or defined(cpuaarch64))) and not defined(iphonesim)} 239 iconvindex:=GetCodepageData(DefaultSystemCodePage); 240 if iconvindex<>-1 then 241 iconvname:=UnixCpMap[iconvindex].name 242 else 243 { default to UTF-8 on Unix platforms } 244 iconvname:='UTF-8'; 245 iconv_wide2ansi:=iconv_open(pchar(iconvname),unicode_encoding2); 246 iconv_ansi2wide:=iconv_open(unicode_encoding2,pchar(iconvname)); 247{$else} 248 { Unix locale settings are ignored on iPhoneOS/iPhoneSimulator } 249 iconv_wide2ansi:=iconv_open('UTF-8',unicode_encoding2); 250 iconv_ansi2wide:=iconv_open(unicode_encoding2,'UTF-8'); 251{$endif} 252 if assigned(iconvctl) and 253 (iconv_wide2ansi<>iconv_t(-1)) then 254 begin 255 transliterate:=1; 256 iconvctl(iconv_wide2ansi,ICONV_SET_TRANSLITERATE,@transliterate); 257 end; 258end; 259 260 261procedure FiniThread; 262begin 263 if (iconv_wide2ansi <> iconv_t(-1)) then 264 iconv_close(iconv_wide2ansi); 265 if (iconv_ansi2wide <> iconv_t(-1)) then 266 iconv_close(iconv_ansi2wide); 267end; 268 269 270{$if defined(beos) and not defined(haiku)} 271function nl_langinfo(__item:nl_item):pchar; 272begin 273 {$warning TODO BeOS nl_langinfo or more uptodate port of iconv...} 274 // Now implement the minimum required to correctly initialize WideString support 275 case __item of 276 CODESET : Result := 'UTF-8'; // BeOS use UTF-8 277 else 278 begin 279 Assert(False, 'nl_langinfo was called with an unknown nl_item value'); 280 Result := ''; 281 end; 282 end; 283end; 284{$endif} 285 286 287function open_iconv_for_cps(cp: TSystemCodePage; const otherencoding: pchar; cp_is_from: boolean): iconv_t; 288 var 289 iconvindex: longint; 290 begin 291 { TODO: add caching (then we also don't need separate code for 292 the default system page and other ones) 293 294 -- typecasting an ansistring function result to pchar is 295 unsafe normally, but these are constant strings -> no 296 problem } 297 open_iconv_for_cps:=iconv_t(-1); 298 iconvindex:=GetCodepageData(cp); 299 if iconvindex=-1 then 300 exit; 301 repeat 302 if cp_is_from then 303 open_iconv_for_cps:=iconv_open(otherencoding,pchar(UnixCpMap[iconvindex].name)) 304 else 305 open_iconv_for_cps:=iconv_open(pchar(UnixCpMap[iconvindex].name),otherencoding); 306 inc(iconvindex); 307 until (open_iconv_for_cps<>iconv_t(-1)) or 308 (iconvindex>high(UnixCpMap)) or 309 (UnixCpMap[iconvindex].cp<>cp); 310 end; 311 312 313{$ifdef aix} 314{$i cwstraix.inc} 315{$endif aix} 316 317procedure Wide2AnsiMove(source:pwidechar; var dest:RawByteString; cp:TSystemCodePage; len:SizeInt); 318 var 319 outlength, 320 outoffset, 321 srclen, 322 outleft : size_t; 323 use_iconv: iconv_t; 324 srcpos : pwidechar; 325 destpos: pchar; 326 mynil : pchar; 327 my0 : size_t; 328 err : longint; 329 transliterate: cint; 330 free_iconv: boolean; 331{$ifdef aix} 332 intermediate: rawbytestring; 333{$endif aix} 334 begin 335{$ifdef aix} 336 { AIX libiconv does not support converting cp866 to anything else except 337 for iso-8859-5 -> always first convert to iso-8859-5, then to UTF-16 } 338 if cp=866 then 339 begin 340 Wide2AnsiMove(source,intermediate,28595,len); 341 if handle_aix_intermediate(pchar(intermediate),28595,cp,dest,len) then 342 exit; 343 end; 344{$endif aix} 345 if (cp=DefaultSystemCodePage) then 346 begin 347 { update iconv converter in case the DefaultSystemCodePage has been 348 changed } 349 if current_DefaultSystemCodePage<>DefaultSystemCodePage then 350 begin 351 FiniThread; 352 InitThread; 353 end; 354 use_iconv:=iconv_wide2ansi; 355 free_iconv:=false; 356 end 357 else 358 begin 359 use_iconv:=open_iconv_for_cps(cp,unicode_encoding2,false); 360 if (use_iconv<>iconv_t(-1)) and 361 assigned(iconvctl) then 362 begin 363 transliterate:=1; 364 iconvctl(use_iconv,ICONV_SET_TRANSLITERATE,@transliterate); 365 end; 366 free_iconv:=true; 367 end; 368 { unsupported encoding -> default move } 369 if use_iconv=iconv_t(-1) then 370 begin 371 DefaultUnicode2AnsiMove(source,dest,DefaultSystemCodePage,len); 372 exit; 373 end; 374 mynil:=nil; 375 my0:=0; 376 { rought estimation } 377 setlength(dest,len*3); 378 outlength:=len*3; 379 srclen:=len*2; 380 srcpos:=source; 381 destpos:=pchar(dest); 382 outleft:=outlength; 383 while iconv(use_iconv,ppchar(@srcpos),@srclen,@destpos,@outleft)=size_t(-1) do 384 begin 385 err:=fpgetCerrno; 386 case err of 387 { last character is incomplete sequence } 388 ESysEINVAL, 389 { incomplete sequence in the middle } 390 ESysEILSEQ: 391 begin 392 { skip and set to '?' } 393 inc(srcpos); 394 dec(srclen,2); 395 destpos^:='?'; 396 inc(destpos); 397 dec(outleft); 398 { reset } 399 iconv(use_iconv,@mynil,@my0,@mynil,@my0); 400 if err=ESysEINVAL then 401 break; 402 end; 403 ESysE2BIG: 404 begin 405 outoffset:=destpos-pchar(dest); 406 { extend } 407 setlength(dest,outlength+len*3); 408 inc(outleft,len*3); 409 inc(outlength,len*3); 410 { string could have been moved } 411 destpos:=pchar(dest)+outoffset; 412 end; 413 else 414 runerror(231); 415 end; 416 end; 417 // truncate string 418 setlength(dest,length(dest)-outleft); 419 SetCodePage(dest,cp,false); 420 if free_iconv then 421 iconv_close(use_iconv); 422 end; 423 424 425procedure Ansi2WideMove(source:pchar; cp:TSystemCodePage; var dest:widestring; len:SizeInt); 426 var 427 outlength, 428 outoffset, 429 outleft : size_t; 430 use_iconv: iconv_t; 431 srcpos, 432 destpos: pchar; 433 mynil : pchar; 434 my0 : size_t; 435 err: cint; 436 iconvindex: longint; 437 free_iconv: boolean; 438{$ifdef aix} 439 intermediate: rawbytestring; 440{$endif aix} 441 begin 442{$ifdef aix} 443 { AIX libiconv does not support converting cp866 to anything else except 444 for iso-8859-5 -> always first convert to iso-8859-5, then to UTF-16 } 445 if cp=866 then 446 begin 447 if handle_aix_intermediate(source,cp,cp,intermediate,len) then 448 source:=pchar(intermediate); 449 end; 450{$endif aix} 451 if (cp=DefaultSystemCodePage) then 452 begin 453 { update iconv converter in case the DefaultSystemCodePage has been 454 changed } 455 if current_DefaultSystemCodePage<>DefaultSystemCodePage then 456 begin 457 FiniThread; 458 InitThread; 459 end; 460 use_iconv:=iconv_ansi2wide; 461 free_iconv:=false; 462 end 463 else 464 begin 465 { TODO: add caching (then we also don't need separate code for 466 the default system page and other ones) 467 468 -- typecasting an ansistring function result to pchar is 469 unsafe normally, but these are constant strings -> no 470 problem } 471 use_iconv:=open_iconv_for_cps(cp,unicode_encoding2,true); 472 free_iconv:=true; 473 end; 474 { unsupported encoding -> default move } 475 if use_iconv=iconv_t(-1) then 476 begin 477 DefaultAnsi2UnicodeMove(source,DefaultSystemCodePage,dest,len); 478 exit; 479 end; 480 mynil:=nil; 481 my0:=0; 482 // extra space 483 outlength:=len+1; 484 setlength(dest,outlength); 485 srcpos:=source; 486 destpos:=pchar(dest); 487 outleft:=outlength*2; 488 while iconv(use_iconv,@srcpos,psize(@len),@destpos,@outleft)=size_t(-1) do 489 begin 490 err:=fpgetCerrno; 491 case err of 492 ESysEINVAL, 493 ESysEILSEQ: 494 begin 495 { skip and set to '?' } 496 inc(srcpos); 497 dec(len); 498 pwidechar(destpos)^:='?'; 499 inc(destpos,2); 500 dec(outleft,2); 501 { reset } 502 iconv(use_iconv,@mynil,@my0,@mynil,@my0); 503 if err=ESysEINVAL then 504 break; 505 end; 506 ESysE2BIG: 507 begin 508 outoffset:=destpos-pchar(dest); 509 { extend } 510 setlength(dest,outlength+len); 511 inc(outleft,len*2); 512 inc(outlength,len); 513 { string could have been moved } 514 destpos:=pchar(dest)+outoffset; 515 end; 516 else 517 runerror(231); 518 end; 519 end; 520 // truncate string 521 setlength(dest,length(dest)-outleft div 2); 522 if free_iconv then 523 iconv_close(use_iconv); 524 end; 525 526 527function LowerWideString(const s : WideString) : WideString; 528 var 529 i : SizeInt; 530 begin 531 SetLength(result,length(s)); 532 for i:=0 to length(s)-1 do 533 pwidechar(result)[i]:=WideChar(towlower(wint_t(s[i+1]))); 534 end; 535 536 537function UpperWideString(const s : WideString) : WideString; 538 var 539 i : SizeInt; 540 begin 541 SetLength(result,length(s)); 542 for i:=0 to length(s)-1 do 543 pwidechar(result)[i]:=WideChar(towupper(wint_t(s[i+1]))); 544 end; 545 546 547procedure EnsureAnsiLen(var S: AnsiString; const len: SizeInt); inline; 548begin 549 if (len>length(s)) then 550 if (length(s) < 10*256) then 551 setlength(s,length(s)+10) 552 else 553 setlength(s,length(s)+length(s) shr 8); 554end; 555 556 557procedure ConcatCharToAnsiStr(const c: char; var S: AnsiString; var index: SizeInt); 558begin 559 EnsureAnsiLen(s,index); 560 pchar(@s[index])^:=c; 561 inc(index); 562end; 563 564 565{ concatenates an utf-32 char to a widestring. S *must* be unique when entering. } 566{$if not(defined(beos) and not defined(haiku))} 567procedure ConcatUTF32ToAnsiStr(const nc: wint_t; var S: AnsiString; var index: SizeInt; var mbstate: mbstate_t); 568{$else not beos} 569procedure ConcatUTF32ToAnsiStr(const nc: wint_t; var S: AnsiString; var index: SizeInt); 570{$endif beos} 571var 572 p : pchar; 573 mblen : size_t; 574begin 575 { we know that s is unique -> avoid uniquestring calls} 576 p:=@s[index]; 577 if (nc<=127) then 578 ConcatCharToAnsiStr(char(nc),s,index) 579 else 580 begin 581 EnsureAnsiLen(s,index+MB_CUR_MAX); 582{$if not(defined(beos) and not defined(haiku))} 583 mblen:=wcrtomb(p,wchar_t(nc),@mbstate); 584{$else not beos} 585 mblen:=wctomb(p,wchar_t(nc)); 586{$endif not beos} 587 if (mblen<>size_t(-1)) then 588 inc(index,mblen) 589 else 590 begin 591 { invalid wide char } 592 p^:='?'; 593 inc(index); 594 end; 595 end; 596end; 597 598 599function LowerAnsiString(const s : AnsiString) : AnsiString; 600 var 601 i, slen, 602 resindex : SizeInt; 603 mblen : size_t; 604{$if not(defined(beos) and not defined(haiku))} 605 ombstate, 606 nmbstate : mbstate_t; 607{$endif beos} 608 wc : wchar_t; 609 begin 610{$if not(defined(beos) and not defined(haiku))} 611 fillchar(ombstate,sizeof(ombstate),0); 612 fillchar(nmbstate,sizeof(nmbstate),0); 613{$endif beos} 614 slen:=length(s); 615 SetLength(result,slen+10); 616 i:=1; 617 resindex:=1; 618 while (i<=slen) do 619 begin 620 if (s[i]<=#127) then 621 begin 622 wc:=wchar_t(s[i]); 623 mblen:= 1; 624 end 625 else 626{$if not(defined(beos) and not defined(haiku))} 627 mblen:=mbrtowc(@wc, pchar(@s[i]), slen-i+1, @ombstate); 628{$else not beos} 629 mblen:=mbtowc(@wc, pchar(@s[i]), slen-i+1); 630{$endif not beos} 631 case mblen of 632 size_t(-2): 633 begin 634 { partial invalid character, copy literally } 635 while (i<=slen) do 636 begin 637 ConcatCharToAnsiStr(s[i],result,resindex); 638 inc(i); 639 end; 640 end; 641 size_t(-1), 0: 642 begin 643 { invalid or null character } 644 ConcatCharToAnsiStr(s[i],result,resindex); 645 inc(i); 646 end; 647 else 648 begin 649 { a valid sequence } 650 { even if mblen = 1, the lowercase version may have a } 651 { different length } 652 { We can't do anything special if wchar_t is 16 bit... } 653{$if not(defined(beos) and not defined(haiku))} 654 ConcatUTF32ToAnsiStr(towlower(wint_t(wc)),result,resindex,nmbstate); 655{$else not beos} 656 ConcatUTF32ToAnsiStr(towlower(wint_t(wc)),result,resindex); 657{$endif not beos} 658 inc(i,mblen); 659 end; 660 end; 661 end; 662 SetLength(result,resindex-1); 663 end; 664 665 666function UpperAnsiString(const s : AnsiString) : AnsiString; 667 var 668 i, slen, 669 resindex : SizeInt; 670 mblen : size_t; 671{$if not(defined(beos) and not defined(haiku))} 672 ombstate, 673 nmbstate : mbstate_t; 674{$endif beos} 675 wc : wchar_t; 676 begin 677{$if not(defined(beos) and not defined(haiku))} 678 fillchar(ombstate,sizeof(ombstate),0); 679 fillchar(nmbstate,sizeof(nmbstate),0); 680{$endif beos} 681 slen:=length(s); 682 SetLength(result,slen+10); 683 i:=1; 684 resindex:=1; 685 while (i<=slen) do 686 begin 687 if (s[i]<=#127) then 688 begin 689 wc:=wchar_t(s[i]); 690 mblen:= 1; 691 end 692 else 693{$if not(defined(beos) and not defined(haiku))} 694 mblen:=mbrtowc(@wc, pchar(@s[i]), slen-i+1, @ombstate); 695{$else not beos} 696 mblen:=mbtowc(@wc, pchar(@s[i]), slen-i+1); 697{$endif beos} 698 case mblen of 699 size_t(-2): 700 begin 701 { partial invalid character, copy literally } 702 while (i<=slen) do 703 begin 704 ConcatCharToAnsiStr(s[i],result,resindex); 705 inc(i); 706 end; 707 end; 708 size_t(-1), 0: 709 begin 710 { invalid or null character } 711 ConcatCharToAnsiStr(s[i],result,resindex); 712 inc(i); 713 end; 714 else 715 begin 716 { a valid sequence } 717 { even if mblen = 1, the uppercase version may have a } 718 { different length } 719 { We can't do anything special if wchar_t is 16 bit... } 720{$if not(defined(beos) and not defined(haiku))} 721 ConcatUTF32ToAnsiStr(towupper(wint_t(wc)),result,resindex,nmbstate); 722{$else not beos} 723 ConcatUTF32ToAnsiStr(towupper(wint_t(wc)),result,resindex); 724{$endif not beos} 725 inc(i,mblen); 726 end; 727 end; 728 end; 729 SetLength(result,resindex-1); 730 end; 731 732function WideStringToUCS4StringNoNulls(const s : WideString) : UCS4String; 733 var 734 i, slen, 735 destindex : SizeInt; 736 uch : UCS4Char; 737 begin 738 slen:=length(s); 739 setlength(result,slen+1); 740 i:=1; 741 destindex:=0; 742 while (i<=slen) do 743 begin 744 uch:=UCS4Char(s[i]); 745 if (uch=0) then 746 result[destindex]:=32 747 else if (uch<=$d7ff) or (uch>=$e000) then 748 result[destindex]:=uch 749 else if (uch<=$dbff) and 750 (i<slen) and 751 (s[i+1]>=#$dc00) and 752 (s[i+1]<=#$dfff) then 753 begin 754 result[destindex]:=(UCS4Char(uch-$d7c0) shl 10)+(UCS4Char(s[i+1]) xor $dc00); 755 inc(i); 756 end 757 else { invalid surrogate pair } 758 result[destindex]:=uch; 759 inc(i); 760 inc(destindex); 761 end; 762 result[destindex]:=UCS4Char(0); 763 { Trimming length in this particular case is just a waste of time, 764 because result will be interpreted as null-terminated and discarded 765 almost immediately } 766 end; 767 768 769function CompareWideString(const s1, s2 : WideString; Options : TCompareOptions) : PtrInt; 770{$if not(defined (aix) and defined(cpupowerpc32))} 771 var 772 hs1,hs2 : UCS4String; 773 us1,us2 : WideString; 774 775 begin 776 { wcscoll interprets null chars as end-of-string -> filter out } 777 if coIgnoreCase in Options then 778 begin 779 us1:=UpperWideString(s1); 780 us2:=UpperWideString(s2); 781 end 782 else 783 begin 784 us1:=s1; 785 us2:=s2; 786 end; 787 hs1:=WideStringToUCS4StringNoNulls(us1); 788 hs2:=WideStringToUCS4StringNoNulls(us2); 789 result:=wcscoll(pwchar_t(hs1),pwchar_t(hs2)); 790 end; 791{$else} 792 { AIX/PPC32 has a 16 bit wchar_t } 793 var 794 i, len: longint; 795 us1,us2 : WideString; 796 hs1, hs2: array of widechar; 797 begin 798 if coIgnoreCase in Options then 799 begin 800 us1:=UpperWideString(s1); 801 us2:=UpperWideString(s2); 802 end 803 else 804 begin 805 us1:=s1; 806 us2:=s2; 807 end; 808 len:=length(us1); 809 setlength(hs1,len+1); 810 for i:=1 to len do 811 if us1[i]<>#0 then 812 hs1[i-1]:=us1[i] 813 else 814 hs1[i-1]:=#32; 815 hs1[len]:=#0; 816 817 len:=length(us2); 818 setlength(hs2,len+1); 819 for i:=1 to len do 820 if us2[i]<>#0 then 821 hs2[i-1]:=us2[i] 822 else 823 hs2[i-1]:=#32; 824 hs2[len]:=#0; 825 result:=wcscoll(pwchar_t(hs1),pwchar_t(hs2)); 826 end; 827{$endif} 828 829 830 831{ return value: number of code points in the string. Whenever an invalid 832 code point is encountered, all characters part of this invalid code point 833 are considered to form one "character" and the next character is 834 considered to be the start of a new (possibly also invalid) code point } 835function CharLengthPChar(const Str: PChar): PtrInt; 836 var 837 nextlen: ptrint; 838 s: pchar; 839{$if not(defined(beos) and not defined(haiku))} 840 mbstate: mbstate_t; 841{$endif not beos} 842 begin 843 result:=0; 844 s:=str; 845{$if not(defined(beos) and not defined(haiku))} 846 fillchar(mbstate,sizeof(mbstate),0); 847{$endif not beos} 848 repeat 849{$if defined(beos) and not defined(haiku)} 850 nextlen:=ptrint(mblen(s,MB_CUR_MAX)); 851{$else beos} 852 nextlen:=ptrint(mbrlen(s,MB_CUR_MAX,@mbstate)); 853{$endif beos} 854 { skip invalid/incomplete sequences } 855 if (nextlen<0) then 856 nextlen:=1; 857 inc(result,1); 858 inc(s,nextlen); 859 until (nextlen=0); 860 end; 861 862 863function CodePointLength(const Str: PChar; maxlookahead: ptrint): PtrInt; 864{$if not(defined(beos) and not defined(haiku))} 865 var 866 mbstate: mbstate_t; 867{$endif not beos} 868 begin 869{$if defined(beos) and not defined(haiku)} 870 result:=ptrint(mblen(str,maxlookahead)); 871{$else beos} 872 fillchar(mbstate,sizeof(mbstate),0); 873 result:=ptrint(mbrlen(str,maxlookahead,@mbstate)); 874 { mbrlen can also return -2 for "incomplete but potially valid character 875 and data has been processed" } 876 if result<0 then 877 result:=-1; 878{$endif beos} 879 end; 880 881 882function StrCompAnsiIntern(s1,s2 : PChar; len1, len2: PtrInt; canmodifys1, canmodifys2: boolean): PtrInt; 883 var 884 a,b: pchar; 885 i: PtrInt; 886 begin 887 if not(canmodifys1) then 888 getmem(a,len1+1) 889 else 890 a:=s1; 891 for i:=0 to len1-1 do 892 if s1[i]<>#0 then 893 a[i]:=s1[i] 894 else 895 a[i]:=#32; 896 a[len1]:=#0; 897 898 if not(canmodifys2) then 899 getmem(b,len2+1) 900 else 901 b:=s2; 902 for i:=0 to len2-1 do 903 if s2[i]<>#0 then 904 b[i]:=s2[i] 905 else 906 b[i]:=#32; 907 b[len2]:=#0; 908 result:=strcoll(a,b); 909 if not(canmodifys1) then 910 freemem(a); 911 if not(canmodifys2) then 912 freemem(b); 913 end; 914 915 916function CompareStrAnsiString(const s1, s2: ansistring): PtrInt; 917 begin 918 result:=StrCompAnsiIntern(pchar(s1),pchar(s2),length(s1),length(s2),false,false); 919 end; 920 921 922function StrCompAnsi(s1,s2 : PChar): PtrInt; 923 begin 924 result:=strcoll(s1,s2); 925 end; 926 927 928function AnsiCompareText(const S1, S2: ansistring): PtrInt; 929 var 930 a, b: AnsiString; 931 begin 932 a:=UpperAnsistring(s1); 933 b:=UpperAnsistring(s2); 934 result:=StrCompAnsiIntern(pchar(a),pchar(b),length(a),length(b),true,true); 935 end; 936 937 938function AnsiStrIComp(S1, S2: PChar): PtrInt; 939 begin 940 result:=AnsiCompareText(ansistring(s1),ansistring(s2)); 941 end; 942 943 944function AnsiStrLComp(S1, S2: PChar; MaxLen: PtrUInt): PtrInt; 945 var 946 a, b: pchar; 947begin 948 if (maxlen=0) then 949 exit(0); 950 if (s1[maxlen]<>#0) then 951 begin 952 getmem(a,maxlen+1); 953 move(s1^,a^,maxlen); 954 a[maxlen]:=#0; 955 end 956 else 957 a:=s1; 958 if (s2[maxlen]<>#0) then 959 begin 960 getmem(b,maxlen+1); 961 move(s2^,b^,maxlen); 962 b[maxlen]:=#0; 963 end 964 else 965 b:=s2; 966 result:=StrCompAnsiIntern(a,b,maxlen,maxlen,a<>s1,b<>s2); 967 if (a<>s1) then 968 freemem(a); 969 if (b<>s2) then 970 freemem(b); 971end; 972 973 974function AnsiStrLIComp(S1, S2: PChar; MaxLen: PtrUInt): PtrInt; 975 var 976 a, b: ansistring; 977begin 978 if (maxlen=0) then 979 exit(0); 980 setlength(a,maxlen); 981 move(s1^,a[1],maxlen); 982 setlength(b,maxlen); 983 move(s2^,b[1],maxlen); 984 result:=AnsiCompareText(a,b); 985end; 986 987 988procedure ansi2pchar(const s: ansistring; const orgp: pchar; out p: pchar); 989var 990 newlen: sizeint; 991begin 992 newlen:=length(s); 993 if newlen>strlen(orgp) then 994 fpc_rangeerror; 995 p:=orgp; 996 if (newlen>0) then 997 move(s[1],p[0],newlen); 998 p[newlen]:=#0; 999end; 1000 1001 1002function AnsiStrLower(Str: PChar): PChar; 1003var 1004 temp: ansistring; 1005begin 1006 temp:=loweransistring(str); 1007 ansi2pchar(temp,str,result); 1008end; 1009 1010 1011function AnsiStrUpper(Str: PChar): PChar; 1012var 1013 temp: ansistring; 1014begin 1015 temp:=upperansistring(str); 1016 ansi2pchar(temp,str,result); 1017end; 1018 1019 1020function envvarset(const varname: pchar): boolean; 1021var 1022 varval: pchar; 1023begin 1024 varval:=fpgetenv(varname); 1025 result:= 1026 assigned(varval) and 1027 (varval[0]<>#0); 1028end; 1029 1030 1031function GetStandardCodePage(const stdcp: TStandardCodePageEnum): TSystemCodePage; 1032var 1033 langinfo: pchar; 1034begin 1035{$ifdef FPCRTL_FILESYSTEM_UTF8} 1036 if stdcp=scpFileSystemSingleByte then 1037 begin 1038 result:=CP_UTF8; 1039 exit; 1040 end; 1041{$endif} 1042 { if none of the relevant LC_* environment variables are set, fall back to 1043 UTF-8 (this happens under some versions of OS X for GUI applications, which 1044 otherwise get CP_ASCII) } 1045 if envvarset('LC_ALL') or 1046 envvarset('LC_CTYPE') or 1047 envvarset('LANG') then 1048 begin 1049 langinfo:=nl_langinfo(CODESET); 1050 { there's a bug in the Mac OS X 10.5 libc (based on FreeBSD's) 1051 that causes it to return an empty string of UTF-8 locales 1052 -> patch up (and in general, UTF-8 is a good default on 1053 Unix platforms) } 1054 if not assigned(langinfo) or 1055 (langinfo^=#0) then 1056 langinfo:='UTF-8'; 1057 Result:=GetCodepageByName(ansistring(langinfo)); 1058 end 1059 else 1060 Result:=unixcp.GetSystemCodepage; 1061end; 1062 1063{$ifdef FPC_HAS_CPSTRING} 1064 1065procedure SetStdIOCodePage(var T: Text); inline; 1066begin 1067 case TextRec(T).Mode of 1068 fmInput:TextRec(T).CodePage:=GetStandardCodePage(scpConsoleInput); 1069 fmOutput:TextRec(T).CodePage:=GetStandardCodePage(scpConsoleOutput); 1070 end; 1071end; 1072 1073procedure SetStdIOCodePages; inline; 1074begin 1075 SetStdIOCodePage(Input); 1076 SetStdIOCodePage(Output); 1077 SetStdIOCodePage(ErrOutput); 1078 SetStdIOCodePage(StdOut); 1079 SetStdIOCodePage(StdErr); 1080end; 1081{$endif FPC_HAS_CPSTRING} 1082 1083var 1084 OrgWideStringManager: TUnicodeStringManager; 1085 1086Procedure SetCWideStringManager; 1087Var 1088 CWideStringManager : TUnicodeStringManager; 1089begin 1090 GetUnicodeStringManager(OrgWideStringManager); 1091 CWideStringManager:=OrgWideStringManager; 1092 With CWideStringManager do 1093 begin 1094 Wide2AnsiMoveProc:=@Wide2AnsiMove; 1095 Ansi2WideMoveProc:=@Ansi2WideMove; 1096 1097 UpperWideStringProc:=@UpperWideString; 1098 LowerWideStringProc:=@LowerWideString; 1099 1100 CompareWideStringProc:=@CompareWideString; 1101// CompareTextWideStringProc:=@CompareTextWideString; 1102 1103 CharLengthPCharProc:=@CharLengthPChar; 1104 CodePointLengthProc:=@CodePointLength; 1105 1106 UpperAnsiStringProc:=@UpperAnsiString; 1107 LowerAnsiStringProc:=@LowerAnsiString; 1108 CompareStrAnsiStringProc:=@CompareStrAnsiString; 1109 CompareTextAnsiStringProc:=@AnsiCompareText; 1110 StrCompAnsiStringProc:=@StrCompAnsi; 1111 StrICompAnsiStringProc:=@AnsiStrIComp; 1112 StrLCompAnsiStringProc:=@AnsiStrLComp; 1113 StrLICompAnsiStringProc:=@AnsiStrLIComp; 1114 StrLowerAnsiStringProc:=@AnsiStrLower; 1115 StrUpperAnsiStringProc:=@AnsiStrUpper; 1116 ThreadInitProc:=@InitThread; 1117 ThreadFiniProc:=@FiniThread; 1118 { Unicode } 1119 Unicode2AnsiMoveProc:=@Wide2AnsiMove; 1120 Ansi2UnicodeMoveProc:=@Ansi2WideMove; 1121 UpperUnicodeStringProc:=@UpperWideString; 1122 LowerUnicodeStringProc:=@LowerWideString; 1123 CompareUnicodeStringProc:=@CompareWideString; 1124 { CodePage } 1125 GetStandardCodePageProc:=@GetStandardCodePage; 1126 end; 1127 SetUnicodeStringManager(CWideStringManager); 1128end; 1129 1130var 1131 iconvlib:TLibHandle; 1132 1133initialization 1134 SetCWideStringManager; 1135 1136 { you have to call setlocale(LC_ALL,'') to initialise the langinfo stuff } 1137 { with the information from the environment variables according to POSIX } 1138 { (some OSes do this automatically, but e.g. Darwin and Solaris don't) } 1139 setlocale(LC_ALL,''); 1140 1141 { load iconvctl function } 1142 iconvlib:=LoadLibrary(libprefix+libiconvname+'.'+SharedSuffix); 1143 if iconvlib<>0 then 1144 pointer(iconvctl):=GetProcAddress(iconvlib,iconvctlname); 1145 1146 { set the DefaultSystemCodePage } 1147 DefaultSystemCodePage:=GetStandardCodePage(scpAnsi); 1148 DefaultFileSystemCodePage:=GetStandardCodePage(scpFileSystemSingleByte); 1149 DefaultRTLFileSystemCodePage:=DefaultFileSystemCodePage; 1150 1151 {$ifdef FPC_HAS_CPSTRING} 1152 SetStdIOCodePages; 1153 {$endif FPC_HAS_CPSTRING} 1154 1155 { init conversion tables for main program } 1156 InitThread; 1157finalization 1158 { fini conversion tables for main program } 1159 FiniThread; 1160 { unload iconv library } 1161 if iconvlib<>0 then 1162 FreeLibrary(iconvlib); 1163 { restore previous (probably default) widestring manager so that subsequent calls 1164 into the widestring manager won't trigger the finalized functionality } 1165 SetWideStringManager(OrgWideStringManager); 1166end. 1167