1{
2    This file is part of the Free Pascal run time library.
3    Copyright (c) 2005 by Florian Klaempfl,
4    member of the Free Pascal development team.
5
6    libc based wide string support
7
8    See the file COPYING.FPC, included in this distribution,
9    for details about the copyright.
10
11    This program is distributed in the hope that it will be useful,
12    but WITHOUT ANY WARRANTY; without even the implied warranty of
13    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
14 **********************************************************************}
15
16{$mode objfpc}
17{$inline on}
18{$implicitexceptions off}
19
20unit cwstring;
21
22interface
23
24procedure SetCWidestringManager;
25
26implementation
27
28{$linklib c}
29
30// Linux (and maybe glibc platforms in general), have iconv in glibc.
31{$if defined(linux) or defined(solaris)}
32  {$define iconv_is_in_libc}
33{$endif}
34
35{$ifdef netbsd}
36  {$ifndef DISABLE_ICONV_LIBC}
37    {$define iconv_is_in_libc}
38  {$endif}
39{$endif}
40
41{$ifndef iconv_is_in_libc}
42 {$if defined(haiku)}
43   {$linklib textencoding}
44 {$else}
45   {$linklib iconv}
46 {$endif}
47 {$define useiconv}
48{$endif not iconv_is_in_libc}
49
50{$i rtldefs.inc}
51
52Uses
53  BaseUnix,
54  ctypes,
55  unix,
56  unixtype,
57  initc,
58  dynlibs,
59  unixcp;
60
61Const
62{$ifndef useiconv}
63    libiconvname='c';  // is in libc under Linux.
64    libprefix='lib';
65{$else}
66  {$ifdef haiku}
67    libiconvname='textencoding';  // is in libtextencoding under Haiku
68    libprefix='lib';
69  {$else}
70    {$if defined(darwin) or defined(aix)}
71      libiconvname='libiconv';
72      libprefix='';
73    {$else}
74      libiconvname='iconv';
75      libprefix='lib';
76    {$endif}
77  {$endif}
78{$endif}
79
80{ helper functions from libc }
81function towlower(__wc:wint_t):wint_t;cdecl;external clib name 'towlower';
82function towupper(__wc:wint_t):wint_t;cdecl;external clib name 'towupper';
83
84function wcscoll (__s1:pwchar_t; __s2:pwchar_t):cint;cdecl;external clib name 'wcscoll';
85function strcoll (__s1:pchar; __s2:pchar):cint;cdecl;external clib name 'strcoll';
86{$ifdef netbsd}
87  { NetBSD has a new setlocale function defined in /usr/include/locale.h
88    that should be used }
89function setlocale(category: cint; locale: pchar): pchar; cdecl; external clib name '__setlocale_mb_len_max_32';
90{$else}
91function setlocale(category: cint; locale: pchar): pchar; cdecl; external clib name 'setlocale';
92{$endif}
93{$if not(defined(beos) and not defined(haiku))}
94function mbrtowc(pwc: pwchar_t; const s: pchar; n: size_t; ps: pmbstate_t): size_t; cdecl; external clib name 'mbrtowc';
95function wcrtomb(s: pchar; wc: wchar_t; ps: pmbstate_t): size_t; cdecl; external clib name 'wcrtomb';
96function mbrlen(const s: pchar; n: size_t; ps: pmbstate_t): size_t; cdecl; external clib name 'mbrlen';
97{$else beos}
98function mbtowc(pwc: pwchar_t; const s: pchar; n: size_t): size_t; cdecl; external clib name 'mbtowc';
99function wctomb(s: pchar; wc: wchar_t): size_t; cdecl; external clib name 'wctomb';
100function mblen(const s: pchar; n: size_t): size_t; cdecl; external clib name 'mblen';
101{$endif beos}
102
103
104const
105{$if defined(linux)}
106  __LC_CTYPE = 0;
107  LC_ALL = 6;
108  _NL_CTYPE_CLASS = (__LC_CTYPE shl 16);
109  _NL_CTYPE_CODESET_NAME = (_NL_CTYPE_CLASS)+14;
110  CODESET = _NL_CTYPE_CODESET_NAME;
111{$elseif defined(darwin)}
112  CODESET = 0;
113  LC_ALL = 0;
114{$elseif defined(FreeBSD)} // actually FreeBSD5. internationalisation is afaik not default on 4.
115  __LC_CTYPE = 0;
116  LC_ALL = 0;
117  _NL_CTYPE_CLASS = (__LC_CTYPE shl 16);
118  _NL_CTYPE_CODESET_NAME = (_NL_CTYPE_CLASS)+14;
119  CODESET = 0; // _NL_CTYPE_CODESET_NAME;
120{$elseif defined(solaris)}
121  {$define ACCEPT_646}
122  CODESET=49;
123  LC_ALL = 6;
124{$elseif defined(beos)}
125  {$ifdef haiku}
126  CODESET= 0; // Checked for Haiku
127  LC_ALL = 0; // Checked for Haiku
128  {$else}
129  {$warning check correct value for BeOS}
130  CODESET=49;
131  LC_ALL = 6; // Checked for BeOS
132  {$endif}
133  ESysEILSEQ = EILSEQ;
134{$elseif defined(OpenBSD)}
135  {$define ACCEPT_646}
136  CODESET = 51;
137  LC_ALL = 0;
138{$elseif defined(NetBSD)}
139  {$define ACCEPT_646}
140  CODESET = 51;
141  LC_ALL = 0;
142{$elseif defined(aix)}
143  CODESET = 49;
144  LC_ALL = -1;
145{$elseif defined(dragonfly)}
146  CODESET = 0;
147  LC_ALL = 0;
148  __LC_CTYPE = 0;
149  _NL_CTYPE_CLASS = (__LC_CTYPE shl 16);
150  _NL_CTYPE_CODESET_NAME = (_NL_CTYPE_CLASS)+14;
151{$else not aix}
152{$error lookup the value of CODESET in /usr/include/langinfo.h, and the value of LC_ALL in /usr/include/locale.h for your OS }
153// and while doing it, check if iconv is in libc, and if the symbols are prefixed with iconv_ or libiconv_
154{$endif}
155
156{ unicode encoding name }
157{$ifdef FPC_LITTLE_ENDIAN}
158  unicode_encoding2 = 'UTF-16LE';
159  unicode_encoding4 = 'UCS-4LE';
160{$else  FPC_LITTLE_ENDIAN}
161{$ifdef AIX}
162  unicode_encoding2 = 'UTF-16';
163{$else AIX}
164  unicode_encoding2 = 'UTF-16BE';
165  unicode_encoding4 = 'UCS-4BE';
166{$endif AIX}
167{$endif  FPC_LITTLE_ENDIAN}
168
169{ en_US.UTF-8 needs maximally 6 chars, UCS-4/UTF-32 needs 4   }
170{ -> 10 should be enough? Should actually use MB_CUR_MAX, but }
171{ that's a libc macro mapped to internal functions/variables  }
172{ and thus not a stable external API on systems where libc    }
173{ breaks backwards compatibility every now and then           }
174  MB_CUR_MAX = 10;
175
176{ Requests for iconvctl }
177  ICONV_TRIVIALP          = 0; // int *argument
178  ICONV_GET_TRANSLITERATE = 1; // int *argument
179  ICONV_SET_TRANSLITERATE = 2; // const int *argument
180  ICONV_GET_DISCARD_ILSEQ = 3; // int *argument
181  ICONV_SET_DISCARD_ILSEQ = 4; // const int *argument
182  ICONV_SET_HOOKS         = 5; // const struct iconv_hooks *argument
183  ICONV_SET_FALLBACKS     = 6; // const struct iconv_fallbacks *argument
184
185type
186  piconv_t = ^iconv_t;
187  iconv_t = pointer;
188  nl_item = cint;
189
190{$ifdef haiku}
191  function nl_langinfo(__item:nl_item):pchar;cdecl;external 'root' name 'nl_langinfo';
192{$else}
193  {$ifndef beos}
194  function nl_langinfo(__item:nl_item):pchar;cdecl;external libiconvname name 'nl_langinfo';
195  {$endif}
196{$endif}
197
198{$if (not defined(bsd) and not defined(beos)) or defined(iconv_is_in_libc) or (defined(darwin) and not defined(cpupowerpc32))}
199function iconv_open(__tocode:pchar; __fromcode:pchar):iconv_t;cdecl;external libiconvname name 'iconv_open';
200function iconv(__cd:iconv_t; __inbuf:ppchar; __inbytesleft:psize_t; __outbuf:ppchar; __outbytesleft:psize_t):size_t;cdecl;external libiconvname name 'iconv';
201function iconv_close(__cd:iconv_t):cint;cdecl;external libiconvname name 'iconv_close';
202const
203  iconvctlname='iconvctl';
204{$else}
205function iconv_open(__tocode:pchar; __fromcode:pchar):iconv_t;cdecl;external libiconvname name 'libiconv_open';
206function iconv(__cd:iconv_t; __inbuf:ppchar; __inbytesleft:psize_t; __outbuf:ppchar; __outbytesleft:psize_t):size_t;cdecl;external libiconvname name 'libiconv';
207function iconv_close(__cd:iconv_t):cint;cdecl;external libiconvname name 'libiconv_close';
208const
209  iconvctlname='libiconvctl';
210{$endif}
211var
212  iconvctl:function(__cd:iconv_t; __request:cint; __argument:pointer):cint;cdecl;
213
214procedure fpc_rangeerror; [external name 'FPC_RANGEERROR'];
215
216
217threadvar
218  iconv_ansi2wide,
219  iconv_wide2ansi : iconv_t;
220  { since we cache the iconv_t converters, we have to do the same
221    for the DefaultSystemCodePage variable since if it changes, we
222    have to re-initialize the converters too. We can't do that via
223    a callback in the widestring manager because DefaultSystemCodePage
224    is not a threadvar and we can't automatically change this in all
225    threads }
226  current_DefaultSystemCodePage: TSystemCodePage;
227
228
229procedure InitThread;
230var
231  transliterate: cint;
232  iconvindex: longint;
233{$if not(defined(darwin) and (defined(cpuarm) or defined(cpuaarch64))) and not defined(iphonesim)}
234  iconvname: rawbytestring;
235{$endif}
236begin
237  current_DefaultSystemCodePage:=DefaultSystemCodePage;
238{$if not(defined(darwin) and (defined(cpuarm) or defined(cpuaarch64))) and not defined(iphonesim)}
239  iconvindex:=GetCodepageData(DefaultSystemCodePage);
240  if iconvindex<>-1 then
241    iconvname:=UnixCpMap[iconvindex].name
242  else
243    { default to UTF-8 on Unix platforms }
244    iconvname:='UTF-8';
245  iconv_wide2ansi:=iconv_open(pchar(iconvname),unicode_encoding2);
246  iconv_ansi2wide:=iconv_open(unicode_encoding2,pchar(iconvname));
247{$else}
248  { Unix locale settings are ignored on iPhoneOS/iPhoneSimulator }
249  iconv_wide2ansi:=iconv_open('UTF-8',unicode_encoding2);
250  iconv_ansi2wide:=iconv_open(unicode_encoding2,'UTF-8');
251{$endif}
252  if assigned(iconvctl) and
253     (iconv_wide2ansi<>iconv_t(-1)) then
254  begin
255    transliterate:=1;
256    iconvctl(iconv_wide2ansi,ICONV_SET_TRANSLITERATE,@transliterate);
257  end;
258end;
259
260
261procedure FiniThread;
262begin
263  if (iconv_wide2ansi <> iconv_t(-1)) then
264    iconv_close(iconv_wide2ansi);
265  if (iconv_ansi2wide <> iconv_t(-1)) then
266    iconv_close(iconv_ansi2wide);
267end;
268
269
270{$if defined(beos) and not defined(haiku)}
271function nl_langinfo(__item:nl_item):pchar;
272begin
273  {$warning TODO BeOS nl_langinfo or more uptodate port of iconv...}
274  // Now implement the minimum required to correctly initialize WideString support
275  case __item of
276    CODESET : Result := 'UTF-8'; // BeOS use UTF-8
277    else
278    begin
279      Assert(False, 'nl_langinfo was called with an unknown nl_item value');
280      Result := '';
281    end;
282  end;
283end;
284{$endif}
285
286
287function open_iconv_for_cps(cp: TSystemCodePage; const otherencoding: pchar; cp_is_from: boolean): iconv_t;
288  var
289    iconvindex: longint;
290  begin
291    { TODO: add caching (then we also don't need separate code for
292      the default system page and other ones)
293
294      -- typecasting an ansistring function result to pchar is
295        unsafe normally, but these are constant strings -> no
296        problem }
297    open_iconv_for_cps:=iconv_t(-1);
298    iconvindex:=GetCodepageData(cp);
299    if iconvindex=-1 then
300      exit;
301    repeat
302      if cp_is_from then
303        open_iconv_for_cps:=iconv_open(otherencoding,pchar(UnixCpMap[iconvindex].name))
304      else
305        open_iconv_for_cps:=iconv_open(pchar(UnixCpMap[iconvindex].name),otherencoding);
306      inc(iconvindex);
307    until (open_iconv_for_cps<>iconv_t(-1)) or
308          (iconvindex>high(UnixCpMap)) or
309          (UnixCpMap[iconvindex].cp<>cp);
310  end;
311
312
313{$ifdef aix}
314{$i cwstraix.inc}
315{$endif aix}
316
317procedure Wide2AnsiMove(source:pwidechar; var dest:RawByteString; cp:TSystemCodePage; len:SizeInt);
318  var
319    outlength,
320    outoffset,
321    srclen,
322    outleft : size_t;
323    use_iconv: iconv_t;
324    srcpos : pwidechar;
325    destpos: pchar;
326    mynil : pchar;
327    my0 : size_t;
328    err : longint;
329    transliterate: cint;
330    free_iconv: boolean;
331{$ifdef aix}
332    intermediate: rawbytestring;
333{$endif aix}
334  begin
335{$ifdef aix}
336    { AIX libiconv does not support converting cp866 to anything else except
337      for iso-8859-5 -> always first convert to iso-8859-5, then to UTF-16 }
338    if cp=866 then
339      begin
340        Wide2AnsiMove(source,intermediate,28595,len);
341        if handle_aix_intermediate(pchar(intermediate),28595,cp,dest,len) then
342          exit;
343      end;
344{$endif aix}
345    if (cp=DefaultSystemCodePage) then
346      begin
347        { update iconv converter in case the DefaultSystemCodePage has been
348          changed }
349        if current_DefaultSystemCodePage<>DefaultSystemCodePage then
350          begin
351            FiniThread;
352            InitThread;
353          end;
354        use_iconv:=iconv_wide2ansi;
355        free_iconv:=false;
356      end
357    else
358      begin
359        use_iconv:=open_iconv_for_cps(cp,unicode_encoding2,false);
360        if (use_iconv<>iconv_t(-1)) and
361           assigned(iconvctl) then
362        begin
363          transliterate:=1;
364          iconvctl(use_iconv,ICONV_SET_TRANSLITERATE,@transliterate);
365        end;
366        free_iconv:=true;
367      end;
368    { unsupported encoding -> default move }
369    if use_iconv=iconv_t(-1) then
370      begin
371        DefaultUnicode2AnsiMove(source,dest,DefaultSystemCodePage,len);
372        exit;
373      end;
374    mynil:=nil;
375    my0:=0;
376    { rought estimation }
377    setlength(dest,len*3);
378    outlength:=len*3;
379    srclen:=len*2;
380    srcpos:=source;
381    destpos:=pchar(dest);
382    outleft:=outlength;
383    while iconv(use_iconv,ppchar(@srcpos),@srclen,@destpos,@outleft)=size_t(-1) do
384      begin
385        err:=fpgetCerrno;
386        case err of
387          { last character is incomplete sequence }
388          ESysEINVAL,
389          { incomplete sequence in the middle }
390          ESysEILSEQ:
391            begin
392              { skip and set to '?' }
393              inc(srcpos);
394              dec(srclen,2);
395              destpos^:='?';
396              inc(destpos);
397              dec(outleft);
398              { reset }
399              iconv(use_iconv,@mynil,@my0,@mynil,@my0);
400              if err=ESysEINVAL then
401                break;
402            end;
403          ESysE2BIG:
404            begin
405              outoffset:=destpos-pchar(dest);
406              { extend }
407              setlength(dest,outlength+len*3);
408              inc(outleft,len*3);
409              inc(outlength,len*3);
410              { string could have been moved }
411              destpos:=pchar(dest)+outoffset;
412            end;
413          else
414            runerror(231);
415        end;
416      end;
417    // truncate string
418    setlength(dest,length(dest)-outleft);
419    SetCodePage(dest,cp,false);
420    if free_iconv then
421      iconv_close(use_iconv);
422  end;
423
424
425procedure Ansi2WideMove(source:pchar; cp:TSystemCodePage; var dest:widestring; len:SizeInt);
426  var
427    outlength,
428    outoffset,
429    outleft : size_t;
430    use_iconv: iconv_t;
431    srcpos,
432    destpos: pchar;
433    mynil : pchar;
434    my0 : size_t;
435    err: cint;
436    iconvindex: longint;
437    free_iconv: boolean;
438{$ifdef aix}
439    intermediate: rawbytestring;
440{$endif aix}
441  begin
442{$ifdef aix}
443    { AIX libiconv does not support converting cp866 to anything else except
444      for iso-8859-5 -> always first convert to iso-8859-5, then to UTF-16 }
445    if cp=866 then
446      begin
447        if handle_aix_intermediate(source,cp,cp,intermediate,len) then
448          source:=pchar(intermediate);
449      end;
450{$endif aix}
451    if (cp=DefaultSystemCodePage) then
452      begin
453        { update iconv converter in case the DefaultSystemCodePage has been
454          changed }
455        if current_DefaultSystemCodePage<>DefaultSystemCodePage then
456          begin
457            FiniThread;
458            InitThread;
459          end;
460        use_iconv:=iconv_ansi2wide;
461        free_iconv:=false;
462      end
463    else
464      begin
465        { TODO: add caching (then we also don't need separate code for
466          the default system page and other ones)
467
468          -- typecasting an ansistring function result to pchar is
469            unsafe normally, but these are constant strings -> no
470            problem }
471        use_iconv:=open_iconv_for_cps(cp,unicode_encoding2,true);
472        free_iconv:=true;
473      end;
474    { unsupported encoding -> default move }
475    if use_iconv=iconv_t(-1) then
476      begin
477        DefaultAnsi2UnicodeMove(source,DefaultSystemCodePage,dest,len);
478        exit;
479      end;
480    mynil:=nil;
481    my0:=0;
482    // extra space
483    outlength:=len+1;
484    setlength(dest,outlength);
485    srcpos:=source;
486    destpos:=pchar(dest);
487    outleft:=outlength*2;
488    while iconv(use_iconv,@srcpos,psize(@len),@destpos,@outleft)=size_t(-1) do
489      begin
490        err:=fpgetCerrno;
491        case err of
492         ESysEINVAL,
493         ESysEILSEQ:
494            begin
495              { skip and set to '?' }
496              inc(srcpos);
497              dec(len);
498              pwidechar(destpos)^:='?';
499              inc(destpos,2);
500              dec(outleft,2);
501              { reset }
502              iconv(use_iconv,@mynil,@my0,@mynil,@my0);
503              if err=ESysEINVAL then
504                break;
505            end;
506          ESysE2BIG:
507            begin
508              outoffset:=destpos-pchar(dest);
509              { extend }
510              setlength(dest,outlength+len);
511              inc(outleft,len*2);
512              inc(outlength,len);
513              { string could have been moved }
514              destpos:=pchar(dest)+outoffset;
515            end;
516          else
517            runerror(231);
518        end;
519      end;
520    // truncate string
521    setlength(dest,length(dest)-outleft div 2);
522    if free_iconv then
523      iconv_close(use_iconv);
524  end;
525
526
527function LowerWideString(const s : WideString) : WideString;
528  var
529    i : SizeInt;
530  begin
531    SetLength(result,length(s));
532    for i:=0 to length(s)-1 do
533      pwidechar(result)[i]:=WideChar(towlower(wint_t(s[i+1])));
534  end;
535
536
537function UpperWideString(const s : WideString) : WideString;
538  var
539    i : SizeInt;
540  begin
541    SetLength(result,length(s));
542    for i:=0 to length(s)-1 do
543      pwidechar(result)[i]:=WideChar(towupper(wint_t(s[i+1])));
544  end;
545
546
547procedure EnsureAnsiLen(var S: AnsiString; const len: SizeInt); inline;
548begin
549  if (len>length(s)) then
550    if (length(s) < 10*256) then
551      setlength(s,length(s)+10)
552    else
553      setlength(s,length(s)+length(s) shr 8);
554end;
555
556
557procedure ConcatCharToAnsiStr(const c: char; var S: AnsiString; var index: SizeInt);
558begin
559  EnsureAnsiLen(s,index);
560  pchar(@s[index])^:=c;
561  inc(index);
562end;
563
564
565{ concatenates an utf-32 char to a widestring. S *must* be unique when entering. }
566{$if not(defined(beos) and not defined(haiku))}
567procedure ConcatUTF32ToAnsiStr(const nc: wint_t; var S: AnsiString; var index: SizeInt; var mbstate: mbstate_t);
568{$else not beos}
569procedure ConcatUTF32ToAnsiStr(const nc: wint_t; var S: AnsiString; var index: SizeInt);
570{$endif beos}
571var
572  p     : pchar;
573  mblen : size_t;
574begin
575  { we know that s is unique -> avoid uniquestring calls}
576  p:=@s[index];
577  if (nc<=127) then
578    ConcatCharToAnsiStr(char(nc),s,index)
579  else
580    begin
581      EnsureAnsiLen(s,index+MB_CUR_MAX);
582{$if not(defined(beos) and not defined(haiku))}
583      mblen:=wcrtomb(p,wchar_t(nc),@mbstate);
584{$else not beos}
585      mblen:=wctomb(p,wchar_t(nc));
586{$endif not beos}
587      if (mblen<>size_t(-1)) then
588        inc(index,mblen)
589      else
590        begin
591          { invalid wide char }
592          p^:='?';
593          inc(index);
594        end;
595    end;
596end;
597
598
599function LowerAnsiString(const s : AnsiString) : AnsiString;
600  var
601    i, slen,
602    resindex : SizeInt;
603    mblen    : size_t;
604{$if not(defined(beos) and not defined(haiku))}
605    ombstate,
606    nmbstate : mbstate_t;
607{$endif beos}
608    wc       : wchar_t;
609  begin
610{$if not(defined(beos) and not defined(haiku))}
611    fillchar(ombstate,sizeof(ombstate),0);
612    fillchar(nmbstate,sizeof(nmbstate),0);
613{$endif beos}
614    slen:=length(s);
615    SetLength(result,slen+10);
616    i:=1;
617    resindex:=1;
618    while (i<=slen) do
619      begin
620        if (s[i]<=#127) then
621          begin
622            wc:=wchar_t(s[i]);
623            mblen:= 1;
624          end
625        else
626{$if not(defined(beos) and not defined(haiku))}
627          mblen:=mbrtowc(@wc, pchar(@s[i]), slen-i+1, @ombstate);
628{$else not beos}
629          mblen:=mbtowc(@wc, pchar(@s[i]), slen-i+1);
630{$endif not beos}
631        case mblen of
632          size_t(-2):
633            begin
634              { partial invalid character, copy literally }
635              while (i<=slen) do
636                begin
637                  ConcatCharToAnsiStr(s[i],result,resindex);
638                  inc(i);
639                end;
640            end;
641          size_t(-1), 0:
642            begin
643              { invalid or null character }
644              ConcatCharToAnsiStr(s[i],result,resindex);
645              inc(i);
646            end;
647          else
648            begin
649              { a valid sequence }
650              { even if mblen = 1, the lowercase version may have a }
651              { different length                                     }
652              { We can't do anything special if wchar_t is 16 bit... }
653{$if not(defined(beos) and not defined(haiku))}
654              ConcatUTF32ToAnsiStr(towlower(wint_t(wc)),result,resindex,nmbstate);
655{$else not beos}
656              ConcatUTF32ToAnsiStr(towlower(wint_t(wc)),result,resindex);
657{$endif not beos}
658              inc(i,mblen);
659            end;
660          end;
661      end;
662    SetLength(result,resindex-1);
663  end;
664
665
666function UpperAnsiString(const s : AnsiString) : AnsiString;
667  var
668    i, slen,
669    resindex : SizeInt;
670    mblen    : size_t;
671{$if not(defined(beos) and not defined(haiku))}
672    ombstate,
673    nmbstate : mbstate_t;
674{$endif beos}
675    wc       : wchar_t;
676  begin
677{$if not(defined(beos) and not defined(haiku))}
678    fillchar(ombstate,sizeof(ombstate),0);
679    fillchar(nmbstate,sizeof(nmbstate),0);
680{$endif beos}
681    slen:=length(s);
682    SetLength(result,slen+10);
683    i:=1;
684    resindex:=1;
685    while (i<=slen) do
686      begin
687        if (s[i]<=#127) then
688          begin
689            wc:=wchar_t(s[i]);
690            mblen:= 1;
691          end
692        else
693{$if not(defined(beos) and not defined(haiku))}
694          mblen:=mbrtowc(@wc, pchar(@s[i]), slen-i+1, @ombstate);
695{$else not beos}
696          mblen:=mbtowc(@wc, pchar(@s[i]), slen-i+1);
697{$endif beos}
698        case mblen of
699          size_t(-2):
700            begin
701              { partial invalid character, copy literally }
702              while (i<=slen) do
703                begin
704                  ConcatCharToAnsiStr(s[i],result,resindex);
705                  inc(i);
706                end;
707            end;
708          size_t(-1), 0:
709            begin
710              { invalid or null character }
711              ConcatCharToAnsiStr(s[i],result,resindex);
712              inc(i);
713            end;
714          else
715            begin
716              { a valid sequence }
717              { even if mblen = 1, the uppercase version may have a }
718              { different length                                     }
719              { We can't do anything special if wchar_t is 16 bit... }
720{$if not(defined(beos) and not defined(haiku))}
721              ConcatUTF32ToAnsiStr(towupper(wint_t(wc)),result,resindex,nmbstate);
722{$else not beos}
723              ConcatUTF32ToAnsiStr(towupper(wint_t(wc)),result,resindex);
724{$endif not beos}
725              inc(i,mblen);
726            end;
727          end;
728      end;
729    SetLength(result,resindex-1);
730  end;
731
732function WideStringToUCS4StringNoNulls(const s : WideString) : UCS4String;
733  var
734    i, slen,
735    destindex : SizeInt;
736    uch       : UCS4Char;
737  begin
738    slen:=length(s);
739    setlength(result,slen+1);
740    i:=1;
741    destindex:=0;
742    while (i<=slen) do
743      begin
744        uch:=UCS4Char(s[i]);
745        if (uch=0) then
746          result[destindex]:=32
747        else if (uch<=$d7ff) or (uch>=$e000) then
748          result[destindex]:=uch
749        else if (uch<=$dbff) and
750          (i<slen) and
751          (s[i+1]>=#$dc00) and
752          (s[i+1]<=#$dfff) then
753          begin
754            result[destindex]:=(UCS4Char(uch-$d7c0) shl 10)+(UCS4Char(s[i+1]) xor $dc00);
755            inc(i);
756          end
757        else { invalid surrogate pair }
758          result[destindex]:=uch;
759        inc(i);
760        inc(destindex);
761      end;
762    result[destindex]:=UCS4Char(0);
763    { Trimming length in this particular case is just a waste of time,
764      because result will be interpreted as null-terminated and discarded
765      almost immediately }
766  end;
767
768
769function CompareWideString(const s1, s2 : WideString; Options : TCompareOptions) : PtrInt;
770{$if not(defined (aix) and defined(cpupowerpc32))}
771  var
772    hs1,hs2 : UCS4String;
773    us1,us2 : WideString;
774
775  begin
776    { wcscoll interprets null chars as end-of-string -> filter out }
777    if coIgnoreCase in Options then
778      begin
779      us1:=UpperWideString(s1);
780      us2:=UpperWideString(s2);
781      end
782    else
783      begin
784      us1:=s1;
785      us2:=s2;
786      end;
787    hs1:=WideStringToUCS4StringNoNulls(us1);
788    hs2:=WideStringToUCS4StringNoNulls(us2);
789    result:=wcscoll(pwchar_t(hs1),pwchar_t(hs2));
790  end;
791{$else}
792  { AIX/PPC32 has a 16 bit wchar_t }
793  var
794    i, len: longint;
795    us1,us2 : WideString;
796    hs1, hs2: array of widechar;
797  begin
798    if coIgnoreCase in Options then
799      begin
800      us1:=UpperWideString(s1);
801      us2:=UpperWideString(s2);
802      end
803    else
804      begin
805      us1:=s1;
806      us2:=s2;
807      end;
808    len:=length(us1);
809    setlength(hs1,len+1);
810    for i:=1 to len do
811      if us1[i]<>#0 then
812        hs1[i-1]:=us1[i]
813      else
814        hs1[i-1]:=#32;
815    hs1[len]:=#0;
816
817    len:=length(us2);
818    setlength(hs2,len+1);
819    for i:=1 to len do
820      if us2[i]<>#0 then
821        hs2[i-1]:=us2[i]
822      else
823        hs2[i-1]:=#32;
824    hs2[len]:=#0;
825    result:=wcscoll(pwchar_t(hs1),pwchar_t(hs2));
826  end;
827{$endif}
828
829
830
831{ return value: number of code points in the string. Whenever an invalid
832  code point is encountered, all characters part of this invalid code point
833  are considered to form one "character" and the next character is
834  considered to be the start of a new (possibly also invalid) code point }
835function CharLengthPChar(const Str: PChar): PtrInt;
836  var
837    nextlen: ptrint;
838    s: pchar;
839{$if not(defined(beos) and not defined(haiku))}
840    mbstate: mbstate_t;
841{$endif not beos}
842  begin
843    result:=0;
844    s:=str;
845{$if not(defined(beos) and not defined(haiku))}
846    fillchar(mbstate,sizeof(mbstate),0);
847{$endif not beos}
848    repeat
849{$if defined(beos) and not defined(haiku)}
850      nextlen:=ptrint(mblen(s,MB_CUR_MAX));
851{$else beos}
852      nextlen:=ptrint(mbrlen(s,MB_CUR_MAX,@mbstate));
853{$endif beos}
854      { skip invalid/incomplete sequences }
855      if (nextlen<0) then
856        nextlen:=1;
857      inc(result,1);
858      inc(s,nextlen);
859    until (nextlen=0);
860  end;
861
862
863function CodePointLength(const Str: PChar; maxlookahead: ptrint): PtrInt;
864{$if not(defined(beos) and not defined(haiku))}
865  var
866    mbstate: mbstate_t;
867{$endif not beos}
868  begin
869{$if defined(beos) and not defined(haiku)}
870    result:=ptrint(mblen(str,maxlookahead));
871{$else beos}
872    fillchar(mbstate,sizeof(mbstate),0);
873    result:=ptrint(mbrlen(str,maxlookahead,@mbstate));
874    { mbrlen can also return -2 for "incomplete but potially valid character
875      and data has been processed" }
876    if result<0 then
877      result:=-1;
878{$endif beos}
879  end;
880
881
882function StrCompAnsiIntern(s1,s2 : PChar; len1, len2: PtrInt; canmodifys1, canmodifys2: boolean): PtrInt;
883  var
884    a,b: pchar;
885    i: PtrInt;
886  begin
887    if not(canmodifys1) then
888      getmem(a,len1+1)
889    else
890      a:=s1;
891    for i:=0 to len1-1 do
892      if s1[i]<>#0 then
893        a[i]:=s1[i]
894      else
895        a[i]:=#32;
896    a[len1]:=#0;
897
898    if not(canmodifys2) then
899      getmem(b,len2+1)
900    else
901      b:=s2;
902    for i:=0 to len2-1 do
903      if s2[i]<>#0 then
904        b[i]:=s2[i]
905      else
906        b[i]:=#32;
907    b[len2]:=#0;
908    result:=strcoll(a,b);
909    if not(canmodifys1) then
910      freemem(a);
911    if not(canmodifys2) then
912      freemem(b);
913  end;
914
915
916function CompareStrAnsiString(const s1, s2: ansistring): PtrInt;
917  begin
918    result:=StrCompAnsiIntern(pchar(s1),pchar(s2),length(s1),length(s2),false,false);
919  end;
920
921
922function StrCompAnsi(s1,s2 : PChar): PtrInt;
923  begin
924    result:=strcoll(s1,s2);
925  end;
926
927
928function AnsiCompareText(const S1, S2: ansistring): PtrInt;
929  var
930    a, b: AnsiString;
931  begin
932    a:=UpperAnsistring(s1);
933    b:=UpperAnsistring(s2);
934    result:=StrCompAnsiIntern(pchar(a),pchar(b),length(a),length(b),true,true);
935  end;
936
937
938function AnsiStrIComp(S1, S2: PChar): PtrInt;
939  begin
940    result:=AnsiCompareText(ansistring(s1),ansistring(s2));
941  end;
942
943
944function AnsiStrLComp(S1, S2: PChar; MaxLen: PtrUInt): PtrInt;
945  var
946    a, b: pchar;
947begin
948  if (maxlen=0) then
949    exit(0);
950  if (s1[maxlen]<>#0) then
951    begin
952      getmem(a,maxlen+1);
953      move(s1^,a^,maxlen);
954      a[maxlen]:=#0;
955    end
956  else
957    a:=s1;
958  if (s2[maxlen]<>#0) then
959    begin
960      getmem(b,maxlen+1);
961      move(s2^,b^,maxlen);
962      b[maxlen]:=#0;
963    end
964  else
965    b:=s2;
966  result:=StrCompAnsiIntern(a,b,maxlen,maxlen,a<>s1,b<>s2);
967  if (a<>s1) then
968    freemem(a);
969  if (b<>s2) then
970    freemem(b);
971end;
972
973
974function AnsiStrLIComp(S1, S2: PChar; MaxLen: PtrUInt): PtrInt;
975  var
976    a, b: ansistring;
977begin
978  if (maxlen=0) then
979    exit(0);
980  setlength(a,maxlen);
981  move(s1^,a[1],maxlen);
982  setlength(b,maxlen);
983  move(s2^,b[1],maxlen);
984  result:=AnsiCompareText(a,b);
985end;
986
987
988procedure ansi2pchar(const s: ansistring; const orgp: pchar; out p: pchar);
989var
990  newlen: sizeint;
991begin
992  newlen:=length(s);
993  if newlen>strlen(orgp) then
994    fpc_rangeerror;
995  p:=orgp;
996  if (newlen>0) then
997    move(s[1],p[0],newlen);
998  p[newlen]:=#0;
999end;
1000
1001
1002function AnsiStrLower(Str: PChar): PChar;
1003var
1004  temp: ansistring;
1005begin
1006  temp:=loweransistring(str);
1007  ansi2pchar(temp,str,result);
1008end;
1009
1010
1011function AnsiStrUpper(Str: PChar): PChar;
1012var
1013  temp: ansistring;
1014begin
1015  temp:=upperansistring(str);
1016  ansi2pchar(temp,str,result);
1017end;
1018
1019
1020function envvarset(const varname: pchar): boolean;
1021var
1022  varval: pchar;
1023begin
1024  varval:=fpgetenv(varname);
1025  result:=
1026    assigned(varval) and
1027    (varval[0]<>#0);
1028end;
1029
1030
1031function GetStandardCodePage(const stdcp: TStandardCodePageEnum): TSystemCodePage;
1032var
1033  langinfo: pchar;
1034begin
1035{$ifdef FPCRTL_FILESYSTEM_UTF8}
1036  if stdcp=scpFileSystemSingleByte then
1037    begin
1038      result:=CP_UTF8;
1039      exit;
1040    end;
1041{$endif}
1042  { if none of the relevant LC_* environment variables are set, fall back to
1043    UTF-8 (this happens under some versions of OS X for GUI applications, which
1044    otherwise get CP_ASCII) }
1045  if envvarset('LC_ALL') or
1046     envvarset('LC_CTYPE') or
1047     envvarset('LANG') then
1048    begin
1049      langinfo:=nl_langinfo(CODESET);
1050      { there's a bug in the Mac OS X 10.5 libc (based on FreeBSD's)
1051        that causes it to return an empty string of UTF-8 locales
1052        -> patch up (and in general, UTF-8 is a good default on
1053        Unix platforms) }
1054      if not assigned(langinfo) or
1055         (langinfo^=#0) then
1056        langinfo:='UTF-8';
1057      Result:=GetCodepageByName(ansistring(langinfo));
1058    end
1059  else
1060    Result:=unixcp.GetSystemCodepage;
1061end;
1062
1063{$ifdef FPC_HAS_CPSTRING}
1064
1065procedure SetStdIOCodePage(var T: Text); inline;
1066begin
1067  case TextRec(T).Mode of
1068    fmInput:TextRec(T).CodePage:=GetStandardCodePage(scpConsoleInput);
1069    fmOutput:TextRec(T).CodePage:=GetStandardCodePage(scpConsoleOutput);
1070  end;
1071end;
1072
1073procedure SetStdIOCodePages; inline;
1074begin
1075  SetStdIOCodePage(Input);
1076  SetStdIOCodePage(Output);
1077  SetStdIOCodePage(ErrOutput);
1078  SetStdIOCodePage(StdOut);
1079  SetStdIOCodePage(StdErr);
1080end;
1081{$endif FPC_HAS_CPSTRING}
1082
1083var
1084  OrgWideStringManager: TUnicodeStringManager;
1085
1086Procedure SetCWideStringManager;
1087Var
1088  CWideStringManager : TUnicodeStringManager;
1089begin
1090  GetUnicodeStringManager(OrgWideStringManager);
1091  CWideStringManager:=OrgWideStringManager;
1092  With CWideStringManager do
1093    begin
1094      Wide2AnsiMoveProc:=@Wide2AnsiMove;
1095      Ansi2WideMoveProc:=@Ansi2WideMove;
1096
1097      UpperWideStringProc:=@UpperWideString;
1098      LowerWideStringProc:=@LowerWideString;
1099
1100      CompareWideStringProc:=@CompareWideString;
1101//      CompareTextWideStringProc:=@CompareTextWideString;
1102
1103      CharLengthPCharProc:=@CharLengthPChar;
1104      CodePointLengthProc:=@CodePointLength;
1105
1106      UpperAnsiStringProc:=@UpperAnsiString;
1107      LowerAnsiStringProc:=@LowerAnsiString;
1108      CompareStrAnsiStringProc:=@CompareStrAnsiString;
1109      CompareTextAnsiStringProc:=@AnsiCompareText;
1110      StrCompAnsiStringProc:=@StrCompAnsi;
1111      StrICompAnsiStringProc:=@AnsiStrIComp;
1112      StrLCompAnsiStringProc:=@AnsiStrLComp;
1113      StrLICompAnsiStringProc:=@AnsiStrLIComp;
1114      StrLowerAnsiStringProc:=@AnsiStrLower;
1115      StrUpperAnsiStringProc:=@AnsiStrUpper;
1116      ThreadInitProc:=@InitThread;
1117      ThreadFiniProc:=@FiniThread;
1118      { Unicode }
1119      Unicode2AnsiMoveProc:=@Wide2AnsiMove;
1120      Ansi2UnicodeMoveProc:=@Ansi2WideMove;
1121      UpperUnicodeStringProc:=@UpperWideString;
1122      LowerUnicodeStringProc:=@LowerWideString;
1123      CompareUnicodeStringProc:=@CompareWideString;
1124      { CodePage }
1125      GetStandardCodePageProc:=@GetStandardCodePage;
1126    end;
1127  SetUnicodeStringManager(CWideStringManager);
1128end;
1129
1130var
1131  iconvlib:TLibHandle;
1132
1133initialization
1134  SetCWideStringManager;
1135
1136  { you have to call setlocale(LC_ALL,'') to initialise the langinfo stuff  }
1137  { with the information from the environment variables according to POSIX  }
1138  { (some OSes do this automatically, but e.g. Darwin and Solaris don't)    }
1139  setlocale(LC_ALL,'');
1140
1141  { load iconvctl function }
1142  iconvlib:=LoadLibrary(libprefix+libiconvname+'.'+SharedSuffix);
1143  if iconvlib<>0 then
1144    pointer(iconvctl):=GetProcAddress(iconvlib,iconvctlname);
1145
1146  { set the DefaultSystemCodePage }
1147  DefaultSystemCodePage:=GetStandardCodePage(scpAnsi);
1148  DefaultFileSystemCodePage:=GetStandardCodePage(scpFileSystemSingleByte);
1149  DefaultRTLFileSystemCodePage:=DefaultFileSystemCodePage;
1150
1151  {$ifdef FPC_HAS_CPSTRING}
1152  SetStdIOCodePages;
1153  {$endif FPC_HAS_CPSTRING}
1154
1155  { init conversion tables for main program }
1156  InitThread;
1157finalization
1158  { fini conversion tables for main program }
1159  FiniThread;
1160  { unload iconv library }
1161  if iconvlib<>0 then
1162    FreeLibrary(iconvlib);
1163  { restore previous (probably default) widestring manager so that subsequent calls
1164    into the widestring manager won't trigger the finalized functionality }
1165  SetWideStringManager(OrgWideStringManager);
1166end.
1167