1 #include "rar.hpp"
2 #define MBFUNCTIONS
3 
4 #if defined(_UNIX) && defined(MBFUNCTIONS)
5 
6 static bool WideToCharMap(const wchar *Src,char *Dest,size_t DestSize,bool &Success);
7 static void CharToWideMap(const char *Src,wchar *Dest,size_t DestSize,bool &Success);
8 
9 // In Unix we map high ASCII characters which cannot be converted to Unicode
10 // to 0xE000 - 0xE0FF private use Unicode area.
11 static const uint MapAreaStart=0xE000;
12 
13 // Mapped string marker. Initially we used 0xFFFF for this purpose,
14 // but it causes MSVC2008 swprintf to fail (it treats 0xFFFF as error marker).
15 // While we could workaround it, it is safer to use another character.
16 static const uint MappedStringMark=0xFFFE;
17 
18 #endif
19 
WideToChar(const wchar * Src,char * Dest,size_t DestSize)20 bool WideToChar(const wchar *Src,char *Dest,size_t DestSize)
21 {
22   bool RetCode=true;
23   *Dest=0; // Set 'Dest' to zero just in case the conversion will fail.
24 
25 #ifdef _WIN_ALL
26   if (WideCharToMultiByte(CP_ACP,0,Src,-1,Dest,(int)DestSize,NULL,NULL)==0)
27     RetCode=false;
28 
29 // wcstombs is broken in Android NDK r9.
30 #elif defined(_APPLE) || defined(_ANDROID)
31   WideToUtf(Src,Dest,DestSize);
32 
33 #elif defined(MBFUNCTIONS)
34   if (!WideToCharMap(Src,Dest,DestSize,RetCode))
35   {
36     mbstate_t ps; // Use thread safe external state based functions.
37     memset (&ps, 0, sizeof(ps));
38     const wchar *SrcParam=Src; // wcsrtombs can change the pointer.
39     size_t ResultingSize=wcsrtombs(Dest,&SrcParam,DestSize,&ps);
40     if (ResultingSize==(size_t)-1)
41       RetCode=false;
42     if (ResultingSize==0 && *Src!=0)
43       RetCode=false;
44   }
45 #else
46   for (int I=0;I<DestSize;I++)
47   {
48     Dest[I]=(char)Src[I];
49     if (Src[I]==0)
50       break;
51   }
52 #endif
53   if (DestSize>0)
54     Dest[DestSize-1]=0;
55 
56   // We tried to return the empty string if conversion is failed,
57   // but it does not work well. WideCharToMultiByte returns 'failed' code
58   // and partially converted string even if we wanted to convert only a part
59   // of string and passed DestSize smaller than required for fully converted
60   // string. Such call is the valid behavior in RAR code and we do not expect
61   // the empty string in this case.
62 
63   return RetCode;
64 }
65 
66 
CharToWide(const char * Src,wchar * Dest,size_t DestSize)67 bool CharToWide(const char *Src,wchar *Dest,size_t DestSize)
68 {
69   bool RetCode=true;
70   *Dest=0; // Set 'Dest' to zero just in case the conversion will fail.
71 
72 #ifdef _WIN_ALL
73   if (MultiByteToWideChar(CP_ACP,0,Src,-1,Dest,(int)DestSize)==0)
74     RetCode=false;
75 
76 // mbstowcs is broken in Android NDK r9.
77 #elif defined(_APPLE) || defined(_ANDROID)
78   UtfToWide(Src,Dest,DestSize);
79 
80 #elif defined(MBFUNCTIONS)
81   mbstate_t ps;
82   memset (&ps, 0, sizeof(ps));
83   const char *SrcParam=Src; // mbsrtowcs can change the pointer.
84   size_t ResultingSize=mbsrtowcs(Dest,&SrcParam,DestSize,&ps);
85   if (ResultingSize==(size_t)-1)
86     RetCode=false;
87   if (ResultingSize==0 && *Src!=0)
88     RetCode=false;
89 
90   if (RetCode==false && DestSize>1)
91     CharToWideMap(Src,Dest,DestSize,RetCode);
92 #else
93   for (int I=0;I<DestSize;I++)
94   {
95     Dest[I]=(wchar_t)Src[I];
96     if (Src[I]==0)
97       break;
98   }
99 #endif
100   if (DestSize>0)
101     Dest[DestSize-1]=0;
102 
103   // We tried to return the empty string if conversion is failed,
104   // but it does not work well. MultiByteToWideChar returns 'failed' code
105   // even if we wanted to convert only a part of string and passed DestSize
106   // smaller than required for fully converted string. Such call is the valid
107   // behavior in RAR code and we do not expect the empty string in this case.
108 
109   return RetCode;
110 }
111 
112 
113 #if defined(_UNIX) && defined(MBFUNCTIONS) && !defined(_ANDROID)
114 // Convert and restore mapped inconvertible Unicode characters.
115 // We use it for extended ASCII names in Unix.
WideToCharMap(const wchar * Src,char * Dest,size_t DestSize,bool & Success)116 bool WideToCharMap(const wchar *Src,char *Dest,size_t DestSize,bool &Success)
117 {
118   // String with inconvertible characters mapped to private use Unicode area
119   // must have the mark code somewhere.
120   if (wcschr(Src,(wchar)MappedStringMark)==NULL)
121     return false;
122 
123   Success=true;
124   uint SrcPos=0,DestPos=0;
125   while (DestPos<DestSize-MB_CUR_MAX)
126   {
127     if (Src[SrcPos]==0)
128     {
129       Dest[DestPos]=0;
130       break;
131     }
132     if (uint(Src[SrcPos])==MappedStringMark)
133     {
134       SrcPos++;
135       continue;
136     }
137     // For security reasons do not retore low ASCII codes, so mapping cannot
138     // be used to hide control codes like path separators.
139     if (uint(Src[SrcPos])>=MapAreaStart+0x80 && uint(Src[SrcPos])<MapAreaStart+0x100)
140       Dest[DestPos++]=char(uint(Src[SrcPos++])-MapAreaStart);
141     else
142     {
143       mbstate_t ps;
144       memset(&ps,0,sizeof(ps));
145       if (wcrtomb(Dest+DestPos,Src[SrcPos],&ps)==-1)
146         Success=false;
147       SrcPos++;
148       memset(&ps,0,sizeof(ps));
149       int Length=mbrlen(Dest+DestPos,MB_CUR_MAX,&ps);
150       DestPos+=Max(Length,1);
151     }
152   }
153   return true;
154 }
155 #endif
156 
157 
158 #if defined(_UNIX) && defined(MBFUNCTIONS) && !defined(_ANDROID)
159 // Convert and map inconvertible Unicode characters.
160 // We use it for extended ASCII names in Unix.
CharToWideMap(const char * Src,wchar * Dest,size_t DestSize,bool & Success)161 void CharToWideMap(const char *Src,wchar *Dest,size_t DestSize,bool &Success)
162 {
163   // Map inconvertible characters to private use Unicode area 0xE000.
164   // Mark such string by placing special non-character code before
165   // first inconvertible character.
166   Success=false;
167   bool MarkAdded=false;
168   uint SrcPos=0,DestPos=0;
169   while (DestPos<DestSize)
170   {
171     if (Src[SrcPos]==0)
172     {
173       Dest[DestPos]=0;
174       Success=true;
175       break;
176     }
177     mbstate_t ps;
178     memset(&ps,0,sizeof(ps));
179     if (mbrtowc(Dest+DestPos,Src+SrcPos,MB_CUR_MAX,&ps)==-1)
180     {
181       // For security reasons we do not want to map low ASCII characters,
182       // so we do not have additional .. and path separator codes.
183       if (byte(Src[SrcPos])>=0x80)
184       {
185         if (!MarkAdded)
186         {
187           Dest[DestPos++]=MappedStringMark;
188           MarkAdded=true;
189           if (DestPos>=DestSize)
190             break;
191         }
192         Dest[DestPos++]=byte(Src[SrcPos++])+MapAreaStart;
193       }
194       else
195         break;
196     }
197     else
198     {
199       memset(&ps,0,sizeof(ps));
200       int Length=mbrlen(Src+SrcPos,MB_CUR_MAX,&ps);
201       SrcPos+=Max(Length,1);
202       DestPos++;
203     }
204   }
205 }
206 #endif
207 
208 
209 // SrcSize is in wide characters, not in bytes.
WideToRaw(const wchar * Src,byte * Dest,size_t SrcSize)210 byte* WideToRaw(const wchar *Src,byte *Dest,size_t SrcSize)
211 {
212   for (size_t I=0;I<SrcSize;I++,Src++)
213   {
214     Dest[I*2]=(byte)*Src;
215     Dest[I*2+1]=(byte)(*Src>>8);
216     if (*Src==0)
217       break;
218   }
219   return Dest;
220 }
221 
222 
RawToWide(const byte * Src,wchar * Dest,size_t DestSize)223 wchar* RawToWide(const byte *Src,wchar *Dest,size_t DestSize)
224 {
225   for (size_t I=0;I<DestSize;I++)
226     if ((Dest[I]=Src[I*2]+(Src[I*2+1]<<8))==0)
227       break;
228   return Dest;
229 }
230 
231 
WideToUtf(const wchar * Src,char * Dest,size_t DestSize)232 void WideToUtf(const wchar *Src,char *Dest,size_t DestSize)
233 {
234   long dsize=(long)DestSize;
235   dsize--;
236   while (*Src!=0 && --dsize>=0)
237   {
238     uint c=*(Src++);
239     if (c<0x80)
240       *(Dest++)=c;
241     else
242       if (c<0x800 && --dsize>=0)
243       {
244         *(Dest++)=(0xc0|(c>>6));
245         *(Dest++)=(0x80|(c&0x3f));
246       }
247       else
248       {
249         if (c>=0xd800 && c<=0xdbff && *Src>=0xdc00 && *Src<=0xdfff) // Surrogate pair.
250         {
251           c=((c-0xd800)<<10)+(*Src-0xdc00)+0x10000;
252           Src++;
253         }
254         if (c<0x10000 && (dsize-=2)>=0)
255         {
256           *(Dest++)=(0xe0|(c>>12));
257           *(Dest++)=(0x80|((c>>6)&0x3f));
258           *(Dest++)=(0x80|(c&0x3f));
259         }
260         else
261           if (c < 0x200000 && (dsize-=3)>=0)
262           {
263             *(Dest++)=(0xf0|(c>>18));
264             *(Dest++)=(0x80|((c>>12)&0x3f));
265             *(Dest++)=(0x80|((c>>6)&0x3f));
266             *(Dest++)=(0x80|(c&0x3f));
267           }
268       }
269   }
270   *Dest=0;
271 }
272 
273 
WideToUtfSize(const wchar * Src)274 size_t WideToUtfSize(const wchar *Src)
275 {
276   size_t Size=0;
277   for (;*Src!=0;Src++)
278     if (*Src<0x80)
279       Size++;
280     else
281       if (*Src<0x800)
282         Size+=2;
283       else
284         if (*Src<0x10000)
285         {
286           if (Src[0]>=0xd800 && Src[0]<=0xdbff && Src[1]>=0xdc00 && Src[1]<=0xdfff)
287           {
288             Size+=4; // 4 output bytes for Unicode surrogate pair.
289             Src++;
290           }
291           else
292             Size+=3;
293         }
294         else
295           if (*Src<0x200000)
296             Size+=4;
297   return Size+1; // Include terminating zero.
298 }
299 
300 
301 // Dest can be NULL if we only need to check validity of Src.
UtfToWide(const char * Src,wchar * Dest,size_t DestSize)302 bool UtfToWide(const char *Src,wchar *Dest,size_t DestSize)
303 {
304   bool Success=true;
305   long dsize=(long)DestSize;
306   dsize--;
307   while (*Src!=0)
308   {
309     uint c=byte(*(Src++)),d;
310     if (c<0x80)
311       d=c;
312     else
313       if ((c>>5)==6)
314       {
315         if ((*Src&0xc0)!=0x80)
316         {
317           Success=false;
318           break;
319         }
320         d=((c&0x1f)<<6)|(*Src&0x3f);
321         Src++;
322       }
323       else
324         if ((c>>4)==14)
325         {
326           if ((Src[0]&0xc0)!=0x80 || (Src[1]&0xc0)!=0x80)
327           {
328             Success=false;
329             break;
330           }
331           d=((c&0xf)<<12)|((Src[0]&0x3f)<<6)|(Src[1]&0x3f);
332           Src+=2;
333         }
334         else
335           if ((c>>3)==30)
336           {
337             if ((Src[0]&0xc0)!=0x80 || (Src[1]&0xc0)!=0x80 || (Src[2]&0xc0)!=0x80)
338             {
339               Success=false;
340               break;
341             }
342             d=((c&7)<<18)|((Src[0]&0x3f)<<12)|((Src[1]&0x3f)<<6)|(Src[2]&0x3f);
343             Src+=3;
344           }
345           else
346           {
347             Success=false;
348             break;
349           }
350     if (Dest!=NULL && --dsize<0)
351       break;
352     if (d>0xffff)
353     {
354       if (Dest!=NULL && --dsize<0)
355         break;
356       if (d>0x10ffff) // UTF-8 must end at 0x10ffff according to RFC 3629.
357       {
358         Success=false;
359         continue;
360       }
361       if (Dest!=NULL)
362         if (sizeof(*Dest)==2) // Use the surrogate pair for 2 byte Unicode.
363         {
364           *(Dest++)=((d-0x10000)>>10)+0xd800;
365           *(Dest++)=(d&0x3ff)+0xdc00;
366         }
367         else
368           *(Dest++)=d;
369     }
370     else
371       if (Dest!=NULL)
372         *(Dest++)=d;
373   }
374   if (Dest!=NULL)
375     *Dest=0;
376   return Success;
377 }
378 
379 
wcsicomp(const wchar * s1,const wchar * s2)380 int wcsicomp(const wchar *s1,const wchar *s2)
381 {
382 #ifdef _WIN_ALL
383   return CompareStringW(LOCALE_USER_DEFAULT,NORM_IGNORECASE|SORT_STRINGSORT,s1,-1,s2,-1)-2;
384 #else
385   while (true)
386   {
387     wchar u1 = towupper(*s1);
388     wchar u2 = towupper(*s2);
389     if (u1 != u2)
390       return u1 < u2 ? -1 : 1;
391     if (*s1==0)
392       break;
393     s1++;
394     s2++;
395   }
396   return 0;
397 #endif
398 }
399 
400 
wcsnicomp(const wchar * s1,const wchar * s2,size_t n)401 int wcsnicomp(const wchar *s1,const wchar *s2,size_t n)
402 {
403 #ifdef _WIN_ALL
404   // If we specify 'n' exceeding the actual string length, CompareString goes
405   // beyond the trailing zero and compares garbage. So we need to limit 'n'
406   // to real string length.
407   size_t l1=Min(wcslen(s1)+1,n);
408   size_t l2=Min(wcslen(s2)+1,n);
409   return CompareStringW(LOCALE_USER_DEFAULT,NORM_IGNORECASE|SORT_STRINGSORT,s1,(int)l1,s2,(int)l2)-2;
410 #else
411   if (n==0)
412     return 0;
413   while (true)
414   {
415     wchar u1 = towupper(*s1);
416     wchar u2 = towupper(*s2);
417     if (u1 != u2)
418       return u1 < u2 ? -1 : 1;
419     if (*s1==0 || --n==0)
420       break;
421     s1++;
422     s2++;
423   }
424   return 0;
425 #endif
426 }
427 
428 
wcscasestr(const wchar_t * str,const wchar_t * search)429 const wchar_t* wcscasestr(const wchar_t *str, const wchar_t *search)
430 {
431   for (size_t i=0;str[i]!=0;i++)
432     for (size_t j=0;;j++)
433     {
434       if (search[j]==0)
435         return str+i;
436       if (tolowerw(str[i+j])!=tolowerw(search[j]))
437         break;
438     }
439   return NULL;
440 }
441 
442 
443 #ifndef SFX_MODULE
wcslower(wchar * s)444 wchar* wcslower(wchar *s)
445 {
446 #ifdef _WIN_ALL
447   CharLower(s);
448 #else
449   for (wchar *c=s;*c!=0;c++)
450     *c=towlower(*c);
451 #endif
452   return s;
453 }
454 #endif
455 
456 
457 #ifndef SFX_MODULE
wcsupper(wchar * s)458 wchar* wcsupper(wchar *s)
459 {
460 #ifdef _WIN_ALL
461   CharUpper(s);
462 #else
463   for (wchar *c=s;*c!=0;c++)
464     *c=towupper(*c);
465 #endif
466   return s;
467 }
468 #endif
469 
470 
471 
472 
toupperw(int ch)473 int toupperw(int ch)
474 {
475 #ifdef _WIN_ALL
476   // CharUpper is more reliable than towupper in Windows, which seems to be
477   // C locale dependent even in Unicode version. For example, towupper failed
478   // to convert lowercase Russian characters.
479   return (int)CharUpper((wchar *)ch);
480 #else
481   return towupper(ch);
482 #endif
483 }
484 
485 
tolowerw(int ch)486 int tolowerw(int ch)
487 {
488 #ifdef _WIN_ALL
489   // CharLower is more reliable than towlower in Windows.
490   // See comment for towupper above.
491   return (int)CharLower((wchar *)ch);
492 #else
493   return towlower(ch);
494 #endif
495 }
496 
497 
atoiw(const wchar * s)498 int atoiw(const wchar *s)
499 {
500   return (int)atoilw(s);
501 }
502 
503 
atoilw(const wchar * s)504 int64 atoilw(const wchar *s)
505 {
506   int sign=1;
507   if (*s=='-')
508   {
509     s++;
510     sign=-1;
511   }
512   int64 n=0;
513   while (*s>='0' && *s<='9')
514   {
515     n=n*10+(*s-'0');
516     s++;
517   }
518   return sign*n;
519 }
520 
521 
522 #ifdef DBCS_SUPPORTED
523 SupportDBCS gdbcs;
524 
SupportDBCS()525 SupportDBCS::SupportDBCS()
526 {
527   Init();
528 }
529 
530 
Init()531 void SupportDBCS::Init()
532 {
533   CPINFO CPInfo;
534   GetCPInfo(CP_ACP,&CPInfo);
535   DBCSMode=CPInfo.MaxCharSize > 1;
536   for (uint I=0;I<ASIZE(IsLeadByte);I++)
537     IsLeadByte[I]=IsDBCSLeadByte(I)!=0;
538 }
539 
540 
charnext(const char * s)541 char* SupportDBCS::charnext(const char *s)
542 {
543   // Zero cannot be the trail byte. So if next byte after the lead byte
544   // is 0, the string is corrupt and we'll better return the pointer to 0,
545   // to break string processing loops.
546   return (char *)(IsLeadByte[(byte)*s] && s[1]!=0 ? s+2:s+1);
547 }
548 
549 
strlend(const char * s)550 size_t SupportDBCS::strlend(const char *s)
551 {
552   size_t Length=0;
553   while (*s!=0)
554   {
555     if (IsLeadByte[(byte)*s])
556       s+=2;
557     else
558       s++;
559     Length++;
560   }
561   return(Length);
562 }
563 
564 
strchrd(const char * s,int c)565 char* SupportDBCS::strchrd(const char *s, int c)
566 {
567   while (*s!=0)
568     if (IsLeadByte[(byte)*s])
569       s+=2;
570     else
571       if (*s==c)
572         return((char *)s);
573       else
574         s++;
575   return(NULL);
576 }
577 
578 
copychrd(char * dest,const char * src)579 void SupportDBCS::copychrd(char *dest,const char *src)
580 {
581   dest[0]=src[0];
582   if (IsLeadByte[(byte)src[0]])
583     dest[1]=src[1];
584 }
585 
586 
strrchrd(const char * s,int c)587 char* SupportDBCS::strrchrd(const char *s, int c)
588 {
589   const char *found=NULL;
590   while (*s!=0)
591     if (IsLeadByte[(byte)*s])
592       s+=2;
593     else
594     {
595       if (*s==c)
596         found=s;
597       s++;
598     }
599   return((char *)found);
600 }
601 #endif
602