1 #include "rar.hpp"
2 #define MBFUNCTIONS
3 
4 #if !defined(_WIN_ALL) && !defined(_APPLE) && defined(_UNIX) && defined(MBFUNCTIONS)
5 
6 static bool WideToCharMap(const wchar *Src,char *Dest,size_t DestSize,bool &Success);
7 static void CharToWideMap(const char *Src,wchar *Dest,size_t DestSize,bool &Success);
8 
9 // In Unix we map high ASCII characters which cannot be converted to Unicode
10 // to 0xE000 - 0xE0FF private use Unicode area.
11 static const uint MapAreaStart=0xE000;
12 
13 // Mapped string marker. Initially we used 0xFFFF for this purpose,
14 // but it causes MSVC2008 swprintf to fail (it treats 0xFFFF as error marker).
15 // While we could workaround it, it is safer to use another character.
16 static const uint MappedStringMark=0xFFFE;
17 
18 #endif
19 
WideToChar(const wchar * Src,char * Dest,size_t DestSize)20 bool WideToChar(const wchar *Src,char *Dest,size_t DestSize)
21 {
22   bool RetCode=true;
23   *Dest=0; // Set 'Dest' to zero just in case the conversion will fail.
24 
25 #ifdef _WIN_ALL
26   if (WideCharToMultiByte(CP_ACP,0,Src,-1,Dest,(int)DestSize,NULL,NULL)==0)
27     RetCode=false;
28 
29 // wcstombs is broken in Android NDK r9.
30 #elif defined(_APPLE)
31   WideToUtf(Src,Dest,DestSize);
32 
33 #elif defined(_UNIX) && defined(MBFUNCTIONS)
34   if (!WideToCharMap(Src,Dest,DestSize,RetCode))
35   {
36     mbstate_t ps; // Use thread safe external state based functions.
37     memset (&ps, 0, sizeof(ps));
38     const wchar *SrcParam=Src; // wcsrtombs can change the pointer.
39 
40     // Some implementations of wcsrtombs can cause memory analyzing tools
41     // like valgrind to report uninitialized data access. It happens because
42     // internally these implementations call SSE4 based wcslen function,
43     // which reads 16 bytes at once including those beyond of trailing 0.
44     size_t ResultingSize=wcsrtombs(Dest,&SrcParam,DestSize,&ps);
45 
46     if (ResultingSize==(size_t)-1 && errno==EILSEQ)
47     {
48       // Aborted on inconvertible character not zero terminating the result.
49       // EILSEQ helps to distinguish it from small output buffer abort.
50       // We want to convert as much as we can, so we clean the output buffer
51       // and repeat conversion.
52       memset (&ps, 0, sizeof(ps));
53       SrcParam=Src; // wcsrtombs can change the pointer.
54       memset(Dest,0,DestSize);
55       ResultingSize=wcsrtombs(Dest,&SrcParam,DestSize,&ps);
56     }
57 
58     if (ResultingSize==(size_t)-1)
59       RetCode=false;
60     if (ResultingSize==0 && *Src!=0)
61       RetCode=false;
62   }
63 #else
64   for (int I=0;I<DestSize;I++)
65   {
66     Dest[I]=(char)Src[I];
67     if (Src[I]==0)
68       break;
69   }
70 #endif
71   if (DestSize>0)
72     Dest[DestSize-1]=0;
73 
74   // We tried to return the empty string if conversion is failed,
75   // but it does not work well. WideCharToMultiByte returns 'failed' code
76   // and partially converted string even if we wanted to convert only a part
77   // of string and passed DestSize smaller than required for fully converted
78   // string. Such call is the valid behavior in RAR code and we do not expect
79   // the empty string in this case.
80 
81   return RetCode;
82 }
83 
84 
CharToWide(const char * Src,wchar * Dest,size_t DestSize)85 bool CharToWide(const char *Src,wchar *Dest,size_t DestSize)
86 {
87   bool RetCode=true;
88   *Dest=0; // Set 'Dest' to zero just in case the conversion will fail.
89 
90 #ifdef _WIN_ALL
91   if (MultiByteToWideChar(CP_ACP,0,Src,-1,Dest,(int)DestSize)==0)
92     RetCode=false;
93 
94 // mbstowcs is broken in Android NDK r9.
95 #elif defined(_APPLE)
96   UtfToWide(Src,Dest,DestSize);
97 
98 #elif defined(_UNIX) && defined(MBFUNCTIONS)
99   mbstate_t ps;
100   memset (&ps, 0, sizeof(ps));
101   const char *SrcParam=Src; // mbsrtowcs can change the pointer.
102   size_t ResultingSize=mbsrtowcs(Dest,&SrcParam,DestSize,&ps);
103   if (ResultingSize==(size_t)-1)
104     RetCode=false;
105   if (ResultingSize==0 && *Src!=0)
106     RetCode=false;
107 
108   if (RetCode==false && DestSize>1)
109     CharToWideMap(Src,Dest,DestSize,RetCode);
110 #else
111   for (int I=0;I<DestSize;I++)
112   {
113     Dest[I]=(wchar_t)Src[I];
114     if (Src[I]==0)
115       break;
116   }
117 #endif
118   if (DestSize>0)
119     Dest[DestSize-1]=0;
120 
121   // We tried to return the empty string if conversion is failed,
122   // but it does not work well. MultiByteToWideChar returns 'failed' code
123   // even if we wanted to convert only a part of string and passed DestSize
124   // smaller than required for fully converted string. Such call is the valid
125   // behavior in RAR code and we do not expect the empty string in this case.
126 
127   return RetCode;
128 }
129 
130 
131 #if !defined(_WIN_ALL) && !defined(_APPLE) && defined(_UNIX) && defined(MBFUNCTIONS)
132 // Convert and restore mapped inconvertible Unicode characters.
133 // We use it for extended ASCII names in Unix.
WideToCharMap(const wchar * Src,char * Dest,size_t DestSize,bool & Success)134 bool WideToCharMap(const wchar *Src,char *Dest,size_t DestSize,bool &Success)
135 {
136   // String with inconvertible characters mapped to private use Unicode area
137   // must have the mark code somewhere.
138   if (wcschr(Src,(wchar)MappedStringMark)==NULL)
139     return false;
140 
141   // Seems to be that wcrtomb in some memory analyzing libraries
142   // can produce uninitilized output while reporting success on garbage input.
143   // So we clean the destination to calm analyzers.
144   memset(Dest,0,DestSize);
145 
146   Success=true;
147   uint SrcPos=0,DestPos=0;
148   while (Src[SrcPos]!=0 && DestPos<DestSize-MB_CUR_MAX)
149   {
150     if (uint(Src[SrcPos])==MappedStringMark)
151     {
152       SrcPos++;
153       continue;
154     }
155     // For security reasons do not restore low ASCII codes, so mapping cannot
156     // be used to hide control codes like path separators.
157     if (uint(Src[SrcPos])>=MapAreaStart+0x80 && uint(Src[SrcPos])<MapAreaStart+0x100)
158       Dest[DestPos++]=char(uint(Src[SrcPos++])-MapAreaStart);
159     else
160     {
161       mbstate_t ps;
162       memset(&ps,0,sizeof(ps));
163       if (wcrtomb(Dest+DestPos,Src[SrcPos],&ps)==(size_t)-1)
164       {
165         Dest[DestPos]='_';
166         Success=false;
167       }
168       SrcPos++;
169       memset(&ps,0,sizeof(ps));
170       int Length=mbrlen(Dest+DestPos,MB_CUR_MAX,&ps);
171       DestPos+=Max(Length,1);
172     }
173   }
174   Dest[Min(DestPos,DestSize-1)]=0;
175   return true;
176 }
177 #endif
178 
179 
180 #if !defined(_WIN_ALL) && !defined(_APPLE) && defined(_UNIX) && defined(MBFUNCTIONS)
181 // Convert and map inconvertible Unicode characters.
182 // We use it for extended ASCII names in Unix.
CharToWideMap(const char * Src,wchar * Dest,size_t DestSize,bool & Success)183 void CharToWideMap(const char *Src,wchar *Dest,size_t DestSize,bool &Success)
184 {
185   // Map inconvertible characters to private use Unicode area 0xE000.
186   // Mark such string by placing special non-character code before
187   // first inconvertible character.
188   Success=false;
189   bool MarkAdded=false;
190   uint SrcPos=0,DestPos=0;
191   while (DestPos<DestSize)
192   {
193     if (Src[SrcPos]==0)
194     {
195       Success=true;
196       break;
197     }
198     mbstate_t ps;
199     memset(&ps,0,sizeof(ps));
200     size_t res=mbrtowc(Dest+DestPos,Src+SrcPos,MB_CUR_MAX,&ps);
201     if (res==(size_t)-1 || res==(size_t)-2)
202     {
203       // For security reasons we do not want to map low ASCII characters,
204       // so we do not have additional .. and path separator codes.
205       if (byte(Src[SrcPos])>=0x80)
206       {
207         if (!MarkAdded)
208         {
209           Dest[DestPos++]=MappedStringMark;
210           MarkAdded=true;
211           if (DestPos>=DestSize)
212             break;
213         }
214         Dest[DestPos++]=byte(Src[SrcPos++])+MapAreaStart;
215       }
216       else
217         break;
218     }
219     else
220     {
221       memset(&ps,0,sizeof(ps));
222       int Length=mbrlen(Src+SrcPos,MB_CUR_MAX,&ps);
223       SrcPos+=Max(Length,1);
224       DestPos++;
225     }
226   }
227   Dest[Min(DestPos,DestSize-1)]=0;
228 }
229 #endif
230 
231 
232 // SrcSize is in wide characters, not in bytes.
WideToRaw(const wchar * Src,byte * Dest,size_t SrcSize)233 byte* WideToRaw(const wchar *Src,byte *Dest,size_t SrcSize)
234 {
235   for (size_t I=0;I<SrcSize;I++,Src++)
236   {
237     Dest[I*2]=(byte)*Src;
238     Dest[I*2+1]=(byte)(*Src>>8);
239     if (*Src==0)
240       break;
241   }
242   return Dest;
243 }
244 
245 
RawToWide(const byte * Src,wchar * Dest,size_t DestSize)246 wchar* RawToWide(const byte *Src,wchar *Dest,size_t DestSize)
247 {
248   for (size_t I=0;I<DestSize;I++)
249     if ((Dest[I]=Src[I*2]+(Src[I*2+1]<<8))==0)
250       break;
251   return Dest;
252 }
253 
254 
WideToUtf(const wchar * Src,char * Dest,size_t DestSize)255 void WideToUtf(const wchar *Src,char *Dest,size_t DestSize)
256 {
257   long dsize=(long)DestSize;
258   dsize--;
259   while (*Src!=0 && --dsize>=0)
260   {
261     uint c=*(Src++);
262     if (c<0x80)
263       *(Dest++)=c;
264     else
265       if (c<0x800 && --dsize>=0)
266       {
267         *(Dest++)=(0xc0|(c>>6));
268         *(Dest++)=(0x80|(c&0x3f));
269       }
270       else
271       {
272         if (c>=0xd800 && c<=0xdbff && *Src>=0xdc00 && *Src<=0xdfff) // Surrogate pair.
273         {
274           c=((c-0xd800)<<10)+(*Src-0xdc00)+0x10000;
275           Src++;
276         }
277         if (c<0x10000 && (dsize-=2)>=0)
278         {
279           *(Dest++)=(0xe0|(c>>12));
280           *(Dest++)=(0x80|((c>>6)&0x3f));
281           *(Dest++)=(0x80|(c&0x3f));
282         }
283         else
284           if (c < 0x200000 && (dsize-=3)>=0)
285           {
286             *(Dest++)=(0xf0|(c>>18));
287             *(Dest++)=(0x80|((c>>12)&0x3f));
288             *(Dest++)=(0x80|((c>>6)&0x3f));
289             *(Dest++)=(0x80|(c&0x3f));
290           }
291       }
292   }
293   *Dest=0;
294 }
295 
296 
WideToUtfSize(const wchar * Src)297 size_t WideToUtfSize(const wchar *Src)
298 {
299   size_t Size=0;
300   for (;*Src!=0;Src++)
301     if (*Src<0x80)
302       Size++;
303     else
304       if (*Src<0x800)
305         Size+=2;
306       else
307         if ((uint)*Src<0x10000) //(uint) to avoid Clang/win "always true" warning for 16-bit wchar_t.
308         {
309           if (Src[0]>=0xd800 && Src[0]<=0xdbff && Src[1]>=0xdc00 && Src[1]<=0xdfff)
310           {
311             Size+=4; // 4 output bytes for Unicode surrogate pair.
312             Src++;
313           }
314           else
315             Size+=3;
316         }
317         else
318           if ((uint)*Src<0x200000) //(uint) to avoid Clang/win "always true" warning for 16-bit wchar_t.
319             Size+=4;
320   return Size+1; // Include terminating zero.
321 }
322 
323 
UtfToWide(const char * Src,wchar * Dest,size_t DestSize)324 bool UtfToWide(const char *Src,wchar *Dest,size_t DestSize)
325 {
326   bool Success=true;
327   long dsize=(long)DestSize;
328   dsize--;
329   while (*Src!=0)
330   {
331     uint c=byte(*(Src++)),d;
332     if (c<0x80)
333       d=c;
334     else
335       if ((c>>5)==6)
336       {
337         if ((*Src&0xc0)!=0x80)
338         {
339           Success=false;
340           break;
341         }
342         d=((c&0x1f)<<6)|(*Src&0x3f);
343         Src++;
344       }
345       else
346         if ((c>>4)==14)
347         {
348           if ((Src[0]&0xc0)!=0x80 || (Src[1]&0xc0)!=0x80)
349           {
350             Success=false;
351             break;
352           }
353           d=((c&0xf)<<12)|((Src[0]&0x3f)<<6)|(Src[1]&0x3f);
354           Src+=2;
355         }
356         else
357           if ((c>>3)==30)
358           {
359             if ((Src[0]&0xc0)!=0x80 || (Src[1]&0xc0)!=0x80 || (Src[2]&0xc0)!=0x80)
360             {
361               Success=false;
362               break;
363             }
364             d=((c&7)<<18)|((Src[0]&0x3f)<<12)|((Src[1]&0x3f)<<6)|(Src[2]&0x3f);
365             Src+=3;
366           }
367           else
368           {
369             Success=false;
370             break;
371           }
372     if (--dsize<0)
373       break;
374     if (d>0xffff)
375     {
376       if (--dsize<0)
377         break;
378       if (d>0x10ffff) // UTF-8 must end at 0x10ffff according to RFC 3629.
379       {
380         Success=false;
381         continue;
382       }
383       if (sizeof(*Dest)==2) // Use the surrogate pair.
384       {
385         *(Dest++)=((d-0x10000)>>10)+0xd800;
386         *(Dest++)=(d&0x3ff)+0xdc00;
387       }
388       else
389         *(Dest++)=d;
390     }
391     else
392       *(Dest++)=d;
393   }
394   *Dest=0;
395   return Success;
396 }
397 
398 
399 // For zero terminated strings.
IsTextUtf8(const byte * Src)400 bool IsTextUtf8(const byte *Src)
401 {
402   return IsTextUtf8(Src,strlen((const char *)Src));
403 }
404 
405 
406 // Source data can be both with and without UTF-8 BOM.
IsTextUtf8(const byte * Src,size_t SrcSize)407 bool IsTextUtf8(const byte *Src,size_t SrcSize)
408 {
409   while (SrcSize-- > 0)
410   {
411     byte C=*(Src++);
412     int HighOne=0; // Number of leftmost '1' bits.
413     for (byte Mask=0x80;Mask!=0 && (C & Mask)!=0;Mask>>=1)
414       HighOne++;
415     if (HighOne==1 || HighOne>6)
416       return false;
417     while (--HighOne > 0)
418       if (SrcSize-- <= 0 || (*(Src++) & 0xc0)!=0x80)
419         return false;
420   }
421   return true;
422 }
423 
424 
wcsicomp(const wchar * s1,const wchar * s2)425 int wcsicomp(const wchar *s1,const wchar *s2)
426 {
427 #ifdef _WIN_ALL
428   return CompareStringW(LOCALE_USER_DEFAULT,NORM_IGNORECASE|SORT_STRINGSORT,s1,-1,s2,-1)-2;
429 #else
430   while (true)
431   {
432     wchar u1 = towupper(*s1);
433     wchar u2 = towupper(*s2);
434     if (u1 != u2)
435       return u1 < u2 ? -1 : 1;
436     if (*s1==0)
437       break;
438     s1++;
439     s2++;
440   }
441   return 0;
442 #endif
443 }
444 
445 
wcsnicomp(const wchar * s1,const wchar * s2,size_t n)446 int wcsnicomp(const wchar *s1,const wchar *s2,size_t n)
447 {
448 #ifdef _WIN_ALL
449   // If we specify 'n' exceeding the actual string length, CompareString goes
450   // beyond the trailing zero and compares garbage. So we need to limit 'n'
451   // to real string length.
452   size_t l1=Min(wcslen(s1)+1,n);
453   size_t l2=Min(wcslen(s2)+1,n);
454   return CompareStringW(LOCALE_USER_DEFAULT,NORM_IGNORECASE|SORT_STRINGSORT,s1,(int)l1,s2,(int)l2)-2;
455 #else
456   if (n==0)
457     return 0;
458   while (true)
459   {
460     wchar u1 = towupper(*s1);
461     wchar u2 = towupper(*s2);
462     if (u1 != u2)
463       return u1 < u2 ? -1 : 1;
464     if (*s1==0 || --n==0)
465       break;
466     s1++;
467     s2++;
468   }
469   return 0;
470 #endif
471 }
472 
473 
wcscasestr(const wchar_t * str,const wchar_t * search)474 const wchar_t* wcscasestr(const wchar_t *str, const wchar_t *search)
475 {
476   for (size_t i=0;str[i]!=0;i++)
477     for (size_t j=0;;j++)
478     {
479       if (search[j]==0)
480         return str+i;
481       if (tolowerw(str[i+j])!=tolowerw(search[j]))
482         break;
483     }
484   return NULL;
485 }
486 
487 
488 #ifndef SFX_MODULE
wcslower(wchar * s)489 wchar* wcslower(wchar *s)
490 {
491 #ifdef _WIN_ALL
492   CharLower(s);
493 #else
494   for (wchar *c=s;*c!=0;c++)
495     *c=towlower(*c);
496 #endif
497   return s;
498 }
499 #endif
500 
501 
502 #ifndef SFX_MODULE
wcsupper(wchar * s)503 wchar* wcsupper(wchar *s)
504 {
505 #ifdef _WIN_ALL
506   CharUpper(s);
507 #else
508   for (wchar *c=s;*c!=0;c++)
509     *c=towupper(*c);
510 #endif
511   return s;
512 }
513 #endif
514 
515 
516 
517 
toupperw(int ch)518 int toupperw(int ch)
519 {
520 #if defined(_WIN_ALL)
521   // CharUpper is more reliable than towupper in Windows, which seems to be
522   // C locale dependent even in Unicode version. For example, towupper failed
523   // to convert lowercase Russian characters.
524   return (int)(INT_PTR)CharUpper((wchar *)(INT_PTR)ch);
525 #else
526   return towupper(ch);
527 #endif
528 }
529 
530 
tolowerw(int ch)531 int tolowerw(int ch)
532 {
533 #if defined(_WIN_ALL)
534   // CharLower is more reliable than towlower in Windows.
535   // See comment for towupper above.
536   return (int)(INT_PTR)CharLower((wchar *)(INT_PTR)ch);
537 #else
538   return towlower(ch);
539 #endif
540 }
541 
542 
atoiw(const wchar * s)543 int atoiw(const wchar *s)
544 {
545   return (int)atoilw(s);
546 }
547 
548 
atoilw(const wchar * s)549 int64 atoilw(const wchar *s)
550 {
551   bool sign=false;
552   if (*s=='-') // We do use signed integers here, for example, in GUI SFX.
553   {
554     s++;
555     sign=true;
556   }
557   // Use unsigned type here, since long string can overflow the variable
558   // and signed integer overflow is undefined behavior in C++.
559   uint64 n=0;
560   while (*s>='0' && *s<='9')
561   {
562     n=n*10+(*s-'0');
563     s++;
564   }
565   // Check int64(n)>=0 to avoid the signed overflow with undefined behavior
566   // when negating 0x8000000000000000.
567   return sign && int64(n)>=0 ? -int64(n) : int64(n);
568 }
569 
570 
571 #ifdef DBCS_SUPPORTED
572 
SupportDBCS()573 SupportDBCS::SupportDBCS()
574 {
575   Init();
576 }
577 
578 
Init()579 void SupportDBCS::Init()
580 {
581   CPINFO CPInfo;
582   GetCPInfo(CP_ACP,&CPInfo);
583   DBCSMode=CPInfo.MaxCharSize > 1;
584   for (uint I=0;I<ASIZE(IsLeadByte);I++)
585     IsLeadByte[I]=IsDBCSLeadByte(I)!=0;
586 }
587 
588 // static
GetInstance()589 SupportDBCS& SupportDBCS::GetInstance() {
590   static SupportDBCS supportDBCS;
591   return supportDBCS;
592 }
593 
charnext(const char * s)594 char* SupportDBCS::charnext(const char *s)
595 {
596   // Zero cannot be the trail byte. So if next byte after the lead byte
597   // is 0, the string is corrupt and we'll better return the pointer to 0,
598   // to break string processing loops.
599   return (char *)(IsLeadByte[(byte)*s] && s[1]!=0 ? s+2:s+1);
600 }
601 
602 
strlend(const char * s)603 size_t SupportDBCS::strlend(const char *s)
604 {
605   size_t Length=0;
606   while (*s!=0)
607   {
608     if (IsLeadByte[(byte)*s])
609       s+=2;
610     else
611       s++;
612     Length++;
613   }
614   return(Length);
615 }
616 
617 
strchrd(const char * s,int c)618 char* SupportDBCS::strchrd(const char *s, int c)
619 {
620   while (*s!=0)
621     if (IsLeadByte[(byte)*s])
622       s+=2;
623     else
624       if (*s==c)
625         return((char *)s);
626       else
627         s++;
628   return(NULL);
629 }
630 
631 
copychrd(char * dest,const char * src)632 void SupportDBCS::copychrd(char *dest,const char *src)
633 {
634   dest[0]=src[0];
635   if (IsLeadByte[(byte)src[0]])
636     dest[1]=src[1];
637 }
638 
639 
strrchrd(const char * s,int c)640 char* SupportDBCS::strrchrd(const char *s, int c)
641 {
642   const char *found=NULL;
643   while (*s!=0)
644     if (IsLeadByte[(byte)*s])
645       s+=2;
646     else
647     {
648       if (*s==c)
649         found=s;
650       s++;
651     }
652   return((char *)found);
653 }
654 #endif
655