1 #include "rar.hpp"
2 #define MBFUNCTIONS
3
4 #if !defined(_WIN_ALL) && !defined(_APPLE) && defined(_UNIX) && defined(MBFUNCTIONS)
5
6 static bool WideToCharMap(const wchar *Src,char *Dest,size_t DestSize,bool &Success);
7 static void CharToWideMap(const char *Src,wchar *Dest,size_t DestSize,bool &Success);
8
9 // In Unix we map high ASCII characters which cannot be converted to Unicode
10 // to 0xE000 - 0xE0FF private use Unicode area.
11 static const uint MapAreaStart=0xE000;
12
13 // Mapped string marker. Initially we used 0xFFFF for this purpose,
14 // but it causes MSVC2008 swprintf to fail (it treats 0xFFFF as error marker).
15 // While we could workaround it, it is safer to use another character.
16 static const uint MappedStringMark=0xFFFE;
17
18 #endif
19
WideToChar(const wchar * Src,char * Dest,size_t DestSize)20 bool WideToChar(const wchar *Src,char *Dest,size_t DestSize)
21 {
22 bool RetCode=true;
23 *Dest=0; // Set 'Dest' to zero just in case the conversion will fail.
24
25 #ifdef _WIN_ALL
26 if (WideCharToMultiByte(CP_ACP,0,Src,-1,Dest,(int)DestSize,NULL,NULL)==0)
27 RetCode=false;
28
29 // wcstombs is broken in Android NDK r9.
30 #elif defined(_APPLE)
31 WideToUtf(Src,Dest,DestSize);
32
33 #elif defined(_UNIX) && defined(MBFUNCTIONS)
34 if (!WideToCharMap(Src,Dest,DestSize,RetCode))
35 {
36 mbstate_t ps; // Use thread safe external state based functions.
37 memset (&ps, 0, sizeof(ps));
38 const wchar *SrcParam=Src; // wcsrtombs can change the pointer.
39
40 // Some implementations of wcsrtombs can cause memory analyzing tools
41 // like valgrind to report uninitialized data access. It happens because
42 // internally these implementations call SSE4 based wcslen function,
43 // which reads 16 bytes at once including those beyond of trailing 0.
44 size_t ResultingSize=wcsrtombs(Dest,&SrcParam,DestSize,&ps);
45
46 if (ResultingSize==(size_t)-1 && errno==EILSEQ)
47 {
48 // Aborted on inconvertible character not zero terminating the result.
49 // EILSEQ helps to distinguish it from small output buffer abort.
50 // We want to convert as much as we can, so we clean the output buffer
51 // and repeat conversion.
52 memset (&ps, 0, sizeof(ps));
53 SrcParam=Src; // wcsrtombs can change the pointer.
54 memset(Dest,0,DestSize);
55 ResultingSize=wcsrtombs(Dest,&SrcParam,DestSize,&ps);
56 }
57
58 if (ResultingSize==(size_t)-1)
59 RetCode=false;
60 if (ResultingSize==0 && *Src!=0)
61 RetCode=false;
62 }
63 #else
64 for (int I=0;I<DestSize;I++)
65 {
66 Dest[I]=(char)Src[I];
67 if (Src[I]==0)
68 break;
69 }
70 #endif
71 if (DestSize>0)
72 Dest[DestSize-1]=0;
73
74 // We tried to return the empty string if conversion is failed,
75 // but it does not work well. WideCharToMultiByte returns 'failed' code
76 // and partially converted string even if we wanted to convert only a part
77 // of string and passed DestSize smaller than required for fully converted
78 // string. Such call is the valid behavior in RAR code and we do not expect
79 // the empty string in this case.
80
81 return RetCode;
82 }
83
84
CharToWide(const char * Src,wchar * Dest,size_t DestSize)85 bool CharToWide(const char *Src,wchar *Dest,size_t DestSize)
86 {
87 bool RetCode=true;
88 *Dest=0; // Set 'Dest' to zero just in case the conversion will fail.
89
90 #ifdef _WIN_ALL
91 if (MultiByteToWideChar(CP_ACP,0,Src,-1,Dest,(int)DestSize)==0)
92 RetCode=false;
93
94 // mbstowcs is broken in Android NDK r9.
95 #elif defined(_APPLE)
96 UtfToWide(Src,Dest,DestSize);
97
98 #elif defined(_UNIX) && defined(MBFUNCTIONS)
99 mbstate_t ps;
100 memset (&ps, 0, sizeof(ps));
101 const char *SrcParam=Src; // mbsrtowcs can change the pointer.
102 size_t ResultingSize=mbsrtowcs(Dest,&SrcParam,DestSize,&ps);
103 if (ResultingSize==(size_t)-1)
104 RetCode=false;
105 if (ResultingSize==0 && *Src!=0)
106 RetCode=false;
107
108 if (RetCode==false && DestSize>1)
109 CharToWideMap(Src,Dest,DestSize,RetCode);
110 #else
111 for (int I=0;I<DestSize;I++)
112 {
113 Dest[I]=(wchar_t)Src[I];
114 if (Src[I]==0)
115 break;
116 }
117 #endif
118 if (DestSize>0)
119 Dest[DestSize-1]=0;
120
121 // We tried to return the empty string if conversion is failed,
122 // but it does not work well. MultiByteToWideChar returns 'failed' code
123 // even if we wanted to convert only a part of string and passed DestSize
124 // smaller than required for fully converted string. Such call is the valid
125 // behavior in RAR code and we do not expect the empty string in this case.
126
127 return RetCode;
128 }
129
130
131 #if !defined(_WIN_ALL) && !defined(_APPLE) && defined(_UNIX) && defined(MBFUNCTIONS)
132 // Convert and restore mapped inconvertible Unicode characters.
133 // We use it for extended ASCII names in Unix.
WideToCharMap(const wchar * Src,char * Dest,size_t DestSize,bool & Success)134 bool WideToCharMap(const wchar *Src,char *Dest,size_t DestSize,bool &Success)
135 {
136 // String with inconvertible characters mapped to private use Unicode area
137 // must have the mark code somewhere.
138 if (wcschr(Src,(wchar)MappedStringMark)==NULL)
139 return false;
140
141 // Seems to be that wcrtomb in some memory analyzing libraries
142 // can produce uninitilized output while reporting success on garbage input.
143 // So we clean the destination to calm analyzers.
144 memset(Dest,0,DestSize);
145
146 Success=true;
147 uint SrcPos=0,DestPos=0;
148 while (Src[SrcPos]!=0 && DestPos<DestSize-MB_CUR_MAX)
149 {
150 if (uint(Src[SrcPos])==MappedStringMark)
151 {
152 SrcPos++;
153 continue;
154 }
155 // For security reasons do not restore low ASCII codes, so mapping cannot
156 // be used to hide control codes like path separators.
157 if (uint(Src[SrcPos])>=MapAreaStart+0x80 && uint(Src[SrcPos])<MapAreaStart+0x100)
158 Dest[DestPos++]=char(uint(Src[SrcPos++])-MapAreaStart);
159 else
160 {
161 mbstate_t ps;
162 memset(&ps,0,sizeof(ps));
163 if (wcrtomb(Dest+DestPos,Src[SrcPos],&ps)==(size_t)-1)
164 {
165 Dest[DestPos]='_';
166 Success=false;
167 }
168 SrcPos++;
169 memset(&ps,0,sizeof(ps));
170 int Length=mbrlen(Dest+DestPos,MB_CUR_MAX,&ps);
171 DestPos+=Max(Length,1);
172 }
173 }
174 Dest[Min(DestPos,DestSize-1)]=0;
175 return true;
176 }
177 #endif
178
179
180 #if !defined(_WIN_ALL) && !defined(_APPLE) && defined(_UNIX) && defined(MBFUNCTIONS)
181 // Convert and map inconvertible Unicode characters.
182 // We use it for extended ASCII names in Unix.
CharToWideMap(const char * Src,wchar * Dest,size_t DestSize,bool & Success)183 void CharToWideMap(const char *Src,wchar *Dest,size_t DestSize,bool &Success)
184 {
185 // Map inconvertible characters to private use Unicode area 0xE000.
186 // Mark such string by placing special non-character code before
187 // first inconvertible character.
188 Success=false;
189 bool MarkAdded=false;
190 uint SrcPos=0,DestPos=0;
191 while (DestPos<DestSize)
192 {
193 if (Src[SrcPos]==0)
194 {
195 Success=true;
196 break;
197 }
198 mbstate_t ps;
199 memset(&ps,0,sizeof(ps));
200 size_t res=mbrtowc(Dest+DestPos,Src+SrcPos,MB_CUR_MAX,&ps);
201 if (res==(size_t)-1 || res==(size_t)-2)
202 {
203 // For security reasons we do not want to map low ASCII characters,
204 // so we do not have additional .. and path separator codes.
205 if (byte(Src[SrcPos])>=0x80)
206 {
207 if (!MarkAdded)
208 {
209 Dest[DestPos++]=MappedStringMark;
210 MarkAdded=true;
211 if (DestPos>=DestSize)
212 break;
213 }
214 Dest[DestPos++]=byte(Src[SrcPos++])+MapAreaStart;
215 }
216 else
217 break;
218 }
219 else
220 {
221 memset(&ps,0,sizeof(ps));
222 int Length=mbrlen(Src+SrcPos,MB_CUR_MAX,&ps);
223 SrcPos+=Max(Length,1);
224 DestPos++;
225 }
226 }
227 Dest[Min(DestPos,DestSize-1)]=0;
228 }
229 #endif
230
231
232 // SrcSize is in wide characters, not in bytes.
WideToRaw(const wchar * Src,byte * Dest,size_t SrcSize)233 byte* WideToRaw(const wchar *Src,byte *Dest,size_t SrcSize)
234 {
235 for (size_t I=0;I<SrcSize;I++,Src++)
236 {
237 Dest[I*2]=(byte)*Src;
238 Dest[I*2+1]=(byte)(*Src>>8);
239 if (*Src==0)
240 break;
241 }
242 return Dest;
243 }
244
245
RawToWide(const byte * Src,wchar * Dest,size_t DestSize)246 wchar* RawToWide(const byte *Src,wchar *Dest,size_t DestSize)
247 {
248 for (size_t I=0;I<DestSize;I++)
249 if ((Dest[I]=Src[I*2]+(Src[I*2+1]<<8))==0)
250 break;
251 return Dest;
252 }
253
254
WideToUtf(const wchar * Src,char * Dest,size_t DestSize)255 void WideToUtf(const wchar *Src,char *Dest,size_t DestSize)
256 {
257 long dsize=(long)DestSize;
258 dsize--;
259 while (*Src!=0 && --dsize>=0)
260 {
261 uint c=*(Src++);
262 if (c<0x80)
263 *(Dest++)=c;
264 else
265 if (c<0x800 && --dsize>=0)
266 {
267 *(Dest++)=(0xc0|(c>>6));
268 *(Dest++)=(0x80|(c&0x3f));
269 }
270 else
271 {
272 if (c>=0xd800 && c<=0xdbff && *Src>=0xdc00 && *Src<=0xdfff) // Surrogate pair.
273 {
274 c=((c-0xd800)<<10)+(*Src-0xdc00)+0x10000;
275 Src++;
276 }
277 if (c<0x10000 && (dsize-=2)>=0)
278 {
279 *(Dest++)=(0xe0|(c>>12));
280 *(Dest++)=(0x80|((c>>6)&0x3f));
281 *(Dest++)=(0x80|(c&0x3f));
282 }
283 else
284 if (c < 0x200000 && (dsize-=3)>=0)
285 {
286 *(Dest++)=(0xf0|(c>>18));
287 *(Dest++)=(0x80|((c>>12)&0x3f));
288 *(Dest++)=(0x80|((c>>6)&0x3f));
289 *(Dest++)=(0x80|(c&0x3f));
290 }
291 }
292 }
293 *Dest=0;
294 }
295
296
WideToUtfSize(const wchar * Src)297 size_t WideToUtfSize(const wchar *Src)
298 {
299 size_t Size=0;
300 for (;*Src!=0;Src++)
301 if (*Src<0x80)
302 Size++;
303 else
304 if (*Src<0x800)
305 Size+=2;
306 else
307 if ((uint)*Src<0x10000) //(uint) to avoid Clang/win "always true" warning for 16-bit wchar_t.
308 {
309 if (Src[0]>=0xd800 && Src[0]<=0xdbff && Src[1]>=0xdc00 && Src[1]<=0xdfff)
310 {
311 Size+=4; // 4 output bytes for Unicode surrogate pair.
312 Src++;
313 }
314 else
315 Size+=3;
316 }
317 else
318 if ((uint)*Src<0x200000) //(uint) to avoid Clang/win "always true" warning for 16-bit wchar_t.
319 Size+=4;
320 return Size+1; // Include terminating zero.
321 }
322
323
UtfToWide(const char * Src,wchar * Dest,size_t DestSize)324 bool UtfToWide(const char *Src,wchar *Dest,size_t DestSize)
325 {
326 bool Success=true;
327 long dsize=(long)DestSize;
328 dsize--;
329 while (*Src!=0)
330 {
331 uint c=byte(*(Src++)),d;
332 if (c<0x80)
333 d=c;
334 else
335 if ((c>>5)==6)
336 {
337 if ((*Src&0xc0)!=0x80)
338 {
339 Success=false;
340 break;
341 }
342 d=((c&0x1f)<<6)|(*Src&0x3f);
343 Src++;
344 }
345 else
346 if ((c>>4)==14)
347 {
348 if ((Src[0]&0xc0)!=0x80 || (Src[1]&0xc0)!=0x80)
349 {
350 Success=false;
351 break;
352 }
353 d=((c&0xf)<<12)|((Src[0]&0x3f)<<6)|(Src[1]&0x3f);
354 Src+=2;
355 }
356 else
357 if ((c>>3)==30)
358 {
359 if ((Src[0]&0xc0)!=0x80 || (Src[1]&0xc0)!=0x80 || (Src[2]&0xc0)!=0x80)
360 {
361 Success=false;
362 break;
363 }
364 d=((c&7)<<18)|((Src[0]&0x3f)<<12)|((Src[1]&0x3f)<<6)|(Src[2]&0x3f);
365 Src+=3;
366 }
367 else
368 {
369 Success=false;
370 break;
371 }
372 if (--dsize<0)
373 break;
374 if (d>0xffff)
375 {
376 if (--dsize<0)
377 break;
378 if (d>0x10ffff) // UTF-8 must end at 0x10ffff according to RFC 3629.
379 {
380 Success=false;
381 continue;
382 }
383 if (sizeof(*Dest)==2) // Use the surrogate pair.
384 {
385 *(Dest++)=((d-0x10000)>>10)+0xd800;
386 *(Dest++)=(d&0x3ff)+0xdc00;
387 }
388 else
389 *(Dest++)=d;
390 }
391 else
392 *(Dest++)=d;
393 }
394 *Dest=0;
395 return Success;
396 }
397
398
399 // For zero terminated strings.
IsTextUtf8(const byte * Src)400 bool IsTextUtf8(const byte *Src)
401 {
402 return IsTextUtf8(Src,strlen((const char *)Src));
403 }
404
405
406 // Source data can be both with and without UTF-8 BOM.
IsTextUtf8(const byte * Src,size_t SrcSize)407 bool IsTextUtf8(const byte *Src,size_t SrcSize)
408 {
409 while (SrcSize-- > 0)
410 {
411 byte C=*(Src++);
412 int HighOne=0; // Number of leftmost '1' bits.
413 for (byte Mask=0x80;Mask!=0 && (C & Mask)!=0;Mask>>=1)
414 HighOne++;
415 if (HighOne==1 || HighOne>6)
416 return false;
417 while (--HighOne > 0)
418 if (SrcSize-- <= 0 || (*(Src++) & 0xc0)!=0x80)
419 return false;
420 }
421 return true;
422 }
423
424
wcsicomp(const wchar * s1,const wchar * s2)425 int wcsicomp(const wchar *s1,const wchar *s2)
426 {
427 #ifdef _WIN_ALL
428 return CompareStringW(LOCALE_USER_DEFAULT,NORM_IGNORECASE|SORT_STRINGSORT,s1,-1,s2,-1)-2;
429 #else
430 while (true)
431 {
432 wchar u1 = towupper(*s1);
433 wchar u2 = towupper(*s2);
434 if (u1 != u2)
435 return u1 < u2 ? -1 : 1;
436 if (*s1==0)
437 break;
438 s1++;
439 s2++;
440 }
441 return 0;
442 #endif
443 }
444
445
wcsnicomp(const wchar * s1,const wchar * s2,size_t n)446 int wcsnicomp(const wchar *s1,const wchar *s2,size_t n)
447 {
448 #ifdef _WIN_ALL
449 // If we specify 'n' exceeding the actual string length, CompareString goes
450 // beyond the trailing zero and compares garbage. So we need to limit 'n'
451 // to real string length.
452 size_t l1=Min(wcslen(s1)+1,n);
453 size_t l2=Min(wcslen(s2)+1,n);
454 return CompareStringW(LOCALE_USER_DEFAULT,NORM_IGNORECASE|SORT_STRINGSORT,s1,(int)l1,s2,(int)l2)-2;
455 #else
456 if (n==0)
457 return 0;
458 while (true)
459 {
460 wchar u1 = towupper(*s1);
461 wchar u2 = towupper(*s2);
462 if (u1 != u2)
463 return u1 < u2 ? -1 : 1;
464 if (*s1==0 || --n==0)
465 break;
466 s1++;
467 s2++;
468 }
469 return 0;
470 #endif
471 }
472
473
wcscasestr(const wchar_t * str,const wchar_t * search)474 const wchar_t* wcscasestr(const wchar_t *str, const wchar_t *search)
475 {
476 for (size_t i=0;str[i]!=0;i++)
477 for (size_t j=0;;j++)
478 {
479 if (search[j]==0)
480 return str+i;
481 if (tolowerw(str[i+j])!=tolowerw(search[j]))
482 break;
483 }
484 return NULL;
485 }
486
487
488 #ifndef SFX_MODULE
wcslower(wchar * s)489 wchar* wcslower(wchar *s)
490 {
491 #ifdef _WIN_ALL
492 CharLower(s);
493 #else
494 for (wchar *c=s;*c!=0;c++)
495 *c=towlower(*c);
496 #endif
497 return s;
498 }
499 #endif
500
501
502 #ifndef SFX_MODULE
wcsupper(wchar * s)503 wchar* wcsupper(wchar *s)
504 {
505 #ifdef _WIN_ALL
506 CharUpper(s);
507 #else
508 for (wchar *c=s;*c!=0;c++)
509 *c=towupper(*c);
510 #endif
511 return s;
512 }
513 #endif
514
515
516
517
toupperw(int ch)518 int toupperw(int ch)
519 {
520 #if defined(_WIN_ALL)
521 // CharUpper is more reliable than towupper in Windows, which seems to be
522 // C locale dependent even in Unicode version. For example, towupper failed
523 // to convert lowercase Russian characters.
524 return (int)(INT_PTR)CharUpper((wchar *)(INT_PTR)ch);
525 #else
526 return towupper(ch);
527 #endif
528 }
529
530
tolowerw(int ch)531 int tolowerw(int ch)
532 {
533 #if defined(_WIN_ALL)
534 // CharLower is more reliable than towlower in Windows.
535 // See comment for towupper above.
536 return (int)(INT_PTR)CharLower((wchar *)(INT_PTR)ch);
537 #else
538 return towlower(ch);
539 #endif
540 }
541
542
atoiw(const wchar * s)543 int atoiw(const wchar *s)
544 {
545 return (int)atoilw(s);
546 }
547
548
atoilw(const wchar * s)549 int64 atoilw(const wchar *s)
550 {
551 bool sign=false;
552 if (*s=='-') // We do use signed integers here, for example, in GUI SFX.
553 {
554 s++;
555 sign=true;
556 }
557 // Use unsigned type here, since long string can overflow the variable
558 // and signed integer overflow is undefined behavior in C++.
559 uint64 n=0;
560 while (*s>='0' && *s<='9')
561 {
562 n=n*10+(*s-'0');
563 s++;
564 }
565 // Check int64(n)>=0 to avoid the signed overflow with undefined behavior
566 // when negating 0x8000000000000000.
567 return sign && int64(n)>=0 ? -int64(n) : int64(n);
568 }
569
570
571 #ifdef DBCS_SUPPORTED
572
SupportDBCS()573 SupportDBCS::SupportDBCS()
574 {
575 Init();
576 }
577
578
Init()579 void SupportDBCS::Init()
580 {
581 CPINFO CPInfo;
582 GetCPInfo(CP_ACP,&CPInfo);
583 DBCSMode=CPInfo.MaxCharSize > 1;
584 for (uint I=0;I<ASIZE(IsLeadByte);I++)
585 IsLeadByte[I]=IsDBCSLeadByte(I)!=0;
586 }
587
588 // static
GetInstance()589 SupportDBCS& SupportDBCS::GetInstance() {
590 static SupportDBCS supportDBCS;
591 return supportDBCS;
592 }
593
charnext(const char * s)594 char* SupportDBCS::charnext(const char *s)
595 {
596 // Zero cannot be the trail byte. So if next byte after the lead byte
597 // is 0, the string is corrupt and we'll better return the pointer to 0,
598 // to break string processing loops.
599 return (char *)(IsLeadByte[(byte)*s] && s[1]!=0 ? s+2:s+1);
600 }
601
602
strlend(const char * s)603 size_t SupportDBCS::strlend(const char *s)
604 {
605 size_t Length=0;
606 while (*s!=0)
607 {
608 if (IsLeadByte[(byte)*s])
609 s+=2;
610 else
611 s++;
612 Length++;
613 }
614 return(Length);
615 }
616
617
strchrd(const char * s,int c)618 char* SupportDBCS::strchrd(const char *s, int c)
619 {
620 while (*s!=0)
621 if (IsLeadByte[(byte)*s])
622 s+=2;
623 else
624 if (*s==c)
625 return((char *)s);
626 else
627 s++;
628 return(NULL);
629 }
630
631
copychrd(char * dest,const char * src)632 void SupportDBCS::copychrd(char *dest,const char *src)
633 {
634 dest[0]=src[0];
635 if (IsLeadByte[(byte)src[0]])
636 dest[1]=src[1];
637 }
638
639
strrchrd(const char * s,int c)640 char* SupportDBCS::strrchrd(const char *s, int c)
641 {
642 const char *found=NULL;
643 while (*s!=0)
644 if (IsLeadByte[(byte)*s])
645 s+=2;
646 else
647 {
648 if (*s==c)
649 found=s;
650 s++;
651 }
652 return((char *)found);
653 }
654 #endif
655