1 /*
2  *  Copyright (C) 2005-2018 Team Kodi
3  *  This file is part of Kodi - https://kodi.tv
4  *
5  *  SPDX-License-Identifier: GPL-2.0-or-later
6  *  See LICENSES/README.md for more information.
7  */
8 
9 #include "CharsetConverter.h"
10 
11 #include "LangInfo.h"
12 #include "guilib/LocalizeStrings.h"
13 #include "log.h"
14 #include "settings/Settings.h"
15 #include "settings/lib/Setting.h"
16 #include "settings/lib/SettingDefinitions.h"
17 #include "utils/StringUtils.h"
18 #include "utils/Utf8Utils.h"
19 
20 #include <algorithm>
21 
22 #include <fribidi.h>
23 #include <iconv.h>
24 
25 #ifdef WORDS_BIGENDIAN
26   #define ENDIAN_SUFFIX "BE"
27 #else
28   #define ENDIAN_SUFFIX "LE"
29 #endif
30 
31 #if defined(TARGET_DARWIN)
32   #define WCHAR_IS_UCS_4 1
33   #define UTF16_CHARSET "UTF-16" ENDIAN_SUFFIX
34   #define UTF32_CHARSET "UTF-32" ENDIAN_SUFFIX
35   #define UTF8_SOURCE "UTF-8-MAC"
36   #define WCHAR_CHARSET UTF32_CHARSET
37 #elif defined(TARGET_WINDOWS)
38   #define WCHAR_IS_UTF16 1
39   #define UTF16_CHARSET "UTF-16" ENDIAN_SUFFIX
40   #define UTF32_CHARSET "UTF-32" ENDIAN_SUFFIX
41   #define UTF8_SOURCE "UTF-8"
42   #define WCHAR_CHARSET UTF16_CHARSET
43 #elif (defined(TARGET_FREEBSD)||defined(TARGET_DRAGONFLY))
44   #define WCHAR_IS_UCS_4 1
45   #define UTF16_CHARSET "UTF-16" ENDIAN_SUFFIX
46   #define UTF32_CHARSET "UTF-32" ENDIAN_SUFFIX
47   #define UTF8_SOURCE "UTF-8"
48   #define WCHAR_CHARSET UTF32_CHARSET
49 #elif defined(TARGET_ANDROID)
50   #define WCHAR_IS_UCS_4 1
51   #define UTF16_CHARSET "UTF-16" ENDIAN_SUFFIX
52   #define UTF32_CHARSET "UTF-32" ENDIAN_SUFFIX
53   #define UTF8_SOURCE "UTF-8"
54   #define WCHAR_CHARSET UTF32_CHARSET
55 #else
56   #define UTF16_CHARSET "UTF-16" ENDIAN_SUFFIX
57   #define UTF32_CHARSET "UTF-32" ENDIAN_SUFFIX
58   #define UTF8_SOURCE "UTF-8"
59   #define WCHAR_CHARSET "WCHAR_T"
60   #if __STDC_ISO_10646__
61     #ifdef SIZEOF_WCHAR_T
62       #if SIZEOF_WCHAR_T == 4
63         #define WCHAR_IS_UCS_4 1
64       #elif SIZEOF_WCHAR_T == 2
65         #define WCHAR_IS_UCS_2 1
66       #endif
67     #endif
68   #endif
69 #endif
70 
71 #define NO_ICONV ((iconv_t)-1)
72 
73 enum SpecialCharset
74 {
75   NotSpecialCharset = 0,
76   SystemCharset,
77   UserCharset /* locale.charset */,
78   SubtitleCharset /* subtitles.charset */,
79 };
80 
81 class CConverterType : public CCriticalSection
82 {
83 public:
84   CConverterType(const std::string&  sourceCharset,        const std::string&  targetCharset,        unsigned int targetSingleCharMaxLen = 1);
85   CConverterType(enum SpecialCharset sourceSpecialCharset, const std::string&  targetCharset,        unsigned int targetSingleCharMaxLen = 1);
86   CConverterType(const std::string&  sourceCharset,        enum SpecialCharset targetSpecialCharset, unsigned int targetSingleCharMaxLen = 1);
87   CConverterType(enum SpecialCharset sourceSpecialCharset, enum SpecialCharset targetSpecialCharset, unsigned int targetSingleCharMaxLen = 1);
88   CConverterType(const CConverterType& other);
89   ~CConverterType();
90 
91   iconv_t GetConverter(CSingleLock& converterLock);
92 
93   void Reset(void);
94   void ReinitTo(const std::string& sourceCharset, const std::string& targetCharset, unsigned int targetSingleCharMaxLen = 1);
GetSourceCharset(void) const95   std::string GetSourceCharset(void) const  { return m_sourceCharset; }
GetTargetCharset(void) const96   std::string GetTargetCharset(void) const  { return m_targetCharset; }
GetTargetSingleCharMaxLen(void) const97   unsigned int GetTargetSingleCharMaxLen(void) const  { return m_targetSingleCharMaxLen; }
98 
99 private:
100   static std::string ResolveSpecialCharset(enum SpecialCharset charset);
101 
102   enum SpecialCharset m_sourceSpecialCharset;
103   std::string         m_sourceCharset;
104   enum SpecialCharset m_targetSpecialCharset;
105   std::string         m_targetCharset;
106   iconv_t             m_iconv;
107   unsigned int        m_targetSingleCharMaxLen;
108 };
109 
CConverterType(const std::string & sourceCharset,const std::string & targetCharset,unsigned int targetSingleCharMaxLen)110 CConverterType::CConverterType(const std::string& sourceCharset, const std::string& targetCharset, unsigned int targetSingleCharMaxLen /*= 1*/) : CCriticalSection(),
111   m_sourceSpecialCharset(NotSpecialCharset),
112   m_sourceCharset(sourceCharset),
113   m_targetSpecialCharset(NotSpecialCharset),
114   m_targetCharset(targetCharset),
115   m_iconv(NO_ICONV),
116   m_targetSingleCharMaxLen(targetSingleCharMaxLen)
117 {
118 }
119 
CConverterType(enum SpecialCharset sourceSpecialCharset,const std::string & targetCharset,unsigned int targetSingleCharMaxLen)120 CConverterType::CConverterType(enum SpecialCharset sourceSpecialCharset, const std::string& targetCharset, unsigned int targetSingleCharMaxLen /*= 1*/) : CCriticalSection(),
121   m_sourceSpecialCharset(sourceSpecialCharset),
122   m_sourceCharset(),
123   m_targetSpecialCharset(NotSpecialCharset),
124   m_targetCharset(targetCharset),
125   m_iconv(NO_ICONV),
126   m_targetSingleCharMaxLen(targetSingleCharMaxLen)
127 {
128 }
129 
CConverterType(const std::string & sourceCharset,enum SpecialCharset targetSpecialCharset,unsigned int targetSingleCharMaxLen)130 CConverterType::CConverterType(const std::string& sourceCharset, enum SpecialCharset targetSpecialCharset, unsigned int targetSingleCharMaxLen /*= 1*/) : CCriticalSection(),
131   m_sourceSpecialCharset(NotSpecialCharset),
132   m_sourceCharset(sourceCharset),
133   m_targetSpecialCharset(targetSpecialCharset),
134   m_targetCharset(),
135   m_iconv(NO_ICONV),
136   m_targetSingleCharMaxLen(targetSingleCharMaxLen)
137 {
138 }
139 
CConverterType(enum SpecialCharset sourceSpecialCharset,enum SpecialCharset targetSpecialCharset,unsigned int targetSingleCharMaxLen)140 CConverterType::CConverterType(enum SpecialCharset sourceSpecialCharset, enum SpecialCharset targetSpecialCharset, unsigned int targetSingleCharMaxLen /*= 1*/) : CCriticalSection(),
141   m_sourceSpecialCharset(sourceSpecialCharset),
142   m_sourceCharset(),
143   m_targetSpecialCharset(targetSpecialCharset),
144   m_targetCharset(),
145   m_iconv(NO_ICONV),
146   m_targetSingleCharMaxLen(targetSingleCharMaxLen)
147 {
148 }
149 
CConverterType(const CConverterType & other)150 CConverterType::CConverterType(const CConverterType& other) : CCriticalSection(),
151   m_sourceSpecialCharset(other.m_sourceSpecialCharset),
152   m_sourceCharset(other.m_sourceCharset),
153   m_targetSpecialCharset(other.m_targetSpecialCharset),
154   m_targetCharset(other.m_targetCharset),
155   m_iconv(NO_ICONV),
156   m_targetSingleCharMaxLen(other.m_targetSingleCharMaxLen)
157 {
158 }
159 
~CConverterType()160 CConverterType::~CConverterType()
161 {
162   CSingleLock lock(*this);
163   if (m_iconv != NO_ICONV)
164     iconv_close(m_iconv);
165   lock.Leave(); // ensure unlocking before final destruction
166 }
167 
GetConverter(CSingleLock & converterLock)168 iconv_t CConverterType::GetConverter(CSingleLock& converterLock)
169 {
170   // ensure that this unique instance is locked externally
171   if (&converterLock.get_underlying() != this)
172     return NO_ICONV;
173 
174   if (m_iconv == NO_ICONV)
175   {
176     if (m_sourceSpecialCharset)
177       m_sourceCharset = ResolveSpecialCharset(m_sourceSpecialCharset);
178     if (m_targetSpecialCharset)
179       m_targetCharset = ResolveSpecialCharset(m_targetSpecialCharset);
180 
181     m_iconv = iconv_open(m_targetCharset.c_str(), m_sourceCharset.c_str());
182 
183     if (m_iconv == NO_ICONV)
184       CLog::Log(LOGERROR, "%s: iconv_open() for \"%s\" -> \"%s\" failed, errno = %d (%s)",
185                 __FUNCTION__, m_sourceCharset.c_str(), m_targetCharset.c_str(), errno, strerror(errno));
186   }
187 
188   return m_iconv;
189 }
190 
Reset(void)191 void CConverterType::Reset(void)
192 {
193   CSingleLock lock(*this);
194   if (m_iconv != NO_ICONV)
195   {
196     iconv_close(m_iconv);
197     m_iconv = NO_ICONV;
198   }
199 
200   if (m_sourceSpecialCharset)
201     m_sourceCharset.clear();
202   if (m_targetSpecialCharset)
203     m_targetCharset.clear();
204 
205 }
206 
ReinitTo(const std::string & sourceCharset,const std::string & targetCharset,unsigned int targetSingleCharMaxLen)207 void CConverterType::ReinitTo(const std::string& sourceCharset, const std::string& targetCharset, unsigned int targetSingleCharMaxLen /*= 1*/)
208 {
209   CSingleLock lock(*this);
210   if (sourceCharset != m_sourceCharset || targetCharset != m_targetCharset)
211   {
212     if (m_iconv != NO_ICONV)
213     {
214       iconv_close(m_iconv);
215       m_iconv = NO_ICONV;
216     }
217 
218     m_sourceSpecialCharset = NotSpecialCharset;
219     m_sourceCharset = sourceCharset;
220     m_targetSpecialCharset = NotSpecialCharset;
221     m_targetCharset = targetCharset;
222     m_targetSingleCharMaxLen = targetSingleCharMaxLen;
223   }
224 }
225 
ResolveSpecialCharset(enum SpecialCharset charset)226 std::string CConverterType::ResolveSpecialCharset(enum SpecialCharset charset)
227 {
228   switch (charset)
229   {
230   case SystemCharset:
231     return "";
232   case UserCharset:
233     return g_langInfo.GetGuiCharSet();
234   case SubtitleCharset:
235     return g_langInfo.GetSubtitleCharSet();
236   case NotSpecialCharset:
237   default:
238     return "UTF-8"; /* dummy value */
239   }
240 }
241 
242 enum StdConversionType /* Keep it in sync with CCharsetConverter::CInnerConverter::m_stdConversion */
243 {
244   NoConversion = -1,
245   Utf8ToUtf32 = 0,
246   Utf32ToUtf8,
247   Utf32ToW,
248   WToUtf32,
249   SubtitleCharsetToUtf8,
250   Utf8ToUserCharset,
251   UserCharsetToUtf8,
252   Utf32ToUserCharset,
253   WtoUtf8,
254   Utf16LEtoW,
255   Utf16BEtoUtf8,
256   Utf16LEtoUtf8,
257   Utf8toW,
258   Utf8ToSystem,
259   SystemToUtf8,
260   Ucs2CharsetToUtf8,
261   NumberOfStdConversionTypes /* Dummy sentinel entry */
262 };
263 
264 /* We don't want to pollute header file with many additional includes and definitions, so put
265    here all staff that require usage of types defined in this file or in additional headers */
266 class CCharsetConverter::CInnerConverter
267 {
268 public:
269   static bool logicalToVisualBiDi(const std::u32string& stringSrc,
270                                   std::u32string& stringDst,
271                                   FriBidiCharType base = FRIBIDI_TYPE_LTR,
272                                   const bool failOnBadString = false,
273                                   int* visualToLogicalMap = nullptr);
274 
275   template<class INPUT,class OUTPUT>
276   static bool stdConvert(StdConversionType convertType, const INPUT& strSource, OUTPUT& strDest, bool failOnInvalidChar = false);
277   template<class INPUT,class OUTPUT>
278   static bool customConvert(const std::string& sourceCharset, const std::string& targetCharset, const INPUT& strSource, OUTPUT& strDest, bool failOnInvalidChar = false);
279 
280   template<class INPUT,class OUTPUT>
281   static bool convert(iconv_t type, int multiplier, const INPUT& strSource, OUTPUT& strDest, bool failOnInvalidChar = false);
282 
283   static CConverterType m_stdConversion[NumberOfStdConversionTypes];
284   static CCriticalSection m_critSectionFriBiDi;
285 };
286 
287 /* single symbol sizes in chars */
288 const int CCharsetConverter::m_Utf8CharMinSize = 1;
289 const int CCharsetConverter::m_Utf8CharMaxSize = 4;
290 
291 CConverterType CCharsetConverter::CInnerConverter::m_stdConversion[NumberOfStdConversionTypes] = /* keep it in sync with enum StdConversionType */
292 {
293   /* Utf8ToUtf32 */         CConverterType(UTF8_SOURCE,     UTF32_CHARSET),
294   /* Utf32ToUtf8 */         CConverterType(UTF32_CHARSET,   "UTF-8", CCharsetConverter::m_Utf8CharMaxSize),
295   /* Utf32ToW */            CConverterType(UTF32_CHARSET,   WCHAR_CHARSET),
296   /* WToUtf32 */            CConverterType(WCHAR_CHARSET,   UTF32_CHARSET),
297   /* SubtitleCharsetToUtf8*/CConverterType(SubtitleCharset, "UTF-8", CCharsetConverter::m_Utf8CharMaxSize),
298   /* Utf8ToUserCharset */   CConverterType(UTF8_SOURCE,     UserCharset),
299   /* UserCharsetToUtf8 */   CConverterType(UserCharset,     "UTF-8", CCharsetConverter::m_Utf8CharMaxSize),
300   /* Utf32ToUserCharset */  CConverterType(UTF32_CHARSET,   UserCharset),
301   /* WtoUtf8 */             CConverterType(WCHAR_CHARSET,   "UTF-8", CCharsetConverter::m_Utf8CharMaxSize),
302   /* Utf16LEtoW */          CConverterType("UTF-16LE",      WCHAR_CHARSET),
303   /* Utf16BEtoUtf8 */       CConverterType("UTF-16BE",      "UTF-8", CCharsetConverter::m_Utf8CharMaxSize),
304   /* Utf16LEtoUtf8 */       CConverterType("UTF-16LE",      "UTF-8", CCharsetConverter::m_Utf8CharMaxSize),
305   /* Utf8toW */             CConverterType(UTF8_SOURCE,     WCHAR_CHARSET),
306   /* Utf8ToSystem */        CConverterType(UTF8_SOURCE,     SystemCharset),
307   /* SystemToUtf8 */        CConverterType(SystemCharset,   UTF8_SOURCE),
308   /* Ucs2CharsetToUtf8 */   CConverterType("UCS-2LE",       "UTF-8", CCharsetConverter::m_Utf8CharMaxSize)
309 };
310 
311 CCriticalSection CCharsetConverter::CInnerConverter::m_critSectionFriBiDi;
312 
313 template<class INPUT,class OUTPUT>
stdConvert(StdConversionType convertType,const INPUT & strSource,OUTPUT & strDest,bool failOnInvalidChar)314 bool CCharsetConverter::CInnerConverter::stdConvert(StdConversionType convertType, const INPUT& strSource, OUTPUT& strDest, bool failOnInvalidChar /*= false*/)
315 {
316   strDest.clear();
317   if (strSource.empty())
318     return true;
319 
320   if (convertType < 0 || convertType >= NumberOfStdConversionTypes)
321     return false;
322 
323   CConverterType& convType = m_stdConversion[convertType];
324   CSingleLock converterLock(convType);
325 
326   return convert(convType.GetConverter(converterLock), convType.GetTargetSingleCharMaxLen(), strSource, strDest, failOnInvalidChar);
327 }
328 
329 template<class INPUT,class OUTPUT>
customConvert(const std::string & sourceCharset,const std::string & targetCharset,const INPUT & strSource,OUTPUT & strDest,bool failOnInvalidChar)330 bool CCharsetConverter::CInnerConverter::customConvert(const std::string& sourceCharset, const std::string& targetCharset, const INPUT& strSource, OUTPUT& strDest, bool failOnInvalidChar /*= false*/)
331 {
332   strDest.clear();
333   if (strSource.empty())
334     return true;
335 
336   iconv_t conv = iconv_open(targetCharset.c_str(), sourceCharset.c_str());
337   if (conv == NO_ICONV)
338   {
339     CLog::Log(LOGERROR, "%s: iconv_open() for \"%s\" -> \"%s\" failed, errno = %d (%s)",
340               __FUNCTION__, sourceCharset.c_str(), targetCharset.c_str(), errno, strerror(errno));
341     return false;
342   }
343   const int dstMultp = (targetCharset.compare(0, 5, "UTF-8") == 0) ? CCharsetConverter::m_Utf8CharMaxSize : 1;
344   const bool result = convert(conv, dstMultp, strSource, strDest, failOnInvalidChar);
345   iconv_close(conv);
346 
347   return result;
348 }
349 
350 /* iconv may declare inbuf to be char** rather than const char** depending on platform and version,
351     so provide a wrapper that handles both */
352 struct charPtrPtrAdapter
353 {
354   const char** pointer;
charPtrPtrAdaptercharPtrPtrAdapter355   explicit charPtrPtrAdapter(const char** p) :
356     pointer(p) { }
operator char**charPtrPtrAdapter357   operator char**()
358   { return const_cast<char**>(pointer); }
operator const char**charPtrPtrAdapter359   operator const char**()
360   { return pointer; }
361 };
362 
363 template<class INPUT,class OUTPUT>
convert(iconv_t type,int multiplier,const INPUT & strSource,OUTPUT & strDest,bool failOnInvalidChar)364 bool CCharsetConverter::CInnerConverter::convert(iconv_t type, int multiplier, const INPUT& strSource, OUTPUT& strDest, bool failOnInvalidChar /*= false*/)
365 {
366   if (type == NO_ICONV)
367     return false;
368 
369   //input buffer for iconv() is the buffer from strSource
370   size_t      inBufSize  = (strSource.length() + 1) * sizeof(typename INPUT::value_type);
371   const char* inBuf      = (const char*)strSource.c_str();
372 
373   //allocate output buffer for iconv()
374   size_t      outBufSize = (strSource.length() + 1) * sizeof(typename OUTPUT::value_type) * multiplier;
375   char*       outBuf     = (char*)malloc(outBufSize);
376   if (outBuf == NULL)
377   {
378     CLog::Log(LOGFATAL, "%s: malloc failed", __FUNCTION__);
379     return false;
380   }
381 
382   size_t      inBytesAvail  = inBufSize;  //how many bytes iconv() can read
383   size_t      outBytesAvail = outBufSize; //how many bytes iconv() can write
384   const char* inBufStart    = inBuf;      //where in our input buffer iconv() should start reading
385   char*       outBufStart   = outBuf;     //where in out output buffer iconv() should start writing
386 
387   size_t returnV;
388   while(true)
389   {
390     //iconv() will update inBufStart, inBytesAvail, outBufStart and outBytesAvail
391     returnV = iconv(type, charPtrPtrAdapter(&inBufStart), &inBytesAvail, &outBufStart, &outBytesAvail);
392 
393     if (returnV == (size_t)-1)
394     {
395       if (errno == E2BIG) //output buffer is not big enough
396       {
397         //save where iconv() ended converting, realloc might make outBufStart invalid
398         size_t bytesConverted = outBufSize - outBytesAvail;
399 
400         //make buffer twice as big
401         outBufSize   *= 2;
402         char* newBuf  = (char*)realloc(outBuf, outBufSize);
403         if (!newBuf)
404         {
405           CLog::Log(LOGFATAL, "%s realloc failed with errno=%d(%s)", __FUNCTION__, errno,
406                     strerror(errno));
407           break;
408         }
409         outBuf = newBuf;
410 
411         //update the buffer pointer and counter
412         outBufStart   = outBuf + bytesConverted;
413         outBytesAvail = outBufSize - bytesConverted;
414 
415         //continue in the loop and convert the rest
416         continue;
417       }
418       else if (errno == EILSEQ) //An invalid multibyte sequence has been encountered in the input
419       {
420         if (failOnInvalidChar)
421           break;
422 
423         //skip invalid byte
424         inBufStart++;
425         inBytesAvail--;
426         //continue in the loop and convert the rest
427         continue;
428       }
429       else if (errno == EINVAL) /* Invalid sequence at the end of input buffer */
430       {
431         if (!failOnInvalidChar)
432           returnV = 0; /* reset error status to use converted part */
433 
434         break;
435       }
436       else //iconv() had some other error
437       {
438         CLog::Log(LOGERROR, "%s: iconv() failed, errno=%d (%s)",
439                   __FUNCTION__, errno, strerror(errno));
440       }
441     }
442     break;
443   }
444 
445   //complete the conversion (reset buffers), otherwise the current data will prefix the data on the next call
446   if (iconv(type, NULL, NULL, &outBufStart, &outBytesAvail) == (size_t)-1)
447     CLog::Log(LOGERROR, "%s failed cleanup errno=%d(%s)", __FUNCTION__, errno, strerror(errno));
448 
449   if (returnV == (size_t)-1)
450   {
451     free(outBuf);
452     return false;
453   }
454   //we're done
455 
456   const typename OUTPUT::size_type sizeInChars = (typename OUTPUT::size_type) (outBufSize - outBytesAvail) / sizeof(typename OUTPUT::value_type);
457   typename OUTPUT::const_pointer strPtr = (typename OUTPUT::const_pointer) outBuf;
458   /* Make sure that all buffer is assigned and string is stopped at end of buffer */
459   if (strPtr[sizeInChars-1] == 0 && strSource[strSource.length()-1] != 0)
460     strDest.assign(strPtr, sizeInChars-1);
461   else
462     strDest.assign(strPtr, sizeInChars);
463 
464   free(outBuf);
465 
466   return true;
467 }
468 
logicalToVisualBiDi(const std::u32string & stringSrc,std::u32string & stringDst,FriBidiCharType base,const bool failOnBadString,int * visualToLogicalMap)469 bool CCharsetConverter::CInnerConverter::logicalToVisualBiDi(
470     const std::u32string& stringSrc,
471     std::u32string& stringDst,
472     FriBidiCharType base /*= FRIBIDI_TYPE_LTR*/,
473     const bool failOnBadString /*= false*/,
474     int* visualToLogicalMap /*= nullptr*/)
475 {
476   stringDst.clear();
477 
478   const size_t srcLen = stringSrc.length();
479   if (srcLen == 0)
480     return true;
481 
482   stringDst.reserve(srcLen);
483   size_t lineStart = 0;
484 
485   // libfribidi is not threadsafe, so make sure we make it so
486   CSingleLock lock(m_critSectionFriBiDi);
487   do
488   {
489     size_t lineEnd = stringSrc.find('\n', lineStart);
490     if (lineEnd >= srcLen) // equal to 'lineEnd == std::string::npos'
491       lineEnd = srcLen;
492     else
493       lineEnd++; // include '\n'
494 
495     const size_t lineLen = lineEnd - lineStart;
496 
497     FriBidiChar* visual = (FriBidiChar*) malloc((lineLen + 1) * sizeof(FriBidiChar));
498     if (visual == NULL)
499     {
500       free(visual);
501       CLog::Log(LOGFATAL, "%s: can't allocate memory", __FUNCTION__);
502       return false;
503     }
504 
505     bool bidiFailed = false;
506     FriBidiCharType baseCopy = base; // preserve same value for all lines, required because fribidi_log2vis will modify parameter value
507     if (fribidi_log2vis(reinterpret_cast<const FriBidiChar*>(stringSrc.c_str() + lineStart),
508                         lineLen, &baseCopy, visual, nullptr,
509                         !visualToLogicalMap ? nullptr : visualToLogicalMap + lineStart, nullptr))
510     {
511       // Removes bidirectional marks
512       const int newLen = fribidi_remove_bidi_marks(
513           visual, lineLen, nullptr, !visualToLogicalMap ? nullptr : visualToLogicalMap + lineStart,
514           nullptr);
515       if (newLen > 0)
516         stringDst.append((const char32_t*)visual, (size_t)newLen);
517       else if (newLen < 0)
518         bidiFailed = failOnBadString;
519     }
520     else
521       bidiFailed = failOnBadString;
522 
523     free(visual);
524 
525     if (bidiFailed)
526       return false;
527 
528     lineStart = lineEnd;
529   } while (lineStart < srcLen);
530 
531   return !stringDst.empty();
532 }
533 
534 static struct SCharsetMapping
535 {
536   const char* charset;
537   const char* caption;
538 } g_charsets[] = {
539   { "ISO-8859-1", "Western Europe (ISO)" }
540   , { "ISO-8859-2", "Central Europe (ISO)" }
541   , { "ISO-8859-3", "South Europe (ISO)" }
542   , { "ISO-8859-4", "Baltic (ISO)" }
543   , { "ISO-8859-5", "Cyrillic (ISO)" }
544   , { "ISO-8859-6", "Arabic (ISO)" }
545   , { "ISO-8859-7", "Greek (ISO)" }
546   , { "ISO-8859-8", "Hebrew (ISO)" }
547   , { "ISO-8859-9", "Turkish (ISO)" }
548   , { "CP1250", "Central Europe (Windows)" }
549   , { "CP1251", "Cyrillic (Windows)" }
550   , { "CP1252", "Western Europe (Windows)" }
551   , { "CP1253", "Greek (Windows)" }
552   , { "CP1254", "Turkish (Windows)" }
553   , { "CP1255", "Hebrew (Windows)" }
554   , { "CP1256", "Arabic (Windows)" }
555   , { "CP1257", "Baltic (Windows)" }
556   , { "CP1258", "Vietnamese (Windows)" }
557   , { "CP874", "Thai (Windows)" }
558   , { "BIG5", "Chinese Traditional (Big5)" }
559   , { "GBK", "Chinese Simplified (GBK)" }
560   , { "SHIFT_JIS", "Japanese (Shift-JIS)" }
561   , { "CP949", "Korean" }
562   , { "BIG5-HKSCS", "Hong Kong (Big5-HKSCS)" }
563   , { NULL, NULL }
564 };
565 
566 CCharsetConverter::CCharsetConverter() = default;
567 
OnSettingChanged(const std::shared_ptr<const CSetting> & setting)568 void CCharsetConverter::OnSettingChanged(const std::shared_ptr<const CSetting>& setting)
569 {
570   if (setting == NULL)
571     return;
572 
573   const std::string& settingId = setting->GetId();
574   if (settingId == CSettings::SETTING_LOCALE_CHARSET)
575     resetUserCharset();
576   else if (settingId == CSettings::SETTING_SUBTITLES_CHARSET)
577     resetSubtitleCharset();
578 }
579 
clear()580 void CCharsetConverter::clear()
581 {
582 }
583 
getCharsetLabels()584 std::vector<std::string> CCharsetConverter::getCharsetLabels()
585 {
586   std::vector<std::string> lab;
587   for(SCharsetMapping* c = g_charsets; c->charset; c++)
588     lab.emplace_back(c->caption);
589 
590   return lab;
591 }
592 
getCharsetLabelByName(const std::string & charsetName)593 std::string CCharsetConverter::getCharsetLabelByName(const std::string& charsetName)
594 {
595   for(SCharsetMapping* c = g_charsets; c->charset; c++)
596   {
597     if (StringUtils::EqualsNoCase(charsetName,c->charset))
598       return c->caption;
599   }
600 
601   return "";
602 }
603 
getCharsetNameByLabel(const std::string & charsetLabel)604 std::string CCharsetConverter::getCharsetNameByLabel(const std::string& charsetLabel)
605 {
606   for(SCharsetMapping* c = g_charsets; c->charset; c++)
607   {
608     if (StringUtils::EqualsNoCase(charsetLabel, c->caption))
609       return c->charset;
610   }
611 
612   return "";
613 }
614 
reset(void)615 void CCharsetConverter::reset(void)
616 {
617   for (CConverterType& conversion : CInnerConverter::m_stdConversion)
618     conversion.Reset();
619 }
620 
resetSystemCharset(void)621 void CCharsetConverter::resetSystemCharset(void)
622 {
623   CInnerConverter::m_stdConversion[Utf8ToSystem].Reset();
624   CInnerConverter::m_stdConversion[SystemToUtf8].Reset();
625 }
626 
resetUserCharset(void)627 void CCharsetConverter::resetUserCharset(void)
628 {
629   CInnerConverter::m_stdConversion[UserCharsetToUtf8].Reset();
630   CInnerConverter::m_stdConversion[UserCharsetToUtf8].Reset();
631   CInnerConverter::m_stdConversion[Utf32ToUserCharset].Reset();
632   resetSubtitleCharset();
633 }
634 
resetSubtitleCharset(void)635 void CCharsetConverter::resetSubtitleCharset(void)
636 {
637   CInnerConverter::m_stdConversion[SubtitleCharsetToUtf8].Reset();
638 }
639 
reinitCharsetsFromSettings(void)640 void CCharsetConverter::reinitCharsetsFromSettings(void)
641 {
642   resetUserCharset(); // this will also reinit Subtitle charsets
643 }
644 
utf8ToUtf32(const std::string & utf8StringSrc,std::u32string & utf32StringDst,bool failOnBadChar)645 bool CCharsetConverter::utf8ToUtf32(const std::string& utf8StringSrc, std::u32string& utf32StringDst, bool failOnBadChar /*= true*/)
646 {
647   return CInnerConverter::stdConvert(Utf8ToUtf32, utf8StringSrc, utf32StringDst, failOnBadChar);
648 }
649 
utf8ToUtf32(const std::string & utf8StringSrc,bool failOnBadChar)650 std::u32string CCharsetConverter::utf8ToUtf32(const std::string& utf8StringSrc, bool failOnBadChar /*= true*/)
651 {
652   std::u32string converted;
653   utf8ToUtf32(utf8StringSrc, converted, failOnBadChar);
654   return converted;
655 }
656 
utf8ToUtf32Visual(const std::string & utf8StringSrc,std::u32string & utf32StringDst,bool bVisualBiDiFlip,bool forceLTRReadingOrder,bool failOnBadChar)657 bool CCharsetConverter::utf8ToUtf32Visual(const std::string& utf8StringSrc, std::u32string& utf32StringDst, bool bVisualBiDiFlip /*= false*/, bool forceLTRReadingOrder /*= false*/, bool failOnBadChar /*= false*/)
658 {
659   if (bVisualBiDiFlip)
660   {
661     std::u32string converted;
662     if (!CInnerConverter::stdConvert(Utf8ToUtf32, utf8StringSrc, converted, failOnBadChar))
663       return false;
664 
665     return CInnerConverter::logicalToVisualBiDi(converted, utf32StringDst, forceLTRReadingOrder ? FRIBIDI_TYPE_LTR : FRIBIDI_TYPE_PDF, failOnBadChar);
666   }
667   return CInnerConverter::stdConvert(Utf8ToUtf32, utf8StringSrc, utf32StringDst, failOnBadChar);
668 }
669 
utf32ToUtf8(const std::u32string & utf32StringSrc,std::string & utf8StringDst,bool failOnBadChar)670 bool CCharsetConverter::utf32ToUtf8(const std::u32string& utf32StringSrc, std::string& utf8StringDst, bool failOnBadChar /*= true*/)
671 {
672   return CInnerConverter::stdConvert(Utf32ToUtf8, utf32StringSrc, utf8StringDst, failOnBadChar);
673 }
674 
utf32ToUtf8(const std::u32string & utf32StringSrc,bool failOnBadChar)675 std::string CCharsetConverter::utf32ToUtf8(const std::u32string& utf32StringSrc, bool failOnBadChar /*= false*/)
676 {
677   std::string converted;
678   utf32ToUtf8(utf32StringSrc, converted, failOnBadChar);
679   return converted;
680 }
681 
utf32ToW(const std::u32string & utf32StringSrc,std::wstring & wStringDst,bool failOnBadChar)682 bool CCharsetConverter::utf32ToW(const std::u32string& utf32StringSrc, std::wstring& wStringDst, bool failOnBadChar /*= true*/)
683 {
684 #ifdef WCHAR_IS_UCS_4
685   wStringDst.assign((const wchar_t*)utf32StringSrc.c_str(), utf32StringSrc.length());
686   return true;
687 #else // !WCHAR_IS_UCS_4
688   return CInnerConverter::stdConvert(Utf32ToW, utf32StringSrc, wStringDst, failOnBadChar);
689 #endif // !WCHAR_IS_UCS_4
690 }
691 
utf32logicalToVisualBiDi(const std::u32string & logicalStringSrc,std::u32string & visualStringDst,bool forceLTRReadingOrder,bool failOnBadString,int * visualToLogicalMap)692 bool CCharsetConverter::utf32logicalToVisualBiDi(const std::u32string& logicalStringSrc,
693                                                  std::u32string& visualStringDst,
694                                                  bool forceLTRReadingOrder /*= false*/,
695                                                  bool failOnBadString /*= false*/,
696                                                  int* visualToLogicalMap /*= nullptr*/)
697 {
698   return CInnerConverter::logicalToVisualBiDi(
699       logicalStringSrc, visualStringDst, forceLTRReadingOrder ? FRIBIDI_TYPE_LTR : FRIBIDI_TYPE_PDF,
700       failOnBadString, visualToLogicalMap);
701 }
702 
wToUtf32(const std::wstring & wStringSrc,std::u32string & utf32StringDst,bool failOnBadChar)703 bool CCharsetConverter::wToUtf32(const std::wstring& wStringSrc, std::u32string& utf32StringDst, bool failOnBadChar /*= true*/)
704 {
705 #ifdef WCHAR_IS_UCS_4
706   /* UCS-4 is almost equal to UTF-32, but UTF-32 has strict limits on possible values, while UCS-4 is usually unchecked.
707    * With this "conversion" we ensure that output will be valid UTF-32 string. */
708 #endif
709   return CInnerConverter::stdConvert(WToUtf32, wStringSrc, utf32StringDst, failOnBadChar);
710 }
711 
712 // The bVisualBiDiFlip forces a flip of characters for hebrew/arabic languages, only set to false if the flipping
713 // of the string is already made or the string is not displayed in the GUI
utf8ToW(const std::string & utf8StringSrc,std::wstring & wStringDst,bool bVisualBiDiFlip,bool forceLTRReadingOrder,bool failOnBadChar)714 bool CCharsetConverter::utf8ToW(const std::string& utf8StringSrc, std::wstring& wStringDst, bool bVisualBiDiFlip /*= true*/,
715                                 bool forceLTRReadingOrder /*= false*/, bool failOnBadChar /*= false*/)
716 {
717   // Try to flip hebrew/arabic characters, if any
718   if (bVisualBiDiFlip)
719   {
720     wStringDst.clear();
721     std::u32string utf32str;
722     if (!CInnerConverter::stdConvert(Utf8ToUtf32, utf8StringSrc, utf32str, failOnBadChar))
723       return false;
724 
725     std::u32string utf32flipped;
726     const bool bidiResult = CInnerConverter::logicalToVisualBiDi(utf32str, utf32flipped, forceLTRReadingOrder ? FRIBIDI_TYPE_LTR : FRIBIDI_TYPE_PDF, failOnBadChar);
727 
728     return CInnerConverter::stdConvert(Utf32ToW, utf32flipped, wStringDst, failOnBadChar) && bidiResult;
729   }
730 
731   return CInnerConverter::stdConvert(Utf8toW, utf8StringSrc, wStringDst, failOnBadChar);
732 }
733 
subtitleCharsetToUtf8(const std::string & stringSrc,std::string & utf8StringDst)734 bool CCharsetConverter::subtitleCharsetToUtf8(const std::string& stringSrc, std::string& utf8StringDst)
735 {
736   return CInnerConverter::stdConvert(SubtitleCharsetToUtf8, stringSrc, utf8StringDst, false);
737 }
738 
fromW(const std::wstring & wStringSrc,std::string & stringDst,const std::string & enc)739 bool CCharsetConverter::fromW(const std::wstring& wStringSrc,
740                               std::string& stringDst, const std::string& enc)
741 {
742   return CInnerConverter::customConvert(WCHAR_CHARSET, enc, wStringSrc, stringDst);
743 }
744 
toW(const std::string & stringSrc,std::wstring & wStringDst,const std::string & enc)745 bool CCharsetConverter::toW(const std::string& stringSrc,
746                             std::wstring& wStringDst, const std::string& enc)
747 {
748   return CInnerConverter::customConvert(enc, WCHAR_CHARSET, stringSrc, wStringDst);
749 }
750 
utf8ToStringCharset(const std::string & utf8StringSrc,std::string & stringDst)751 bool CCharsetConverter::utf8ToStringCharset(const std::string& utf8StringSrc, std::string& stringDst)
752 {
753   return CInnerConverter::stdConvert(Utf8ToUserCharset, utf8StringSrc, stringDst);
754 }
755 
utf8ToStringCharset(std::string & stringSrcDst)756 bool CCharsetConverter::utf8ToStringCharset(std::string& stringSrcDst)
757 {
758   std::string strSrc(stringSrcDst);
759   return utf8ToStringCharset(strSrc, stringSrcDst);
760 }
761 
ToUtf8(const std::string & strSourceCharset,const std::string & stringSrc,std::string & utf8StringDst,bool failOnBadChar)762 bool CCharsetConverter::ToUtf8(const std::string& strSourceCharset, const std::string& stringSrc, std::string& utf8StringDst, bool failOnBadChar /*= false*/)
763 {
764   if (strSourceCharset == "UTF-8")
765   { // simple case - no conversion necessary
766     utf8StringDst = stringSrc;
767     return true;
768   }
769 
770   return CInnerConverter::customConvert(strSourceCharset, "UTF-8", stringSrc, utf8StringDst, failOnBadChar);
771 }
772 
utf8To(const std::string & strDestCharset,const std::string & utf8StringSrc,std::string & stringDst)773 bool CCharsetConverter::utf8To(const std::string& strDestCharset, const std::string& utf8StringSrc, std::string& stringDst)
774 {
775   if (strDestCharset == "UTF-8")
776   { // simple case - no conversion necessary
777     stringDst = utf8StringSrc;
778     return true;
779   }
780 
781   return CInnerConverter::customConvert(UTF8_SOURCE, strDestCharset, utf8StringSrc, stringDst);
782 }
783 
utf8To(const std::string & strDestCharset,const std::string & utf8StringSrc,std::u16string & utf16StringDst)784 bool CCharsetConverter::utf8To(const std::string& strDestCharset, const std::string& utf8StringSrc, std::u16string& utf16StringDst)
785 {
786   return CInnerConverter::customConvert(UTF8_SOURCE, strDestCharset, utf8StringSrc, utf16StringDst);
787 }
788 
utf8To(const std::string & strDestCharset,const std::string & utf8StringSrc,std::u32string & utf32StringDst)789 bool CCharsetConverter::utf8To(const std::string& strDestCharset, const std::string& utf8StringSrc, std::u32string& utf32StringDst)
790 {
791   return CInnerConverter::customConvert(UTF8_SOURCE, strDestCharset, utf8StringSrc, utf32StringDst);
792 }
793 
unknownToUTF8(std::string & stringSrcDst)794 bool CCharsetConverter::unknownToUTF8(std::string& stringSrcDst)
795 {
796   std::string source(stringSrcDst);
797   return unknownToUTF8(source, stringSrcDst);
798 }
799 
unknownToUTF8(const std::string & stringSrc,std::string & utf8StringDst,bool failOnBadChar)800 bool CCharsetConverter::unknownToUTF8(const std::string& stringSrc, std::string& utf8StringDst, bool failOnBadChar /*= false*/)
801 {
802   // checks whether it's utf8 already, and if not converts using the sourceCharset if given, else the string charset
803   if (CUtf8Utils::isValidUtf8(stringSrc))
804   {
805     utf8StringDst = stringSrc;
806     return true;
807   }
808   return CInnerConverter::stdConvert(UserCharsetToUtf8, stringSrc, utf8StringDst, failOnBadChar);
809 }
810 
wToUTF8(const std::wstring & wStringSrc,std::string & utf8StringDst,bool failOnBadChar)811 bool CCharsetConverter::wToUTF8(const std::wstring& wStringSrc, std::string& utf8StringDst, bool failOnBadChar /*= false*/)
812 {
813   return CInnerConverter::stdConvert(WtoUtf8, wStringSrc, utf8StringDst, failOnBadChar);
814 }
815 
utf16BEtoUTF8(const std::u16string & utf16StringSrc,std::string & utf8StringDst)816 bool CCharsetConverter::utf16BEtoUTF8(const std::u16string& utf16StringSrc, std::string& utf8StringDst)
817 {
818   return CInnerConverter::stdConvert(Utf16BEtoUtf8, utf16StringSrc, utf8StringDst);
819 }
820 
utf16LEtoUTF8(const std::u16string & utf16StringSrc,std::string & utf8StringDst)821 bool CCharsetConverter::utf16LEtoUTF8(const std::u16string& utf16StringSrc,
822                                       std::string& utf8StringDst)
823 {
824   return CInnerConverter::stdConvert(Utf16LEtoUtf8, utf16StringSrc, utf8StringDst);
825 }
826 
ucs2ToUTF8(const std::u16string & ucs2StringSrc,std::string & utf8StringDst)827 bool CCharsetConverter::ucs2ToUTF8(const std::u16string& ucs2StringSrc, std::string& utf8StringDst)
828 {
829   return CInnerConverter::stdConvert(Ucs2CharsetToUtf8, ucs2StringSrc,utf8StringDst);
830 }
831 
utf16LEtoW(const std::u16string & utf16String,std::wstring & wString)832 bool CCharsetConverter::utf16LEtoW(const std::u16string& utf16String, std::wstring& wString)
833 {
834   return CInnerConverter::stdConvert(Utf16LEtoW, utf16String, wString);
835 }
836 
utf32ToStringCharset(const std::u32string & utf32StringSrc,std::string & stringDst)837 bool CCharsetConverter::utf32ToStringCharset(const std::u32string& utf32StringSrc, std::string& stringDst)
838 {
839   return CInnerConverter::stdConvert(Utf32ToUserCharset, utf32StringSrc, stringDst);
840 }
841 
utf8ToSystem(std::string & stringSrcDst,bool failOnBadChar)842 bool CCharsetConverter::utf8ToSystem(std::string& stringSrcDst, bool failOnBadChar /*= false*/)
843 {
844   std::string strSrc(stringSrcDst);
845   return CInnerConverter::stdConvert(Utf8ToSystem, strSrc, stringSrcDst, failOnBadChar);
846 }
847 
systemToUtf8(const std::string & sysStringSrc,std::string & utf8StringDst,bool failOnBadChar)848 bool CCharsetConverter::systemToUtf8(const std::string& sysStringSrc, std::string& utf8StringDst, bool failOnBadChar /*= false*/)
849 {
850   return CInnerConverter::stdConvert(SystemToUtf8, sysStringSrc, utf8StringDst, failOnBadChar);
851 }
852 
utf8logicalToVisualBiDi(const std::string & utf8StringSrc,std::string & utf8StringDst,bool failOnBadString)853 bool CCharsetConverter::utf8logicalToVisualBiDi(const std::string& utf8StringSrc, std::string& utf8StringDst, bool failOnBadString /*= false*/)
854 {
855   utf8StringDst.clear();
856   std::u32string utf32flipped;
857   if (!utf8ToUtf32Visual(utf8StringSrc, utf32flipped, true, true, failOnBadString))
858     return false;
859 
860   return CInnerConverter::stdConvert(Utf32ToUtf8, utf32flipped, utf8StringDst, failOnBadString);
861 }
862 
SettingOptionsCharsetsFiller(const SettingConstPtr & setting,std::vector<StringSettingOption> & list,std::string & current,void * data)863 void CCharsetConverter::SettingOptionsCharsetsFiller(const SettingConstPtr& setting,
864                                                      std::vector<StringSettingOption>& list,
865                                                      std::string& current,
866                                                      void* data)
867 {
868   std::vector<std::string> vecCharsets = g_charsetConverter.getCharsetLabels();
869   sort(vecCharsets.begin(), vecCharsets.end(), sortstringbyname());
870 
871   list.emplace_back(g_localizeStrings.Get(13278), "DEFAULT"); // "Default"
872   for (int i = 0; i < (int) vecCharsets.size(); ++i)
873     list.emplace_back(vecCharsets[i], g_charsetConverter.getCharsetNameByLabel(vecCharsets[i]));
874 }
875