1 /*
2 * Copyright (C) 2005-2018 Team Kodi
3 * This file is part of Kodi - https://kodi.tv
4 *
5 * SPDX-License-Identifier: GPL-2.0-or-later
6 * See LICENSES/README.md for more information.
7 */
8
9 #include "CharsetConverter.h"
10
11 #include "LangInfo.h"
12 #include "guilib/LocalizeStrings.h"
13 #include "log.h"
14 #include "settings/Settings.h"
15 #include "settings/lib/Setting.h"
16 #include "settings/lib/SettingDefinitions.h"
17 #include "utils/StringUtils.h"
18 #include "utils/Utf8Utils.h"
19
20 #include <algorithm>
21
22 #include <fribidi.h>
23 #include <iconv.h>
24
25 #ifdef WORDS_BIGENDIAN
26 #define ENDIAN_SUFFIX "BE"
27 #else
28 #define ENDIAN_SUFFIX "LE"
29 #endif
30
31 #if defined(TARGET_DARWIN)
32 #define WCHAR_IS_UCS_4 1
33 #define UTF16_CHARSET "UTF-16" ENDIAN_SUFFIX
34 #define UTF32_CHARSET "UTF-32" ENDIAN_SUFFIX
35 #define UTF8_SOURCE "UTF-8-MAC"
36 #define WCHAR_CHARSET UTF32_CHARSET
37 #elif defined(TARGET_WINDOWS)
38 #define WCHAR_IS_UTF16 1
39 #define UTF16_CHARSET "UTF-16" ENDIAN_SUFFIX
40 #define UTF32_CHARSET "UTF-32" ENDIAN_SUFFIX
41 #define UTF8_SOURCE "UTF-8"
42 #define WCHAR_CHARSET UTF16_CHARSET
43 #elif (defined(TARGET_FREEBSD)||defined(TARGET_DRAGONFLY))
44 #define WCHAR_IS_UCS_4 1
45 #define UTF16_CHARSET "UTF-16" ENDIAN_SUFFIX
46 #define UTF32_CHARSET "UTF-32" ENDIAN_SUFFIX
47 #define UTF8_SOURCE "UTF-8"
48 #define WCHAR_CHARSET UTF32_CHARSET
49 #elif defined(TARGET_ANDROID)
50 #define WCHAR_IS_UCS_4 1
51 #define UTF16_CHARSET "UTF-16" ENDIAN_SUFFIX
52 #define UTF32_CHARSET "UTF-32" ENDIAN_SUFFIX
53 #define UTF8_SOURCE "UTF-8"
54 #define WCHAR_CHARSET UTF32_CHARSET
55 #else
56 #define UTF16_CHARSET "UTF-16" ENDIAN_SUFFIX
57 #define UTF32_CHARSET "UTF-32" ENDIAN_SUFFIX
58 #define UTF8_SOURCE "UTF-8"
59 #define WCHAR_CHARSET "WCHAR_T"
60 #if __STDC_ISO_10646__
61 #ifdef SIZEOF_WCHAR_T
62 #if SIZEOF_WCHAR_T == 4
63 #define WCHAR_IS_UCS_4 1
64 #elif SIZEOF_WCHAR_T == 2
65 #define WCHAR_IS_UCS_2 1
66 #endif
67 #endif
68 #endif
69 #endif
70
71 #define NO_ICONV ((iconv_t)-1)
72
73 enum SpecialCharset
74 {
75 NotSpecialCharset = 0,
76 SystemCharset,
77 UserCharset /* locale.charset */,
78 SubtitleCharset /* subtitles.charset */,
79 };
80
81 class CConverterType : public CCriticalSection
82 {
83 public:
84 CConverterType(const std::string& sourceCharset, const std::string& targetCharset, unsigned int targetSingleCharMaxLen = 1);
85 CConverterType(enum SpecialCharset sourceSpecialCharset, const std::string& targetCharset, unsigned int targetSingleCharMaxLen = 1);
86 CConverterType(const std::string& sourceCharset, enum SpecialCharset targetSpecialCharset, unsigned int targetSingleCharMaxLen = 1);
87 CConverterType(enum SpecialCharset sourceSpecialCharset, enum SpecialCharset targetSpecialCharset, unsigned int targetSingleCharMaxLen = 1);
88 CConverterType(const CConverterType& other);
89 ~CConverterType();
90
91 iconv_t GetConverter(CSingleLock& converterLock);
92
93 void Reset(void);
94 void ReinitTo(const std::string& sourceCharset, const std::string& targetCharset, unsigned int targetSingleCharMaxLen = 1);
GetSourceCharset(void) const95 std::string GetSourceCharset(void) const { return m_sourceCharset; }
GetTargetCharset(void) const96 std::string GetTargetCharset(void) const { return m_targetCharset; }
GetTargetSingleCharMaxLen(void) const97 unsigned int GetTargetSingleCharMaxLen(void) const { return m_targetSingleCharMaxLen; }
98
99 private:
100 static std::string ResolveSpecialCharset(enum SpecialCharset charset);
101
102 enum SpecialCharset m_sourceSpecialCharset;
103 std::string m_sourceCharset;
104 enum SpecialCharset m_targetSpecialCharset;
105 std::string m_targetCharset;
106 iconv_t m_iconv;
107 unsigned int m_targetSingleCharMaxLen;
108 };
109
CConverterType(const std::string & sourceCharset,const std::string & targetCharset,unsigned int targetSingleCharMaxLen)110 CConverterType::CConverterType(const std::string& sourceCharset, const std::string& targetCharset, unsigned int targetSingleCharMaxLen /*= 1*/) : CCriticalSection(),
111 m_sourceSpecialCharset(NotSpecialCharset),
112 m_sourceCharset(sourceCharset),
113 m_targetSpecialCharset(NotSpecialCharset),
114 m_targetCharset(targetCharset),
115 m_iconv(NO_ICONV),
116 m_targetSingleCharMaxLen(targetSingleCharMaxLen)
117 {
118 }
119
CConverterType(enum SpecialCharset sourceSpecialCharset,const std::string & targetCharset,unsigned int targetSingleCharMaxLen)120 CConverterType::CConverterType(enum SpecialCharset sourceSpecialCharset, const std::string& targetCharset, unsigned int targetSingleCharMaxLen /*= 1*/) : CCriticalSection(),
121 m_sourceSpecialCharset(sourceSpecialCharset),
122 m_sourceCharset(),
123 m_targetSpecialCharset(NotSpecialCharset),
124 m_targetCharset(targetCharset),
125 m_iconv(NO_ICONV),
126 m_targetSingleCharMaxLen(targetSingleCharMaxLen)
127 {
128 }
129
CConverterType(const std::string & sourceCharset,enum SpecialCharset targetSpecialCharset,unsigned int targetSingleCharMaxLen)130 CConverterType::CConverterType(const std::string& sourceCharset, enum SpecialCharset targetSpecialCharset, unsigned int targetSingleCharMaxLen /*= 1*/) : CCriticalSection(),
131 m_sourceSpecialCharset(NotSpecialCharset),
132 m_sourceCharset(sourceCharset),
133 m_targetSpecialCharset(targetSpecialCharset),
134 m_targetCharset(),
135 m_iconv(NO_ICONV),
136 m_targetSingleCharMaxLen(targetSingleCharMaxLen)
137 {
138 }
139
CConverterType(enum SpecialCharset sourceSpecialCharset,enum SpecialCharset targetSpecialCharset,unsigned int targetSingleCharMaxLen)140 CConverterType::CConverterType(enum SpecialCharset sourceSpecialCharset, enum SpecialCharset targetSpecialCharset, unsigned int targetSingleCharMaxLen /*= 1*/) : CCriticalSection(),
141 m_sourceSpecialCharset(sourceSpecialCharset),
142 m_sourceCharset(),
143 m_targetSpecialCharset(targetSpecialCharset),
144 m_targetCharset(),
145 m_iconv(NO_ICONV),
146 m_targetSingleCharMaxLen(targetSingleCharMaxLen)
147 {
148 }
149
CConverterType(const CConverterType & other)150 CConverterType::CConverterType(const CConverterType& other) : CCriticalSection(),
151 m_sourceSpecialCharset(other.m_sourceSpecialCharset),
152 m_sourceCharset(other.m_sourceCharset),
153 m_targetSpecialCharset(other.m_targetSpecialCharset),
154 m_targetCharset(other.m_targetCharset),
155 m_iconv(NO_ICONV),
156 m_targetSingleCharMaxLen(other.m_targetSingleCharMaxLen)
157 {
158 }
159
~CConverterType()160 CConverterType::~CConverterType()
161 {
162 CSingleLock lock(*this);
163 if (m_iconv != NO_ICONV)
164 iconv_close(m_iconv);
165 lock.Leave(); // ensure unlocking before final destruction
166 }
167
GetConverter(CSingleLock & converterLock)168 iconv_t CConverterType::GetConverter(CSingleLock& converterLock)
169 {
170 // ensure that this unique instance is locked externally
171 if (&converterLock.get_underlying() != this)
172 return NO_ICONV;
173
174 if (m_iconv == NO_ICONV)
175 {
176 if (m_sourceSpecialCharset)
177 m_sourceCharset = ResolveSpecialCharset(m_sourceSpecialCharset);
178 if (m_targetSpecialCharset)
179 m_targetCharset = ResolveSpecialCharset(m_targetSpecialCharset);
180
181 m_iconv = iconv_open(m_targetCharset.c_str(), m_sourceCharset.c_str());
182
183 if (m_iconv == NO_ICONV)
184 CLog::Log(LOGERROR, "%s: iconv_open() for \"%s\" -> \"%s\" failed, errno = %d (%s)",
185 __FUNCTION__, m_sourceCharset.c_str(), m_targetCharset.c_str(), errno, strerror(errno));
186 }
187
188 return m_iconv;
189 }
190
Reset(void)191 void CConverterType::Reset(void)
192 {
193 CSingleLock lock(*this);
194 if (m_iconv != NO_ICONV)
195 {
196 iconv_close(m_iconv);
197 m_iconv = NO_ICONV;
198 }
199
200 if (m_sourceSpecialCharset)
201 m_sourceCharset.clear();
202 if (m_targetSpecialCharset)
203 m_targetCharset.clear();
204
205 }
206
ReinitTo(const std::string & sourceCharset,const std::string & targetCharset,unsigned int targetSingleCharMaxLen)207 void CConverterType::ReinitTo(const std::string& sourceCharset, const std::string& targetCharset, unsigned int targetSingleCharMaxLen /*= 1*/)
208 {
209 CSingleLock lock(*this);
210 if (sourceCharset != m_sourceCharset || targetCharset != m_targetCharset)
211 {
212 if (m_iconv != NO_ICONV)
213 {
214 iconv_close(m_iconv);
215 m_iconv = NO_ICONV;
216 }
217
218 m_sourceSpecialCharset = NotSpecialCharset;
219 m_sourceCharset = sourceCharset;
220 m_targetSpecialCharset = NotSpecialCharset;
221 m_targetCharset = targetCharset;
222 m_targetSingleCharMaxLen = targetSingleCharMaxLen;
223 }
224 }
225
ResolveSpecialCharset(enum SpecialCharset charset)226 std::string CConverterType::ResolveSpecialCharset(enum SpecialCharset charset)
227 {
228 switch (charset)
229 {
230 case SystemCharset:
231 return "";
232 case UserCharset:
233 return g_langInfo.GetGuiCharSet();
234 case SubtitleCharset:
235 return g_langInfo.GetSubtitleCharSet();
236 case NotSpecialCharset:
237 default:
238 return "UTF-8"; /* dummy value */
239 }
240 }
241
242 enum StdConversionType /* Keep it in sync with CCharsetConverter::CInnerConverter::m_stdConversion */
243 {
244 NoConversion = -1,
245 Utf8ToUtf32 = 0,
246 Utf32ToUtf8,
247 Utf32ToW,
248 WToUtf32,
249 SubtitleCharsetToUtf8,
250 Utf8ToUserCharset,
251 UserCharsetToUtf8,
252 Utf32ToUserCharset,
253 WtoUtf8,
254 Utf16LEtoW,
255 Utf16BEtoUtf8,
256 Utf16LEtoUtf8,
257 Utf8toW,
258 Utf8ToSystem,
259 SystemToUtf8,
260 Ucs2CharsetToUtf8,
261 NumberOfStdConversionTypes /* Dummy sentinel entry */
262 };
263
264 /* We don't want to pollute header file with many additional includes and definitions, so put
265 here all staff that require usage of types defined in this file or in additional headers */
266 class CCharsetConverter::CInnerConverter
267 {
268 public:
269 static bool logicalToVisualBiDi(const std::u32string& stringSrc,
270 std::u32string& stringDst,
271 FriBidiCharType base = FRIBIDI_TYPE_LTR,
272 const bool failOnBadString = false,
273 int* visualToLogicalMap = nullptr);
274
275 template<class INPUT,class OUTPUT>
276 static bool stdConvert(StdConversionType convertType, const INPUT& strSource, OUTPUT& strDest, bool failOnInvalidChar = false);
277 template<class INPUT,class OUTPUT>
278 static bool customConvert(const std::string& sourceCharset, const std::string& targetCharset, const INPUT& strSource, OUTPUT& strDest, bool failOnInvalidChar = false);
279
280 template<class INPUT,class OUTPUT>
281 static bool convert(iconv_t type, int multiplier, const INPUT& strSource, OUTPUT& strDest, bool failOnInvalidChar = false);
282
283 static CConverterType m_stdConversion[NumberOfStdConversionTypes];
284 static CCriticalSection m_critSectionFriBiDi;
285 };
286
287 /* single symbol sizes in chars */
288 const int CCharsetConverter::m_Utf8CharMinSize = 1;
289 const int CCharsetConverter::m_Utf8CharMaxSize = 4;
290
291 CConverterType CCharsetConverter::CInnerConverter::m_stdConversion[NumberOfStdConversionTypes] = /* keep it in sync with enum StdConversionType */
292 {
293 /* Utf8ToUtf32 */ CConverterType(UTF8_SOURCE, UTF32_CHARSET),
294 /* Utf32ToUtf8 */ CConverterType(UTF32_CHARSET, "UTF-8", CCharsetConverter::m_Utf8CharMaxSize),
295 /* Utf32ToW */ CConverterType(UTF32_CHARSET, WCHAR_CHARSET),
296 /* WToUtf32 */ CConverterType(WCHAR_CHARSET, UTF32_CHARSET),
297 /* SubtitleCharsetToUtf8*/CConverterType(SubtitleCharset, "UTF-8", CCharsetConverter::m_Utf8CharMaxSize),
298 /* Utf8ToUserCharset */ CConverterType(UTF8_SOURCE, UserCharset),
299 /* UserCharsetToUtf8 */ CConverterType(UserCharset, "UTF-8", CCharsetConverter::m_Utf8CharMaxSize),
300 /* Utf32ToUserCharset */ CConverterType(UTF32_CHARSET, UserCharset),
301 /* WtoUtf8 */ CConverterType(WCHAR_CHARSET, "UTF-8", CCharsetConverter::m_Utf8CharMaxSize),
302 /* Utf16LEtoW */ CConverterType("UTF-16LE", WCHAR_CHARSET),
303 /* Utf16BEtoUtf8 */ CConverterType("UTF-16BE", "UTF-8", CCharsetConverter::m_Utf8CharMaxSize),
304 /* Utf16LEtoUtf8 */ CConverterType("UTF-16LE", "UTF-8", CCharsetConverter::m_Utf8CharMaxSize),
305 /* Utf8toW */ CConverterType(UTF8_SOURCE, WCHAR_CHARSET),
306 /* Utf8ToSystem */ CConverterType(UTF8_SOURCE, SystemCharset),
307 /* SystemToUtf8 */ CConverterType(SystemCharset, UTF8_SOURCE),
308 /* Ucs2CharsetToUtf8 */ CConverterType("UCS-2LE", "UTF-8", CCharsetConverter::m_Utf8CharMaxSize)
309 };
310
311 CCriticalSection CCharsetConverter::CInnerConverter::m_critSectionFriBiDi;
312
313 template<class INPUT,class OUTPUT>
stdConvert(StdConversionType convertType,const INPUT & strSource,OUTPUT & strDest,bool failOnInvalidChar)314 bool CCharsetConverter::CInnerConverter::stdConvert(StdConversionType convertType, const INPUT& strSource, OUTPUT& strDest, bool failOnInvalidChar /*= false*/)
315 {
316 strDest.clear();
317 if (strSource.empty())
318 return true;
319
320 if (convertType < 0 || convertType >= NumberOfStdConversionTypes)
321 return false;
322
323 CConverterType& convType = m_stdConversion[convertType];
324 CSingleLock converterLock(convType);
325
326 return convert(convType.GetConverter(converterLock), convType.GetTargetSingleCharMaxLen(), strSource, strDest, failOnInvalidChar);
327 }
328
329 template<class INPUT,class OUTPUT>
customConvert(const std::string & sourceCharset,const std::string & targetCharset,const INPUT & strSource,OUTPUT & strDest,bool failOnInvalidChar)330 bool CCharsetConverter::CInnerConverter::customConvert(const std::string& sourceCharset, const std::string& targetCharset, const INPUT& strSource, OUTPUT& strDest, bool failOnInvalidChar /*= false*/)
331 {
332 strDest.clear();
333 if (strSource.empty())
334 return true;
335
336 iconv_t conv = iconv_open(targetCharset.c_str(), sourceCharset.c_str());
337 if (conv == NO_ICONV)
338 {
339 CLog::Log(LOGERROR, "%s: iconv_open() for \"%s\" -> \"%s\" failed, errno = %d (%s)",
340 __FUNCTION__, sourceCharset.c_str(), targetCharset.c_str(), errno, strerror(errno));
341 return false;
342 }
343 const int dstMultp = (targetCharset.compare(0, 5, "UTF-8") == 0) ? CCharsetConverter::m_Utf8CharMaxSize : 1;
344 const bool result = convert(conv, dstMultp, strSource, strDest, failOnInvalidChar);
345 iconv_close(conv);
346
347 return result;
348 }
349
350 /* iconv may declare inbuf to be char** rather than const char** depending on platform and version,
351 so provide a wrapper that handles both */
352 struct charPtrPtrAdapter
353 {
354 const char** pointer;
charPtrPtrAdaptercharPtrPtrAdapter355 explicit charPtrPtrAdapter(const char** p) :
356 pointer(p) { }
operator char**charPtrPtrAdapter357 operator char**()
358 { return const_cast<char**>(pointer); }
operator const char**charPtrPtrAdapter359 operator const char**()
360 { return pointer; }
361 };
362
363 template<class INPUT,class OUTPUT>
convert(iconv_t type,int multiplier,const INPUT & strSource,OUTPUT & strDest,bool failOnInvalidChar)364 bool CCharsetConverter::CInnerConverter::convert(iconv_t type, int multiplier, const INPUT& strSource, OUTPUT& strDest, bool failOnInvalidChar /*= false*/)
365 {
366 if (type == NO_ICONV)
367 return false;
368
369 //input buffer for iconv() is the buffer from strSource
370 size_t inBufSize = (strSource.length() + 1) * sizeof(typename INPUT::value_type);
371 const char* inBuf = (const char*)strSource.c_str();
372
373 //allocate output buffer for iconv()
374 size_t outBufSize = (strSource.length() + 1) * sizeof(typename OUTPUT::value_type) * multiplier;
375 char* outBuf = (char*)malloc(outBufSize);
376 if (outBuf == NULL)
377 {
378 CLog::Log(LOGFATAL, "%s: malloc failed", __FUNCTION__);
379 return false;
380 }
381
382 size_t inBytesAvail = inBufSize; //how many bytes iconv() can read
383 size_t outBytesAvail = outBufSize; //how many bytes iconv() can write
384 const char* inBufStart = inBuf; //where in our input buffer iconv() should start reading
385 char* outBufStart = outBuf; //where in out output buffer iconv() should start writing
386
387 size_t returnV;
388 while(true)
389 {
390 //iconv() will update inBufStart, inBytesAvail, outBufStart and outBytesAvail
391 returnV = iconv(type, charPtrPtrAdapter(&inBufStart), &inBytesAvail, &outBufStart, &outBytesAvail);
392
393 if (returnV == (size_t)-1)
394 {
395 if (errno == E2BIG) //output buffer is not big enough
396 {
397 //save where iconv() ended converting, realloc might make outBufStart invalid
398 size_t bytesConverted = outBufSize - outBytesAvail;
399
400 //make buffer twice as big
401 outBufSize *= 2;
402 char* newBuf = (char*)realloc(outBuf, outBufSize);
403 if (!newBuf)
404 {
405 CLog::Log(LOGFATAL, "%s realloc failed with errno=%d(%s)", __FUNCTION__, errno,
406 strerror(errno));
407 break;
408 }
409 outBuf = newBuf;
410
411 //update the buffer pointer and counter
412 outBufStart = outBuf + bytesConverted;
413 outBytesAvail = outBufSize - bytesConverted;
414
415 //continue in the loop and convert the rest
416 continue;
417 }
418 else if (errno == EILSEQ) //An invalid multibyte sequence has been encountered in the input
419 {
420 if (failOnInvalidChar)
421 break;
422
423 //skip invalid byte
424 inBufStart++;
425 inBytesAvail--;
426 //continue in the loop and convert the rest
427 continue;
428 }
429 else if (errno == EINVAL) /* Invalid sequence at the end of input buffer */
430 {
431 if (!failOnInvalidChar)
432 returnV = 0; /* reset error status to use converted part */
433
434 break;
435 }
436 else //iconv() had some other error
437 {
438 CLog::Log(LOGERROR, "%s: iconv() failed, errno=%d (%s)",
439 __FUNCTION__, errno, strerror(errno));
440 }
441 }
442 break;
443 }
444
445 //complete the conversion (reset buffers), otherwise the current data will prefix the data on the next call
446 if (iconv(type, NULL, NULL, &outBufStart, &outBytesAvail) == (size_t)-1)
447 CLog::Log(LOGERROR, "%s failed cleanup errno=%d(%s)", __FUNCTION__, errno, strerror(errno));
448
449 if (returnV == (size_t)-1)
450 {
451 free(outBuf);
452 return false;
453 }
454 //we're done
455
456 const typename OUTPUT::size_type sizeInChars = (typename OUTPUT::size_type) (outBufSize - outBytesAvail) / sizeof(typename OUTPUT::value_type);
457 typename OUTPUT::const_pointer strPtr = (typename OUTPUT::const_pointer) outBuf;
458 /* Make sure that all buffer is assigned and string is stopped at end of buffer */
459 if (strPtr[sizeInChars-1] == 0 && strSource[strSource.length()-1] != 0)
460 strDest.assign(strPtr, sizeInChars-1);
461 else
462 strDest.assign(strPtr, sizeInChars);
463
464 free(outBuf);
465
466 return true;
467 }
468
logicalToVisualBiDi(const std::u32string & stringSrc,std::u32string & stringDst,FriBidiCharType base,const bool failOnBadString,int * visualToLogicalMap)469 bool CCharsetConverter::CInnerConverter::logicalToVisualBiDi(
470 const std::u32string& stringSrc,
471 std::u32string& stringDst,
472 FriBidiCharType base /*= FRIBIDI_TYPE_LTR*/,
473 const bool failOnBadString /*= false*/,
474 int* visualToLogicalMap /*= nullptr*/)
475 {
476 stringDst.clear();
477
478 const size_t srcLen = stringSrc.length();
479 if (srcLen == 0)
480 return true;
481
482 stringDst.reserve(srcLen);
483 size_t lineStart = 0;
484
485 // libfribidi is not threadsafe, so make sure we make it so
486 CSingleLock lock(m_critSectionFriBiDi);
487 do
488 {
489 size_t lineEnd = stringSrc.find('\n', lineStart);
490 if (lineEnd >= srcLen) // equal to 'lineEnd == std::string::npos'
491 lineEnd = srcLen;
492 else
493 lineEnd++; // include '\n'
494
495 const size_t lineLen = lineEnd - lineStart;
496
497 FriBidiChar* visual = (FriBidiChar*) malloc((lineLen + 1) * sizeof(FriBidiChar));
498 if (visual == NULL)
499 {
500 free(visual);
501 CLog::Log(LOGFATAL, "%s: can't allocate memory", __FUNCTION__);
502 return false;
503 }
504
505 bool bidiFailed = false;
506 FriBidiCharType baseCopy = base; // preserve same value for all lines, required because fribidi_log2vis will modify parameter value
507 if (fribidi_log2vis(reinterpret_cast<const FriBidiChar*>(stringSrc.c_str() + lineStart),
508 lineLen, &baseCopy, visual, nullptr,
509 !visualToLogicalMap ? nullptr : visualToLogicalMap + lineStart, nullptr))
510 {
511 // Removes bidirectional marks
512 const int newLen = fribidi_remove_bidi_marks(
513 visual, lineLen, nullptr, !visualToLogicalMap ? nullptr : visualToLogicalMap + lineStart,
514 nullptr);
515 if (newLen > 0)
516 stringDst.append((const char32_t*)visual, (size_t)newLen);
517 else if (newLen < 0)
518 bidiFailed = failOnBadString;
519 }
520 else
521 bidiFailed = failOnBadString;
522
523 free(visual);
524
525 if (bidiFailed)
526 return false;
527
528 lineStart = lineEnd;
529 } while (lineStart < srcLen);
530
531 return !stringDst.empty();
532 }
533
534 static struct SCharsetMapping
535 {
536 const char* charset;
537 const char* caption;
538 } g_charsets[] = {
539 { "ISO-8859-1", "Western Europe (ISO)" }
540 , { "ISO-8859-2", "Central Europe (ISO)" }
541 , { "ISO-8859-3", "South Europe (ISO)" }
542 , { "ISO-8859-4", "Baltic (ISO)" }
543 , { "ISO-8859-5", "Cyrillic (ISO)" }
544 , { "ISO-8859-6", "Arabic (ISO)" }
545 , { "ISO-8859-7", "Greek (ISO)" }
546 , { "ISO-8859-8", "Hebrew (ISO)" }
547 , { "ISO-8859-9", "Turkish (ISO)" }
548 , { "CP1250", "Central Europe (Windows)" }
549 , { "CP1251", "Cyrillic (Windows)" }
550 , { "CP1252", "Western Europe (Windows)" }
551 , { "CP1253", "Greek (Windows)" }
552 , { "CP1254", "Turkish (Windows)" }
553 , { "CP1255", "Hebrew (Windows)" }
554 , { "CP1256", "Arabic (Windows)" }
555 , { "CP1257", "Baltic (Windows)" }
556 , { "CP1258", "Vietnamese (Windows)" }
557 , { "CP874", "Thai (Windows)" }
558 , { "BIG5", "Chinese Traditional (Big5)" }
559 , { "GBK", "Chinese Simplified (GBK)" }
560 , { "SHIFT_JIS", "Japanese (Shift-JIS)" }
561 , { "CP949", "Korean" }
562 , { "BIG5-HKSCS", "Hong Kong (Big5-HKSCS)" }
563 , { NULL, NULL }
564 };
565
566 CCharsetConverter::CCharsetConverter() = default;
567
OnSettingChanged(const std::shared_ptr<const CSetting> & setting)568 void CCharsetConverter::OnSettingChanged(const std::shared_ptr<const CSetting>& setting)
569 {
570 if (setting == NULL)
571 return;
572
573 const std::string& settingId = setting->GetId();
574 if (settingId == CSettings::SETTING_LOCALE_CHARSET)
575 resetUserCharset();
576 else if (settingId == CSettings::SETTING_SUBTITLES_CHARSET)
577 resetSubtitleCharset();
578 }
579
clear()580 void CCharsetConverter::clear()
581 {
582 }
583
getCharsetLabels()584 std::vector<std::string> CCharsetConverter::getCharsetLabels()
585 {
586 std::vector<std::string> lab;
587 for(SCharsetMapping* c = g_charsets; c->charset; c++)
588 lab.emplace_back(c->caption);
589
590 return lab;
591 }
592
getCharsetLabelByName(const std::string & charsetName)593 std::string CCharsetConverter::getCharsetLabelByName(const std::string& charsetName)
594 {
595 for(SCharsetMapping* c = g_charsets; c->charset; c++)
596 {
597 if (StringUtils::EqualsNoCase(charsetName,c->charset))
598 return c->caption;
599 }
600
601 return "";
602 }
603
getCharsetNameByLabel(const std::string & charsetLabel)604 std::string CCharsetConverter::getCharsetNameByLabel(const std::string& charsetLabel)
605 {
606 for(SCharsetMapping* c = g_charsets; c->charset; c++)
607 {
608 if (StringUtils::EqualsNoCase(charsetLabel, c->caption))
609 return c->charset;
610 }
611
612 return "";
613 }
614
reset(void)615 void CCharsetConverter::reset(void)
616 {
617 for (CConverterType& conversion : CInnerConverter::m_stdConversion)
618 conversion.Reset();
619 }
620
resetSystemCharset(void)621 void CCharsetConverter::resetSystemCharset(void)
622 {
623 CInnerConverter::m_stdConversion[Utf8ToSystem].Reset();
624 CInnerConverter::m_stdConversion[SystemToUtf8].Reset();
625 }
626
resetUserCharset(void)627 void CCharsetConverter::resetUserCharset(void)
628 {
629 CInnerConverter::m_stdConversion[UserCharsetToUtf8].Reset();
630 CInnerConverter::m_stdConversion[UserCharsetToUtf8].Reset();
631 CInnerConverter::m_stdConversion[Utf32ToUserCharset].Reset();
632 resetSubtitleCharset();
633 }
634
resetSubtitleCharset(void)635 void CCharsetConverter::resetSubtitleCharset(void)
636 {
637 CInnerConverter::m_stdConversion[SubtitleCharsetToUtf8].Reset();
638 }
639
reinitCharsetsFromSettings(void)640 void CCharsetConverter::reinitCharsetsFromSettings(void)
641 {
642 resetUserCharset(); // this will also reinit Subtitle charsets
643 }
644
utf8ToUtf32(const std::string & utf8StringSrc,std::u32string & utf32StringDst,bool failOnBadChar)645 bool CCharsetConverter::utf8ToUtf32(const std::string& utf8StringSrc, std::u32string& utf32StringDst, bool failOnBadChar /*= true*/)
646 {
647 return CInnerConverter::stdConvert(Utf8ToUtf32, utf8StringSrc, utf32StringDst, failOnBadChar);
648 }
649
utf8ToUtf32(const std::string & utf8StringSrc,bool failOnBadChar)650 std::u32string CCharsetConverter::utf8ToUtf32(const std::string& utf8StringSrc, bool failOnBadChar /*= true*/)
651 {
652 std::u32string converted;
653 utf8ToUtf32(utf8StringSrc, converted, failOnBadChar);
654 return converted;
655 }
656
utf8ToUtf32Visual(const std::string & utf8StringSrc,std::u32string & utf32StringDst,bool bVisualBiDiFlip,bool forceLTRReadingOrder,bool failOnBadChar)657 bool CCharsetConverter::utf8ToUtf32Visual(const std::string& utf8StringSrc, std::u32string& utf32StringDst, bool bVisualBiDiFlip /*= false*/, bool forceLTRReadingOrder /*= false*/, bool failOnBadChar /*= false*/)
658 {
659 if (bVisualBiDiFlip)
660 {
661 std::u32string converted;
662 if (!CInnerConverter::stdConvert(Utf8ToUtf32, utf8StringSrc, converted, failOnBadChar))
663 return false;
664
665 return CInnerConverter::logicalToVisualBiDi(converted, utf32StringDst, forceLTRReadingOrder ? FRIBIDI_TYPE_LTR : FRIBIDI_TYPE_PDF, failOnBadChar);
666 }
667 return CInnerConverter::stdConvert(Utf8ToUtf32, utf8StringSrc, utf32StringDst, failOnBadChar);
668 }
669
utf32ToUtf8(const std::u32string & utf32StringSrc,std::string & utf8StringDst,bool failOnBadChar)670 bool CCharsetConverter::utf32ToUtf8(const std::u32string& utf32StringSrc, std::string& utf8StringDst, bool failOnBadChar /*= true*/)
671 {
672 return CInnerConverter::stdConvert(Utf32ToUtf8, utf32StringSrc, utf8StringDst, failOnBadChar);
673 }
674
utf32ToUtf8(const std::u32string & utf32StringSrc,bool failOnBadChar)675 std::string CCharsetConverter::utf32ToUtf8(const std::u32string& utf32StringSrc, bool failOnBadChar /*= false*/)
676 {
677 std::string converted;
678 utf32ToUtf8(utf32StringSrc, converted, failOnBadChar);
679 return converted;
680 }
681
utf32ToW(const std::u32string & utf32StringSrc,std::wstring & wStringDst,bool failOnBadChar)682 bool CCharsetConverter::utf32ToW(const std::u32string& utf32StringSrc, std::wstring& wStringDst, bool failOnBadChar /*= true*/)
683 {
684 #ifdef WCHAR_IS_UCS_4
685 wStringDst.assign((const wchar_t*)utf32StringSrc.c_str(), utf32StringSrc.length());
686 return true;
687 #else // !WCHAR_IS_UCS_4
688 return CInnerConverter::stdConvert(Utf32ToW, utf32StringSrc, wStringDst, failOnBadChar);
689 #endif // !WCHAR_IS_UCS_4
690 }
691
utf32logicalToVisualBiDi(const std::u32string & logicalStringSrc,std::u32string & visualStringDst,bool forceLTRReadingOrder,bool failOnBadString,int * visualToLogicalMap)692 bool CCharsetConverter::utf32logicalToVisualBiDi(const std::u32string& logicalStringSrc,
693 std::u32string& visualStringDst,
694 bool forceLTRReadingOrder /*= false*/,
695 bool failOnBadString /*= false*/,
696 int* visualToLogicalMap /*= nullptr*/)
697 {
698 return CInnerConverter::logicalToVisualBiDi(
699 logicalStringSrc, visualStringDst, forceLTRReadingOrder ? FRIBIDI_TYPE_LTR : FRIBIDI_TYPE_PDF,
700 failOnBadString, visualToLogicalMap);
701 }
702
wToUtf32(const std::wstring & wStringSrc,std::u32string & utf32StringDst,bool failOnBadChar)703 bool CCharsetConverter::wToUtf32(const std::wstring& wStringSrc, std::u32string& utf32StringDst, bool failOnBadChar /*= true*/)
704 {
705 #ifdef WCHAR_IS_UCS_4
706 /* UCS-4 is almost equal to UTF-32, but UTF-32 has strict limits on possible values, while UCS-4 is usually unchecked.
707 * With this "conversion" we ensure that output will be valid UTF-32 string. */
708 #endif
709 return CInnerConverter::stdConvert(WToUtf32, wStringSrc, utf32StringDst, failOnBadChar);
710 }
711
712 // The bVisualBiDiFlip forces a flip of characters for hebrew/arabic languages, only set to false if the flipping
713 // of the string is already made or the string is not displayed in the GUI
utf8ToW(const std::string & utf8StringSrc,std::wstring & wStringDst,bool bVisualBiDiFlip,bool forceLTRReadingOrder,bool failOnBadChar)714 bool CCharsetConverter::utf8ToW(const std::string& utf8StringSrc, std::wstring& wStringDst, bool bVisualBiDiFlip /*= true*/,
715 bool forceLTRReadingOrder /*= false*/, bool failOnBadChar /*= false*/)
716 {
717 // Try to flip hebrew/arabic characters, if any
718 if (bVisualBiDiFlip)
719 {
720 wStringDst.clear();
721 std::u32string utf32str;
722 if (!CInnerConverter::stdConvert(Utf8ToUtf32, utf8StringSrc, utf32str, failOnBadChar))
723 return false;
724
725 std::u32string utf32flipped;
726 const bool bidiResult = CInnerConverter::logicalToVisualBiDi(utf32str, utf32flipped, forceLTRReadingOrder ? FRIBIDI_TYPE_LTR : FRIBIDI_TYPE_PDF, failOnBadChar);
727
728 return CInnerConverter::stdConvert(Utf32ToW, utf32flipped, wStringDst, failOnBadChar) && bidiResult;
729 }
730
731 return CInnerConverter::stdConvert(Utf8toW, utf8StringSrc, wStringDst, failOnBadChar);
732 }
733
subtitleCharsetToUtf8(const std::string & stringSrc,std::string & utf8StringDst)734 bool CCharsetConverter::subtitleCharsetToUtf8(const std::string& stringSrc, std::string& utf8StringDst)
735 {
736 return CInnerConverter::stdConvert(SubtitleCharsetToUtf8, stringSrc, utf8StringDst, false);
737 }
738
fromW(const std::wstring & wStringSrc,std::string & stringDst,const std::string & enc)739 bool CCharsetConverter::fromW(const std::wstring& wStringSrc,
740 std::string& stringDst, const std::string& enc)
741 {
742 return CInnerConverter::customConvert(WCHAR_CHARSET, enc, wStringSrc, stringDst);
743 }
744
toW(const std::string & stringSrc,std::wstring & wStringDst,const std::string & enc)745 bool CCharsetConverter::toW(const std::string& stringSrc,
746 std::wstring& wStringDst, const std::string& enc)
747 {
748 return CInnerConverter::customConvert(enc, WCHAR_CHARSET, stringSrc, wStringDst);
749 }
750
utf8ToStringCharset(const std::string & utf8StringSrc,std::string & stringDst)751 bool CCharsetConverter::utf8ToStringCharset(const std::string& utf8StringSrc, std::string& stringDst)
752 {
753 return CInnerConverter::stdConvert(Utf8ToUserCharset, utf8StringSrc, stringDst);
754 }
755
utf8ToStringCharset(std::string & stringSrcDst)756 bool CCharsetConverter::utf8ToStringCharset(std::string& stringSrcDst)
757 {
758 std::string strSrc(stringSrcDst);
759 return utf8ToStringCharset(strSrc, stringSrcDst);
760 }
761
ToUtf8(const std::string & strSourceCharset,const std::string & stringSrc,std::string & utf8StringDst,bool failOnBadChar)762 bool CCharsetConverter::ToUtf8(const std::string& strSourceCharset, const std::string& stringSrc, std::string& utf8StringDst, bool failOnBadChar /*= false*/)
763 {
764 if (strSourceCharset == "UTF-8")
765 { // simple case - no conversion necessary
766 utf8StringDst = stringSrc;
767 return true;
768 }
769
770 return CInnerConverter::customConvert(strSourceCharset, "UTF-8", stringSrc, utf8StringDst, failOnBadChar);
771 }
772
utf8To(const std::string & strDestCharset,const std::string & utf8StringSrc,std::string & stringDst)773 bool CCharsetConverter::utf8To(const std::string& strDestCharset, const std::string& utf8StringSrc, std::string& stringDst)
774 {
775 if (strDestCharset == "UTF-8")
776 { // simple case - no conversion necessary
777 stringDst = utf8StringSrc;
778 return true;
779 }
780
781 return CInnerConverter::customConvert(UTF8_SOURCE, strDestCharset, utf8StringSrc, stringDst);
782 }
783
utf8To(const std::string & strDestCharset,const std::string & utf8StringSrc,std::u16string & utf16StringDst)784 bool CCharsetConverter::utf8To(const std::string& strDestCharset, const std::string& utf8StringSrc, std::u16string& utf16StringDst)
785 {
786 return CInnerConverter::customConvert(UTF8_SOURCE, strDestCharset, utf8StringSrc, utf16StringDst);
787 }
788
utf8To(const std::string & strDestCharset,const std::string & utf8StringSrc,std::u32string & utf32StringDst)789 bool CCharsetConverter::utf8To(const std::string& strDestCharset, const std::string& utf8StringSrc, std::u32string& utf32StringDst)
790 {
791 return CInnerConverter::customConvert(UTF8_SOURCE, strDestCharset, utf8StringSrc, utf32StringDst);
792 }
793
unknownToUTF8(std::string & stringSrcDst)794 bool CCharsetConverter::unknownToUTF8(std::string& stringSrcDst)
795 {
796 std::string source(stringSrcDst);
797 return unknownToUTF8(source, stringSrcDst);
798 }
799
unknownToUTF8(const std::string & stringSrc,std::string & utf8StringDst,bool failOnBadChar)800 bool CCharsetConverter::unknownToUTF8(const std::string& stringSrc, std::string& utf8StringDst, bool failOnBadChar /*= false*/)
801 {
802 // checks whether it's utf8 already, and if not converts using the sourceCharset if given, else the string charset
803 if (CUtf8Utils::isValidUtf8(stringSrc))
804 {
805 utf8StringDst = stringSrc;
806 return true;
807 }
808 return CInnerConverter::stdConvert(UserCharsetToUtf8, stringSrc, utf8StringDst, failOnBadChar);
809 }
810
wToUTF8(const std::wstring & wStringSrc,std::string & utf8StringDst,bool failOnBadChar)811 bool CCharsetConverter::wToUTF8(const std::wstring& wStringSrc, std::string& utf8StringDst, bool failOnBadChar /*= false*/)
812 {
813 return CInnerConverter::stdConvert(WtoUtf8, wStringSrc, utf8StringDst, failOnBadChar);
814 }
815
utf16BEtoUTF8(const std::u16string & utf16StringSrc,std::string & utf8StringDst)816 bool CCharsetConverter::utf16BEtoUTF8(const std::u16string& utf16StringSrc, std::string& utf8StringDst)
817 {
818 return CInnerConverter::stdConvert(Utf16BEtoUtf8, utf16StringSrc, utf8StringDst);
819 }
820
utf16LEtoUTF8(const std::u16string & utf16StringSrc,std::string & utf8StringDst)821 bool CCharsetConverter::utf16LEtoUTF8(const std::u16string& utf16StringSrc,
822 std::string& utf8StringDst)
823 {
824 return CInnerConverter::stdConvert(Utf16LEtoUtf8, utf16StringSrc, utf8StringDst);
825 }
826
ucs2ToUTF8(const std::u16string & ucs2StringSrc,std::string & utf8StringDst)827 bool CCharsetConverter::ucs2ToUTF8(const std::u16string& ucs2StringSrc, std::string& utf8StringDst)
828 {
829 return CInnerConverter::stdConvert(Ucs2CharsetToUtf8, ucs2StringSrc,utf8StringDst);
830 }
831
utf16LEtoW(const std::u16string & utf16String,std::wstring & wString)832 bool CCharsetConverter::utf16LEtoW(const std::u16string& utf16String, std::wstring& wString)
833 {
834 return CInnerConverter::stdConvert(Utf16LEtoW, utf16String, wString);
835 }
836
utf32ToStringCharset(const std::u32string & utf32StringSrc,std::string & stringDst)837 bool CCharsetConverter::utf32ToStringCharset(const std::u32string& utf32StringSrc, std::string& stringDst)
838 {
839 return CInnerConverter::stdConvert(Utf32ToUserCharset, utf32StringSrc, stringDst);
840 }
841
utf8ToSystem(std::string & stringSrcDst,bool failOnBadChar)842 bool CCharsetConverter::utf8ToSystem(std::string& stringSrcDst, bool failOnBadChar /*= false*/)
843 {
844 std::string strSrc(stringSrcDst);
845 return CInnerConverter::stdConvert(Utf8ToSystem, strSrc, stringSrcDst, failOnBadChar);
846 }
847
systemToUtf8(const std::string & sysStringSrc,std::string & utf8StringDst,bool failOnBadChar)848 bool CCharsetConverter::systemToUtf8(const std::string& sysStringSrc, std::string& utf8StringDst, bool failOnBadChar /*= false*/)
849 {
850 return CInnerConverter::stdConvert(SystemToUtf8, sysStringSrc, utf8StringDst, failOnBadChar);
851 }
852
utf8logicalToVisualBiDi(const std::string & utf8StringSrc,std::string & utf8StringDst,bool failOnBadString)853 bool CCharsetConverter::utf8logicalToVisualBiDi(const std::string& utf8StringSrc, std::string& utf8StringDst, bool failOnBadString /*= false*/)
854 {
855 utf8StringDst.clear();
856 std::u32string utf32flipped;
857 if (!utf8ToUtf32Visual(utf8StringSrc, utf32flipped, true, true, failOnBadString))
858 return false;
859
860 return CInnerConverter::stdConvert(Utf32ToUtf8, utf32flipped, utf8StringDst, failOnBadString);
861 }
862
SettingOptionsCharsetsFiller(const SettingConstPtr & setting,std::vector<StringSettingOption> & list,std::string & current,void * data)863 void CCharsetConverter::SettingOptionsCharsetsFiller(const SettingConstPtr& setting,
864 std::vector<StringSettingOption>& list,
865 std::string& current,
866 void* data)
867 {
868 std::vector<std::string> vecCharsets = g_charsetConverter.getCharsetLabels();
869 sort(vecCharsets.begin(), vecCharsets.end(), sortstringbyname());
870
871 list.emplace_back(g_localizeStrings.Get(13278), "DEFAULT"); // "Default"
872 for (int i = 0; i < (int) vecCharsets.size(); ++i)
873 list.emplace_back(vecCharsets[i], g_charsetConverter.getCharsetNameByLabel(vecCharsets[i]));
874 }
875