1 // *****************************************************************************
2 // * This file is part of the FreeFileSync project. It is distributed under    *
3 // * GNU General Public License: http://www.gnu.org/licenses/gpl-3.0           *
4 // * Copyright (C) Zenju (zenju AT freefilesync DOT org) - All Rights Reserved *
5 // *****************************************************************************
6 
7 #ifndef UTF_H_01832479146991573473545
8 #define UTF_H_01832479146991573473545
9 
10 #include <cstdint>
11 #include <iterator>
12 #include "string_tools.h" //copyStringTo
13 
14 namespace zen
15 {
16 //convert all(!) char- and wchar_t-based "string-like" objects applying a UTF8 conversions (but only if necessary!)
17 template <class TargetString, class SourceString>
18 TargetString utfCvrtTo(const SourceString& str);
19 
20 const char BYTE_ORDER_MARK_UTF8[] = "\xEF\xBB\xBF";
21 
22 template <class CharString>
23 bool isValidUtf8(const CharString& str); //check for UTF-8 encoding errors
24 
25 //---- explicit conversion: wide <-> utf8 ----
26 template <class CharString, class WideString>
27 CharString wideToUtf8(const WideString& str); //example: std::string tmp = wideToUtf8<std::string>(L"abc");
28 
29 template <class WideString, class CharString>
30 WideString utf8ToWide(const CharString& str); //std::wstring tmp = utf8ToWide<std::wstring>("abc");
31 
32 //access unicode characters in UTF-encoded string (char- or wchar_t-based)
33 template <class UtfString>
34 size_t unicodeLength(const UtfString& str); //return number of code points for UTF-encoded string
35 
36 template <class UtfString>
37 size_t findUnicodePos(const UtfString& str, size_t unicodePos); //return position of unicode char in UTF-encoded string
38 
39 
40 
41 
42 
43 
44 
45 
46 
47 
48 
49 
50 
51 
52 
53 
54 
55 
56 //----------------------- implementation ----------------------------------
57 namespace implementation
58 {
59 using CodePoint = std::uint32_t;
60 using Char16    = std::uint16_t;
61 using Char8     = unsigned char;
62 
63 const CodePoint LEAD_SURROGATE      = 0xd800;
64 const CodePoint TRAIL_SURROGATE     = 0xdc00; //== LEAD_SURROGATE_MAX + 1
65 const CodePoint TRAIL_SURROGATE_MAX = 0xdfff;
66 
67 const CodePoint REPLACEMENT_CHAR    = 0xfffd;
68 const CodePoint CODE_POINT_MAX      = 0x10ffff;
69 
70 
71 template <class Function> inline
codePointToUtf16(CodePoint cp,Function writeOutput)72 void codePointToUtf16(CodePoint cp, Function writeOutput) //"writeOutput" is a unary function taking a Char16
73 {
74     //http://en.wikipedia.org/wiki/UTF-16
75 
76     if (cp < LEAD_SURROGATE)
77         writeOutput(static_cast<Char16>(cp));
78     else if (cp <= TRAIL_SURROGATE_MAX) //invalid code point
79         codePointToUtf16(REPLACEMENT_CHAR, writeOutput); //resolves to 1-character utf16
80     else if (cp < 0x10000)
81         writeOutput(static_cast<Char16>(cp));
82     else if (cp <= CODE_POINT_MAX)
83     {
84         cp -= 0x10000;
85         writeOutput(LEAD_SURROGATE  + static_cast<Char16>(cp >> 10));
86         writeOutput(TRAIL_SURROGATE + static_cast<Char16>(cp & 0x3ff));
87     }
88     else //invalid code point
89         codePointToUtf16(REPLACEMENT_CHAR, writeOutput); //resolves to 1-character utf16
90 }
91 
92 
93 inline
getUtf16Len(Char16 ch)94 size_t getUtf16Len(Char16 ch) //ch must be first code unit! returns 0 on error!
95 {
96     if (ch < LEAD_SURROGATE)
97         return 1;
98     else if (ch < TRAIL_SURROGATE)
99         return 2;
100     else if (ch <= TRAIL_SURROGATE_MAX)
101         return 0; //unexpected trail surrogate!
102     else
103         return 1;
104 }
105 
106 
107 template <class CharIterator, class Function> inline
utf16ToCodePoint(CharIterator first,CharIterator last,Function writeOutput)108 void utf16ToCodePoint(CharIterator first, CharIterator last, Function writeOutput) //"writeOutput" is a unary function taking a CodePoint
109 {
110     static_assert(sizeof(typename std::iterator_traits<CharIterator>::value_type) == 2, "");
111 
112     for ( ; first != last; ++first)
113     {
114         CodePoint cp = static_cast<Char16>(*first);
115         switch (getUtf16Len(static_cast<Char16>(cp)))
116         {
117             case 0: //invalid utf16 character
118                 cp = REPLACEMENT_CHAR;
119                 break;
120             case 1:
121                 break;
122             case 2:
123                 if (++first != last) //trail surrogate expected!
124                 {
125                     const Char16 ch = static_cast<Char16>(*first);
126                     if (TRAIL_SURROGATE <= ch && ch <= TRAIL_SURROGATE_MAX) //trail surrogate expected!
127                     {
128                         cp = ((cp - LEAD_SURROGATE) << 10) + (ch - TRAIL_SURROGATE) + 0x10000;
129                         break;
130                     }
131                 }
132                 --first;
133                 cp = REPLACEMENT_CHAR;
134                 break;
135         }
136         writeOutput(cp);
137     }
138 }
139 
140 
141 template <class Function> inline
codePointToUtf8(CodePoint cp,Function writeOutput)142 void codePointToUtf8(CodePoint cp, Function writeOutput) //"writeOutput" is a unary function taking a Char8
143 {
144     //http://en.wikipedia.org/wiki/UTF-8
145     //assert(cp < LEAD_SURROGATE || TRAIL_SURROGATE_MAX < cp); //code points [0xd800, 0xdfff] are reserved for UTF-16 and *should* not be encoded in UTF-8
146 
147     if (cp < 0x80)
148         writeOutput(static_cast<Char8>(cp));
149     else if (cp < 0x800)
150     {
151         writeOutput(static_cast<Char8>((cp >> 6  ) | 0xc0));
152         writeOutput(static_cast<Char8>((cp & 0x3f) | 0x80));
153     }
154     else if (cp < 0x10000)
155     {
156         writeOutput(static_cast<Char8>( (cp >> 12       ) | 0xe0));
157         writeOutput(static_cast<Char8>(((cp >> 6) & 0x3f) | 0x80));
158         writeOutput(static_cast<Char8>( (cp & 0x3f      ) | 0x80));
159     }
160     else if (cp <= CODE_POINT_MAX)
161     {
162         writeOutput(static_cast<Char8>( (cp >> 18        ) | 0xf0));
163         writeOutput(static_cast<Char8>(((cp >> 12) & 0x3f) | 0x80));
164         writeOutput(static_cast<Char8>(((cp >> 6)  & 0x3f) | 0x80));
165         writeOutput(static_cast<Char8>( (cp & 0x3f       ) | 0x80));
166     }
167     else //invalid code point
168         codePointToUtf8(REPLACEMENT_CHAR, writeOutput); //resolves to 3-byte utf8
169 }
170 
171 
172 inline
getUtf8Len(unsigned char ch)173 size_t getUtf8Len(unsigned char ch) //ch must be first code unit! returns 0 on error!
174 {
175     if (ch < 0x80)
176         return 1;
177     if (ch >> 5 == 0x6)
178         return 2;
179     if (ch >> 4 == 0xe)
180         return 3;
181     if (ch >> 3 == 0x1e)
182         return 4;
183     return 0; //innvalid begin of UTF8 encoding
184 }
185 
186 
187 template <class CharIterator> inline
decodeTrail(CharIterator & first,CharIterator last,CodePoint & cp)188 bool decodeTrail(CharIterator& first, CharIterator last, CodePoint& cp) //decode trailing surrogate byte
189 {
190     if (++first != last) //trail surrogate expected!
191     {
192         const Char8 ch = static_cast<Char8>(*first);
193         if (ch >> 6 == 0x2) //trail surrogate expected!
194         {
195             cp = (cp << 6) + (ch & 0x3f);
196             return true;
197         }
198     }
199     --first;
200     cp = REPLACEMENT_CHAR;
201     return false;
202 }
203 
204 template <class CharIterator, class Function> inline
utf8ToCodePoint(CharIterator first,CharIterator last,Function writeOutput)205 void utf8ToCodePoint(CharIterator first, CharIterator last, Function writeOutput) //"writeOutput" is a unary function taking a CodePoint
206 {
207     static_assert(sizeof(typename std::iterator_traits<CharIterator>::value_type) == 1, "");
208 
209     for ( ; first != last; ++first)
210     {
211         CodePoint cp = static_cast<Char8>(*first);
212         switch (getUtf8Len(static_cast<Char8>(cp)))
213         {
214             case 0: //invalid utf8 character
215                 cp = REPLACEMENT_CHAR;
216                 break;
217             case 1:
218                 break;
219             case 2:
220                 cp &= 0x1f;
221                 decodeTrail(first, last, cp);
222                 break;
223             case 3:
224                 cp &= 0xf;
225                 if (decodeTrail(first, last, cp))
226                     decodeTrail(first, last, cp);
227                 break;
228             case 4:
229                 cp &= 0x7;
230                 if (decodeTrail(first, last, cp))
231                     if (decodeTrail(first, last, cp))
232                         decodeTrail(first, last, cp);
233                 if (cp > CODE_POINT_MAX) cp = REPLACEMENT_CHAR;
234                 break;
235         }
236         writeOutput(cp);
237     }
238 }
239 
240 
241 template <class CharString> inline
unicodeLength(const CharString & str,char)242 size_t unicodeLength(const CharString& str, char) //utf8
243 {
244     using CharType = typename GetCharType<CharString>::Type;
245 
246     const CharType*       strFirst  = strBegin(str);
247     const CharType* const strLast   = strFirst + strLength(str);
248 
249     size_t len = 0;
250     while (strFirst < strLast) //[!]
251     {
252         ++len;
253         size_t utf8len = getUtf8Len(*strFirst);
254         if (utf8len == 0) ++utf8len; //invalid utf8 character
255         strFirst += utf8len;
256     }
257     return len;
258 }
259 
260 
261 template <class WideString> inline
unicodeLengthWide(const WideString & str,Int2Type<2>)262 size_t unicodeLengthWide(const WideString& str, Int2Type<2>) //windows: utf16-wchar_t
263 {
264     using CharType = typename GetCharType<WideString>::Type;
265 
266     const CharType*       strFirst = strBegin(str);
267     const CharType* const strLast  = strFirst + strLength(str);
268 
269     size_t len = 0;
270     while (strFirst < strLast) //[!]
271     {
272         ++len;
273         size_t utf16len = getUtf16Len(*strFirst);
274         if (utf16len == 0) ++utf16len; //invalid utf16 character
275         strFirst += utf16len;
276     }
277     return len;
278 }
279 
280 
281 template <class WideString> inline
unicodeLengthWide(const WideString & str,Int2Type<4>)282 size_t unicodeLengthWide(const WideString& str, Int2Type<4>) //other OS: utf32-wchar_t
283 {
284     return strLength(str);
285 }
286 
287 
288 template <class WideString> inline
unicodeLength(const WideString & str,wchar_t)289 size_t unicodeLength(const WideString& str, wchar_t)
290 {
291     return unicodeLengthWide(str, Int2Type<sizeof(wchar_t)>());
292 }
293 }
294 
295 
296 template <class UtfString> inline
unicodeLength(const UtfString & str)297 size_t unicodeLength(const UtfString& str) //return number of code points
298 {
299     return implementation::unicodeLength(str, typename GetCharType<UtfString>::Type());
300 }
301 
302 
303 namespace implementation
304 {
305 template <class CharString> inline
findUnicodePos(const CharString & str,size_t unicodePos,char)306 size_t findUnicodePos(const CharString& str, size_t unicodePos, char) //utf8-char
307 {
308     using CharType = typename GetCharType<CharString>::Type;
309 
310     const CharType* strFirst = strBegin(str);
311     const size_t strLen = strLength(str);
312 
313     size_t utfPos = 0;
314     while (unicodePos-- > 0)
315     {
316         if (utfPos >= strLen)
317             return strLen;
318 
319         size_t utf8len = getUtf8Len(strFirst[utfPos]);
320         if (utf8len == 0) ++utf8len; //invalid utf8 character
321         utfPos += utf8len;
322     }
323     if (utfPos >= strLen)
324         return strLen;
325     return utfPos;
326 }
327 
328 
329 template <class WideString> inline
findUnicodePosWide(const WideString & str,size_t unicodePos,Int2Type<2>)330 size_t findUnicodePosWide(const WideString& str, size_t unicodePos, Int2Type<2>) //windows: utf16-wchar_t
331 {
332     using CharType = typename GetCharType<WideString>::Type;
333 
334     const CharType* strFirst = strBegin(str);
335     const size_t strLen = strLength(str);
336 
337     size_t utfPos = 0;
338     while (unicodePos-- > 0)
339     {
340         if (utfPos >= strLen)
341             return strLen;
342 
343         size_t utf16len = getUtf16Len(strFirst[utfPos]);
344         if (utf16len == 0) ++utf16len; //invalid utf16 character
345         utfPos += utf16len;
346     }
347     if (utfPos >= strLen)
348         return strLen;
349     return utfPos;
350 }
351 
352 
353 template <class WideString> inline
findUnicodePosWide(const WideString & str,size_t unicodePos,Int2Type<4>)354 size_t findUnicodePosWide(const WideString& str, size_t unicodePos, Int2Type<4>) //other OS: utf32-wchar_t
355 {
356     return std::min(strLength(str), unicodePos);
357 }
358 
359 
360 template <class UtfString> inline
findUnicodePos(const UtfString & str,size_t unicodePos,wchar_t)361 size_t findUnicodePos(const UtfString& str, size_t unicodePos, wchar_t)
362 {
363     return findUnicodePosWide(str, unicodePos, Int2Type<sizeof(wchar_t)>());
364 }
365 }
366 
367 
368 template <class UtfString> inline
findUnicodePos(const UtfString & str,size_t unicodePos)369 size_t findUnicodePos(const UtfString& str, size_t unicodePos) //return position of unicode char in UTF-encoded string
370 {
371     return implementation::findUnicodePos(str, unicodePos, typename GetCharType<UtfString>::Type());
372 }
373 
374 //-------------------------------------------------------------------------------------------
375 
376 namespace implementation
377 {
378 template <class WideString, class CharString> inline
utf8ToWide(const CharString & str,Int2Type<2>)379 WideString utf8ToWide(const CharString& str, Int2Type<2>) //windows: convert utf8 to utf16-wchar_t
380 {
381     WideString output;
382     utf8ToCodePoint(strBegin(str), strBegin(str) + strLength(str),
383     [&](CodePoint cp) { codePointToUtf16(cp, [&](Char16 c) { output += static_cast<wchar_t>(c); }); });
384     return output;
385 }
386 
387 
388 template <class WideString, class CharString> inline
utf8ToWide(const CharString & str,Int2Type<4>)389 WideString utf8ToWide(const CharString& str, Int2Type<4>) //other OS: convert utf8 to utf32-wchar_t
390 {
391     WideString output;
392     utf8ToCodePoint(strBegin(str), strBegin(str) + strLength(str),
393     [&](CodePoint cp) { output += static_cast<wchar_t>(cp); });
394     return output;
395 }
396 
397 
398 template <class CharString, class WideString> inline
wideToUtf8(const WideString & str,Int2Type<2>)399 CharString wideToUtf8(const WideString& str, Int2Type<2>) //windows: convert utf16-wchar_t to utf8
400 {
401     CharString output;
402     utf16ToCodePoint(strBegin(str), strBegin(str) + strLength(str),
403     [&](CodePoint cp) { codePointToUtf8(cp, [&](char c) { output += c; }); });
404     return output;
405 }
406 
407 
408 template <class CharString, class WideString> inline
wideToUtf8(const WideString & str,Int2Type<4>)409 CharString wideToUtf8(const WideString& str, Int2Type<4>) //other OS: convert utf32-wchar_t to utf8
410 {
411     CharString output;
412     std::for_each(strBegin(str), strBegin(str) + strLength(str),
413     [&](CodePoint cp) { codePointToUtf8(cp, [&](char c) { output += c; }); });
414     return output;
415 }
416 }
417 
418 
419 template <class CharString> inline
isValidUtf8(const CharString & str)420 bool isValidUtf8(const CharString& str)
421 {
422     using namespace implementation;
423     bool valid = true;
424     utf8ToCodePoint(strBegin(str), strBegin(str) + strLength(str),
425                     [&](CodePoint cp)
426     {
427         if (cp == REPLACEMENT_CHAR)
428             valid = false; //perf: should we use an (expensive) exception for iteration break?
429     });
430     return valid;
431 }
432 
433 
434 template <class WideString, class CharString> inline
utf8ToWide(const CharString & str)435 WideString utf8ToWide(const CharString& str)
436 {
437     static_assert(IsSameType<typename GetCharType<CharString>::Type, char   >::value, "");
438     static_assert(IsSameType<typename GetCharType<WideString>::Type, wchar_t>::value, "");
439 
440     return implementation::utf8ToWide<WideString>(str, Int2Type<sizeof(wchar_t)>());
441 }
442 
443 
444 template <class CharString, class WideString> inline
wideToUtf8(const WideString & str)445 CharString wideToUtf8(const WideString& str)
446 {
447     static_assert(IsSameType<typename GetCharType<CharString>::Type, char   >::value, "");
448     static_assert(IsSameType<typename GetCharType<WideString>::Type, wchar_t>::value, "");
449 
450     return implementation::wideToUtf8<CharString>(str, Int2Type<sizeof(wchar_t)>());
451 }
452 
453 //-------------------------------------------------------------------------------------------
454 
455 template <class TargetString, class SourceString> inline
utfCvrtTo(const SourceString & str,char,wchar_t)456 TargetString utfCvrtTo(const SourceString& str, char, wchar_t) { return utf8ToWide<TargetString>(str); }
457 
458 template <class TargetString, class SourceString> inline
utfCvrtTo(const SourceString & str,wchar_t,char)459 TargetString utfCvrtTo(const SourceString& str, wchar_t, char) { return wideToUtf8<TargetString>(str); }
460 
461 template <class TargetString, class SourceString> inline
utfCvrtTo(const SourceString & str,char,char)462 TargetString utfCvrtTo(const SourceString& str, char, char) { return copyStringTo<TargetString>(str); }
463 
464 template <class TargetString, class SourceString> inline
utfCvrtTo(const SourceString & str,wchar_t,wchar_t)465 TargetString utfCvrtTo(const SourceString& str, wchar_t, wchar_t) { return copyStringTo<TargetString>(str); }
466 
467 template <class TargetString, class SourceString> inline
utfCvrtTo(const SourceString & str)468 TargetString utfCvrtTo(const SourceString& str)
469 {
470     return utfCvrtTo<TargetString>(str,
471                                    typename GetCharType<SourceString>::Type(),
472                                    typename GetCharType<TargetString>::Type());
473 }
474 }
475 
476 #endif //UTF_H_01832479146991573473545
477