1 // Scintilla source code edit control
2 /** @file UniConversion.h
3  ** Functions to handle UTF-8 and UTF-16 strings.
4  **/
5 // Copyright 1998-2001 by Neil Hodgson <neilh@scintilla.org>
6 // The License.txt file describes the conditions under which this software may be distributed.
7 
8 #ifndef UNICONVERSION_H
9 #define UNICONVERSION_H
10 
11 namespace Scintilla {
12 
13 const int UTF8MaxBytes = 4;
14 
15 const int unicodeReplacementChar = 0xFFFD;
16 
17 size_t UTF8Length(const wchar_t *uptr, size_t tlen) noexcept;
18 void UTF8FromUTF16(const wchar_t *uptr, size_t tlen, char *putf, size_t len);
19 void UTF8FromUTF32Character(int uch, char *putf) noexcept;
20 size_t UTF16Length(const char *s, size_t len) noexcept;
21 size_t UTF16FromUTF8(const char *s, size_t len, wchar_t *tbuf, size_t tlen);
22 size_t UTF32Length(const char *s, size_t len) noexcept;
23 size_t UTF32FromUTF8(const char *s, size_t len, unsigned int *tbuf, size_t tlen);
24 // WStringFromUTF8 does the right thing when wchar_t is 2 or 4 bytes so
25 // works on both Windows and Unix.
26 std::wstring WStringFromUTF8(const char *s, size_t len);
27 unsigned int UTF16FromUTF32Character(unsigned int val, wchar_t *tbuf) noexcept;
28 bool UTF8IsValid(const char *s, size_t len) noexcept;
29 std::string FixInvalidUTF8(const std::string &text);
30 
31 extern const unsigned char UTF8BytesOfLead[256];
32 
UnicodeFromUTF8(const unsigned char * us)33 inline int UnicodeFromUTF8(const unsigned char *us) noexcept {
34 	switch (UTF8BytesOfLead[us[0]]) {
35 	case 1:
36 		return us[0];
37 	case 2:
38 		return ((us[0] & 0x1F) << 6) + (us[1] & 0x3F);
39 	case 3:
40 		return ((us[0] & 0xF) << 12) + ((us[1] & 0x3F) << 6) + (us[2] & 0x3F);
41 	default:
42 		return ((us[0] & 0x7) << 18) + ((us[1] & 0x3F) << 12) + ((us[2] & 0x3F) << 6) + (us[3] & 0x3F);
43 	}
44 }
45 
UTF8IsTrailByte(unsigned char ch)46 inline constexpr bool UTF8IsTrailByte(unsigned char ch) noexcept {
47 	return (ch >= 0x80) && (ch < 0xc0);
48 }
49 
UTF8IsAscii(int ch)50 inline constexpr bool UTF8IsAscii(int ch) noexcept {
51 	return ch < 0x80;
52 }
53 
54 enum { UTF8MaskWidth=0x7, UTF8MaskInvalid=0x8 };
55 int UTF8Classify(const unsigned char *us, size_t len) noexcept;
56 
57 // Similar to UTF8Classify but returns a length of 1 for invalid bytes
58 // instead of setting the invalid flag
59 int UTF8DrawBytes(const unsigned char *us, int len) noexcept;
60 
61 // Line separator is U+2028 \xe2\x80\xa8
62 // Paragraph separator is U+2029 \xe2\x80\xa9
63 const int UTF8SeparatorLength = 3;
UTF8IsSeparator(const unsigned char * us)64 inline bool UTF8IsSeparator(const unsigned char *us) noexcept {
65 	return (us[0] == 0xe2) && (us[1] == 0x80) && ((us[2] == 0xa8) || (us[2] == 0xa9));
66 }
67 
68 // NEL is U+0085 \xc2\x85
69 const int UTF8NELLength = 2;
UTF8IsNEL(const unsigned char * us)70 inline bool UTF8IsNEL(const unsigned char *us) noexcept {
71 	return (us[0] == 0xc2) && (us[1] == 0x85);
72 }
73 
74 enum { SURROGATE_LEAD_FIRST = 0xD800 };
75 enum { SURROGATE_LEAD_LAST = 0xDBFF };
76 enum { SURROGATE_TRAIL_FIRST = 0xDC00 };
77 enum { SURROGATE_TRAIL_LAST = 0xDFFF };
78 enum { SUPPLEMENTAL_PLANE_FIRST = 0x10000 };
79 
UTF16CharLength(wchar_t uch)80 inline constexpr unsigned int UTF16CharLength(wchar_t uch) noexcept {
81 	return ((uch >= SURROGATE_LEAD_FIRST) && (uch <= SURROGATE_LEAD_LAST)) ? 2 : 1;
82 }
83 
UTF16LengthFromUTF8ByteCount(unsigned int byteCount)84 inline constexpr unsigned int UTF16LengthFromUTF8ByteCount(unsigned int byteCount) noexcept {
85 	return (byteCount < 4) ? 1 : 2;
86 }
87 
88 }
89 
90 #endif
91