xref: /reactos/sdk/lib/rtl/utf8.c (revision 1734f297)
1 /*
2  * PROJECT:     ReactOS Kernel - Vista+ APIs
3  * LICENSE:     GPL-2.0-or-later (https://spdx.org/licenses/GPL-2.0-or-later)
4  * PURPOSE:     Rtl functions of Vista+
5  * COPYRIGHT:   2016 Thomas Faber <thomas.faber@reactos.org>
6  */
7 
8 /* INCLUDES ******************************************************************/
9 
10 #include <rtl_vista.h>
11 
12 #define NDEBUG
13 #include <debug.h>
14 
15 /* FUNCTIONS *****************************************************************/
16 
17 /******************************************************************************
18  * RtlUnicodeToUTF8N [NTDLL.@]
19  */
20 NTSTATUS NTAPI RtlUnicodeToUTF8N(CHAR *utf8_dest, ULONG utf8_bytes_max,
21                                  ULONG *utf8_bytes_written,
22                                  const WCHAR *uni_src, ULONG uni_bytes)
23 {
24     NTSTATUS status;
25     ULONG i;
26     ULONG written;
27     ULONG ch;
28     BYTE utf8_ch[4];
29     ULONG utf8_ch_len;
30 
31     if (!uni_src)
32         return STATUS_INVALID_PARAMETER_4;
33     if (!utf8_bytes_written)
34         return STATUS_INVALID_PARAMETER;
35     if (utf8_dest && uni_bytes % sizeof(WCHAR))
36         return STATUS_INVALID_PARAMETER_5;
37 
38     written = 0;
39     status = STATUS_SUCCESS;
40 
41     for (i = 0; i < uni_bytes / sizeof(WCHAR); i++)
42     {
43         /* decode UTF-16 into ch */
44         ch = uni_src[i];
45         if (ch >= 0xdc00 && ch <= 0xdfff)
46         {
47             ch = 0xfffd;
48             status = STATUS_SOME_NOT_MAPPED;
49         }
50         else if (ch >= 0xd800 && ch <= 0xdbff)
51         {
52             if (i + 1 < uni_bytes / sizeof(WCHAR))
53             {
54                 ch -= 0xd800;
55                 ch <<= 10;
56                 if (uni_src[i + 1] >= 0xdc00 && uni_src[i + 1] <= 0xdfff)
57                 {
58                     ch |= uni_src[i + 1] - 0xdc00;
59                     ch += 0x010000;
60                     i++;
61                 }
62                 else
63                 {
64                     ch = 0xfffd;
65                     status = STATUS_SOME_NOT_MAPPED;
66                 }
67             }
68             else
69             {
70                 ch = 0xfffd;
71                 status = STATUS_SOME_NOT_MAPPED;
72             }
73         }
74 
75         /* encode ch as UTF-8 */
76         ASSERT(ch <= 0x10ffff);
77         if (ch < 0x80)
78         {
79             utf8_ch[0] = ch & 0x7f;
80             utf8_ch_len = 1;
81         }
82         else if (ch < 0x800)
83         {
84             utf8_ch[0] = 0xc0 | (ch >>  6 & 0x1f);
85             utf8_ch[1] = 0x80 | (ch >>  0 & 0x3f);
86             utf8_ch_len = 2;
87         }
88         else if (ch < 0x10000)
89         {
90             utf8_ch[0] = 0xe0 | (ch >> 12 & 0x0f);
91             utf8_ch[1] = 0x80 | (ch >>  6 & 0x3f);
92             utf8_ch[2] = 0x80 | (ch >>  0 & 0x3f);
93             utf8_ch_len = 3;
94         }
95         else if (ch < 0x200000)
96         {
97             utf8_ch[0] = 0xf0 | (ch >> 18 & 0x07);
98             utf8_ch[1] = 0x80 | (ch >> 12 & 0x3f);
99             utf8_ch[2] = 0x80 | (ch >>  6 & 0x3f);
100             utf8_ch[3] = 0x80 | (ch >>  0 & 0x3f);
101             utf8_ch_len = 4;
102         }
103 
104         if (!utf8_dest)
105         {
106             written += utf8_ch_len;
107             continue;
108         }
109 
110         if (utf8_bytes_max >= utf8_ch_len)
111         {
112             memcpy(utf8_dest, utf8_ch, utf8_ch_len);
113             utf8_dest += utf8_ch_len;
114             utf8_bytes_max -= utf8_ch_len;
115             written += utf8_ch_len;
116         }
117         else
118         {
119             utf8_bytes_max = 0;
120             status = STATUS_BUFFER_TOO_SMALL;
121         }
122     }
123 
124     *utf8_bytes_written = written;
125     return status;
126 }
127 
128 
129 /******************************************************************************
130  * RtlUTF8ToUnicodeN [NTDLL.@]
131  */
132 NTSTATUS NTAPI RtlUTF8ToUnicodeN(WCHAR *uni_dest, ULONG uni_bytes_max,
133                                  ULONG *uni_bytes_written,
134                                  const CHAR *utf8_src, ULONG utf8_bytes)
135 {
136     NTSTATUS status;
137     ULONG i, j;
138     ULONG written;
139     ULONG ch;
140     ULONG utf8_trail_bytes;
141     WCHAR utf16_ch[3];
142     ULONG utf16_ch_len;
143 
144     if (!utf8_src)
145         return STATUS_INVALID_PARAMETER_4;
146     if (!uni_bytes_written)
147         return STATUS_INVALID_PARAMETER;
148 
149     written = 0;
150     status = STATUS_SUCCESS;
151 
152     for (i = 0; i < utf8_bytes; i++)
153     {
154         /* read UTF-8 lead byte */
155         ch = (BYTE)utf8_src[i];
156         utf8_trail_bytes = 0;
157         if (ch >= 0xf5)
158         {
159             ch = 0xfffd;
160             status = STATUS_SOME_NOT_MAPPED;
161         }
162         else if (ch >= 0xf0)
163         {
164             ch &= 0x07;
165             utf8_trail_bytes = 3;
166         }
167         else if (ch >= 0xe0)
168         {
169             ch &= 0x0f;
170             utf8_trail_bytes = 2;
171         }
172         else if (ch >= 0xc2)
173         {
174             ch &= 0x1f;
175             utf8_trail_bytes = 1;
176         }
177         else if (ch >= 0x80)
178         {
179             /* overlong or trail byte */
180             ch = 0xfffd;
181             status = STATUS_SOME_NOT_MAPPED;
182         }
183 
184         /* read UTF-8 trail bytes */
185         if (i + utf8_trail_bytes < utf8_bytes)
186         {
187             for (j = 0; j < utf8_trail_bytes; j++)
188             {
189                 if ((utf8_src[i + 1] & 0xc0) == 0x80)
190                 {
191                     ch <<= 6;
192                     ch |= utf8_src[i + 1] & 0x3f;
193                     i++;
194                 }
195                 else
196                 {
197                     ch = 0xfffd;
198                     utf8_trail_bytes = 0;
199                     status = STATUS_SOME_NOT_MAPPED;
200                     break;
201                 }
202             }
203         }
204         else
205         {
206             ch = 0xfffd;
207             utf8_trail_bytes = 0;
208             status = STATUS_SOME_NOT_MAPPED;
209             i = utf8_bytes;
210         }
211 
212         /* encode ch as UTF-16 */
213         if ((ch > 0x10ffff) ||
214             (ch >= 0xd800 && ch <= 0xdfff) ||
215             (utf8_trail_bytes == 2 && ch < 0x00800) ||
216             (utf8_trail_bytes == 3 && ch < 0x10000))
217         {
218             /* invalid codepoint or overlong encoding */
219             utf16_ch[0] = 0xfffd;
220             utf16_ch[1] = 0xfffd;
221             utf16_ch[2] = 0xfffd;
222             utf16_ch_len = utf8_trail_bytes;
223             status = STATUS_SOME_NOT_MAPPED;
224         }
225         else if (ch >= 0x10000)
226         {
227             /* surrogate pair */
228             ch -= 0x010000;
229             utf16_ch[0] = 0xd800 + (ch >> 10 & 0x3ff);
230             utf16_ch[1] = 0xdc00 + (ch >>  0 & 0x3ff);
231             utf16_ch_len = 2;
232         }
233         else
234         {
235             /* single unit */
236             utf16_ch[0] = ch;
237             utf16_ch_len = 1;
238         }
239 
240         if (!uni_dest)
241         {
242             written += utf16_ch_len;
243             continue;
244         }
245 
246         for (j = 0; j < utf16_ch_len; j++)
247         {
248             if (uni_bytes_max >= sizeof(WCHAR))
249             {
250                 *uni_dest++ = utf16_ch[j];
251                 uni_bytes_max -= sizeof(WCHAR);
252                 written++;
253             }
254             else
255             {
256                 uni_bytes_max = 0;
257                 status = STATUS_BUFFER_TOO_SMALL;
258             }
259         }
260     }
261 
262     *uni_bytes_written = written * sizeof(WCHAR);
263     return status;
264 }
265