1 /*
2  * Copyright (c) 1998, 2016, Oracle and/or its affiliates. All rights reserved.
3  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
4  *
5  * This code is free software; you can redistribute it and/or modify it
6  * under the terms of the GNU General Public License version 2 only, as
7  * published by the Free Software Foundation.  Oracle designates this
8  * particular file as subject to the "Classpath" exception as provided
9  * by Oracle in the LICENSE file that accompanied this code.
10  *
11  * This code is distributed in the hope that it will be useful, but WITHOUT
12  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
13  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
14  * version 2 for more details (a copy is included in the LICENSE file that
15  * accompanied this code).
16  *
17  * You should have received a copy of the GNU General Public License version
18  * 2 along with this work; if not, write to the Free Software Foundation,
19  * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
20  *
21  * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
22  * or visit www.oracle.com if you need additional information or have any
23  * questions.
24  */
25 
26 #include <stdlib.h>
27 #include <ctype.h>
28 
29 #include "jni.h"
30 
31 #include "utf_util.h"
32 
33 
34 /* Error and assert macros */
35 #define UTF_ERROR(m) utfError(__FILE__, __LINE__,  m)
36 #define UTF_ASSERT(x) ( (x)==0 ? UTF_ERROR("ASSERT ERROR " #x) : (void)0 )
37 
38 // Platform independed part
39 
utfError(char * file,int line,char * message)40 static void utfError(char *file, int line, char *message) {
41     (void)fprintf(stderr, "UTF ERROR [\"%s\":%d]: %s\n", file, line, message);
42     abort();
43 }
44 
45 /* Determine length of this Standard UTF-8 in Modified UTF-8.
46  *    Validation is done of the basic UTF encoding rules, returns
47  *    length (no change) when errors are detected in the UTF encoding.
48  *
49  *    Note: Accepts Modified UTF-8 also, no verification on the
50  *          correctness of Standard UTF-8 is done. e,g, 0xC080 input is ok.
51  */
utf8sToUtf8mLength(jbyte * string,int length)52 int JNICALL utf8sToUtf8mLength(jbyte *string, int length) {
53   int newLength;
54   int i;
55 
56   newLength = 0;
57   for ( i = 0 ; i < length ; i++ ) {
58     unsigned byte;
59 
60     byte = (unsigned char)string[i];
61     if ( (byte & 0x80) == 0 ) { /* 1byte encoding */
62       newLength++;
63       if ( byte == 0 ) {
64         newLength++; /* We gain one byte in length on NULL bytes */
65       }
66     } else if ( (byte & 0xE0) == 0xC0 ) { /* 2byte encoding */
67       /* Check encoding of following bytes */
68       if ( (i+1) >= length || (string[i+1] & 0xC0) != 0x80 ) {
69         break; /* Error condition */
70       }
71       i++; /* Skip next byte */
72       newLength += 2;
73     } else if ( (byte & 0xF0) == 0xE0 ) { /* 3byte encoding */
74       /* Check encoding of following bytes */
75       if ( (i+2) >= length || (string[i+1] & 0xC0) != 0x80
76         || (string[i+2] & 0xC0) != 0x80 ) {
77         break; /* Error condition */
78         }
79         i += 2; /* Skip next two bytes */
80         newLength += 3;
81     } else if ( (byte & 0xF8) == 0xF0 ) { /* 4byte encoding */
82       /* Check encoding of following bytes */
83       if ( (i+3) >= length || (string[i+1] & 0xC0) != 0x80
84         || (string[i+2] & 0xC0) != 0x80
85         || (string[i+3] & 0xC0) != 0x80 ) {
86         break; /* Error condition */
87         }
88         i += 3; /* Skip next 3 bytes */
89         newLength += 6; /* 4byte encoding turns into 2 3byte ones */
90     } else {
91       break; /* Error condition */
92     }
93   }
94   if ( i != length ) {
95     /* Error in finding new length, return old length so no conversion */
96     /* FIXUP: ERROR_MESSAGE? */
97     return length;
98   }
99   return newLength;
100 }
101 
102 /* Convert Standard UTF-8 to Modified UTF-8.
103  *    Assumes the UTF-8 encoding was validated by utf8mLength() above.
104  *
105  *    Note: Accepts Modified UTF-8 also, no verification on the
106  *          correctness of Standard UTF-8 is done. e,g, 0xC080 input is ok.
107  */
utf8sToUtf8m(jbyte * string,int length,jbyte * newString,int newLength)108 void JNICALL utf8sToUtf8m(jbyte *string, int length, jbyte *newString, int newLength) {
109     int i;
110     int j;
111 
112     j = 0;
113     for ( i = 0 ; i < length ; i++ ) {
114         unsigned byte1;
115 
116         byte1 = (unsigned char)string[i];
117 
118         /* NULL bytes and bytes starting with 11110xxx are special */
119         if ( (byte1 & 0x80) == 0 ) { /* 1byte encoding */
120             if ( byte1 == 0 ) {
121                 /* Bits out: 11000000 10000000 */
122                 newString[j++] = (jbyte)0xC0;
123                 newString[j++] = (jbyte)0x80;
124             } else {
125                 /* Single byte */
126                 newString[j++] = byte1;
127             }
128         } else if ( (byte1 & 0xE0) == 0xC0 ) { /* 2byte encoding */
129             newString[j++] = byte1;
130             newString[j++] = string[++i];
131         } else if ( (byte1 & 0xF0) == 0xE0 ) { /* 3byte encoding */
132             newString[j++] = byte1;
133             newString[j++] = string[++i];
134             newString[j++] = string[++i];
135         } else if ( (byte1 & 0xF8) == 0xF0 ) { /* 4byte encoding */
136             /* Beginning of 4byte encoding, turn into 2 3byte encodings */
137             unsigned byte2, byte3, byte4, u21;
138 
139             /* Bits in: 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx */
140             byte2 = (unsigned char)string[++i];
141             byte3 = (unsigned char)string[++i];
142             byte4 = (unsigned char)string[++i];
143             /* Reconstruct full 21bit value */
144             u21  = (byte1 & 0x07) << 18;
145             u21 += (byte2 & 0x3F) << 12;
146             u21 += (byte3 & 0x3F) << 6;
147             u21 += (byte4 & 0x3F);
148             /* Bits out: 11101101 1010xxxx 10xxxxxx */
149             newString[j++] = (jbyte)0xED;
150             newString[j++] = (jbyte)(0xA0 + (((u21 >> 16) - 1) & 0x0F));
151             newString[j++] = (jbyte)(0x80 + ((u21 >> 10) & 0x3F));
152             /* Bits out: 11101101 1011xxxx 10xxxxxx */
153             newString[j++] = (jbyte)0xED;
154             newString[j++] = (jbyte)(0xB0 + ((u21 >>  6) & 0x0F));
155             newString[j++] = byte4;
156         }
157     }
158     UTF_ASSERT(i==length);
159     UTF_ASSERT(j==newLength);
160     newString[j] = (jbyte)0;
161 }
162 
163 /* Given a Modified UTF-8 string, calculate the Standard UTF-8 length.
164  *   Basic validation of the UTF encoding rules is done, and length is
165  *   returned (no change) when errors are detected.
166  *
167  *   Note: No validation is made that this is indeed Modified UTF-8 coming in.
168  *
169  */
utf8mToUtf8sLength(jbyte * string,int length)170 int JNICALL utf8mToUtf8sLength(jbyte *string, int length) {
171     int newLength;
172     int i;
173 
174     newLength = 0;
175     for ( i = 0 ; i < length ; i++ ) {
176         unsigned byte1, byte2, byte3, byte4, byte5, byte6;
177 
178         byte1 = (unsigned char)string[i];
179         if ( (byte1 & 0x80) == 0 ) { /* 1byte encoding */
180             newLength++;
181         } else if ( (byte1 & 0xE0) == 0xC0 ) { /* 2byte encoding */
182             /* Check encoding of following bytes */
183             if ( (i+1) >= length || (string[i+1] & 0xC0) != 0x80 ) {
184                 break; /* Error condition */
185             }
186             byte2 = (unsigned char)string[++i];
187             if ( byte1 != 0xC0 || byte2 != 0x80 ) {
188                 newLength += 2; /* Normal 2byte encoding, not 0xC080 */
189             } else {
190                 newLength++;    /* We will turn 0xC080 into 0 */
191             }
192         } else if ( (byte1 & 0xF0) == 0xE0 ) { /* 3byte encoding */
193             /* Check encoding of following bytes */
194             if ( (i+2) >= length || (string[i+1] & 0xC0) != 0x80
195                                  || (string[i+2] & 0xC0) != 0x80 ) {
196                 break; /* Error condition */
197             }
198             byte2 = (unsigned char)string[++i];
199             byte3 = (unsigned char)string[++i];
200             newLength += 3;
201             /* Possible process a second 3byte encoding */
202             if ( (i+3) < length && byte1 == 0xED && (byte2 & 0xF0) == 0xA0 ) {
203                 /* See if this is a pair of 3byte encodings */
204                 byte4 = (unsigned char)string[i+1];
205                 byte5 = (unsigned char)string[i+2];
206                 byte6 = (unsigned char)string[i+3];
207                 if ( byte4 == 0xED && (byte5 & 0xF0) == 0xB0 ) {
208                     /* Check encoding of 3rd byte */
209                     if ( (byte6 & 0xC0) != 0x80 ) {
210                         break; /* Error condition */
211                     }
212                     newLength++; /* New string will have 4byte encoding */
213                     i += 3;       /* Skip next 3 bytes */
214                 }
215             }
216         } else {
217             break; /* Error condition */
218         }
219     }
220     if ( i != length ) {
221         /* Error in UTF encoding */
222         /*  FIXUP: ERROR_MESSAGE()? */
223         return length;
224     }
225     return newLength;
226 }
227 
228 /* Convert a Modified UTF-8 string into a Standard UTF-8 string
229  *   It is assumed that this string has been validated in terms of the
230  *   basic UTF encoding rules by utf8Length() above.
231  *
232  *   Note: No validation is made that this is indeed Modified UTF-8 coming in.
233  *
234  */
utf8mToUtf8s(jbyte * string,int length,jbyte * newString,int newLength)235 void JNICALL utf8mToUtf8s(jbyte *string, int length, jbyte *newString, int newLength) {
236     int i;
237     int j;
238 
239     j = 0;
240     for ( i = 0 ; i < length ; i++ ) {
241         unsigned byte1, byte2, byte3, byte4, byte5, byte6;
242 
243         byte1 = (unsigned char)string[i];
244         if ( (byte1 & 0x80) == 0 ) { /* 1byte encoding */
245             /* Single byte */
246             newString[j++] = byte1;
247         } else if ( (byte1 & 0xE0) == 0xC0 ) { /* 2byte encoding */
248             byte2 = (unsigned char)string[++i];
249             if ( byte1 != 0xC0 || byte2 != 0x80 ) {
250                 newString[j++] = byte1;
251                 newString[j++] = byte2;
252             } else {
253                 newString[j++] = 0;
254             }
255         } else if ( (byte1 & 0xF0) == 0xE0 ) { /* 3byte encoding */
256             byte2 = (unsigned char)string[++i];
257             byte3 = (unsigned char)string[++i];
258             if ( i+3 < length && byte1 == 0xED && (byte2 & 0xF0) == 0xA0 ) {
259                 /* See if this is a pair of 3byte encodings */
260                 byte4 = (unsigned char)string[i+1];
261                 byte5 = (unsigned char)string[i+2];
262                 byte6 = (unsigned char)string[i+3];
263                 if ( byte4 == 0xED && (byte5 & 0xF0) == 0xB0 ) {
264                     unsigned u21;
265 
266                     /* Bits in: 11101101 1010xxxx 10xxxxxx */
267                     /* Bits in: 11101101 1011xxxx 10xxxxxx */
268                     i += 3;
269 
270                     /* Reconstruct 21 bit code */
271                     u21  = ((byte2 & 0x0F) + 1) << 16;
272                     u21 += (byte3 & 0x3F) << 10;
273                     u21 += (byte5 & 0x0F) << 6;
274                     u21 += (byte6 & 0x3F);
275 
276                     /* Bits out: 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx */
277 
278                     /* Convert to 4byte encoding */
279                     newString[j++] = 0xF0 + ((u21 >> 18) & 0x07);
280                     newString[j++] = 0x80 + ((u21 >> 12) & 0x3F);
281                     newString[j++] = 0x80 + ((u21 >>  6) & 0x3F);
282                     newString[j++] = 0x80 + (u21 & 0x3F);
283                     continue;
284                 }
285             }
286             /* Normal 3byte encoding */
287             newString[j++] = byte1;
288             newString[j++] = byte2;
289             newString[j++] = byte3;
290         }
291     }
292     UTF_ASSERT(i==length);
293     UTF_ASSERT(j==newLength);
294     newString[j] = 0;
295 }
296 
297 #ifdef _WIN32
298 // Microsoft Windows specific part
299 
300 #include <windows.h>
301 
getCodepage()302 static UINT getCodepage() {
303     LANGID langID;
304     LCID localeID;
305     TCHAR strCodePage[7];       // ANSI code page id
306 
307     static UINT intCodePage = -1;
308 
309     if (intCodePage == -1) {
310         // Firts call, get codepage from the os
311         langID = LANGIDFROMLCID(GetUserDefaultLCID());
312         localeID = MAKELCID(langID, SORT_DEFAULT);
313         if (GetLocaleInfo(localeID, LOCALE_IDEFAULTANSICODEPAGE,
314                          strCodePage, sizeof(strCodePage)/sizeof(TCHAR)) > 0 ) {
315             intCodePage = atoi(strCodePage);
316         }
317         else {
318             intCodePage = GetACP();
319         }
320     }
321 
322     return intCodePage;
323 }
324 
325 /*
326  * Get wide string  (assumes len>0)
327  */
getWideString(UINT codePage,char * str,int len,int * pwlen)328 static WCHAR* getWideString(UINT codePage, char* str, int len, int *pwlen) {
329     int wlen;
330     WCHAR* wstr;
331 
332     /* Convert the string to WIDE string */
333     wlen = MultiByteToWideChar(codePage, 0, str, len, NULL, 0);
334     *pwlen = wlen;
335     if (wlen <= 0) {
336         UTF_ERROR(("Can't get WIDE string length"));
337         return NULL;
338     }
339     wstr = (WCHAR*)malloc(wlen * sizeof(WCHAR));
340     if (wstr == NULL) {
341         UTF_ERROR(("Can't malloc() any space"));
342         return NULL;
343     }
344     if (MultiByteToWideChar(codePage, 0, str, len, wstr, wlen) == 0) {
345         UTF_ERROR(("Can't get WIDE string"));
346         return NULL;
347     }
348     return wstr;
349 }
350 
351 /*
352  * Convert UTF-8 to a platform string
353  * NOTE: outputBufSize includes the space for the trailing 0.
354  */
utf8ToPlatform(jbyte * utf8,int len,char * output,int outputBufSize)355 int JNICALL utf8ToPlatform(jbyte *utf8, int len, char* output, int outputBufSize) {
356     int wlen;
357     int plen;
358     WCHAR* wstr;
359     UINT codepage;
360     int outputMaxLen;
361 
362     UTF_ASSERT(utf8);
363     UTF_ASSERT(output);
364     UTF_ASSERT(len >= 0);
365     UTF_ASSERT(outputBufSize > len);
366     outputMaxLen = outputBufSize - 1; // leave space for trailing 0
367 
368     /* Zero length is ok, but we don't need to do much */
369     if ( len == 0 ) {
370         output[0] = 0;
371         return 0;
372     }
373 
374     /* Get WIDE string version (assumes len>0) */
375     wstr = getWideString(CP_UTF8, (char*)utf8, len, &wlen);
376     if ( wstr == NULL ) {
377         // Can't allocate WIDE string
378         goto just_copy_bytes;
379     }
380 
381     /* Convert WIDE string to MultiByte string */
382     codepage = getCodepage();
383     plen = WideCharToMultiByte(codepage, 0, wstr, wlen,
384                                output, outputMaxLen, NULL, NULL);
385     free(wstr);
386     if (plen <= 0) {
387         // Can't convert WIDE string to multi-byte
388         goto just_copy_bytes;
389     }
390     output[plen] = '\0';
391     return plen;
392 
393 just_copy_bytes:
394     (void)memcpy(output, utf8, len);
395     output[len] = 0;
396     return len;
397 }
398 
399 /*
400  * Convert Platform Encoding to UTF-8.
401  * NOTE: outputBufSize includes the space for the trailing 0.
402  */
utf8FromPlatform(char * str,int len,jbyte * output,int outputBufSize)403 int JNICALL utf8FromPlatform(char *str, int len, jbyte *output, int outputBufSize) {
404     int wlen;
405     int plen;
406     WCHAR* wstr;
407     UINT codepage;
408     int outputMaxLen;
409 
410     UTF_ASSERT(str);
411     UTF_ASSERT(output);
412     UTF_ASSERT(len >= 0);
413     UTF_ASSERT(outputBufSize > len);
414     outputMaxLen = outputBufSize - 1; // leave space for trailing 0
415 
416     /* Zero length is ok, but we don't need to do much */
417     if ( len == 0 ) {
418         output[0] = 0;
419         return 0;
420     }
421 
422     /* Get WIDE string version (assumes len>0) */
423     codepage = getCodepage();
424     wstr = getWideString(codepage, str, len, &wlen);
425     if ( wstr == NULL ) {
426         goto just_copy_bytes;
427     }
428 
429     /* Convert WIDE string to UTF-8 string */
430     plen = WideCharToMultiByte(CP_UTF8, 0, wstr, wlen,
431                                (char*)output, outputMaxLen, NULL, NULL);
432     free(wstr);
433     if (plen <= 0) {
434         UTF_ERROR(("Can't convert WIDE string to multi-byte"));
435         goto just_copy_bytes;
436     }
437     output[plen] = '\0';
438     return plen;
439 
440 just_copy_bytes:
441     (void)memcpy(output, str, len);
442     output[len] = 0;
443     return len;
444 }
445 
446 
447 #else
448 // *NIX specific part
449 
450 #include <iconv.h>
451 #include <locale.h>
452 #include <langinfo.h>
453 #include <string.h>
454 
455 typedef enum {TO_UTF8, FROM_UTF8} conv_direction;
456 
457 /*
458  * Do iconv() conversion.
459  *    Returns length or -1 if output overflows.
460  * NOTE: outputBufSize includes the space for the trailing 0.
461  */
iconvConvert(conv_direction drn,char * bytes,size_t len,char * output,size_t outputBufSize)462 static int iconvConvert(conv_direction drn, char *bytes, size_t len, char *output, size_t outputBufSize) {
463 
464     static char *codeset = 0;
465     iconv_t func;
466     size_t bytes_converted;
467     size_t inLeft, outLeft;
468     char *inbuf, *outbuf;
469     int outputMaxLen;
470 
471     UTF_ASSERT(bytes);
472     UTF_ASSERT(output);
473     UTF_ASSERT(outputBufSize > len);
474     outputMaxLen = outputBufSize - 1; // leave space for trailing 0
475 
476     /* Zero length is ok, but we don't need to do much */
477     if ( len == 0 ) {
478         output[0] = 0;
479         return 0;
480     }
481 
482     if (codeset == NULL && codeset != (char *) -1) {
483         // locale is not initialized, do it now
484         if (setlocale(LC_ALL, "") != NULL) {
485             // nl_langinfo returns ANSI_X3.4-1968 by default
486             codeset = (char*)nl_langinfo(CODESET);
487         }
488 
489         if (codeset == NULL) {
490            // Not able to intialize process locale from platform one.
491            codeset = (char *) -1;
492         }
493     }
494 
495     if (codeset == (char *) -1) {
496       // There was an error during initialization, so just bail out
497       goto just_copy_bytes;
498     }
499 
500     func = (drn == TO_UTF8) ? iconv_open(codeset, "UTF-8") : iconv_open("UTF-8", codeset);
501     if (func == (iconv_t) -1) {
502         // Requested charset combination is not supported, conversion couldn't be done.
503         // make sure we will not try it again
504         codeset = (char *) -1;
505         goto just_copy_bytes;
506     }
507 
508     // perform conversion
509     inbuf = bytes;
510     outbuf = output;
511     inLeft = len;
512     outLeft = outputMaxLen;
513 
514     bytes_converted = iconv(func, (void*)&inbuf, &inLeft, &outbuf, &outLeft);
515     if (bytes_converted == (size_t) -1 || bytes_converted == 0 || inLeft != 0) {
516         // Input string is invalid, not able to convert entire string
517         // or some other iconv error happens.
518         iconv_close(func);
519         goto just_copy_bytes;
520     }
521 
522     iconv_close(func);
523     // Overwrite bytes_converted with value of actually stored bytes
524     bytes_converted = outputMaxLen-outLeft;
525     output[bytes_converted] = 0;
526     return bytes_converted;
527 
528 
529 just_copy_bytes:
530     (void)memcpy(output, bytes, len);
531     output[len] = 0;
532     return len;
533  }
534 
535 /*
536  * Convert UTF-8 to Platform Encoding.
537  *    Returns length or -1 if output overflows.
538  * NOTE: outputBufSize includes the space for the trailing 0.
539  */
utf8ToPlatform(jbyte * utf8,int len,char * output,int outputBufSize)540 int JNICALL utf8ToPlatform(jbyte *utf8, int len, char *output, int outputBufSize) {
541     return iconvConvert(FROM_UTF8, (char*)utf8, len, output, outputBufSize);
542 }
543 
544 /*
545  * Convert Platform Encoding to UTF-8.
546  *    Returns length or -1 if output overflows.
547  * NOTE: outputBufSize includes the space for the trailing 0.
548  */
utf8FromPlatform(char * str,int len,jbyte * output,int outputBufSize)549 int JNICALL utf8FromPlatform(char *str, int len, jbyte *output, int outputBufSize) {
550     return iconvConvert(TO_UTF8, str, len, (char*) output, outputBufSize);
551 }
552 
553 #endif
554