1 /*
2  *	Copyright 2017, University Corporation for Atmospheric Research
3  *      See netcdf/COPYRIGHT file for copying and redistribution conditions.
4  */
5 
6 #include "config.h"
7 #ifdef HAVE_STDLIB_H
8 #include <stdlib.h>
9 #endif
10 #ifdef HAVE_STRING_H
11 #include <string.h>
12 #endif
13 #include "netcdf.h"
14 #include "ncutf8.h"
15 #include "utf8proc.h"
16 
17 /* Provide a wrapper around whatever utf8 library we use. */
18 
19 /*
20  * Check validity of a UTF8 encoded null-terminated byte string.
21  * Return codes:
22  * NC_NOERR -- string is valid utf8
23  * NC_ENOMEM -- out of memory
24  * NC_EINVAL -- invalid argument or internal error
25  * NC_EBADNAME-- not valid utf8
26  */
27 
nc_utf8_validate(const unsigned char * name)28 int nc_utf8_validate(const unsigned char* name)
29 {
30     int ncstat = NC_NOERR;
31     const nc_utf8proc_uint8_t *str;
32     nc_utf8proc_ssize_t nchars = -1;
33     nc_utf8proc_int32_t codepoint;
34     nc_utf8proc_ssize_t count;
35 
36     str = (const nc_utf8proc_uint8_t*)name;
37     while(*str) {
38         count = nc_utf8proc_iterate(str,nchars,&codepoint);
39 	if(count < 0) {
40 	    switch (count) {
41 	    case UTF8PROC_ERROR_NOMEM:
42 	    case UTF8PROC_ERROR_OVERFLOW:
43 		ncstat = NC_ENOMEM;
44 		break;
45 	    case UTF8PROC_ERROR_INVALIDOPTS:
46 		ncstat = NC_EINVAL;
47 		break;
48 	    case UTF8PROC_ERROR_INVALIDUTF8:
49 	    case UTF8PROC_ERROR_NOTASSIGNED:
50 	    default:
51 		ncstat = NC_EBADNAME;
52 		break;
53 	    }
54 	    goto done;
55 	} else { /* move to next char */
56 	    str += count;
57 	}
58     }
59 done:
60     return ncstat;
61 }
62 
63 /*
64  * Returns a pointer to newly allocated memory of a
65  * normalized version of the null-terminated string 'str'.
66  * Normalized string is returned in normalp argument;
67  * caller must free.
68  * Return codes:
69  * NC_NOERR -- success
70  * NC_ENOMEM -- out of memory
71  * NC_EINVAL -- illegal argument or internal error
72  * NC_EBADNAME -- other failure
73  */
74 int
nc_utf8_normalize(const unsigned char * utf8,unsigned char ** normalp)75 nc_utf8_normalize(const unsigned char* utf8, unsigned char** normalp)
76 {
77     int ncstat = NC_NOERR;
78     const nc_utf8proc_uint8_t* str = (const nc_utf8proc_uint8_t*)utf8;
79     nc_utf8proc_uint8_t* retval = NULL;
80     nc_utf8proc_ssize_t count;
81     count = nc_utf8proc_map(str, 0, &retval, UTF8PROC_NULLTERM | UTF8PROC_STABLE | UTF8PROC_COMPOSE);
82     if(count < 0) {/* error */
83 	switch (count) {
84 	case UTF8PROC_ERROR_NOMEM:
85 	case UTF8PROC_ERROR_OVERFLOW:
86 	ncstat = NC_ENOMEM;
87 	    break;
88 	case UTF8PROC_ERROR_INVALIDOPTS:
89 	    ncstat = NC_EINVAL;
90 	    break;
91 	case UTF8PROC_ERROR_INVALIDUTF8:
92 	case UTF8PROC_ERROR_NOTASSIGNED:
93 	default:
94 	    ncstat = NC_EBADNAME;
95 	    break;
96 	}
97 	goto done;
98     } else
99 	if(normalp) *normalp = (unsigned char*)retval;
100 done:
101     return ncstat;
102 }
103 
104 /*
105  * Convert a normalized utf8 string to utf16. This is approximate
106  * because it just does the truncation version of conversion for
107  * each 32-bit codepoint to get the corresponding utf16.
108  * Return codes:
109  * NC_NOERR -- success
110  * NC_ENOMEM -- out of memory
111  * NC_EINVAL -- invalid argument or internal error
112  * NC_EBADNAME-- not valid utf16
113  */
114 
nc_utf8_to_utf16(const unsigned char * s8,unsigned short ** utf16p,size_t * len16p)115 int nc_utf8_to_utf16(const unsigned char* s8, unsigned short** utf16p, size_t* len16p)
116 {
117     int ncstat = NC_NOERR;
118     const nc_utf8proc_uint8_t *str;
119     nc_utf8proc_ssize_t nchars = -1;
120     nc_utf8proc_int32_t codepoint;
121     nc_utf8proc_ssize_t count;
122     size_t len8, len16;
123     unsigned short* utf16;
124     unsigned short* p16;
125 
126     len8 = strlen((char*)s8);
127     utf16 = (unsigned short*)malloc(sizeof(unsigned short)*(len8+1));
128     if(utf16 == NULL) {
129       ncstat = NC_ENOMEM;
130       goto done;
131     }
132     str = (const nc_utf8proc_uint8_t*)s8;
133     /* Walk the string and convert each codepoint */
134     p16 = utf16;
135     len16 = 0;
136     while(*str) {
137       count = nc_utf8proc_iterate(str,nchars,&codepoint);
138       if(count < 0) {
139 	    switch (count) {
140 	    case UTF8PROC_ERROR_NOMEM:
141 	    case UTF8PROC_ERROR_OVERFLOW:
142           ncstat = NC_ENOMEM;
143           break;
144 	    case UTF8PROC_ERROR_INVALIDOPTS:
145           ncstat = NC_EINVAL;
146           break;
147 	    case UTF8PROC_ERROR_INVALIDUTF8:
148 	    case UTF8PROC_ERROR_NOTASSIGNED:
149 	    default:
150           ncstat = NC_EBADNAME;
151           break;
152 	    }
153 	    goto done;
154       } else { /* move to next char */
155 	    /* Complain if top 16 bits not zero */
156 	    if((codepoint & 0xFFFF0000) != 0) {
157 	          ncstat = NC_EBADNAME;
158 	          goto done;
159 	    }
160 	    /* Truncate codepoint to 16 bits and store */
161 	    *p16++ = (unsigned short)(codepoint & 0x0000FFFF);
162 	    str += count;
163 	    len16++;
164       }
165     }
166     *p16++ = (unsigned short)0;
167     if(utf16p)
168       *utf16p = utf16;
169     else
170       free(utf16);
171 
172     if(len16p) *len16p = len16;
173  done:
174     if(ncstat) free(utf16);
175     return ncstat;
176 }
177