1 /*
2 * Copyright 2017, University Corporation for Atmospheric Research
3 * See netcdf/COPYRIGHT file for copying and redistribution conditions.
4 */
5
6 #include "config.h"
7 #ifdef HAVE_STDLIB_H
8 #include <stdlib.h>
9 #endif
10 #ifdef HAVE_STRING_H
11 #include <string.h>
12 #endif
13 #include "netcdf.h"
14 #include "ncutf8.h"
15 #include "utf8proc.h"
16
17 /* Provide a wrapper around whatever utf8 library we use. */
18
19 /*
20 * Check validity of a UTF8 encoded null-terminated byte string.
21 * Return codes:
22 * NC_NOERR -- string is valid utf8
23 * NC_ENOMEM -- out of memory
24 * NC_EINVAL -- invalid argument or internal error
25 * NC_EBADNAME-- not valid utf8
26 */
27
nc_utf8_validate(const unsigned char * name)28 int nc_utf8_validate(const unsigned char* name)
29 {
30 int ncstat = NC_NOERR;
31 const nc_utf8proc_uint8_t *str;
32 nc_utf8proc_ssize_t nchars = -1;
33 nc_utf8proc_int32_t codepoint;
34 nc_utf8proc_ssize_t count;
35
36 str = (const nc_utf8proc_uint8_t*)name;
37 while(*str) {
38 count = nc_utf8proc_iterate(str,nchars,&codepoint);
39 if(count < 0) {
40 switch (count) {
41 case UTF8PROC_ERROR_NOMEM:
42 case UTF8PROC_ERROR_OVERFLOW:
43 ncstat = NC_ENOMEM;
44 break;
45 case UTF8PROC_ERROR_INVALIDOPTS:
46 ncstat = NC_EINVAL;
47 break;
48 case UTF8PROC_ERROR_INVALIDUTF8:
49 case UTF8PROC_ERROR_NOTASSIGNED:
50 default:
51 ncstat = NC_EBADNAME;
52 break;
53 }
54 goto done;
55 } else { /* move to next char */
56 str += count;
57 }
58 }
59 done:
60 return ncstat;
61 }
62
63 /*
64 * Returns a pointer to newly allocated memory of a
65 * normalized version of the null-terminated string 'str'.
66 * Normalized string is returned in normalp argument;
67 * caller must free.
68 * Return codes:
69 * NC_NOERR -- success
70 * NC_ENOMEM -- out of memory
71 * NC_EINVAL -- illegal argument or internal error
72 * NC_EBADNAME -- other failure
73 */
74 int
nc_utf8_normalize(const unsigned char * utf8,unsigned char ** normalp)75 nc_utf8_normalize(const unsigned char* utf8, unsigned char** normalp)
76 {
77 int ncstat = NC_NOERR;
78 const nc_utf8proc_uint8_t* str = (const nc_utf8proc_uint8_t*)utf8;
79 nc_utf8proc_uint8_t* retval = NULL;
80 nc_utf8proc_ssize_t count;
81 count = nc_utf8proc_map(str, 0, &retval, UTF8PROC_NULLTERM | UTF8PROC_STABLE | UTF8PROC_COMPOSE);
82 if(count < 0) {/* error */
83 switch (count) {
84 case UTF8PROC_ERROR_NOMEM:
85 case UTF8PROC_ERROR_OVERFLOW:
86 ncstat = NC_ENOMEM;
87 break;
88 case UTF8PROC_ERROR_INVALIDOPTS:
89 ncstat = NC_EINVAL;
90 break;
91 case UTF8PROC_ERROR_INVALIDUTF8:
92 case UTF8PROC_ERROR_NOTASSIGNED:
93 default:
94 ncstat = NC_EBADNAME;
95 break;
96 }
97 goto done;
98 } else
99 if(normalp) *normalp = (unsigned char*)retval;
100 done:
101 return ncstat;
102 }
103
104 /*
105 * Convert a normalized utf8 string to utf16. This is approximate
106 * because it just does the truncation version of conversion for
107 * each 32-bit codepoint to get the corresponding utf16.
108 * Return codes:
109 * NC_NOERR -- success
110 * NC_ENOMEM -- out of memory
111 * NC_EINVAL -- invalid argument or internal error
112 * NC_EBADNAME-- not valid utf16
113 */
114
nc_utf8_to_utf16(const unsigned char * s8,unsigned short ** utf16p,size_t * len16p)115 int nc_utf8_to_utf16(const unsigned char* s8, unsigned short** utf16p, size_t* len16p)
116 {
117 int ncstat = NC_NOERR;
118 const nc_utf8proc_uint8_t *str;
119 nc_utf8proc_ssize_t nchars = -1;
120 nc_utf8proc_int32_t codepoint;
121 nc_utf8proc_ssize_t count;
122 size_t len8, len16;
123 unsigned short* utf16;
124 unsigned short* p16;
125
126 len8 = strlen((char*)s8);
127 utf16 = (unsigned short*)malloc(sizeof(unsigned short)*(len8+1));
128 if(utf16 == NULL) {
129 ncstat = NC_ENOMEM;
130 goto done;
131 }
132 str = (const nc_utf8proc_uint8_t*)s8;
133 /* Walk the string and convert each codepoint */
134 p16 = utf16;
135 len16 = 0;
136 while(*str) {
137 count = nc_utf8proc_iterate(str,nchars,&codepoint);
138 if(count < 0) {
139 switch (count) {
140 case UTF8PROC_ERROR_NOMEM:
141 case UTF8PROC_ERROR_OVERFLOW:
142 ncstat = NC_ENOMEM;
143 break;
144 case UTF8PROC_ERROR_INVALIDOPTS:
145 ncstat = NC_EINVAL;
146 break;
147 case UTF8PROC_ERROR_INVALIDUTF8:
148 case UTF8PROC_ERROR_NOTASSIGNED:
149 default:
150 ncstat = NC_EBADNAME;
151 break;
152 }
153 goto done;
154 } else { /* move to next char */
155 /* Complain if top 16 bits not zero */
156 if((codepoint & 0xFFFF0000) != 0) {
157 ncstat = NC_EBADNAME;
158 goto done;
159 }
160 /* Truncate codepoint to 16 bits and store */
161 *p16++ = (unsigned short)(codepoint & 0x0000FFFF);
162 str += count;
163 len16++;
164 }
165 }
166 *p16++ = (unsigned short)0;
167 if(utf16p)
168 *utf16p = utf16;
169 else
170 free(utf16);
171
172 if(len16p) *len16p = len16;
173 done:
174 if(ncstat) free(utf16);
175 return ncstat;
176 }
177