1 /* $NetBSD: unicode.c,v 1.4 2014/12/10 04:37:55 christos Exp $ */ 2 3 #ifndef lint 4 static char *rcsid = "Id: unicode.c,v 1.1 2003/06/04 00:26:16 marka Exp "; 5 #endif 6 7 /* 8 * Copyright (c) 2000,2001,2002 Japan Network Information Center. 9 * All rights reserved. 10 * 11 * By using this file, you agree to the terms and conditions set forth bellow. 12 * 13 * LICENSE TERMS AND CONDITIONS 14 * 15 * The following License Terms and Conditions apply, unless a different 16 * license is obtained from Japan Network Information Center ("JPNIC"), 17 * a Japanese association, Kokusai-Kougyou-Kanda Bldg 6F, 2-3-4 Uchi-Kanda, 18 * Chiyoda-ku, Tokyo 101-0047, Japan. 19 * 20 * 1. Use, Modification and Redistribution (including distribution of any 21 * modified or derived work) in source and/or binary forms is permitted 22 * under this License Terms and Conditions. 23 * 24 * 2. Redistribution of source code must retain the copyright notices as they 25 * appear in each source code file, this License Terms and Conditions. 26 * 27 * 3. Redistribution in binary form must reproduce the Copyright Notice, 28 * this License Terms and Conditions, in the documentation and/or other 29 * materials provided with the distribution. For the purposes of binary 30 * distribution the "Copyright Notice" refers to the following language: 31 * "Copyright (c) 2000-2002 Japan Network Information Center. All rights reserved." 32 * 33 * 4. The name of JPNIC may not be used to endorse or promote products 34 * derived from this Software without specific prior written approval of 35 * JPNIC. 36 * 37 * 5. Disclaimer/Limitation of Liability: THIS SOFTWARE IS PROVIDED BY JPNIC 38 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 39 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A 40 * PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL JPNIC BE LIABLE 41 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 42 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 43 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR 44 * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, 45 * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR 46 * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF 47 * ADVISED OF THE POSSIBILITY OF SUCH DAMAGES. 48 */ 49 50 #include <config.h> 51 52 #include <stddef.h> 53 #include <stdlib.h> 54 #include <string.h> 55 56 #include <idn/result.h> 57 #include <idn/logmacro.h> 58 #include <idn/assert.h> 59 #include <idn/unicode.h> 60 61 #define UNICODE_CURRENT "3.2.0" 62 63 #define UCS_MAX 0x10ffff 64 #define END_BIT 0x80000000 65 66 /* 67 * Some constants for Hangul decomposition/composition. 68 */ 69 #define SBase 0xac00 70 #define LBase 0x1100 71 #define VBase 0x1161 72 #define TBase 0x11a7 73 #define LCount 19 74 #define VCount 21 75 #define TCount 28 76 #define SLast (SBase + LCount * VCount * TCount) 77 78 /* 79 * Symbol composition macro. 80 */ 81 #define compose_sym(a, b) compose_symX(a, b) 82 #define compose_symX(a, b) a ## b 83 84 struct composition { 85 unsigned long c2; /* 2nd character */ 86 unsigned long comp; /* composed character */ 87 }; 88 89 #include "unicodedata_320.c" 90 #define VERSION v320 91 #include "unicode_template.c" 92 #undef VERSION 93 94 typedef int (*unicode_canonclassproc)(unsigned long v); 95 typedef int (*unicode_decomposeproc)(unsigned long c, 96 const unsigned long **seqp); 97 typedef int (*unicode_composeproc)(unsigned long c, 98 const struct composition **compp); 99 100 static struct idn__unicode_ops { 101 char *version; 102 unicode_canonclassproc canonclass_proc; 103 unicode_decomposeproc decompose_proc; 104 unicode_composeproc compose_proc; 105 } unicode_versions[] = { 106 #define MAKE_UNICODE_HANDLE(version, suffix) \ 107 { version, \ 108 compose_sym(canonclass_, suffix), \ 109 compose_sym(decompose_, suffix), \ 110 compose_sym(compose_, suffix) } 111 MAKE_UNICODE_HANDLE("3.2.0", v320), 112 { NULL }, 113 #undef MAKE_UNICODE_HANDLE 114 }; 115 116 idn_result_t 117 idn__unicode_create(const char *version, 118 idn__unicode_version_t *versionp) { 119 idn__unicode_version_t v; 120 121 assert(versionp != NULL); 122 TRACE(("idn__unicode_create(version=%-.50s)\n", 123 version == NULL ? "<NULL>" : version)); 124 125 if (version == NULL) 126 version = UNICODE_CURRENT; 127 128 for (v = unicode_versions; v->version != NULL; v++) { 129 if (strcmp(v->version, version) == 0) { 130 *versionp = v; 131 return (idn_success); 132 } 133 } 134 return (idn_notfound); 135 } 136 137 void 138 idn__unicode_destroy(idn__unicode_version_t version) { 139 assert(version != NULL); 140 TRACE(("idn__unicode_destroy()\n")); 141 /* Nothing to do */ 142 } 143 144 int 145 idn__unicode_canonicalclass(idn__unicode_version_t version, unsigned long c) { 146 if (c > UCS_MAX) 147 return (0); 148 149 return (*version->canonclass_proc)(c); 150 } 151 152 idn_result_t 153 idn__unicode_decompose(idn__unicode_version_t version, 154 int compat, unsigned long *v, size_t vlen, 155 unsigned long c, int *decomp_lenp) { 156 unsigned long *vorg = v; 157 int seqidx; 158 const unsigned long *seq; 159 160 assert(v != NULL && vlen >= 0 && decomp_lenp != NULL); 161 162 if (c > UCS_MAX) 163 return (idn_notfound); 164 165 /* 166 * First, check for Hangul. 167 */ 168 if (SBase <= c && c < SLast) { 169 int idx, t_offset, v_offset, l_offset; 170 171 idx = c - SBase; 172 t_offset = idx % TCount; 173 idx /= TCount; 174 v_offset = idx % VCount; 175 l_offset = idx / VCount; 176 if ((t_offset == 0 && vlen < 2) || (t_offset > 0 && vlen < 3)) 177 return (idn_buffer_overflow); 178 *v++ = LBase + l_offset; 179 *v++ = VBase + v_offset; 180 if (t_offset > 0) 181 *v++ = TBase + t_offset; 182 *decomp_lenp = v - vorg; 183 return (idn_success); 184 } 185 186 /* 187 * Look up decomposition table. If no decomposition is defined 188 * or if it is a compatibility decomosition when canonical 189 * decomposition requested, return 'idn_notfound'. 190 */ 191 seqidx = (*version->decompose_proc)(c, &seq); 192 if (seqidx == 0 || (compat == 0 && (seqidx & DECOMP_COMPAT) != 0)) 193 return (idn_notfound); 194 195 /* 196 * Copy the decomposed sequence. The end of the sequence are 197 * marked with END_BIT. 198 */ 199 do { 200 unsigned long c; 201 int dlen; 202 idn_result_t r; 203 204 c = *seq & ~END_BIT; 205 206 /* Decompose recursively. */ 207 r = idn__unicode_decompose(version, compat, v, vlen, c, &dlen); 208 if (r == idn_success) { 209 v += dlen; 210 vlen -= dlen; 211 } else if (r == idn_notfound) { 212 if (vlen < 1) 213 return (idn_buffer_overflow); 214 *v++ = c; 215 vlen--; 216 } else { 217 return (r); 218 } 219 220 } while ((*seq++ & END_BIT) == 0); 221 222 *decomp_lenp = v - vorg; 223 224 return (idn_success); 225 } 226 227 int 228 idn__unicode_iscompositecandidate(idn__unicode_version_t version, 229 unsigned long c) { 230 const struct composition *dummy; 231 232 if (c > UCS_MAX) 233 return (0); 234 235 /* Check for Hangul */ 236 if ((LBase <= c && c < LBase + LCount) || (SBase <= c && c < SLast)) 237 return (1); 238 239 /* 240 * Look up composition table. If there are no composition 241 * that begins with the given character, it is not a 242 * composition candidate. 243 */ 244 if ((*version->compose_proc)(c, &dummy) == 0) 245 return (0); 246 else 247 return (1); 248 } 249 250 idn_result_t 251 idn__unicode_compose(idn__unicode_version_t version, unsigned long c1, 252 unsigned long c2, unsigned long *compp) { 253 int n; 254 int lo, hi; 255 const struct composition *cseq; 256 257 assert(compp != NULL); 258 259 if (c1 > UCS_MAX || c2 > UCS_MAX) 260 return (idn_notfound); 261 262 /* 263 * Check for Hangul. 264 */ 265 if (LBase <= c1 && c1 < LBase + LCount && 266 VBase <= c2 && c2 < VBase + VCount) { 267 /* 268 * Hangul L and V. 269 */ 270 *compp = SBase + 271 ((c1 - LBase) * VCount + (c2 - VBase)) * TCount; 272 return (idn_success); 273 } else if (SBase <= c1 && c1 < SLast && 274 TBase <= c2 && c2 < TBase + TCount && 275 (c1 - SBase) % TCount == 0) { 276 /* 277 * Hangul LV and T. 278 */ 279 *compp = c1 + (c2 - TBase); 280 return (idn_success); 281 } 282 283 /* 284 * Look up composition table. If the result is 0, no composition 285 * is defined. Otherwise, upper 16bits of the result contains 286 * the number of composition that begins with 'c1', and the lower 287 * 16bits is the offset in 'compose_seq'. 288 */ 289 if ((n = (*version->compose_proc)(c1, &cseq)) == 0) 290 return (idn_notfound); 291 292 /* 293 * The composite sequences are sorted by the 2nd character 'c2'. 294 * So we can use binary search. 295 */ 296 lo = 0; 297 hi = n - 1; 298 while (lo <= hi) { 299 int mid = (lo + hi) / 2; 300 301 if (cseq[mid].c2 < c2) { 302 lo = mid + 1; 303 } else if (cseq[mid].c2 > c2) { 304 hi = mid - 1; 305 } else { 306 *compp = cseq[mid].comp; 307 return (idn_success); 308 } 309 } 310 return (idn_notfound); 311 } 312