1 /* Creation date: 2008-04-05T22:10:32Z
2 * Authors: Don
3 */
4
5 /*
6
7 Copyright (c) 2007-2010 Don Owens <don@regexguy.com>. All rights reserved.
8
9 This is free software; you can redistribute it and/or modify it under
10 the Perl Artistic license. You should have received a copy of the
11 Artistic license with this distribution, in the file named
12 "Artistic". You may also obtain a copy from
13 http://regexguy.com/license/Artistic
14
15 This program is distributed in the hope that it will be useful, but
16 WITHOUT ANY WARRANTY; without even the implied warranty of
17 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
18
19 */
20
21 /* $Header: /repository/projects/libjsonevt/utf16.c,v 1.3 2009-02-23 17:46:55 don Exp $ */
22
23 #include "utf16.h"
24
25 #define SAFE_SET_POINTER_VAL(ptr, val) if (ptr) { *(ptr) = val; }
26
27 uint32_t
utf16_bytes_to_unicode(const uint8_t * orig_buf,uint32_t buf_len,uint32_t * ret_len,uint32_t is_little_endian)28 utf16_bytes_to_unicode(const uint8_t *orig_buf, uint32_t buf_len, uint32_t *ret_len,
29 uint32_t is_little_endian) {
30
31 const uint8_t *s = orig_buf;
32
33 if (buf_len < 2) {
34 /* utf-16 requires at least two bytes for a code point */
35
36 SAFE_SET_POINTER_VAL(ret_len, 0);
37 return 0;
38 }
39
40 if (is_little_endian) {
41 if ( (s[1] & 0xfc) == 0xd8 ) {
42 /* surrogate pair -- requires 4 bytes */
43 if (buf_len < 4) {
44 SAFE_SET_POINTER_VAL(ret_len, 0);
45 return 0;
46 }
47
48 SAFE_SET_POINTER_VAL(ret_len, 4);
49
50 return 0x010000
51 + ( s[2] | ((s[3] & 0x03) << 8) | ((*s & 0xff) << 10) | ((s[1] & 0x03) << 18) );
52 }
53 else {
54 SAFE_SET_POINTER_VAL(ret_len, 2);
55
56 return ( *s | (s[1] << 8) );
57 }
58 }
59 else { /* big endian */
60 if ( (*s & 0xfc) == 0xd8 ) {
61 /* surrogate pair -- requires 4 bytes */
62 if (buf_len < 4) {
63 SAFE_SET_POINTER_VAL(ret_len, 0);
64 return 0;
65 }
66
67 SAFE_SET_POINTER_VAL(ret_len, 4);
68 return 0x010000
69 + ( s[3] | ((s[2] & 0x03) << 8) | (s[1] << 10) | ((*s & 0x03) << 18) );
70 }
71 else {
72 SAFE_SET_POINTER_VAL(ret_len, 2);
73
74 return ( (*s << 8) | s[1] );
75 }
76 }
77
78 return 0;
79 }
80
81 uint32_t
utf16_unicode_to_bytes(uint32_t cp,uint8_t * out_buf,uint32_t output_little_endian)82 utf16_unicode_to_bytes(uint32_t cp, uint8_t *out_buf, uint32_t output_little_endian) {
83 uint8_t *d = out_buf;
84
85 if (cp < 0xffff) {
86 /* single unsigned 16-bit code unit, so 2 bytes, with same value as the code point */
87
88 /* but 0xd800 .. 0xdfff are ill-formed */
89 if (cp >= 0xd800 && cp <= 0xdfff) {
90 *d = 0;
91 return 0;
92 }
93
94 /* big endian is the default */
95
96 if (output_little_endian) {
97 /* little endian */
98 *d++ = cp & 0xff;
99 *d++ = (cp & 0xff00) >> 8;
100 }
101 else {
102 /* big endian */
103 *d++ = (cp & 0xff00) >> 8;
104 *d++ = cp & 0xff;
105 }
106 return 2;
107 }
108 else {
109 /* use surrogate pairs */
110 cp -= 0x010000;
111
112 if (output_little_endian) {
113 /* little endian */
114 *d++ = (cp & 0x000ff300) >> 10;
115 *d++ = ((cp & 0x00300000) >> 18) | 0xd8;
116 *d++ = cp & 0x00ff;
117 *d++ = ((cp & 0x0300) >> 8) | 0xdc;
118 }
119 else {
120 /* big endian */
121 *d++ = ((cp & 0x00300000) >> 18) | 0xd8;
122 *d++ = (cp & 0x000ff300) >> 10;
123 *d++ = ((cp & 0x0300) >> 8) | 0xdc;
124 *d++ = cp & 0x00ff;
125 }
126 return 4;
127 }
128
129 return 0;
130 }
131
132
133