1 /* Creation date: 2008-04-05T22:10:32Z
2  * Authors: Don
3  */
4 
5 /*
6 
7  Copyright (c) 2007-2010 Don Owens <don@regexguy.com>.  All rights reserved.
8 
9  This is free software; you can redistribute it and/or modify it under
10  the Perl Artistic license.  You should have received a copy of the
11  Artistic license with this distribution, in the file named
12  "Artistic".  You may also obtain a copy from
13  http://regexguy.com/license/Artistic
14 
15  This program is distributed in the hope that it will be useful, but
16  WITHOUT ANY WARRANTY; without even the implied warranty of
17  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
18 
19 */
20 
21 /* $Header: /repository/projects/libjsonevt/utf16.c,v 1.3 2009-02-23 17:46:55 don Exp $ */
22 
23 #include "utf16.h"
24 
25 #define SAFE_SET_POINTER_VAL(ptr, val) if (ptr) { *(ptr) = val; }
26 
27 uint32_t
utf16_bytes_to_unicode(const uint8_t * orig_buf,uint32_t buf_len,uint32_t * ret_len,uint32_t is_little_endian)28 utf16_bytes_to_unicode(const uint8_t *orig_buf, uint32_t buf_len, uint32_t *ret_len,
29     uint32_t is_little_endian) {
30 
31     const uint8_t *s = orig_buf;
32 
33     if (buf_len < 2) {
34         /* utf-16 requires at least two bytes for a code point */
35 
36         SAFE_SET_POINTER_VAL(ret_len, 0);
37         return 0;
38     }
39 
40     if (is_little_endian) {
41         if ( (s[1] & 0xfc) == 0xd8 ) {
42             /* surrogate pair -- requires 4 bytes */
43             if (buf_len < 4) {
44                 SAFE_SET_POINTER_VAL(ret_len, 0);
45                 return 0;
46             }
47 
48             SAFE_SET_POINTER_VAL(ret_len, 4);
49 
50             return 0x010000
51                 + ( s[2] | ((s[3] & 0x03) << 8) | ((*s & 0xff) << 10) | ((s[1] & 0x03) << 18) );
52         }
53         else {
54             SAFE_SET_POINTER_VAL(ret_len, 2);
55 
56             return ( *s | (s[1] << 8) );
57         }
58     }
59     else { /* big endian */
60         if ( (*s & 0xfc) == 0xd8 ) {
61             /* surrogate pair -- requires 4 bytes */
62             if (buf_len < 4) {
63                 SAFE_SET_POINTER_VAL(ret_len, 0);
64                 return 0;
65             }
66 
67             SAFE_SET_POINTER_VAL(ret_len, 4);
68             return 0x010000
69                 + ( s[3] | ((s[2] & 0x03) << 8) | (s[1] << 10) | ((*s & 0x03) << 18) );
70         }
71         else {
72             SAFE_SET_POINTER_VAL(ret_len, 2);
73 
74             return ( (*s << 8) | s[1] );
75         }
76     }
77 
78     return 0;
79 }
80 
81 uint32_t
utf16_unicode_to_bytes(uint32_t cp,uint8_t * out_buf,uint32_t output_little_endian)82 utf16_unicode_to_bytes(uint32_t cp, uint8_t *out_buf, uint32_t output_little_endian) {
83     uint8_t *d = out_buf;
84 
85     if (cp < 0xffff) {
86         /* single unsigned 16-bit code unit, so 2 bytes, with same value as the code point */
87 
88         /* but 0xd800 .. 0xdfff are ill-formed */
89         if (cp >= 0xd800 && cp <= 0xdfff) {
90             *d = 0;
91             return 0;
92         }
93 
94         /* big endian is the default */
95 
96         if (output_little_endian) {
97             /* little endian */
98             *d++ = cp & 0xff;
99             *d++ = (cp & 0xff00) >> 8;
100         }
101         else {
102             /* big endian */
103             *d++ = (cp & 0xff00) >> 8;
104             *d++ = cp & 0xff;
105         }
106         return 2;
107     }
108     else {
109         /* use surrogate pairs */
110         cp -= 0x010000;
111 
112         if (output_little_endian) {
113             /* little endian */
114             *d++ = (cp  & 0x000ff300) >> 10;
115             *d++ = ((cp & 0x00300000) >> 18) | 0xd8;
116             *d++ = cp   & 0x00ff;
117             *d++ = ((cp & 0x0300)     >> 8)  | 0xdc;
118         }
119         else {
120             /* big endian */
121             *d++ = ((cp & 0x00300000) >> 18) | 0xd8;
122             *d++ = (cp  & 0x000ff300) >> 10;
123             *d++ = ((cp & 0x0300)     >> 8)  | 0xdc;
124             *d++ = cp   & 0x00ff;
125         }
126         return 4;
127     }
128 
129     return 0;
130 }
131 
132 
133