1 /*
2  * libid3tag - ID3 tag manipulation library
3  * Copyright (C) 2000-2004 Underbit Technologies, Inc.
4  *
5  * This program is free software; you can redistribute it and/or modify
6  * it under the terms of the GNU General Public License as published by
7  * the Free Software Foundation; either version 2 of the License, or
8  * (at your option) any later version.
9  *
10  * This program is distributed in the hope that it will be useful,
11  * but WITHOUT ANY WARRANTY; without even the implied warranty of
12  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
13  * GNU General Public License for more details.
14  *
15  * You should have received a copy of the GNU General Public License
16  * along with this program; if not, write to the Free Software
17  * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
18  *
19  * $Id: utf16.c,v 1.9 2004/01/23 09:41:32 rob Exp $
20  */
21 
22 # ifdef HAVE_CONFIG_H
23 #  include "config.h"
24 # endif
25 
26 # include "global.h"
27 
28 # include <stdlib.h>
29 
30 # include "id3tag.h"
31 # include "utf16.h"
32 # include "ucs4.h"
33 
34 /*
35  * NAME:	utf16->length()
36  * DESCRIPTION:	return the number of ucs4 chars represented by a utf16 string
37  */
id3_utf16_length(id3_utf16_t const * utf16)38 id3_length_t id3_utf16_length(id3_utf16_t const *utf16)
39 {
40   id3_length_t length = 0;
41 
42   while (*utf16) {
43     if (utf16[0] < 0xd800 || utf16[0] > 0xdfff)
44       ++length;
45     else if (utf16[0] >= 0xd800 && utf16[0] <= 0xdbff &&
46 	     utf16[1] >= 0xdc00 && utf16[1] <= 0xdfff) {
47       ++length;
48       ++utf16;
49     }
50 
51     ++utf16;
52   }
53 
54   return length;
55 }
56 
57 /*
58  * NAME:	utf16->size()
59  * DESCRIPTION:	return the encoding size of a utf16 string
60  */
id3_utf16_size(id3_utf16_t const * utf16)61 id3_length_t id3_utf16_size(id3_utf16_t const *utf16)
62 {
63   id3_utf16_t const *ptr = utf16;
64 
65   while (*ptr)
66     ++ptr;
67 
68   return ptr - utf16 + 1;
69 }
70 
71 /*
72  * NAME:	utf16->ucs4duplicate()
73  * DESCRIPTION:	duplicate and decode a utf16 string into ucs4
74  */
id3_utf16_ucs4duplicate(id3_utf16_t const * utf16)75 id3_ucs4_t *id3_utf16_ucs4duplicate(id3_utf16_t const *utf16)
76 {
77   id3_ucs4_t *ucs4;
78 
79   ucs4 = malloc((id3_utf16_length(utf16) + 1) * sizeof(*ucs4));
80   if (ucs4)
81     id3_utf16_decode(utf16, ucs4);
82 
83   return release(ucs4);
84 }
85 
86 /*
87  * NAME:	utf16->decodechar()
88  * DESCRIPTION:	decode a series of utf16 chars into a single ucs4 char
89  */
id3_utf16_decodechar(id3_utf16_t const * utf16,id3_ucs4_t * ucs4)90 id3_length_t id3_utf16_decodechar(id3_utf16_t const *utf16, id3_ucs4_t *ucs4)
91 {
92   id3_utf16_t const *start = utf16;
93 
94   while (1) {
95     if (utf16[0] < 0xd800 || utf16[0] > 0xdfff) {
96       *ucs4 = utf16[0];
97       return utf16 - start + 1;
98     }
99     else if (utf16[0] >= 0xd800 && utf16[0] <= 0xdbff &&
100 	     utf16[1] >= 0xdc00 && utf16[1] <= 0xdfff) {
101       *ucs4 = (((utf16[0] & 0x03ffL) << 10) |
102 	       ((utf16[1] & 0x03ffL) <<  0)) + 0x00010000L;
103       return utf16 - start + 2;
104     }
105 
106     ++utf16;
107   }
108 }
109 
110 /*
111  * NAME:	utf16->encodechar()
112  * DESCRIPTION:	encode a single ucs4 char into a series of up to 2 utf16 chars
113  */
id3_utf16_encodechar(id3_utf16_t * utf16,id3_ucs4_t ucs4)114 id3_length_t id3_utf16_encodechar(id3_utf16_t *utf16, id3_ucs4_t ucs4)
115 {
116   if (ucs4 < 0x00010000L) {
117     utf16[0] = ucs4;
118 
119     return 1;
120   }
121   else if (ucs4 < 0x00110000L) {
122     ucs4 -= 0x00010000L;
123 
124     utf16[0] = ((ucs4 >> 10) & 0x3ff) | 0xd800;
125     utf16[1] = ((ucs4 >>  0) & 0x3ff) | 0xdc00;
126 
127     return 2;
128   }
129 
130   /* default */
131 
132   return id3_utf16_encodechar(utf16, ID3_UCS4_REPLACEMENTCHAR);
133 }
134 
135 /*
136  * NAME:	utf16->decode()
137  * DESCRIPTION:	decode a complete utf16 string into a ucs4 string
138  */
id3_utf16_decode(id3_utf16_t const * utf16,id3_ucs4_t * ucs4)139 void id3_utf16_decode(id3_utf16_t const *utf16, id3_ucs4_t *ucs4)
140 {
141   do
142     utf16 += id3_utf16_decodechar(utf16, ucs4);
143   while (*ucs4++);
144 }
145 
146 /*
147  * NAME:	utf16->encode()
148  * DESCRIPTION:	encode a complete ucs4 string into a utf16 string
149  */
id3_utf16_encode(id3_utf16_t * utf16,id3_ucs4_t const * ucs4)150 void id3_utf16_encode(id3_utf16_t *utf16, id3_ucs4_t const *ucs4)
151 {
152   do
153     utf16 += id3_utf16_encodechar(utf16, *ucs4);
154   while (*ucs4++);
155 }
156 
157 /*
158  * NAME:	utf16->put()
159  * DESCRIPTION:	serialize a single utf16 character
160  */
id3_utf16_put(id3_byte_t ** ptr,id3_utf16_t utf16,enum id3_utf16_byteorder byteorder)161 id3_length_t id3_utf16_put(id3_byte_t **ptr, id3_utf16_t utf16,
162 			   enum id3_utf16_byteorder byteorder)
163 {
164   if (ptr) {
165     switch (byteorder) {
166     default:
167     case ID3_UTF16_BYTEORDER_BE:
168       (*ptr)[0] = (utf16 >> 8) & 0xff;
169       (*ptr)[1] = (utf16 >> 0) & 0xff;
170       break;
171 
172     case ID3_UTF16_BYTEORDER_LE:
173       (*ptr)[0] = (utf16 >> 0) & 0xff;
174       (*ptr)[1] = (utf16 >> 8) & 0xff;
175       break;
176     }
177 
178     *ptr += 2;
179   }
180 
181   return 2;
182 }
183 
184 /*
185  * NAME:	utf16->get()
186  * DESCRIPTION:	deserialize a single utf16 character
187  */
id3_utf16_get(id3_byte_t const ** ptr,enum id3_utf16_byteorder byteorder)188 id3_utf16_t id3_utf16_get(id3_byte_t const **ptr,
189 			  enum id3_utf16_byteorder byteorder)
190 {
191   id3_utf16_t utf16;
192 
193   switch (byteorder) {
194   default:
195   case ID3_UTF16_BYTEORDER_BE:
196     utf16 =
197       ((*ptr)[0] << 8) |
198       ((*ptr)[1] << 0);
199     break;
200 
201   case ID3_UTF16_BYTEORDER_LE:
202     utf16 =
203       ((*ptr)[0] << 0) |
204       ((*ptr)[1] << 8);
205     break;
206   }
207 
208   *ptr += 2;
209 
210   return utf16;
211 }
212 
213 /*
214  * NAME:	utf16->serialize()
215  * DESCRIPTION:	serialize a ucs4 string using utf16 encoding
216  */
id3_utf16_serialize(id3_byte_t ** ptr,id3_ucs4_t const * ucs4,enum id3_utf16_byteorder byteorder,int terminate)217 id3_length_t id3_utf16_serialize(id3_byte_t **ptr, id3_ucs4_t const *ucs4,
218 				 enum id3_utf16_byteorder byteorder,
219 				 int terminate)
220 {
221   id3_length_t size = 0;
222   id3_utf16_t utf16[2], *out;
223 
224   if (byteorder == ID3_UTF16_BYTEORDER_ANY)
225     size += id3_utf16_put(ptr, 0xfeff, byteorder);
226 
227   while (*ucs4) {
228     switch (id3_utf16_encodechar(out = utf16, *ucs4++)) {
229     case 2: size += id3_utf16_put(ptr, *out++, byteorder);
230     case 1: size += id3_utf16_put(ptr, *out++, byteorder);
231     case 0: break;
232     }
233   }
234 
235   if (terminate)
236     size += id3_utf16_put(ptr, 0, byteorder);
237 
238   return size;
239 }
240 
241 /*
242  * NAME:	utf16->deserialize()
243  * DESCRIPTION:	deserialize a ucs4 string using utf16 encoding
244  */
id3_utf16_deserialize(id3_byte_t const ** ptr,id3_length_t length,enum id3_utf16_byteorder byteorder)245 id3_ucs4_t *id3_utf16_deserialize(id3_byte_t const **ptr, id3_length_t length,
246 				  enum id3_utf16_byteorder byteorder)
247 {
248   id3_byte_t const *end;
249   id3_utf16_t *utf16ptr, *utf16;
250   id3_ucs4_t *ucs4;
251 
252   end = *ptr + (length & ~1);
253 
254   utf16 = malloc((length / 2 + 1) * sizeof(*utf16));
255   if (utf16 == 0)
256     return 0;
257 
258   if (byteorder == ID3_UTF16_BYTEORDER_ANY && end - *ptr > 0) {
259     switch (((*ptr)[0] << 8) |
260 	    ((*ptr)[1] << 0)) {
261     case 0xfeff:
262       byteorder = ID3_UTF16_BYTEORDER_BE;
263       *ptr += 2;
264       break;
265 
266     case 0xfffe:
267       byteorder = ID3_UTF16_BYTEORDER_LE;
268       *ptr += 2;
269       break;
270     }
271   }
272 
273   utf16ptr = utf16;
274   while (end - *ptr > 0 && (*utf16ptr = id3_utf16_get(ptr, byteorder)))
275     ++utf16ptr;
276 
277   *utf16ptr = 0;
278 
279   ucs4 = malloc((id3_utf16_length(utf16) + 1) * sizeof(*ucs4));
280   if (ucs4)
281     id3_utf16_decode(utf16, ucs4);
282 
283   free(utf16);
284 
285   return ucs4;
286 }
287