1 /*
2  * libid3tag - ID3 tag manipulation library
3  * Copyright (C) 2000-2004 Underbit Technologies, Inc.
4  *
5  * This program is free software; you can redistribute it and/or modify
6  * it under the terms of the GNU General Public License as published by
7  * the Free Software Foundation; either version 2 of the License, or
8  * (at your option) any later version.
9  *
10  * This program is distributed in the hope that it will be useful,
11  * but WITHOUT ANY WARRANTY; without even the implied warranty of
12  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
13  * GNU General Public License for more details.
14  *
15  * You should have received a copy of the GNU General Public License
16  * along with this program; if not, write to the Free Software
17  * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
18  *
19  * $Id: utf8.c,v 1.9 2004/01/23 09:41:32 rob Exp $
20  */
21 
22 # ifdef HAVE_CONFIG_H
23 #  include "config.h"
24 # endif
25 
26 # include "global.h"
27 
28 # include <stdlib.h>
29 
30 # include "id3tag.h"
31 # include "utf8.h"
32 # include "ucs4.h"
33 
34 /*
35  * NAME:	utf8->length()
36  * DESCRIPTION:	return the number of ucs4 chars represented by a utf8 string
37  */
id3_utf8_length(id3_utf8_t const * utf8)38 id3_length_t id3_utf8_length(id3_utf8_t const *utf8)
39 {
40   id3_length_t length = 0;
41 
42   while (*utf8) {
43     if ((utf8[0] & 0x80) == 0x00)
44       ++length;
45     else if ((utf8[0] & 0xe0) == 0xc0 &&
46 	     (utf8[1] & 0xc0) == 0x80) {
47       if (((utf8[0] & 0x1fL) << 6) >= 0x00000080L) {
48 	++length;
49 	utf8 += 1;
50       }
51     }
52     else if ((utf8[0] & 0xf0) == 0xe0 &&
53 	     (utf8[1] & 0xc0) == 0x80 &&
54 	     (utf8[2] & 0xc0) == 0x80) {
55       if ((((utf8[0] & 0x0fL) << 12) |
56 	   ((utf8[1] & 0x3fL) <<  6)) >= 0x00000800L) {
57 	++length;
58 	utf8 += 2;
59       }
60     }
61     else if ((utf8[0] & 0xf8) == 0xf0 &&
62 	     (utf8[1] & 0xc0) == 0x80 &&
63 	     (utf8[2] & 0xc0) == 0x80 &&
64 	     (utf8[3] & 0xc0) == 0x80) {
65       if ((((utf8[0] & 0x07L) << 18) |
66 	   ((utf8[1] & 0x3fL) << 12)) >= 0x00010000L) {
67 	++length;
68 	utf8 += 3;
69       }
70     }
71     else if ((utf8[0] & 0xfc) == 0xf8 &&
72 	     (utf8[1] & 0xc0) == 0x80 &&
73 	     (utf8[2] & 0xc0) == 0x80 &&
74 	     (utf8[3] & 0xc0) == 0x80 &&
75 	     (utf8[4] & 0xc0) == 0x80) {
76       if ((((utf8[0] & 0x03L) << 24) |
77 	   ((utf8[0] & 0x3fL) << 18)) >= 0x00200000L) {
78 	++length;
79 	utf8 += 4;
80       }
81     }
82     else if ((utf8[0] & 0xfe) == 0xfc &&
83 	     (utf8[1] & 0xc0) == 0x80 &&
84 	     (utf8[2] & 0xc0) == 0x80 &&
85 	     (utf8[3] & 0xc0) == 0x80 &&
86 	     (utf8[4] & 0xc0) == 0x80 &&
87 	     (utf8[5] & 0xc0) == 0x80) {
88       if ((((utf8[0] & 0x01L) << 30) |
89 	   ((utf8[0] & 0x3fL) << 24)) >= 0x04000000L) {
90 	++length;
91 	utf8 += 5;
92       }
93     }
94 
95     ++utf8;
96   }
97 
98   return length;
99 }
100 
101 /*
102  * NAME:	utf8->size()
103  * DESCRIPTION:	return the encoding size of a utf8 string
104  */
id3_utf8_size(id3_utf8_t const * utf8)105 id3_length_t id3_utf8_size(id3_utf8_t const *utf8)
106 {
107   id3_utf8_t const *ptr = utf8;
108 
109   while (*ptr)
110     ++ptr;
111 
112   return ptr - utf8 + 1;
113 }
114 
115 /*
116  * NAME:	utf8->ucs4duplicate()
117  * DESCRIPTION:	duplicate and decode a utf8 string into ucs4
118  */
id3_utf8_ucs4duplicate(id3_utf8_t const * utf8)119 id3_ucs4_t *id3_utf8_ucs4duplicate(id3_utf8_t const *utf8)
120 {
121   id3_ucs4_t *ucs4;
122 
123   ucs4 = malloc((id3_utf8_length(utf8) + 1) * sizeof(*ucs4));
124   if (ucs4)
125     id3_utf8_decode(utf8, ucs4);
126 
127   return release(ucs4);
128 }
129 
130 /*
131  * NAME:	utf8->decodechar()
132  * DESCRIPTION:	decode a series of utf8 chars into a single ucs4 char
133  */
id3_utf8_decodechar(id3_utf8_t const * utf8,id3_ucs4_t * ucs4)134 id3_length_t id3_utf8_decodechar(id3_utf8_t const *utf8, id3_ucs4_t *ucs4)
135 {
136   id3_utf8_t const *start = utf8;
137 
138   while (1) {
139     if ((utf8[0] & 0x80) == 0x00) {
140       *ucs4 = utf8[0];
141       return utf8 - start + 1;
142     }
143     else if ((utf8[0] & 0xe0) == 0xc0 &&
144 	     (utf8[1] & 0xc0) == 0x80) {
145       *ucs4 =
146 	((utf8[0] & 0x1fL) << 6) |
147 	((utf8[1] & 0x3fL) << 0);
148       if (*ucs4 >= 0x00000080L)
149 	return utf8 - start + 2;
150     }
151     else if ((utf8[0] & 0xf0) == 0xe0 &&
152 	     (utf8[1] & 0xc0) == 0x80 &&
153 	     (utf8[2] & 0xc0) == 0x80) {
154       *ucs4 =
155 	((utf8[0] & 0x0fL) << 12) |
156 	((utf8[1] & 0x3fL) <<  6) |
157 	((utf8[2] & 0x3fL) <<  0);
158       if (*ucs4 >= 0x00000800L)
159 	return utf8 - start + 3;
160     }
161     else if ((utf8[0] & 0xf8) == 0xf0 &&
162 	     (utf8[1] & 0xc0) == 0x80 &&
163 	     (utf8[2] & 0xc0) == 0x80 &&
164 	     (utf8[3] & 0xc0) == 0x80) {
165       *ucs4 =
166 	((utf8[0] & 0x07L) << 18) |
167 	((utf8[1] & 0x3fL) << 12) |
168 	((utf8[2] & 0x3fL) <<  6) |
169 	((utf8[3] & 0x3fL) <<  0);
170       if (*ucs4 >= 0x00010000L)
171 	return utf8 - start + 4;
172     }
173     else if ((utf8[0] & 0xfc) == 0xf8 &&
174 	     (utf8[1] & 0xc0) == 0x80 &&
175 	     (utf8[2] & 0xc0) == 0x80 &&
176 	     (utf8[3] & 0xc0) == 0x80 &&
177 	     (utf8[4] & 0xc0) == 0x80) {
178       *ucs4 =
179 	((utf8[0] & 0x03L) << 24) |
180 	((utf8[1] & 0x3fL) << 18) |
181 	((utf8[2] & 0x3fL) << 12) |
182 	((utf8[3] & 0x3fL) <<  6) |
183 	((utf8[4] & 0x3fL) <<  0);
184       if (*ucs4 >= 0x00200000L)
185 	return utf8 - start + 5;
186     }
187     else if ((utf8[0] & 0xfe) == 0xfc &&
188 	     (utf8[1] & 0xc0) == 0x80 &&
189 	     (utf8[2] & 0xc0) == 0x80 &&
190 	     (utf8[3] & 0xc0) == 0x80 &&
191 	     (utf8[4] & 0xc0) == 0x80 &&
192 	     (utf8[5] & 0xc0) == 0x80) {
193       *ucs4 =
194 	((utf8[0] & 0x01L) << 30) |
195 	((utf8[1] & 0x3fL) << 24) |
196 	((utf8[2] & 0x3fL) << 18) |
197 	((utf8[3] & 0x3fL) << 12) |
198 	((utf8[4] & 0x3fL) <<  6) |
199 	((utf8[5] & 0x3fL) <<  0);
200       if (*ucs4 >= 0x04000000L)
201 	return utf8 - start + 6;
202     }
203 
204     ++utf8;
205   }
206 }
207 
208 /*
209  * NAME:	utf8->encodechar()
210  * DESCRIPTION:	encode a single ucs4 char into a series of up to 6 utf8 chars
211  */
id3_utf8_encodechar(id3_utf8_t * utf8,id3_ucs4_t ucs4)212 id3_length_t id3_utf8_encodechar(id3_utf8_t *utf8, id3_ucs4_t ucs4)
213 {
214   if (ucs4 <= 0x0000007fL) {
215     utf8[0] = ucs4;
216 
217     return 1;
218   }
219   else if (ucs4 <= 0x000007ffL) {
220     utf8[0] = 0xc0 | ((ucs4 >>  6) & 0x1f);
221     utf8[1] = 0x80 | ((ucs4 >>  0) & 0x3f);
222 
223     return 2;
224   }
225   else if (ucs4 <= 0x0000ffffL) {
226     utf8[0] = 0xe0 | ((ucs4 >> 12) & 0x0f);
227     utf8[1] = 0x80 | ((ucs4 >>  6) & 0x3f);
228     utf8[2] = 0x80 | ((ucs4 >>  0) & 0x3f);
229 
230     return 3;
231   }
232   else if (ucs4 <= 0x001fffffL) {
233     utf8[0] = 0xf0 | ((ucs4 >> 18) & 0x07);
234     utf8[1] = 0x80 | ((ucs4 >> 12) & 0x3f);
235     utf8[2] = 0x80 | ((ucs4 >>  6) & 0x3f);
236     utf8[3] = 0x80 | ((ucs4 >>  0) & 0x3f);
237 
238     return 4;
239   }
240   else if (ucs4 <= 0x03ffffffL) {
241     utf8[0] = 0xf8 | ((ucs4 >> 24) & 0x03);
242     utf8[1] = 0x80 | ((ucs4 >> 18) & 0x3f);
243     utf8[2] = 0x80 | ((ucs4 >> 12) & 0x3f);
244     utf8[3] = 0x80 | ((ucs4 >>  6) & 0x3f);
245     utf8[4] = 0x80 | ((ucs4 >>  0) & 0x3f);
246 
247     return 5;
248   }
249   else if (ucs4 <= 0x7fffffffL) {
250     utf8[0] = 0xfc | ((ucs4 >> 30) & 0x01);
251     utf8[1] = 0x80 | ((ucs4 >> 24) & 0x3f);
252     utf8[2] = 0x80 | ((ucs4 >> 18) & 0x3f);
253     utf8[3] = 0x80 | ((ucs4 >> 12) & 0x3f);
254     utf8[4] = 0x80 | ((ucs4 >>  6) & 0x3f);
255     utf8[5] = 0x80 | ((ucs4 >>  0) & 0x3f);
256 
257     return 6;
258   }
259 
260   /* default */
261 
262   return id3_utf8_encodechar(utf8, ID3_UCS4_REPLACEMENTCHAR);
263 }
264 
265 /*
266  * NAME:	utf8->decode()
267  * DESCRIPTION:	decode a complete utf8 string into a ucs4 string
268  */
id3_utf8_decode(id3_utf8_t const * utf8,id3_ucs4_t * ucs4)269 void id3_utf8_decode(id3_utf8_t const *utf8, id3_ucs4_t *ucs4)
270 {
271   do
272     utf8 += id3_utf8_decodechar(utf8, ucs4);
273   while (*ucs4++);
274 }
275 
276 /*
277  * NAME:	utf8->encode()
278  * DESCRIPTION:	encode a complete ucs4 string into a utf8 string
279  */
id3_utf8_encode(id3_utf8_t * utf8,id3_ucs4_t const * ucs4)280 void id3_utf8_encode(id3_utf8_t *utf8, id3_ucs4_t const *ucs4)
281 {
282   do
283     utf8 += id3_utf8_encodechar(utf8, *ucs4);
284   while (*ucs4++);
285 }
286 
287 /*
288  * NAME:	utf8->put()
289  * DESCRIPTION:	serialize a single utf8 character
290  */
id3_utf8_put(id3_byte_t ** ptr,id3_utf8_t utf8)291 id3_length_t id3_utf8_put(id3_byte_t **ptr, id3_utf8_t utf8)
292 {
293   if (ptr)
294     *(*ptr)++ = utf8;
295 
296   return 1;
297 }
298 
299 /*
300  * NAME:	utf8->get()
301  * DESCRIPTION:	deserialize a single utf8 character
302  */
id3_utf8_get(id3_byte_t const ** ptr)303 id3_utf8_t id3_utf8_get(id3_byte_t const **ptr)
304 {
305   return *(*ptr)++;
306 }
307 
308 /*
309  * NAME:	utf8->serialize()
310  * DESCRIPTION:	serialize a ucs4 string using utf8 encoding
311  */
id3_utf8_serialize(id3_byte_t ** ptr,id3_ucs4_t const * ucs4,int terminate)312 id3_length_t id3_utf8_serialize(id3_byte_t **ptr, id3_ucs4_t const *ucs4,
313 				int terminate)
314 {
315   id3_length_t size = 0;
316   id3_utf8_t utf8[6], *out;
317 
318   while (*ucs4) {
319     switch (id3_utf8_encodechar(out = utf8, *ucs4++)) {
320     case 6: size += id3_utf8_put(ptr, *out++);
321     case 5: size += id3_utf8_put(ptr, *out++);
322     case 4: size += id3_utf8_put(ptr, *out++);
323     case 3: size += id3_utf8_put(ptr, *out++);
324     case 2: size += id3_utf8_put(ptr, *out++);
325     case 1: size += id3_utf8_put(ptr, *out++);
326     case 0: break;
327     }
328   }
329 
330   if (terminate)
331     size += id3_utf8_put(ptr, 0);
332 
333   return size;
334 }
335 
336 /*
337  * NAME:	utf8->deserialize()
338  * DESCRIPTION:	deserialize a ucs4 string using utf8 encoding
339  */
id3_utf8_deserialize(id3_byte_t const ** ptr,id3_length_t length)340 id3_ucs4_t *id3_utf8_deserialize(id3_byte_t const **ptr, id3_length_t length)
341 {
342   id3_byte_t const *end;
343   id3_utf8_t *utf8ptr, *utf8;
344   id3_ucs4_t *ucs4;
345 
346   end = *ptr + length;
347 
348   utf8 = malloc((length + 1) * sizeof(*utf8));
349   if (utf8 == 0)
350     return 0;
351 
352   utf8ptr = utf8;
353   while (end - *ptr > 0 && (*utf8ptr = id3_utf8_get(ptr)))
354     ++utf8ptr;
355 
356   *utf8ptr = 0;
357 
358   ucs4 = malloc((id3_utf8_length(utf8) + 1) * sizeof(*ucs4));
359   if (ucs4)
360     id3_utf8_decode(utf8, ucs4);
361 
362   free(utf8);
363 
364   return ucs4;
365 }
366