1 #include <gpac/utf.h>
2 /**
3 * This code has been adapted from http://www.ietf.org/rfc/rfc2640.txt
4 * Full Copyright Statement
5
6 Copyright (C) The Internet Society (1999). All Rights Reserved.
7
8 This document and translations of it may be copied and furnished to
9 others, and derivative works that comment on or otherwise explain it
10 or assist in its implementation may be prepared, copied, published
11 and distributed, in whole or in part, without restriction of any
12 kind, provided that the above copyright notice and this paragraph are
13 included on all such copies and derivative works. However, this
14 document itself may not be modified in any way, such as by removing
15 the copyright notice or references to the Internet Society or other
16 Internet organizations, except as needed for the purpose of
17 developing Internet standards in which case the procedures for
18 copyrights defined in the Internet Standards process must be
19 followed, or as required to translate it into languages other than
20 English.
21
22 The limited permissions granted above are perpetual and will not be
23 revoked by the Internet Society or its successors or assigns.
24
25 This document and the information contained herein is provided on an
26 "AS IS" basis and THE INTERNET SOCIETY AND THE INTERNET ENGINEERING
27 TASK FORCE DISCLAIMS ALL WARRANTIES, EXPRESS OR IMPLIED, INCLUDING
28 BUT NOT LIMITED TO ANY WARRANTY THAT THE USE OF THE INFORMATION
29 HEREIN WILL NOT INFRINGE ANY RIGHTS OR ANY IMPLIED WARRANTIES OF
30 MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE.
31
32 Acknowledgement
33
34 Funding for the RFC Editor function is currently provided by the
35 Internet Society.
36 */
37
38 GF_EXPORT
utf8_to_ucs4(u32 * ucs4_buf,u32 utf8_len,unsigned char * utf8_buf)39 u32 utf8_to_ucs4(u32 * ucs4_buf, u32 utf8_len, unsigned char *utf8_buf)
40 {
41 const unsigned char *utf8_endbuf = utf8_buf + utf8_len;
42 u32 ucs_len = 0;
43 assert( ucs4_buf );
44 assert( utf8_buf );
45
46 while (utf8_buf != utf8_endbuf) {
47
48 if ((*utf8_buf & 0x80) == 0x00) {
49 /* ASCII chars no
50 * conversion needed */
51 *ucs4_buf++ = (u32) * utf8_buf;
52 utf8_buf++;
53 ucs_len++;
54 } else if ((*utf8_buf & 0xE0) == 0xC0)
55 //In the 2 byte utf - 8 range
56 {
57 *ucs4_buf++ = (u32) (((*utf8_buf - 0xC0) * 0x40)
58 + (*(utf8_buf + 1) - 0x80));
59 utf8_buf += 2;
60 ucs_len++;
61 } else if ((*utf8_buf & 0xF0) == 0xE0) {
62 /* In the 3 byte utf-8
63 * range */
64 *ucs4_buf++ = (u32) (((*utf8_buf - 0xE0) * 0x1000)
65 + ((*(utf8_buf + 1) - 0x80) * 0x40)
66 + (*(utf8_buf + 2) - 0x80));
67
68 utf8_buf += 3;
69 ucs_len++;
70 } else if ((*utf8_buf & 0xF8) == 0xF0) {
71 /* In the 4 byte utf-8
72 * range */
73 *ucs4_buf++ = (u32)
74 (((*utf8_buf - 0xF0) * 0x040000)
75 + ((*(utf8_buf + 1) - 0x80) * 0x1000)
76 + ((*(utf8_buf + 2) - 0x80) * 0x40)
77 + (*(utf8_buf + 3) - 0x80));
78 utf8_buf += 4;
79 ucs_len++;
80 } else if ((*utf8_buf & 0xFC) == 0xF8) {
81 /* In the 5 byte utf-8
82 * range */
83 *ucs4_buf++ = (u32)
84 (((*utf8_buf - 0xF8) * 0x01000000)
85 + ((*(utf8_buf + 1) - 0x80) * 0x040000)
86 + ((*(utf8_buf + 2) - 0x80) * 0x1000)
87 + ((*(utf8_buf + 3) - 0x80) * 0x40)
88 + (*(utf8_buf + 4) - 0x80));
89 utf8_buf += 5;
90 ucs_len++;
91 } else if ((*utf8_buf & 0xFE) == 0xFC) {
92 /* In the 6 byte utf-8
93 * range */
94 *ucs4_buf++ = (u32)
95 (((*utf8_buf - 0xFC) * 0x40000000)
96 + ((*(utf8_buf + 1) - 0x80) * 0x010000000)
97 + ((*(utf8_buf + 2) - 0x80) * 0x040000)
98 + ((*(utf8_buf + 3) - 0x80) * 0x1000)
99 + ((*(utf8_buf + 4) - 0x80) * 0x40)
100 + (*(utf8_buf + 5) - 0x80));
101 utf8_buf += 6;
102 ucs_len++;
103 } else {
104 return 0;
105 }
106 }
107 return (ucs_len);
108 }
109
110