1 #include <gpac/utf.h>
2 /**
3  * This code has been adapted from http://www.ietf.org/rfc/rfc2640.txt
4  * Full Copyright Statement
5 
6    Copyright (C) The Internet Society (1999).  All Rights Reserved.
7 
8    This document and translations of it may be copied and furnished to
9    others, and derivative works that comment on or otherwise explain it
10    or assist in its implementation may be prepared, copied, published
11    and distributed, in whole or in part, without restriction of any
12    kind, provided that the above copyright notice and this paragraph are
13    included on all such copies and derivative works.  However, this
14    document itself may not be modified in any way, such as by removing
15    the copyright notice or references to the Internet Society or other
16    Internet organizations, except as needed for the purpose of
17    developing Internet standards in which case the procedures for
18    copyrights defined in the Internet Standards process must be
19    followed, or as required to translate it into languages other than
20    English.
21 
22    The limited permissions granted above are perpetual and will not be
23    revoked by the Internet Society or its successors or assigns.
24 
25    This document and the information contained herein is provided on an
26    "AS IS" basis and THE INTERNET SOCIETY AND THE INTERNET ENGINEERING
27    TASK FORCE DISCLAIMS ALL WARRANTIES, EXPRESS OR IMPLIED, INCLUDING
28    BUT NOT LIMITED TO ANY WARRANTY THAT THE USE OF THE INFORMATION
29    HEREIN WILL NOT INFRINGE ANY RIGHTS OR ANY IMPLIED WARRANTIES OF
30    MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE.
31 
32 Acknowledgement
33 
34    Funding for the RFC Editor function is currently provided by the
35    Internet Society.
36  */
37 
38 GF_EXPORT
utf8_to_ucs4(u32 * ucs4_buf,u32 utf8_len,unsigned char * utf8_buf)39 u32 utf8_to_ucs4(u32 * ucs4_buf, u32 utf8_len, unsigned char *utf8_buf)
40 {
41 	const unsigned char *utf8_endbuf = utf8_buf + utf8_len;
42 	u32             ucs_len = 0;
43 	assert( ucs4_buf );
44 	assert( utf8_buf );
45 
46 	while (utf8_buf != utf8_endbuf) {
47 
48 		if ((*utf8_buf & 0x80) == 0x00) {
49 			/* ASCII chars no
50 								 * conversion needed */
51 			*ucs4_buf++ = (u32) * utf8_buf;
52 			utf8_buf++;
53 			ucs_len++;
54 		} else if ((*utf8_buf & 0xE0) == 0xC0)
55 			//In the 2 byte utf - 8 range
56 		{
57 			*ucs4_buf++ = (u32) (((*utf8_buf - 0xC0) * 0x40)
58 			                     + (*(utf8_buf + 1) - 0x80));
59 			utf8_buf += 2;
60 			ucs_len++;
61 		} else if ((*utf8_buf & 0xF0) == 0xE0) {
62 			/* In the 3 byte utf-8
63 									 * range */
64 			*ucs4_buf++ = (u32) (((*utf8_buf - 0xE0) * 0x1000)
65 			                     + ((*(utf8_buf + 1) - 0x80) * 0x40)
66 			                     + (*(utf8_buf + 2) - 0x80));
67 
68 			utf8_buf += 3;
69 			ucs_len++;
70 		} else if ((*utf8_buf & 0xF8) == 0xF0) {
71 			/* In the 4 byte utf-8
72 									 * range */
73 			*ucs4_buf++ = (u32)
74 			              (((*utf8_buf - 0xF0) * 0x040000)
75 			               + ((*(utf8_buf + 1) - 0x80) * 0x1000)
76 			               + ((*(utf8_buf + 2) - 0x80) * 0x40)
77 			               + (*(utf8_buf + 3) - 0x80));
78 			utf8_buf += 4;
79 			ucs_len++;
80 		} else if ((*utf8_buf & 0xFC) == 0xF8) {
81 			/* In the 5 byte utf-8
82 									 * range */
83 			*ucs4_buf++ = (u32)
84 			              (((*utf8_buf - 0xF8) * 0x01000000)
85 			               + ((*(utf8_buf + 1) - 0x80) * 0x040000)
86 			               + ((*(utf8_buf + 2) - 0x80) * 0x1000)
87 			               + ((*(utf8_buf + 3) - 0x80) * 0x40)
88 			               + (*(utf8_buf + 4) - 0x80));
89 			utf8_buf += 5;
90 			ucs_len++;
91 		} else if ((*utf8_buf & 0xFE) == 0xFC) {
92 			/* In the 6 byte utf-8
93 									 * range */
94 			*ucs4_buf++ = (u32)
95 			              (((*utf8_buf - 0xFC) * 0x40000000)
96 			               + ((*(utf8_buf + 1) - 0x80) * 0x010000000)
97 			               + ((*(utf8_buf + 2) - 0x80) * 0x040000)
98 			               + ((*(utf8_buf + 3) - 0x80) * 0x1000)
99 			               + ((*(utf8_buf + 4) - 0x80) * 0x40)
100 			               + (*(utf8_buf + 5) - 0x80));
101 			utf8_buf += 6;
102 			ucs_len++;
103 		} else {
104 			return 0;
105 		}
106 	}
107 	return (ucs_len);
108 }
109 
110