1 /* Conversion of files between different charsets and surfaces.
2 Copyright � 1996, 97, 98, 99, 00 Free Software Foundation, Inc.
3 Contributed by Fran�ois Pinard <pinard@iro.umontreal.ca>, 1996.
4
5 This library is free software; you can redistribute it and/or
6 modify it under the terms of the GNU Lesser General Public License
7 as published by the Free Software Foundation; either version 2 of the
8 License, or (at your option) any later version.
9
10 This library is distributed in the hope that it will be
11 useful, but WITHOUT ANY WARRANTY; without even the implied warranty
12 of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 Lesser General Public License for more details.
14
15 You should have received a copy of the GNU Lesser General Public
16 License along with the `recode' Library; see the file `COPYING.LIB'.
17 If not, write to the Free Software Foundation, Inc., 59 Temple Place -
18 Suite 330, Boston, MA 02111-1307, USA. */
19
20 #include "common.h"
21
22 /* Define HANDLE_32_BITS if you want conversion for 2^32 codes instead
23 of 2^31. But this would not be ISO-10646, which says 2^31. */
24
25 /* Read next data byte and check its value, discard an illegal sequence.
26 This macro is meant to be used only within the `while' loop in
27 `transform_utf8_ucs[24]'. */
28 #define GET_DATA_BYTE \
29 character = get_byte (subtask); \
30 if (character == EOF) \
31 { \
32 RETURN_IF_NOGO (RECODE_INVALID_INPUT, subtask); \
33 break; \
34 } \
35 else if ((MASK (2) << 6 & character) != 1 << 7) \
36 { \
37 RETURN_IF_NOGO (RECODE_INVALID_INPUT, subtask); \
38 continue; \
39 } \
40 else
41
42 /* Read next data byte and check its value, discard an illegal sequence.
43 Merge it into `value' at POSITION. This macro is meant to be used only
44 within the `while' loop in `transform_utf8_ucs[24]'. */
45 #define GET_DATA_BYTE_AT(Position) \
46 GET_DATA_BYTE /* ... else */ value |= (MASK (6) & character) << Position
47
48 static bool
transform_ucs2_utf8(RECODE_SUBTASK subtask)49 transform_ucs2_utf8 (RECODE_SUBTASK subtask)
50 {
51 unsigned value;
52
53 while (get_ucs2 (&value, subtask))
54 {
55 if (value & ~MASK (7))
56 if (value & ~MASK (11))
57 {
58 /* 3 bytes - more than 11 bits, but not more than 16. */
59 put_byte ((MASK (3) << 5) | (MASK (6) & value >> 12), subtask);
60 put_byte ((1 << 7) | (MASK (6) & value >> 6), subtask);
61 put_byte ((1 << 7) | (MASK (6) & value), subtask);
62 }
63 else
64 {
65 /* 2 bytes - more than 7 bits, but not more than 11. */
66 put_byte ((MASK (2) << 6) | (MASK (6) & value >> 6), subtask);
67 put_byte ((1 << 7) | (MASK (6) & value), subtask);
68 }
69 else
70 /* 1 byte - not more than 7 bits (that is, ASCII). */
71 put_byte (value, subtask);
72 }
73
74 SUBTASK_RETURN (subtask);
75 }
76
77 static bool
transform_ucs4_utf8(RECODE_SUBTASK subtask)78 transform_ucs4_utf8 (RECODE_SUBTASK subtask)
79 {
80 unsigned value;
81
82 while (get_ucs4 (&value, subtask))
83 if (value & ~MASK (16))
84 if (value & ~MASK (26))
85 if (value & ~MASK (31))
86 {
87 #if HANDLE_32_BITS
88 /* 7 bytes - more than 31 bits (that is, exactly 32 :-). */
89 put_byte (MASK (7) << 1);
90 put_byte ((1 << 7) | (MASK (6) & value >> 30), subtask);
91 put_byte ((1 << 7) | (MASK (6) & value >> 24), subtask);
92 put_byte ((1 << 7) | (MASK (6) & value >> 18), subtask);
93 put_byte ((1 << 7) | (MASK (6) & value >> 12), subtask);
94 put_byte ((1 << 7) | (MASK (6) & value >> 6), subtask);
95 put_byte ((1 << 7) | (MASK (6) & value), subtask);
96 #else
97 RETURN_IF_NOGO (RECODE_INVALID_INPUT, subtask);
98 #endif
99 }
100 else
101 {
102 /* 6 bytes - more than 26 bits, but not more than 31. */
103 put_byte ((MASK (6) << 2) | (MASK (6) & value >> 30), subtask);
104 put_byte ((1 << 7) | (MASK (6) & value >> 24), subtask);
105 put_byte ((1 << 7) | (MASK (6) & value >> 18), subtask);
106 put_byte ((1 << 7) | (MASK (6) & value >> 12), subtask);
107 put_byte ((1 << 7) | (MASK (6) & value >> 6), subtask);
108 put_byte ((1 << 7) | (MASK (6) & value), subtask);
109 }
110 else if (value & ~MASK (21))
111 {
112 /* 5 bytes - more than 21 bits, but not more than 26. */
113 put_byte ((MASK (5) << 3) | (MASK (6) & value >> 24), subtask);
114 put_byte ((1 << 7) | (MASK (6) & value >> 18), subtask);
115 put_byte ((1 << 7) | (MASK (6) & value >> 12), subtask);
116 put_byte ((1 << 7) | (MASK (6) & value >> 6), subtask);
117 put_byte ((1 << 7) | (MASK (6) & value), subtask);
118 }
119 else
120 {
121 /* 4 bytes - more than 16 bits, but not more than 21. */
122 put_byte ((MASK (4) << 4) | (MASK (6) & value >> 18), subtask);
123 put_byte ((1 << 7) | (MASK (6) & value >> 12), subtask);
124 put_byte ((1 << 7) | (MASK (6) & value >> 6), subtask);
125 put_byte ((1 << 7) | (MASK (6) & value), subtask);
126 }
127 else if (value & ~MASK (7))
128 if (value & ~MASK (11))
129 {
130 /* 3 bytes - more than 11 bits, but not more than 16. */
131 put_byte ((MASK (3) << 5) | (MASK (6) & value >> 12), subtask);
132 put_byte ((1 << 7) | (MASK (6) & value >> 6), subtask);
133 put_byte ((1 << 7) | (MASK (6) & value), subtask);
134 }
135 else
136 {
137 /* 2 bytes - more than 7 bits, but not more than 11. */
138 put_byte ((MASK (2) << 6) | (MASK (6) & value >> 6), subtask);
139 put_byte ((1 << 7) | (MASK (6) & value), subtask);
140 }
141 else
142 /* 1 byte - not more than 7 bits (that is, ASCII). */
143 put_byte (value, subtask);
144
145 SUBTASK_RETURN (subtask);
146 }
147
148 /* FIXME: The UTF-8 decoding algorithms do not validate that the minimum
149 length surface was indeed used. This would be necessary for ensuring
150 that the recoding is exactly reversible. In fact, this minimum length
151 surface is also a requirement of UTF-8 specification. */
152
153 static bool
transform_utf8_ucs4(RECODE_SUBTASK subtask)154 transform_utf8_ucs4 (RECODE_SUBTASK subtask)
155 {
156 int character = get_byte (subtask);
157 unsigned value;
158
159 while (character != EOF)
160
161 /* Process one UTF-8 value. EOF is acceptable on first byte only. */
162
163 if ((character & MASK (4) << 4) == MASK (4) << 4)
164 if ((character & MASK (6) << 2) == MASK (6) << 2)
165 if ((character & MASK (7) << 1) == MASK (7) << 1)
166 {
167 /* 7 bytes - more than 31 bits (that is, exactly 32 :-). */
168 #if HANDLE_32_BITS
169 value = 0;
170 GET_DATA_BYTE_AT (30);
171 GET_DATA_BYTE_AT (24);
172 GET_DATA_BYTE_AT (18);
173 GET_DATA_BYTE_AT (12);
174 GET_DATA_BYTE_AT (6);
175 GET_DATA_BYTE_AT (0);
176 put_ucs4 (value, subtask);
177 character = get_byte (subtask);
178 #else
179 RETURN_IF_NOGO (RECODE_INVALID_INPUT, subtask);
180 character = get_byte (subtask);
181 #endif
182 }
183 else
184 {
185 /* 6 bytes - more than 26 bits, but not more than 31. */
186 value = (MASK (1) & character) << 30;
187 GET_DATA_BYTE_AT (24);
188 GET_DATA_BYTE_AT (18);
189 GET_DATA_BYTE_AT (12);
190 GET_DATA_BYTE_AT (6);
191 GET_DATA_BYTE_AT (0);
192 put_ucs4 (value, subtask);
193 character = get_byte (subtask);
194 }
195 else if ((character & MASK (5) << 3) == MASK (5) << 3)
196 {
197 /* 5 bytes - more than 21 bits, but not more than 26. */
198 value = (MASK (2) & character) << 24;
199 GET_DATA_BYTE_AT (18);
200 GET_DATA_BYTE_AT (12);
201 GET_DATA_BYTE_AT (6);
202 GET_DATA_BYTE_AT (0);
203 put_ucs4 (value, subtask);
204 character = get_byte (subtask);
205 }
206 else
207 {
208 /* 4 bytes - more than 16 bits, but not more than 21. */
209 value = (MASK (3) & character) << 18;
210 GET_DATA_BYTE_AT (12);
211 GET_DATA_BYTE_AT (6);
212 GET_DATA_BYTE_AT (0);
213 put_ucs4 (value, subtask);
214 character = get_byte (subtask);
215 }
216 else if ((character & MASK (2) << 6) == MASK (2) << 6)
217 if ((character & MASK (3) << 5) == MASK (3) << 5)
218 {
219 /* 3 bytes - more than 11 bits, but not more than 16. */
220 value = (MASK (4) & character) << 12;
221 GET_DATA_BYTE_AT (6);
222 GET_DATA_BYTE_AT (0);
223 put_ucs4 (value, subtask);
224 character = get_byte (subtask);
225 }
226 else
227 {
228 /* 2 bytes - more than 7 bits, but not more than 11. */
229 value = (MASK (5) & character) << 6;
230 GET_DATA_BYTE_AT (0);
231 put_ucs4 (value, subtask);
232 character = get_byte (subtask);
233 }
234 else if ((character & 1 << 7) == 1 << 7)
235 {
236 /* Valid only as a continuation byte. */
237 RETURN_IF_NOGO (RECODE_INVALID_INPUT, subtask);
238 character = get_byte (subtask);
239 }
240 else
241 {
242 /* 1 byte - not more than 7 bits (that is, ASCII). */
243 put_ucs4 (MASK (8) & character, subtask);
244 character = get_byte (subtask);
245 }
246
247 SUBTASK_RETURN (subtask);
248 }
249
250 bool
module_utf8(RECODE_OUTER outer)251 module_utf8 (RECODE_OUTER outer)
252 {
253 return
254 declare_single (outer, "ISO-10646-UCS-4", "UTF-8",
255 outer->quality_variable_to_variable,
256 NULL, transform_ucs4_utf8)
257 && declare_single (outer, "UTF-8", "ISO-10646-UCS-4",
258 outer->quality_variable_to_variable,
259 NULL, transform_utf8_ucs4)
260
261 && declare_alias (outer, "UTF-2", "UTF-8")
262 && declare_alias (outer, "UTF-FSS", "UTF-8")
263 && declare_alias (outer, "FSS_UTF", "UTF-8")
264 && declare_alias (outer, "TF-8", "UTF-8")
265 && declare_alias (outer, "u8", "UTF-8")
266
267 /* Simple UCS-2 does not have to go through UTF-16. */
268 && declare_single (outer, "ISO-10646-UCS-2", "UTF-8",
269 outer->quality_variable_to_variable,
270 NULL, transform_ucs2_utf8);
271 }
272
273 void
delmodule_utf8(RECODE_OUTER outer)274 delmodule_utf8 (RECODE_OUTER outer)
275 {
276 }
277