1 /* Conversion of files between different charsets and surfaces.
2    Copyright � 1996, 97, 98, 99, 00 Free Software Foundation, Inc.
3    Contributed by Fran�ois Pinard <pinard@iro.umontreal.ca>, 1996.
4 
5    This library is free software; you can redistribute it and/or
6    modify it under the terms of the GNU Lesser General Public License
7    as published by the Free Software Foundation; either version 2 of the
8    License, or (at your option) any later version.
9 
10    This library is distributed in the hope that it will be
11    useful, but WITHOUT ANY WARRANTY; without even the implied warranty
12    of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
13    Lesser General Public License for more details.
14 
15    You should have received a copy of the GNU Lesser General Public
16    License along with the `recode' Library; see the file `COPYING.LIB'.
17    If not, write to the Free Software Foundation, Inc., 59 Temple Place -
18    Suite 330, Boston, MA 02111-1307, USA.  */
19 
20 #include "common.h"
21 
22 /* Define HANDLE_32_BITS if you want conversion for 2^32 codes instead
23    of 2^31.  But this would not be ISO-10646, which says 2^31.  */
24 
25 /* Read next data byte and check its value, discard an illegal sequence.
26    This macro is meant to be used only within the `while' loop in
27    `transform_utf8_ucs[24]'.  */
28 #define GET_DATA_BYTE \
29   character = get_byte (subtask);						\
30   if (character == EOF)							\
31     {									\
32       RETURN_IF_NOGO (RECODE_INVALID_INPUT, subtask);		\
33       break;								\
34     }									\
35   else if ((MASK (2) << 6 & character) != 1 << 7)			\
36     {									\
37       RETURN_IF_NOGO (RECODE_INVALID_INPUT, subtask);		\
38       continue;								\
39     }									\
40   else
41 
42 /* Read next data byte and check its value, discard an illegal sequence.
43    Merge it into `value' at POSITION.  This macro is meant to be used only
44    within the `while' loop in `transform_utf8_ucs[24]'.  */
45 #define GET_DATA_BYTE_AT(Position) \
46   GET_DATA_BYTE /* ... else */ value |= (MASK (6) & character) << Position
47 
48 static bool
transform_ucs2_utf8(RECODE_SUBTASK subtask)49 transform_ucs2_utf8 (RECODE_SUBTASK subtask)
50 {
51   unsigned value;
52 
53   while (get_ucs2 (&value, subtask))
54     {
55       if (value & ~MASK (7))
56 	if (value & ~MASK (11))
57 	  {
58 	    /* 3 bytes - more than 11 bits, but not more than 16.  */
59 	    put_byte ((MASK (3) << 5) | (MASK (6) & value >> 12), subtask);
60 	    put_byte ((1 << 7) | (MASK (6) & value >> 6), subtask);
61 	    put_byte ((1 << 7) | (MASK (6) & value), subtask);
62 	  }
63 	else
64 	  {
65 	    /* 2 bytes - more than 7 bits, but not more than 11.  */
66 	    put_byte ((MASK (2) << 6) | (MASK (6) & value >> 6), subtask);
67 	    put_byte ((1 << 7) | (MASK (6) & value), subtask);
68 	  }
69       else
70 	/* 1 byte - not more than 7 bits (that is, ASCII).  */
71 	put_byte (value, subtask);
72     }
73 
74   SUBTASK_RETURN (subtask);
75 }
76 
77 static bool
transform_ucs4_utf8(RECODE_SUBTASK subtask)78 transform_ucs4_utf8 (RECODE_SUBTASK subtask)
79 {
80   unsigned value;
81 
82   while (get_ucs4 (&value, subtask))
83     if (value & ~MASK (16))
84       if (value & ~MASK (26))
85 	if (value & ~MASK (31))
86 	  {
87 #if HANDLE_32_BITS
88 	    /* 7 bytes - more than 31 bits (that is, exactly 32 :-).  */
89 	    put_byte (MASK (7) << 1);
90 	    put_byte ((1 << 7) | (MASK (6) & value >> 30), subtask);
91 	    put_byte ((1 << 7) | (MASK (6) & value >> 24), subtask);
92 	    put_byte ((1 << 7) | (MASK (6) & value >> 18), subtask);
93 	    put_byte ((1 << 7) | (MASK (6) & value >> 12), subtask);
94 	    put_byte ((1 << 7) | (MASK (6) & value >> 6), subtask);
95 	    put_byte ((1 << 7) | (MASK (6) & value), subtask);
96 #else
97 	    RETURN_IF_NOGO (RECODE_INVALID_INPUT, subtask);
98 #endif
99 	  }
100   	else
101 	  {
102 	    /* 6 bytes - more than 26 bits, but not more than 31.  */
103 	    put_byte ((MASK (6) << 2) | (MASK (6) & value >> 30), subtask);
104 	    put_byte ((1 << 7) | (MASK (6) & value >> 24), subtask);
105 	    put_byte ((1 << 7) | (MASK (6) & value >> 18), subtask);
106 	    put_byte ((1 << 7) | (MASK (6) & value >> 12), subtask);
107 	    put_byte ((1 << 7) | (MASK (6) & value >> 6), subtask);
108 	    put_byte ((1 << 7) | (MASK (6) & value), subtask);
109 	  }
110       else if (value & ~MASK (21))
111 	{
112 	  /* 5 bytes - more than 21 bits, but not more than 26.  */
113 	  put_byte ((MASK (5) << 3) | (MASK (6) & value >> 24), subtask);
114 	  put_byte ((1 << 7) | (MASK (6) & value >> 18), subtask);
115 	  put_byte ((1 << 7) | (MASK (6) & value >> 12), subtask);
116 	  put_byte ((1 << 7) | (MASK (6) & value >> 6), subtask);
117 	  put_byte ((1 << 7) | (MASK (6) & value), subtask);
118 	}
119       else
120 	{
121 	  /* 4 bytes - more than 16 bits, but not more than 21.  */
122 	  put_byte ((MASK (4) << 4) | (MASK (6) & value >> 18), subtask);
123 	  put_byte ((1 << 7) | (MASK (6) & value >> 12), subtask);
124 	  put_byte ((1 << 7) | (MASK (6) & value >> 6), subtask);
125 	  put_byte ((1 << 7) | (MASK (6) & value), subtask);
126 	}
127     else if (value & ~MASK (7))
128       if (value & ~MASK (11))
129 	{
130 	  /* 3 bytes - more than 11 bits, but not more than 16.  */
131 	  put_byte ((MASK (3) << 5) | (MASK (6) & value >> 12), subtask);
132 	  put_byte ((1 << 7) | (MASK (6) & value >> 6), subtask);
133 	  put_byte ((1 << 7) | (MASK (6) & value), subtask);
134 	}
135       else
136 	{
137 	  /* 2 bytes - more than 7 bits, but not more than 11.  */
138 	  put_byte ((MASK (2) << 6) | (MASK (6) & value >> 6), subtask);
139 	  put_byte ((1 << 7) | (MASK (6) & value), subtask);
140 	}
141     else
142       /* 1 byte - not more than 7 bits (that is, ASCII).  */
143       put_byte (value, subtask);
144 
145   SUBTASK_RETURN (subtask);
146 }
147 
148 /* FIXME: The UTF-8 decoding algorithms do not validate that the minimum
149    length surface was indeed used.  This would be necessary for ensuring
150    that the recoding is exactly reversible.  In fact, this minimum length
151    surface is also a requirement of UTF-8 specification.  */
152 
153 static bool
transform_utf8_ucs4(RECODE_SUBTASK subtask)154 transform_utf8_ucs4 (RECODE_SUBTASK subtask)
155 {
156   int character = get_byte (subtask);
157   unsigned value;
158 
159   while (character != EOF)
160 
161     /* Process one UTF-8 value.  EOF is acceptable on first byte only.  */
162 
163     if ((character & MASK (4) << 4) == MASK (4) << 4)
164       if ((character & MASK (6) << 2) == MASK (6) << 2)
165 	if ((character & MASK (7) << 1) == MASK (7) << 1)
166 	  {
167 	    /* 7 bytes - more than 31 bits (that is, exactly 32 :-).  */
168 #if HANDLE_32_BITS
169 	    value = 0;
170 	    GET_DATA_BYTE_AT (30);
171 	    GET_DATA_BYTE_AT (24);
172 	    GET_DATA_BYTE_AT (18);
173 	    GET_DATA_BYTE_AT (12);
174 	    GET_DATA_BYTE_AT (6);
175 	    GET_DATA_BYTE_AT (0);
176 	    put_ucs4 (value, subtask);
177 	    character = get_byte (subtask);
178 #else
179 	    RETURN_IF_NOGO (RECODE_INVALID_INPUT, subtask);
180 	    character = get_byte (subtask);
181 #endif
182 	  }
183 	else
184 	  {
185 	    /* 6 bytes - more than 26 bits, but not more than 31.  */
186 	    value = (MASK (1) & character) << 30;
187 	    GET_DATA_BYTE_AT (24);
188 	    GET_DATA_BYTE_AT (18);
189 	    GET_DATA_BYTE_AT (12);
190 	    GET_DATA_BYTE_AT (6);
191 	    GET_DATA_BYTE_AT (0);
192 	    put_ucs4 (value, subtask);
193 	    character = get_byte (subtask);
194 	  }
195       else if ((character & MASK (5) << 3) == MASK (5) << 3)
196 	{
197 	  /* 5 bytes - more than 21 bits, but not more than 26.  */
198 	  value = (MASK (2) & character) << 24;
199 	  GET_DATA_BYTE_AT (18);
200 	  GET_DATA_BYTE_AT (12);
201 	  GET_DATA_BYTE_AT (6);
202 	  GET_DATA_BYTE_AT (0);
203 	  put_ucs4 (value, subtask);
204 	  character = get_byte (subtask);
205 	}
206       else
207 	{
208 	  /* 4 bytes - more than 16 bits, but not more than 21.  */
209 	  value = (MASK (3) & character) << 18;
210 	  GET_DATA_BYTE_AT (12);
211 	  GET_DATA_BYTE_AT (6);
212 	  GET_DATA_BYTE_AT (0);
213 	  put_ucs4 (value, subtask);
214 	  character = get_byte (subtask);
215 	}
216     else if ((character & MASK (2) << 6) == MASK (2) << 6)
217       if ((character & MASK (3) << 5) == MASK (3) << 5)
218 	{
219 	  /* 3 bytes - more than 11 bits, but not more than 16.  */
220 	  value = (MASK (4) & character) << 12;
221 	  GET_DATA_BYTE_AT (6);
222 	  GET_DATA_BYTE_AT (0);
223 	  put_ucs4 (value, subtask);
224 	  character = get_byte (subtask);
225 	}
226       else
227 	{
228 	  /* 2 bytes - more than 7 bits, but not more than 11.  */
229 	  value = (MASK (5) & character) << 6;
230 	  GET_DATA_BYTE_AT (0);
231 	  put_ucs4 (value, subtask);
232 	  character = get_byte (subtask);
233 	}
234     else if ((character & 1 << 7) == 1 << 7)
235       {
236 	/* Valid only as a continuation byte.  */
237 	RETURN_IF_NOGO (RECODE_INVALID_INPUT, subtask);
238 	character = get_byte (subtask);
239       }
240     else
241       {
242 	/* 1 byte - not more than 7 bits (that is, ASCII).  */
243 	put_ucs4 (MASK (8) & character, subtask);
244 	character = get_byte (subtask);
245       }
246 
247   SUBTASK_RETURN (subtask);
248 }
249 
250 bool
module_utf8(RECODE_OUTER outer)251 module_utf8 (RECODE_OUTER outer)
252 {
253   return
254     declare_single (outer, "ISO-10646-UCS-4", "UTF-8",
255 		    outer->quality_variable_to_variable,
256 		    NULL, transform_ucs4_utf8)
257     && declare_single (outer, "UTF-8", "ISO-10646-UCS-4",
258 		       outer->quality_variable_to_variable,
259 		       NULL, transform_utf8_ucs4)
260 
261     && declare_alias (outer, "UTF-2", "UTF-8")
262     && declare_alias (outer, "UTF-FSS", "UTF-8")
263     && declare_alias (outer, "FSS_UTF", "UTF-8")
264     && declare_alias (outer, "TF-8", "UTF-8")
265     && declare_alias (outer, "u8", "UTF-8")
266 
267     /* Simple UCS-2 does not have to go through UTF-16.  */
268     && declare_single (outer, "ISO-10646-UCS-2", "UTF-8",
269 		       outer->quality_variable_to_variable,
270 		       NULL, transform_ucs2_utf8);
271 }
272 
273 void
delmodule_utf8(RECODE_OUTER outer)274 delmodule_utf8 (RECODE_OUTER outer)
275 {
276 }
277