1------------------------------------------------------------------------------
2--                                                                          --
3--                         GNAT RUN-TIME COMPONENTS                         --
4--                                                                          --
5--                       S Y S T E M . W C H _ C N V                        --
6--                                                                          --
7--                                 B o d y                                  --
8--                                                                          --
9--          Copyright (C) 1992-2009, Free Software Foundation, Inc.         --
10--                                                                          --
11-- GNAT is free software;  you can  redistribute it  and/or modify it under --
12-- terms of the  GNU General Public License as published  by the Free Soft- --
13-- ware  Foundation;  either version 3,  or (at your option) any later ver- --
14-- sion.  GNAT is distributed in the hope that it will be useful, but WITH- --
15-- OUT ANY WARRANTY;  without even the  implied warranty of MERCHANTABILITY --
16-- or FITNESS FOR A PARTICULAR PURPOSE.                                     --
17--                                                                          --
18-- As a special exception under Section 7 of GPL version 3, you are granted --
19-- additional permissions described in the GCC Runtime Library Exception,   --
20-- version 3.1, as published by the Free Software Foundation.               --
21--                                                                          --
22-- You should have received a copy of the GNU General Public License and    --
23-- a copy of the GCC Runtime Library Exception along with this program;     --
24-- see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see    --
25-- <http://www.gnu.org/licenses/>.                                          --
26--                                                                          --
27-- GNAT was originally developed  by the GNAT team at  New York University. --
28-- Extensive contributions were provided by Ada Core Technologies Inc.      --
29--                                                                          --
30------------------------------------------------------------------------------
31
32pragma Compiler_Unit;
33
34with Interfaces;     use Interfaces;
35with System.WCh_Con; use System.WCh_Con;
36with System.WCh_JIS; use System.WCh_JIS;
37
38package body System.WCh_Cnv is
39
40   -----------------------------
41   -- Char_Sequence_To_UTF_32 --
42   -----------------------------
43
44   function Char_Sequence_To_UTF_32
45     (C  : Character;
46      EM : System.WCh_Con.WC_Encoding_Method) return UTF_32_Code
47   is
48      B1 : Unsigned_32;
49      C1 : Character;
50      U  : Unsigned_32;
51      W  : Unsigned_32;
52
53      procedure Get_Hex (N : Character);
54      --  If N is a hex character, then set B1 to 16 * B1 + character N.
55      --  Raise Constraint_Error if character N is not a hex character.
56
57      procedure Get_UTF_Byte;
58      pragma Inline (Get_UTF_Byte);
59      --  Used to interpret a 2#10xxxxxx# continuation byte in UTF-8 mode.
60      --  Reads a byte, and raises CE if the first two bits are not 10.
61      --  Otherwise shifts W 6 bits left and or's in the 6 xxxxxx bits.
62
63      -------------
64      -- Get_Hex --
65      -------------
66
67      procedure Get_Hex (N : Character) is
68         B2 : constant Unsigned_32 := Character'Pos (N);
69      begin
70         if B2 in Character'Pos ('0') .. Character'Pos ('9') then
71            B1 := B1 * 16 + B2 - Character'Pos ('0');
72         elsif B2 in Character'Pos ('A') .. Character'Pos ('F') then
73            B1 := B1 * 16 + B2 - (Character'Pos ('A') - 10);
74         elsif B2 in Character'Pos ('a') .. Character'Pos ('f') then
75            B1 := B1 * 16 + B2 - (Character'Pos ('a') - 10);
76         else
77            raise Constraint_Error;
78         end if;
79      end Get_Hex;
80
81      ------------------
82      -- Get_UTF_Byte --
83      ------------------
84
85      procedure Get_UTF_Byte is
86      begin
87         U := Unsigned_32 (Character'Pos (In_Char));
88
89         if (U and 2#11000000#) /= 2#10_000000# then
90            raise Constraint_Error;
91         end if;
92
93         W := Shift_Left (W, 6) or (U and 2#00111111#);
94      end Get_UTF_Byte;
95
96   --  Start of processing for Char_Sequence_To_Wide
97
98   begin
99      case EM is
100
101         when WCEM_Hex =>
102            if C /= ASCII.ESC then
103               return Character'Pos (C);
104
105            else
106               B1 := 0;
107               Get_Hex (In_Char);
108               Get_Hex (In_Char);
109               Get_Hex (In_Char);
110               Get_Hex (In_Char);
111
112               return UTF_32_Code (B1);
113            end if;
114
115         when WCEM_Upper =>
116            if C > ASCII.DEL then
117               return 256 * Character'Pos (C) + Character'Pos (In_Char);
118            else
119               return Character'Pos (C);
120            end if;
121
122         when WCEM_Shift_JIS =>
123            if C > ASCII.DEL then
124               return Wide_Character'Pos (Shift_JIS_To_JIS (C, In_Char));
125            else
126               return Character'Pos (C);
127            end if;
128
129         when WCEM_EUC =>
130            if C > ASCII.DEL then
131               return Wide_Character'Pos (EUC_To_JIS (C, In_Char));
132            else
133               return Character'Pos (C);
134            end if;
135
136         when WCEM_UTF8 =>
137
138            --  Note: for details of UTF8 encoding see RFC 3629
139
140            U := Unsigned_32 (Character'Pos (C));
141
142            --  16#00_0000#-16#00_007F#: 0xxxxxxx
143
144            if (U and 2#10000000#) = 2#00000000# then
145               return Character'Pos (C);
146
147            --  16#00_0080#-16#00_07FF#: 110xxxxx 10xxxxxx
148
149            elsif (U and 2#11100000#) = 2#110_00000# then
150               W := U and 2#00011111#;
151               Get_UTF_Byte;
152               return UTF_32_Code (W);
153
154            --  16#00_0800#-16#00_ffff#: 1110xxxx 10xxxxxx 10xxxxxx
155
156            elsif (U and 2#11110000#) = 2#1110_0000# then
157               W := U and 2#00001111#;
158               Get_UTF_Byte;
159               Get_UTF_Byte;
160               return UTF_32_Code (W);
161
162            --  16#01_0000#-16#10_FFFF#: 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
163
164            elsif (U and 2#11111000#) = 2#11110_000# then
165               W := U and 2#00000111#;
166
167               for K in 1 .. 3 loop
168                  Get_UTF_Byte;
169               end loop;
170
171               return UTF_32_Code (W);
172
173            --  16#0020_0000#-16#03FF_FFFF#: 111110xx 10xxxxxx 10xxxxxx
174            --                               10xxxxxx 10xxxxxx
175
176            elsif (U and 2#11111100#) = 2#111110_00# then
177               W := U and 2#00000011#;
178
179               for K in 1 .. 4 loop
180                  Get_UTF_Byte;
181               end loop;
182
183               return UTF_32_Code (W);
184
185            --  16#0400_0000#-16#7FFF_FFFF#: 1111110x 10xxxxxx 10xxxxxx
186            --                               10xxxxxx 10xxxxxx 10xxxxxx
187
188            elsif (U and 2#11111110#) = 2#1111110_0# then
189               W := U and 2#00000001#;
190
191               for K in 1 .. 5 loop
192                  Get_UTF_Byte;
193               end loop;
194
195               return UTF_32_Code (W);
196
197            else
198               raise Constraint_Error;
199            end if;
200
201         when WCEM_Brackets =>
202            if C /= '[' then
203               return Character'Pos (C);
204            end if;
205
206            if In_Char /= '"' then
207               raise Constraint_Error;
208            end if;
209
210            B1 := 0;
211            Get_Hex (In_Char);
212            Get_Hex (In_Char);
213
214            C1 := In_Char;
215
216            if C1 /= '"' then
217               Get_Hex (C1);
218               Get_Hex (In_Char);
219
220               C1 := In_Char;
221
222               if C1 /= '"' then
223                  Get_Hex (C1);
224                  Get_Hex (In_Char);
225
226                  C1 := In_Char;
227
228                  if C1 /= '"' then
229                     Get_Hex (C1);
230                     Get_Hex (In_Char);
231
232                     if B1 > Unsigned_32 (UTF_32_Code'Last) then
233                        raise Constraint_Error;
234                     end if;
235
236                     if In_Char /= '"' then
237                        raise Constraint_Error;
238                     end if;
239                  end if;
240               end if;
241            end if;
242
243            if In_Char /= ']' then
244               raise Constraint_Error;
245            end if;
246
247            return UTF_32_Code (B1);
248
249      end case;
250   end Char_Sequence_To_UTF_32;
251
252   --------------------------------
253   -- Char_Sequence_To_Wide_Char --
254   --------------------------------
255
256   function Char_Sequence_To_Wide_Char
257     (C  : Character;
258      EM : System.WCh_Con.WC_Encoding_Method) return Wide_Character
259   is
260      function Char_Sequence_To_UTF is new Char_Sequence_To_UTF_32 (In_Char);
261
262      U : constant UTF_32_Code := Char_Sequence_To_UTF (C, EM);
263
264   begin
265      if U > 16#FFFF# then
266         raise Constraint_Error;
267      else
268         return Wide_Character'Val (U);
269      end if;
270   end Char_Sequence_To_Wide_Char;
271
272   -----------------------------
273   -- UTF_32_To_Char_Sequence --
274   -----------------------------
275
276   procedure UTF_32_To_Char_Sequence
277     (Val : UTF_32_Code;
278      EM  : System.WCh_Con.WC_Encoding_Method)
279   is
280      Hexc : constant array (UTF_32_Code range 0 .. 15) of Character :=
281               "0123456789ABCDEF";
282
283      C1, C2 : Character;
284      U      : Unsigned_32;
285
286   begin
287      --  Raise CE for invalid UTF_32_Code
288
289      if not Val'Valid then
290         raise Constraint_Error;
291      end if;
292
293      --  Processing depends on encoding mode
294
295      case EM is
296
297         when WCEM_Hex =>
298            if Val < 256 then
299               Out_Char (Character'Val (Val));
300            elsif Val <= 16#FFFF# then
301               Out_Char (ASCII.ESC);
302               Out_Char (Hexc (Val / (16**3)));
303               Out_Char (Hexc ((Val / (16**2)) mod 16));
304               Out_Char (Hexc ((Val / 16) mod 16));
305               Out_Char (Hexc (Val mod 16));
306            else
307               raise Constraint_Error;
308            end if;
309
310         when WCEM_Upper =>
311            if Val < 128 then
312               Out_Char (Character'Val (Val));
313            elsif Val < 16#8000# or else Val > 16#FFFF# then
314               raise Constraint_Error;
315            else
316               Out_Char (Character'Val (Val / 256));
317               Out_Char (Character'Val (Val mod 256));
318            end if;
319
320         when WCEM_Shift_JIS =>
321            if Val < 128 then
322               Out_Char (Character'Val (Val));
323            elsif Val <= 16#FFFF# then
324               JIS_To_Shift_JIS (Wide_Character'Val (Val), C1, C2);
325               Out_Char (C1);
326               Out_Char (C2);
327            else
328               raise Constraint_Error;
329            end if;
330
331         when WCEM_EUC =>
332            if Val < 128 then
333               Out_Char (Character'Val (Val));
334            elsif Val <= 16#FFFF# then
335               JIS_To_EUC (Wide_Character'Val (Val), C1, C2);
336               Out_Char (C1);
337               Out_Char (C2);
338            else
339               raise Constraint_Error;
340            end if;
341
342         when WCEM_UTF8 =>
343
344            --  Note: for details of UTF8 encoding see RFC 3629
345
346            U := Unsigned_32 (Val);
347
348            --  16#00_0000#-16#00_007F#: 0xxxxxxx
349
350            if U <= 16#00_007F# then
351               Out_Char (Character'Val (U));
352
353            --  16#00_0080#-16#00_07FF#: 110xxxxx 10xxxxxx
354
355            elsif U <= 16#00_07FF# then
356               Out_Char (Character'Val (2#11000000# or Shift_Right (U, 6)));
357               Out_Char (Character'Val (2#10000000# or (U and 2#00111111#)));
358
359            --  16#00_0800#-16#00_FFFF#: 1110xxxx 10xxxxxx 10xxxxxx
360
361            elsif U <= 16#00_FFFF# then
362               Out_Char (Character'Val (2#11100000# or Shift_Right (U, 12)));
363               Out_Char (Character'Val (2#10000000# or (Shift_Right (U, 6)
364                                                          and 2#00111111#)));
365               Out_Char (Character'Val (2#10000000# or (U and 2#00111111#)));
366
367            --  16#01_0000#-16#10_FFFF#: 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
368
369            elsif U <= 16#10_FFFF# then
370               Out_Char (Character'Val (2#11110000# or Shift_Right (U, 18)));
371               Out_Char (Character'Val (2#10000000# or (Shift_Right (U, 12)
372                                                          and 2#00111111#)));
373               Out_Char (Character'Val (2#10000000# or (Shift_Right (U, 6)
374                                                          and 2#00111111#)));
375               Out_Char (Character'Val (2#10000000# or (U and 2#00111111#)));
376
377            --  16#0020_0000#-16#03FF_FFFF#: 111110xx 10xxxxxx 10xxxxxx
378            --                               10xxxxxx 10xxxxxx
379
380            elsif U <= 16#03FF_FFFF# then
381               Out_Char (Character'Val (2#11111000# or Shift_Right (U, 24)));
382               Out_Char (Character'Val (2#10000000# or (Shift_Right (U, 18)
383                                                          and 2#00111111#)));
384               Out_Char (Character'Val (2#10000000# or (Shift_Right (U, 12)
385                                                          and 2#00111111#)));
386               Out_Char (Character'Val (2#10000000# or (Shift_Right (U, 6)
387                                                          and 2#00111111#)));
388               Out_Char (Character'Val (2#10000000# or (U and 2#00111111#)));
389
390            --  16#0400_0000#-16#7FFF_FFFF#: 1111110x 10xxxxxx 10xxxxxx
391            --                               10xxxxxx 10xxxxxx 10xxxxxx
392
393            elsif U <= 16#7FFF_FFFF# then
394               Out_Char (Character'Val (2#11111100# or Shift_Right (U, 30)));
395               Out_Char (Character'Val (2#10000000# or (Shift_Right (U, 24)
396                                                          and 2#00111111#)));
397               Out_Char (Character'Val (2#10000000# or (Shift_Right (U, 18)
398                                                          and 2#00111111#)));
399               Out_Char (Character'Val (2#10000000# or (Shift_Right (U, 12)
400                                                          and 2#00111111#)));
401               Out_Char (Character'Val (2#10000000# or (Shift_Right (U, 6)
402                                                          and 2#00111111#)));
403               Out_Char (Character'Val (2#10000000# or (U and 2#00111111#)));
404
405            else
406               raise Constraint_Error;
407            end if;
408
409         when WCEM_Brackets =>
410
411            --  Values in the range 0-255 are directly output. Note that there
412            --  is some issue with [ (16#5B#] since this will cause confusion
413            --  if the resulting string is interpreted using brackets encoding.
414
415            --  One possibility would be to always output [ as ["5B"] but in
416            --  practice this is undesirable, since for example normal use of
417            --  Wide_Text_IO for output (much more common than input), really
418            --  does want to be able to say something like
419
420            --     Put_Line ("Start of output [first run]");
421
422            --  and have it come out as intended, rather than contaminated by
423            --  a ["5B"] sequence in place of the left bracket.
424
425            if Val < 256 then
426               Out_Char (Character'Val (Val));
427
428            --  Otherwise use brackets notation for vales greater than 255
429
430            else
431               Out_Char ('[');
432               Out_Char ('"');
433
434               if Val > 16#FFFF# then
435                  if Val > 16#00FF_FFFF# then
436                     Out_Char (Hexc (Val / 16 ** 7));
437                     Out_Char (Hexc ((Val / 16 ** 6) mod 16));
438                  end if;
439
440                  Out_Char (Hexc ((Val / 16 ** 5) mod 16));
441                  Out_Char (Hexc ((Val / 16 ** 4) mod 16));
442               end if;
443
444               Out_Char (Hexc ((Val / 16 ** 3) mod 16));
445               Out_Char (Hexc ((Val / 16 ** 2) mod 16));
446               Out_Char (Hexc ((Val / 16) mod 16));
447               Out_Char (Hexc (Val mod 16));
448
449               Out_Char ('"');
450               Out_Char (']');
451            end if;
452      end case;
453   end UTF_32_To_Char_Sequence;
454
455   --------------------------------
456   -- Wide_Char_To_Char_Sequence --
457   --------------------------------
458
459   procedure Wide_Char_To_Char_Sequence
460     (WC : Wide_Character;
461      EM : System.WCh_Con.WC_Encoding_Method)
462   is
463      procedure UTF_To_Char_Sequence is new UTF_32_To_Char_Sequence (Out_Char);
464   begin
465      UTF_To_Char_Sequence (Wide_Character'Pos (WC), EM);
466   end Wide_Char_To_Char_Sequence;
467
468end System.WCh_Cnv;
469