1------------------------------------------------------------------------------
2--                                                                          --
3--                         GNAT RUN-TIME COMPONENTS                         --
4--                                                                          --
5--                       S Y S T E M . W C H _ C N V                        --
6--                                                                          --
7--                                 B o d y                                  --
8--                                                                          --
9--          Copyright (C) 1992-2018, Free Software Foundation, Inc.         --
10--                                                                          --
11-- GNAT is free software;  you can  redistribute it  and/or modify it under --
12-- terms of the  GNU General Public License as published  by the Free Soft- --
13-- ware  Foundation;  either version 3,  or (at your option) any later ver- --
14-- sion.  GNAT is distributed in the hope that it will be useful, but WITH- --
15-- OUT ANY WARRANTY;  without even the  implied warranty of MERCHANTABILITY --
16-- or FITNESS FOR A PARTICULAR PURPOSE.                                     --
17--                                                                          --
18-- As a special exception under Section 7 of GPL version 3, you are granted --
19-- additional permissions described in the GCC Runtime Library Exception,   --
20-- version 3.1, as published by the Free Software Foundation.               --
21--                                                                          --
22-- You should have received a copy of the GNU General Public License and    --
23-- a copy of the GCC Runtime Library Exception along with this program;     --
24-- see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see    --
25-- <http://www.gnu.org/licenses/>.                                          --
26--                                                                          --
27-- GNAT was originally developed  by the GNAT team at  New York University. --
28-- Extensive contributions were provided by Ada Core Technologies Inc.      --
29--                                                                          --
30------------------------------------------------------------------------------
31
32pragma Compiler_Unit_Warning;
33
34with Interfaces;     use Interfaces;
35with System.WCh_Con; use System.WCh_Con;
36with System.WCh_JIS; use System.WCh_JIS;
37
38package body System.WCh_Cnv is
39
40   -----------------------------
41   -- Char_Sequence_To_UTF_32 --
42   -----------------------------
43
44   function Char_Sequence_To_UTF_32
45     (C  : Character;
46      EM : System.WCh_Con.WC_Encoding_Method) return UTF_32_Code
47   is
48      B1 : Unsigned_32;
49      C1 : Character;
50      U  : Unsigned_32;
51      W  : Unsigned_32;
52
53      procedure Get_Hex (N : Character);
54      --  If N is a hex character, then set B1 to 16 * B1 + character N.
55      --  Raise Constraint_Error if character N is not a hex character.
56
57      procedure Get_UTF_Byte;
58      pragma Inline (Get_UTF_Byte);
59      --  Used to interpret a 2#10xxxxxx# continuation byte in UTF-8 mode.
60      --  Reads a byte, and raises CE if the first two bits are not 10.
61      --  Otherwise shifts W 6 bits left and or's in the 6 xxxxxx bits.
62
63      -------------
64      -- Get_Hex --
65      -------------
66
67      procedure Get_Hex (N : Character) is
68         B2 : constant Unsigned_32 := Character'Pos (N);
69      begin
70         if B2 in Character'Pos ('0') .. Character'Pos ('9') then
71            B1 := B1 * 16 + B2 - Character'Pos ('0');
72         elsif B2 in Character'Pos ('A') .. Character'Pos ('F') then
73            B1 := B1 * 16 + B2 - (Character'Pos ('A') - 10);
74         elsif B2 in Character'Pos ('a') .. Character'Pos ('f') then
75            B1 := B1 * 16 + B2 - (Character'Pos ('a') - 10);
76         else
77            raise Constraint_Error;
78         end if;
79      end Get_Hex;
80
81      ------------------
82      -- Get_UTF_Byte --
83      ------------------
84
85      procedure Get_UTF_Byte is
86      begin
87         U := Unsigned_32 (Character'Pos (In_Char));
88
89         if (U and 2#11000000#) /= 2#10_000000# then
90            raise Constraint_Error;
91         end if;
92
93         W := Shift_Left (W, 6) or (U and 2#00111111#);
94      end Get_UTF_Byte;
95
96   --  Start of processing for Char_Sequence_To_UTF_32
97
98   begin
99      case EM is
100         when WCEM_Hex =>
101            if C /= ASCII.ESC then
102               return Character'Pos (C);
103
104            else
105               B1 := 0;
106               Get_Hex (In_Char);
107               Get_Hex (In_Char);
108               Get_Hex (In_Char);
109               Get_Hex (In_Char);
110
111               return UTF_32_Code (B1);
112            end if;
113
114         when WCEM_Upper =>
115            if C > ASCII.DEL then
116               return 256 * Character'Pos (C) + Character'Pos (In_Char);
117            else
118               return Character'Pos (C);
119            end if;
120
121         when WCEM_Shift_JIS =>
122            if C > ASCII.DEL then
123               return Wide_Character'Pos (Shift_JIS_To_JIS (C, In_Char));
124            else
125               return Character'Pos (C);
126            end if;
127
128         when WCEM_EUC =>
129            if C > ASCII.DEL then
130               return Wide_Character'Pos (EUC_To_JIS (C, In_Char));
131            else
132               return Character'Pos (C);
133            end if;
134
135         when WCEM_UTF8 =>
136
137            --  Note: for details of UTF8 encoding see RFC 3629
138
139            U := Unsigned_32 (Character'Pos (C));
140
141            --  16#00_0000#-16#00_007F#: 0xxxxxxx
142
143            if (U and 2#10000000#) = 2#00000000# then
144               return Character'Pos (C);
145
146            --  16#00_0080#-16#00_07FF#: 110xxxxx 10xxxxxx
147
148            elsif (U and 2#11100000#) = 2#110_00000# then
149               W := U and 2#00011111#;
150               Get_UTF_Byte;
151               return UTF_32_Code (W);
152
153            --  16#00_0800#-16#00_ffff#: 1110xxxx 10xxxxxx 10xxxxxx
154
155            elsif (U and 2#11110000#) = 2#1110_0000# then
156               W := U and 2#00001111#;
157               Get_UTF_Byte;
158               Get_UTF_Byte;
159               return UTF_32_Code (W);
160
161            --  16#01_0000#-16#10_FFFF#: 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
162
163            elsif (U and 2#11111000#) = 2#11110_000# then
164               W := U and 2#00000111#;
165
166               for K in 1 .. 3 loop
167                  Get_UTF_Byte;
168               end loop;
169
170               return UTF_32_Code (W);
171
172            --  16#0020_0000#-16#03FF_FFFF#: 111110xx 10xxxxxx 10xxxxxx
173            --                               10xxxxxx 10xxxxxx
174
175            elsif (U and 2#11111100#) = 2#111110_00# then
176               W := U and 2#00000011#;
177
178               for K in 1 .. 4 loop
179                  Get_UTF_Byte;
180               end loop;
181
182               return UTF_32_Code (W);
183
184            --  16#0400_0000#-16#7FFF_FFFF#: 1111110x 10xxxxxx 10xxxxxx
185            --                               10xxxxxx 10xxxxxx 10xxxxxx
186
187            elsif (U and 2#11111110#) = 2#1111110_0# then
188               W := U and 2#00000001#;
189
190               for K in 1 .. 5 loop
191                  Get_UTF_Byte;
192               end loop;
193
194               return UTF_32_Code (W);
195
196            else
197               raise Constraint_Error;
198            end if;
199
200         when WCEM_Brackets =>
201            if C /= '[' then
202               return Character'Pos (C);
203            end if;
204
205            if In_Char /= '"' then
206               raise Constraint_Error;
207            end if;
208
209            B1 := 0;
210            Get_Hex (In_Char);
211            Get_Hex (In_Char);
212
213            C1 := In_Char;
214
215            if C1 /= '"' then
216               Get_Hex (C1);
217               Get_Hex (In_Char);
218
219               C1 := In_Char;
220
221               if C1 /= '"' then
222                  Get_Hex (C1);
223                  Get_Hex (In_Char);
224
225                  C1 := In_Char;
226
227                  if C1 /= '"' then
228                     Get_Hex (C1);
229                     Get_Hex (In_Char);
230
231                     if B1 > Unsigned_32 (UTF_32_Code'Last) then
232                        raise Constraint_Error;
233                     end if;
234
235                     if In_Char /= '"' then
236                        raise Constraint_Error;
237                     end if;
238                  end if;
239               end if;
240            end if;
241
242            if In_Char /= ']' then
243               raise Constraint_Error;
244            end if;
245
246            return UTF_32_Code (B1);
247      end case;
248   end Char_Sequence_To_UTF_32;
249
250   --------------------------------
251   -- Char_Sequence_To_Wide_Char --
252   --------------------------------
253
254   function Char_Sequence_To_Wide_Char
255     (C  : Character;
256      EM : System.WCh_Con.WC_Encoding_Method) return Wide_Character
257   is
258      function Char_Sequence_To_UTF is new Char_Sequence_To_UTF_32 (In_Char);
259
260      U : constant UTF_32_Code := Char_Sequence_To_UTF (C, EM);
261
262   begin
263      if U > 16#FFFF# then
264         raise Constraint_Error;
265      else
266         return Wide_Character'Val (U);
267      end if;
268   end Char_Sequence_To_Wide_Char;
269
270   -----------------------------
271   -- UTF_32_To_Char_Sequence --
272   -----------------------------
273
274   procedure UTF_32_To_Char_Sequence
275     (Val : UTF_32_Code;
276      EM  : System.WCh_Con.WC_Encoding_Method)
277   is
278      Hexc : constant array (UTF_32_Code range 0 .. 15) of Character :=
279               "0123456789ABCDEF";
280
281      C1, C2 : Character;
282      U      : Unsigned_32;
283
284   begin
285      --  Raise CE for invalid UTF_32_Code
286
287      if not Val'Valid then
288         raise Constraint_Error;
289      end if;
290
291      --  Processing depends on encoding mode
292
293      case EM is
294         when WCEM_Hex =>
295            if Val < 256 then
296               Out_Char (Character'Val (Val));
297            elsif Val <= 16#FFFF# then
298               Out_Char (ASCII.ESC);
299               Out_Char (Hexc (Val / (16**3)));
300               Out_Char (Hexc ((Val / (16**2)) mod 16));
301               Out_Char (Hexc ((Val / 16) mod 16));
302               Out_Char (Hexc (Val mod 16));
303            else
304               raise Constraint_Error;
305            end if;
306
307         when WCEM_Upper =>
308            if Val < 128 then
309               Out_Char (Character'Val (Val));
310            elsif Val < 16#8000# or else Val > 16#FFFF# then
311               raise Constraint_Error;
312            else
313               Out_Char (Character'Val (Val / 256));
314               Out_Char (Character'Val (Val mod 256));
315            end if;
316
317         when WCEM_Shift_JIS =>
318            if Val < 128 then
319               Out_Char (Character'Val (Val));
320            elsif Val <= 16#FFFF# then
321               JIS_To_Shift_JIS (Wide_Character'Val (Val), C1, C2);
322               Out_Char (C1);
323               Out_Char (C2);
324            else
325               raise Constraint_Error;
326            end if;
327
328         when WCEM_EUC =>
329            if Val < 128 then
330               Out_Char (Character'Val (Val));
331            elsif Val <= 16#FFFF# then
332               JIS_To_EUC (Wide_Character'Val (Val), C1, C2);
333               Out_Char (C1);
334               Out_Char (C2);
335            else
336               raise Constraint_Error;
337            end if;
338
339         when WCEM_UTF8 =>
340
341            --  Note: for details of UTF8 encoding see RFC 3629
342
343            U := Unsigned_32 (Val);
344
345            --  16#00_0000#-16#00_007F#: 0xxxxxxx
346
347            if U <= 16#00_007F# then
348               Out_Char (Character'Val (U));
349
350            --  16#00_0080#-16#00_07FF#: 110xxxxx 10xxxxxx
351
352            elsif U <= 16#00_07FF# then
353               Out_Char (Character'Val (2#11000000# or Shift_Right (U, 6)));
354               Out_Char (Character'Val (2#10000000# or (U and 2#00111111#)));
355
356            --  16#00_0800#-16#00_FFFF#: 1110xxxx 10xxxxxx 10xxxxxx
357
358            elsif U <= 16#00_FFFF# then
359               Out_Char (Character'Val (2#11100000# or Shift_Right (U, 12)));
360               Out_Char (Character'Val (2#10000000# or (Shift_Right (U, 6)
361                                                          and 2#00111111#)));
362               Out_Char (Character'Val (2#10000000# or (U and 2#00111111#)));
363
364            --  16#01_0000#-16#10_FFFF#: 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
365
366            elsif U <= 16#10_FFFF# then
367               Out_Char (Character'Val (2#11110000# or Shift_Right (U, 18)));
368               Out_Char (Character'Val (2#10000000# or (Shift_Right (U, 12)
369                                                          and 2#00111111#)));
370               Out_Char (Character'Val (2#10000000# or (Shift_Right (U, 6)
371                                                          and 2#00111111#)));
372               Out_Char (Character'Val (2#10000000# or (U and 2#00111111#)));
373
374            --  16#0020_0000#-16#03FF_FFFF#: 111110xx 10xxxxxx 10xxxxxx
375            --                               10xxxxxx 10xxxxxx
376
377            elsif U <= 16#03FF_FFFF# then
378               Out_Char (Character'Val (2#11111000# or Shift_Right (U, 24)));
379               Out_Char (Character'Val (2#10000000# or (Shift_Right (U, 18)
380                                                          and 2#00111111#)));
381               Out_Char (Character'Val (2#10000000# or (Shift_Right (U, 12)
382                                                          and 2#00111111#)));
383               Out_Char (Character'Val (2#10000000# or (Shift_Right (U, 6)
384                                                          and 2#00111111#)));
385               Out_Char (Character'Val (2#10000000# or (U and 2#00111111#)));
386
387            --  16#0400_0000#-16#7FFF_FFFF#: 1111110x 10xxxxxx 10xxxxxx
388            --                               10xxxxxx 10xxxxxx 10xxxxxx
389
390            elsif U <= 16#7FFF_FFFF# then
391               Out_Char (Character'Val (2#11111100# or Shift_Right (U, 30)));
392               Out_Char (Character'Val (2#10000000# or (Shift_Right (U, 24)
393                                                          and 2#00111111#)));
394               Out_Char (Character'Val (2#10000000# or (Shift_Right (U, 18)
395                                                          and 2#00111111#)));
396               Out_Char (Character'Val (2#10000000# or (Shift_Right (U, 12)
397                                                          and 2#00111111#)));
398               Out_Char (Character'Val (2#10000000# or (Shift_Right (U, 6)
399                                                          and 2#00111111#)));
400               Out_Char (Character'Val (2#10000000# or (U and 2#00111111#)));
401
402            else
403               raise Constraint_Error;
404            end if;
405
406         when WCEM_Brackets =>
407
408            --  Values in the range 0-255 are directly output. Note that there
409            --  is an issue with [ (16#5B#) since this will cause confusion
410            --  if the resulting string is interpreted using brackets encoding.
411
412            --  One possibility would be to always output [ as ["5B"] but in
413            --  practice this is undesirable, since for example normal use of
414            --  Wide_Text_IO for output (much more common than input), really
415            --  does want to be able to say something like
416
417            --     Put_Line ("Start of output [first run]");
418
419            --  and have it come out as intended, rather than contaminated by
420            --  a ["5B"] sequence in place of the left bracket.
421
422            if Val < 256 then
423               Out_Char (Character'Val (Val));
424
425            --  Otherwise use brackets notation for vales greater than 255
426
427            else
428               Out_Char ('[');
429               Out_Char ('"');
430
431               if Val > 16#FFFF# then
432                  if Val > 16#00FF_FFFF# then
433                     Out_Char (Hexc (Val / 16 ** 7));
434                     Out_Char (Hexc ((Val / 16 ** 6) mod 16));
435                  end if;
436
437                  Out_Char (Hexc ((Val / 16 ** 5) mod 16));
438                  Out_Char (Hexc ((Val / 16 ** 4) mod 16));
439               end if;
440
441               Out_Char (Hexc ((Val / 16 ** 3) mod 16));
442               Out_Char (Hexc ((Val / 16 ** 2) mod 16));
443               Out_Char (Hexc ((Val / 16) mod 16));
444               Out_Char (Hexc (Val mod 16));
445
446               Out_Char ('"');
447               Out_Char (']');
448            end if;
449      end case;
450   end UTF_32_To_Char_Sequence;
451
452   --------------------------------
453   -- Wide_Char_To_Char_Sequence --
454   --------------------------------
455
456   procedure Wide_Char_To_Char_Sequence
457     (WC : Wide_Character;
458      EM : System.WCh_Con.WC_Encoding_Method)
459   is
460      procedure UTF_To_Char_Sequence is new UTF_32_To_Char_Sequence (Out_Char);
461   begin
462      UTF_To_Char_Sequence (Wide_Character'Pos (WC), EM);
463   end Wide_Char_To_Char_Sequence;
464
465end System.WCh_Cnv;
466