1------------------------------------------------------------------------------
2--                                                                          --
3--                         GNAT RUN-TIME COMPONENTS                         --
4--                                                                          --
5--                ADA.STRINGS.UTF_ENCODING.WIDE_WIDE_STRINGS                --
6--                                                                          --
7--                                 B o d y                                  --
8--                                                                          --
9--          Copyright (C) 2010-2012, Free Software Foundation, Inc.         --
10--                                                                          --
11-- GNAT is free software;  you can  redistribute it  and/or modify it under --
12-- terms of the  GNU General Public License as published  by the Free Soft- --
13-- ware  Foundation;  either version 3,  or (at your option) any later ver- --
14-- sion.  GNAT is distributed in the hope that it will be useful, but WITH- --
15-- OUT ANY WARRANTY;  without even the  implied warranty of MERCHANTABILITY --
16-- or FITNESS FOR A PARTICULAR PURPOSE.                                     --
17--                                                                          --
18-- As a special exception under Section 7 of GPL version 3, you are granted --
19-- additional permissions described in the GCC Runtime Library Exception,   --
20-- version 3.1, as published by the Free Software Foundation.               --
21--                                                                          --
22-- You should have received a copy of the GNU General Public License and    --
23-- a copy of the GCC Runtime Library Exception along with this program;     --
24-- see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see    --
25-- <http://www.gnu.org/licenses/>.                                          --
26--                                                                          --
27-- GNAT was originally developed  by the GNAT team at  New York University. --
28-- Extensive contributions were provided by Ada Core Technologies Inc.      --
29--                                                                          --
30------------------------------------------------------------------------------
31
32package body Ada.Strings.UTF_Encoding.Wide_Wide_Strings is
33   use Interfaces;
34
35   ------------
36   -- Decode --
37   ------------
38
39   --  Decode UTF-8/UTF-16BE/UTF-16LE input to Wide_Wide_String
40
41   function Decode
42     (Item         : UTF_String;
43      Input_Scheme : Encoding_Scheme) return Wide_Wide_String
44   is
45   begin
46      if Input_Scheme = UTF_8 then
47         return Decode (Item);
48      else
49         return Decode (To_UTF_16 (Item, Input_Scheme));
50      end if;
51   end Decode;
52
53   --  Decode UTF-8 input to Wide_Wide_String
54
55   function Decode (Item : UTF_8_String) return Wide_Wide_String is
56      Result : Wide_Wide_String (1 .. Item'Length);
57      --  Result string (worst case is same length as input)
58
59      Len : Natural := 0;
60      --  Length of result stored so far
61
62      Iptr : Natural;
63      --  Input string pointer
64
65      C : Unsigned_8;
66      R : Unsigned_32;
67
68      procedure Get_Continuation;
69      --  Reads a continuation byte of the form 10xxxxxx, shifts R left by 6
70      --  bits, and or's in the xxxxxx to the low order 6 bits. On return Ptr
71      --  is incremented. Raises exception if continuation byte does not exist
72      --  or is invalid.
73
74      ----------------------
75      -- Get_Continuation --
76      ----------------------
77
78      procedure Get_Continuation is
79      begin
80         if Iptr > Item'Last then
81            Raise_Encoding_Error (Iptr - 1);
82
83         else
84            C := To_Unsigned_8 (Item (Iptr));
85            Iptr := Iptr + 1;
86
87            if C not in 2#10_000000# .. 2#10_111111# then
88               Raise_Encoding_Error (Iptr - 1);
89            else
90               R := Shift_Left (R, 6) or Unsigned_32 (C and 2#00_111111#);
91            end if;
92         end if;
93      end Get_Continuation;
94
95   --  Start of processing for Decode
96
97   begin
98      Iptr := Item'First;
99
100      --  Skip BOM at start
101
102      if Item'Length >= 3
103        and then Item (Iptr .. Iptr + 2) = BOM_8
104      then
105         Iptr := Iptr + 3;
106
107      --  Error if bad BOM
108
109      elsif Item'Length >= 2
110        and then (Item (Iptr .. Iptr + 1) = BOM_16BE
111                    or else
112                  Item (Iptr .. Iptr + 1) = BOM_16LE)
113      then
114         Raise_Encoding_Error (Iptr);
115      end if;
116
117      --  Loop through input characters
118
119      while Iptr <= Item'Last loop
120         C := To_Unsigned_8 (Item (Iptr));
121         Iptr := Iptr + 1;
122
123         --  Codes in the range 16#00# - 16#7F# are represented as
124         --    0xxxxxxx
125
126         if C <= 16#7F# then
127            R := Unsigned_32 (C);
128
129         --  No initial code can be of the form 10xxxxxx. Such codes are used
130         --  only for continuations.
131
132         elsif C <= 2#10_111111# then
133            Raise_Encoding_Error (Iptr - 1);
134
135         --  Codes in the range 16#80# - 16#7FF# are represented as
136         --    110yyyxx 10xxxxxx
137
138         elsif C <= 2#110_11111# then
139            R := Unsigned_32 (C and 2#000_11111#);
140            Get_Continuation;
141
142         --  Codes in the range 16#800# - 16#FFFF# are represented as
143         --    1110yyyy 10yyyyxx 10xxxxxx
144
145         elsif C <= 2#1110_1111# then
146            R := Unsigned_32 (C and 2#0000_1111#);
147            Get_Continuation;
148            Get_Continuation;
149
150         --  Codes in the range 16#10000# - 16#10FFFF# are represented as
151         --    11110zzz 10zzyyyy 10yyyyxx 10xxxxxx
152
153         elsif C <= 2#11110_111# then
154            R := Unsigned_32 (C and 2#00000_111#);
155            Get_Continuation;
156            Get_Continuation;
157            Get_Continuation;
158
159         --  Any other code is an error
160
161         else
162            Raise_Encoding_Error (Iptr - 1);
163         end if;
164
165         Len := Len + 1;
166         Result (Len) := Wide_Wide_Character'Val (R);
167      end loop;
168
169      return Result (1 .. Len);
170   end Decode;
171
172   --  Decode UTF-16 input to Wide_Wide_String
173
174   function Decode (Item : UTF_16_Wide_String) return Wide_Wide_String is
175      Result : Wide_Wide_String (1 .. Item'Length);
176      --  Result cannot be longer than the input string
177
178      Len : Natural := 0;
179      --  Length of result
180
181      Iptr : Natural;
182      --  Pointer to next element in Item
183
184      C : Unsigned_16;
185      R : Unsigned_32;
186
187   begin
188      --  Skip UTF-16 BOM at start
189
190      Iptr := Item'First;
191
192      if Iptr <= Item'Last and then Item (Iptr) = BOM_16 (1) then
193         Iptr := Iptr + 1;
194      end if;
195
196      --  Loop through input characters
197
198      while Iptr <= Item'Last loop
199         C := To_Unsigned_16 (Item (Iptr));
200         Iptr := Iptr + 1;
201
202         --  Codes in the range 16#0000#..16#D7FF# or 16#E000#..16#FFFD#
203         --  represent their own value.
204
205         if C <= 16#D7FF# or else C in 16#E000# .. 16#FFFD# then
206            Len := Len + 1;
207            Result (Len) := Wide_Wide_Character'Val (C);
208
209         --  Codes in the range 16#D800#..16#DBFF# represent the first of the
210         --  two surrogates used to encode the range 16#01_000#..16#10_FFFF".
211         --  The first surrogate provides 10 high order bits of the result.
212
213         elsif C <= 16#DBFF# then
214            R := Shift_Left ((Unsigned_32 (C) - 16#D800#), 10);
215
216            --  Error if at end of string
217
218            if Iptr > Item'Last then
219               Raise_Encoding_Error (Iptr - 1);
220
221            --  Otherwise next character must be valid low order surrogate
222            --  which provides the low 10 order bits of the result.
223
224            else
225               C := To_Unsigned_16 (Item (Iptr));
226               Iptr := Iptr + 1;
227
228               if C not in 16#DC00# .. 16#DFFF# then
229                  Raise_Encoding_Error (Iptr - 1);
230
231               else
232                  R := R or (Unsigned_32 (C) mod 2 ** 10);
233
234               --  The final adjustment is to add 16#01_0000 to get the
235               --  result back in the required 21 bit range.
236
237                  R := R + 16#01_0000#;
238                  Len := Len + 1;
239                  Result (Len) := Wide_Wide_Character'Val (R);
240               end if;
241            end if;
242
243         --  Remaining codes are invalid
244
245         else
246            Raise_Encoding_Error (Iptr - 1);
247         end if;
248      end loop;
249
250      return Result (1 .. Len);
251   end Decode;
252
253   ------------
254   -- Encode --
255   ------------
256
257   --  Encode Wide_Wide_String in UTF-8, UTF-16BE or UTF-16LE
258
259   function Encode
260     (Item          : Wide_Wide_String;
261      Output_Scheme : Encoding_Scheme;
262      Output_BOM    : Boolean  := False) return UTF_String
263   is
264   begin
265      if Output_Scheme = UTF_8 then
266         return Encode (Item, Output_BOM);
267      else
268         return From_UTF_16 (Encode (Item), Output_Scheme, Output_BOM);
269      end if;
270   end Encode;
271
272   --  Encode Wide_Wide_String in UTF-8
273
274   function Encode
275     (Item       : Wide_Wide_String;
276      Output_BOM : Boolean  := False) return UTF_8_String
277   is
278      Result : String (1 .. 4 * Item'Length + 3);
279      --  Worst case is four bytes per input byte + space for BOM
280
281      Len  : Natural;
282      --  Number of output codes stored in Result
283
284      C : Unsigned_32;
285      --  Single input character
286
287      procedure Store (C : Unsigned_32);
288      pragma Inline (Store);
289      --  Store one output code (input is in range 0 .. 255)
290
291      -----------
292      -- Store --
293      -----------
294
295      procedure Store (C : Unsigned_32) is
296      begin
297         Len := Len + 1;
298         Result (Len) := Character'Val (C);
299      end Store;
300
301   --  Start of processing for Encode
302
303   begin
304      --  Output BOM if required
305
306      if Output_BOM then
307         Result (1 .. 3) := BOM_8;
308         Len := 3;
309      else
310         Len := 0;
311      end if;
312
313      --  Loop through characters of input
314
315      for Iptr in Item'Range loop
316         C := To_Unsigned_32 (Item (Iptr));
317
318         --  Codes in the range 16#00#..16#7F# are represented as
319         --    0xxxxxxx
320
321         if C <= 16#7F# then
322            Store (C);
323
324         --  Codes in the range 16#80#..16#7FF# are represented as
325         --    110yyyxx 10xxxxxx
326
327         elsif C <= 16#7FF# then
328            Store (2#110_00000# or Shift_Right (C, 6));
329            Store (2#10_000000# or (C and 2#00_111111#));
330
331         --  Codes in the range 16#800#..16#D7FF# or 16#E000#..16#FFFD# are
332         --  represented as
333         --    1110yyyy 10yyyyxx 10xxxxxx
334
335         elsif C <= 16#D7FF# or else C in 16#E000# .. 16#FFFD# then
336            Store (2#1110_0000# or Shift_Right (C, 12));
337            Store (2#10_000000# or
338                     Shift_Right (C and 2#111111_000000#, 6));
339            Store (2#10_000000# or (C and 2#00_111111#));
340
341         --  Codes in the range 16#10000# - 16#10FFFF# are represented as
342         --    11110zzz 10zzyyyy 10yyyyxx 10xxxxxx
343
344         elsif C in 16#1_0000# .. 16#10_FFFF# then
345            Store (2#11110_000# or
346                     Shift_Right (C, 18));
347            Store (2#10_000000# or
348                     Shift_Right (C and 2#111111_000000_000000#, 12));
349            Store (2#10_000000# or
350                     Shift_Right (C and 2#111111_000000#, 6));
351            Store (2#10_000000# or
352                     (C and 2#00_111111#));
353
354         --  All other codes are invalid
355
356         else
357            Raise_Encoding_Error (Iptr);
358         end if;
359      end loop;
360
361      return Result (1 .. Len);
362   end Encode;
363
364   --  Encode Wide_Wide_String in UTF-16
365
366   function Encode
367     (Item       : Wide_Wide_String;
368      Output_BOM : Boolean  := False) return UTF_16_Wide_String
369   is
370      Result : UTF_16_Wide_String (1 .. 2 * Item'Length + 1);
371      --  Worst case is each input character generates two output characters
372      --  plus one for possible BOM.
373
374      Len : Integer;
375      --  Length of output string
376
377      C : Unsigned_32;
378
379   begin
380      --  Output BOM if needed
381
382      if Output_BOM then
383         Result (1) := BOM_16 (1);
384         Len := 1;
385      else
386         Len := 0;
387      end if;
388
389      --  Loop through input characters encoding them
390
391      for Iptr in Item'Range loop
392         C := To_Unsigned_32 (Item (Iptr));
393
394         --  Codes in the range 16#00_0000#..16#00_D7FF# or 16#E000#..16#FFFD#
395         --  are output unchanged
396
397         if C <= 16#00_D7FF# or else C in 16#E000# .. 16#FFFD# then
398            Len := Len + 1;
399            Result (Len) := Wide_Character'Val (C);
400
401         --  Codes in the range 16#01_0000#..16#10_FFFF# are output using two
402         --  surrogate characters. First 16#1_0000# is subtracted from the code
403         --  point to give a 20-bit value. This is then split into two separate
404         --  10-bit values each of which is represented as a surrogate with the
405         --  most significant half placed in the first surrogate. The ranges of
406         --  values used for the two surrogates are 16#D800#-16#DBFF# for the
407         --  first, most significant surrogate and 16#DC00#-16#DFFF# for the
408         --  second, least significant surrogate.
409
410         elsif C in 16#1_0000# ..  16#10_FFFF# then
411            C := C - 16#1_0000#;
412
413            Len := Len + 1;
414            Result (Len) := Wide_Character'Val (16#D800# + C / 2 ** 10);
415
416            Len := Len + 1;
417            Result (Len) := Wide_Character'Val (16#DC00# + C mod 2 ** 10);
418
419         --  All other codes are invalid
420
421         else
422            Raise_Encoding_Error (Iptr);
423         end if;
424      end loop;
425
426      return Result (1 .. Len);
427   end Encode;
428
429end Ada.Strings.UTF_Encoding.Wide_Wide_Strings;
430