1------------------------------------------------------------------------------
2--                                                                          --
3--                         GNAT RUN-TIME COMPONENTS                         --
4--                                                                          --
5--                   ADA.STRINGS.UTF_ENCODING.WIDE_STRINGS                  --
6--                                                                          --
7--                                 B o d y                                  --
8--                                                                          --
9--          Copyright (C) 2010-2012, Free Software Foundation, Inc.         --
10--                                                                          --
11-- GNAT is free software;  you can  redistribute it  and/or modify it under --
12-- terms of the  GNU General Public License as published  by the Free Soft- --
13-- ware  Foundation;  either version 3,  or (at your option) any later ver- --
14-- sion.  GNAT is distributed in the hope that it will be useful, but WITH- --
15-- OUT ANY WARRANTY;  without even the  implied warranty of MERCHANTABILITY --
16-- or FITNESS FOR A PARTICULAR PURPOSE.                                     --
17--                                                                          --
18-- As a special exception under Section 7 of GPL version 3, you are granted --
19-- additional permissions described in the GCC Runtime Library Exception,   --
20-- version 3.1, as published by the Free Software Foundation.               --
21--                                                                          --
22-- You should have received a copy of the GNU General Public License and    --
23-- a copy of the GCC Runtime Library Exception along with this program;     --
24-- see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see    --
25-- <http://www.gnu.org/licenses/>.                                          --
26--                                                                          --
27-- GNAT was originally developed  by the GNAT team at  New York University. --
28-- Extensive contributions were provided by Ada Core Technologies Inc.      --
29--                                                                          --
30------------------------------------------------------------------------------
31
32package body Ada.Strings.UTF_Encoding.Wide_Strings is
33   use Interfaces;
34
35   ------------
36   -- Decode --
37   ------------
38
39   --  Decode UTF-8/UTF-16BE/UTF-16LE input to Wide_String
40
41   function Decode
42     (Item         : UTF_String;
43      Input_Scheme : Encoding_Scheme) return Wide_String
44   is
45   begin
46      if Input_Scheme = UTF_8 then
47         return Decode (Item);
48      else
49         return Decode (To_UTF_16 (Item, Input_Scheme));
50      end if;
51   end Decode;
52
53   --  Decode UTF-8 input to Wide_String
54
55   function Decode (Item : UTF_8_String) return Wide_String is
56      Result : Wide_String (1 .. Item'Length);
57      --  Result string (worst case is same length as input)
58
59      Len : Natural := 0;
60      --  Length of result stored so far
61
62      Iptr : Natural;
63      --  Input Item pointer
64
65      C : Unsigned_8;
66      R : Unsigned_16;
67
68      procedure Get_Continuation;
69      --  Reads a continuation byte of the form 10xxxxxx, shifts R left by 6
70      --  bits, and or's in the xxxxxx to the low order 6 bits. On return Ptr
71      --  is incremented. Raises exception if continuation byte does not exist
72      --  or is invalid.
73
74      ----------------------
75      -- Get_Continuation --
76      ----------------------
77
78      procedure Get_Continuation is
79      begin
80         if Iptr > Item'Last then
81            Raise_Encoding_Error (Iptr - 1);
82
83         else
84            C := To_Unsigned_8 (Item (Iptr));
85            Iptr := Iptr + 1;
86
87            if C not in 2#10_000000# .. 2#10_111111# then
88               Raise_Encoding_Error (Iptr - 1);
89            else
90               R := Shift_Left (R, 6) or Unsigned_16 (C and 2#00_111111#);
91            end if;
92         end if;
93      end Get_Continuation;
94
95   --  Start of processing for Decode
96
97   begin
98      Iptr := Item'First;
99
100      --  Skip BOM at start
101
102      if Item'Length >= 3
103        and then Item (Iptr .. Iptr + 2) = BOM_8
104      then
105         Iptr := Iptr + 3;
106
107      --  Error if bad BOM
108
109      elsif Item'Length >= 2
110        and then (Item (Iptr .. Iptr + 1) = BOM_16BE
111                    or else
112                  Item (Iptr .. Iptr + 1) = BOM_16LE)
113      then
114         Raise_Encoding_Error (Iptr);
115      end if;
116
117      while Iptr <= Item'Last loop
118         C := To_Unsigned_8 (Item (Iptr));
119         Iptr := Iptr + 1;
120
121         --  Codes in the range 16#00# - 16#7F# are represented as
122         --    0xxxxxxx
123
124         if C <= 16#7F# then
125            R := Unsigned_16 (C);
126
127         --  No initial code can be of the form 10xxxxxx. Such codes are used
128         --  only for continuations.
129
130         elsif C <= 2#10_111111# then
131            Raise_Encoding_Error (Iptr - 1);
132
133         --  Codes in the range 16#80# - 16#7FF# are represented as
134         --    110yyyxx 10xxxxxx
135
136         elsif C <= 2#110_11111# then
137            R := Unsigned_16 (C and 2#000_11111#);
138            Get_Continuation;
139
140         --  Codes in the range 16#800# - 16#FFFF# are represented as
141         --    1110yyyy 10yyyyxx 10xxxxxx
142
143         elsif C <= 2#1110_1111# then
144            R := Unsigned_16 (C and 2#0000_1111#);
145            Get_Continuation;
146            Get_Continuation;
147
148         --  Codes in the range 16#10000# - 16#10FFFF# are represented as
149         --    11110zzz 10zzyyyy 10yyyyxx 10xxxxxx
150
151         --  Such codes are out of range for Wide_String output
152
153         else
154            Raise_Encoding_Error (Iptr - 1);
155         end if;
156
157         Len := Len + 1;
158         Result (Len) := Wide_Character'Val (R);
159      end loop;
160
161      return Result (1 .. Len);
162   end Decode;
163
164   --  Decode UTF-16 input to Wide_String
165
166   function Decode (Item : UTF_16_Wide_String) return Wide_String is
167      Result : Wide_String (1 .. Item'Length);
168      --  Result is same length as input (possibly minus 1 if BOM present)
169
170      Len : Natural := 0;
171      --  Length of result
172
173      Iptr : Natural;
174      --  Index of next Item element
175
176      C : Unsigned_16;
177
178   begin
179      --  Skip UTF-16 BOM at start
180
181      Iptr := Item'First;
182
183      if Item'Length > 0 and then Item (Iptr) = BOM_16 (1) then
184         Iptr := Iptr + 1;
185      end if;
186
187      --  Loop through input characters
188
189      while Iptr <= Item'Last loop
190         C := To_Unsigned_16 (Item (Iptr));
191         Iptr := Iptr + 1;
192
193         --  Codes in the range 16#0000#..16#D7FF# or 16#E000#..16#FFFD#
194         --  represent their own value.
195
196         if C <= 16#D7FF# or else C in 16#E000# .. 16#FFFD# then
197            Len := Len + 1;
198            Result (Len) := Wide_Character'Val (C);
199
200         --  Codes in the range 16#D800#..16#DBFF# represent the first of the
201         --  two surrogates used to encode the range 16#01_000#..16#10_FFFF".
202         --  Such codes are out of range for 16-bit output.
203
204         --  The case of input in the range 16#DC00#..16#DFFF# must never
205         --  occur, since it means we have a second surrogate character with
206         --  no corresponding first surrogate.
207
208         --  Codes in the range 16#FFFE# .. 16#FFFF# are also invalid since
209         --  they conflict with codes used for BOM values.
210
211         --  Thus all remaining codes are invalid
212
213         else
214            Raise_Encoding_Error (Iptr - 1);
215         end if;
216      end loop;
217
218      return Result (1 .. Len);
219   end Decode;
220
221   ------------
222   -- Encode --
223   ------------
224
225   --  Encode Wide_String in UTF-8, UTF-16BE or UTF-16LE
226
227   function Encode
228     (Item          : Wide_String;
229      Output_Scheme : Encoding_Scheme;
230      Output_BOM    : Boolean  := False) return UTF_String
231   is
232   begin
233      --  Case of UTF_8
234
235      if Output_Scheme = UTF_8 then
236         return Encode (Item, Output_BOM);
237
238      --  Case of UTF_16LE or UTF_16BE, use UTF-16 intermediary
239
240      else
241         return From_UTF_16 (UTF_16_Wide_String'(Encode (Item)),
242                             Output_Scheme, Output_BOM);
243      end if;
244   end Encode;
245
246   --  Encode Wide_String in UTF-8
247
248   function Encode
249     (Item       : Wide_String;
250      Output_BOM : Boolean  := False) return UTF_8_String
251   is
252      Result : UTF_8_String (1 .. 3 * Item'Length + 3);
253      --  Worst case is three bytes per input byte + space for BOM
254
255      Len : Natural;
256      --  Number of output codes stored in Result
257
258      C : Unsigned_16;
259      --  Single input character
260
261      procedure Store (C : Unsigned_16);
262      pragma Inline (Store);
263      --  Store one output code, C is in the range 0 .. 255
264
265      -----------
266      -- Store --
267      -----------
268
269      procedure Store (C : Unsigned_16) is
270      begin
271         Len := Len + 1;
272         Result (Len) := Character'Val (C);
273      end Store;
274
275   --  Start of processing for UTF8_Encode
276
277   begin
278      --  Output BOM if required
279
280      if Output_BOM then
281         Result (1 .. 3) := BOM_8;
282         Len := 3;
283      else
284         Len := 0;
285      end if;
286
287      --  Loop through characters of input
288
289      for J in Item'Range loop
290         C := To_Unsigned_16 (Item (J));
291
292         --  Codes in the range 16#00# - 16#7F# are represented as
293         --    0xxxxxxx
294
295         if C <= 16#7F# then
296            Store (C);
297
298         --  Codes in the range 16#80# - 16#7FF# are represented as
299         --    110yyyxx 10xxxxxx
300
301         elsif C <= 16#7FF# then
302            Store (2#110_00000# or Shift_Right (C, 6));
303            Store (2#10_000000# or (C and 2#00_111111#));
304
305         --  Codes in the range 16#800# - 16#FFFF# are represented as
306         --    1110yyyy 10yyyyxx 10xxxxxx
307
308         else
309            Store (2#1110_0000# or Shift_Right (C, 12));
310            Store (2#10_000000# or
311                     Shift_Right (C and 2#111111_000000#, 6));
312            Store (2#10_000000# or (C and 2#00_111111#));
313         end if;
314      end loop;
315
316      return Result (1 .. Len);
317   end Encode;
318
319   --  Encode Wide_String in UTF-16
320
321   function Encode
322     (Item       : Wide_String;
323      Output_BOM : Boolean  := False) return UTF_16_Wide_String
324   is
325      Result : UTF_16_Wide_String
326                 (1 .. Item'Length + Boolean'Pos (Output_BOM));
327      --  Output is same length as input + possible BOM
328
329      Len : Integer;
330      --  Length of output string
331
332      C : Unsigned_16;
333
334   begin
335      --  Output BOM if required
336
337      if Output_BOM then
338         Result (1) := BOM_16 (1);
339         Len := 1;
340      else
341         Len := 0;
342      end if;
343
344      --  Loop through input characters encoding them
345
346      for Iptr in Item'Range loop
347         C := To_Unsigned_16 (Item (Iptr));
348
349         --  Codes in the range 16#0000#..16#D7FF# or 16#E000#..16#FFFD# are
350         --  output unchanged.
351
352         if C <= 16#D7FF# or else C in 16#E000# .. 16#FFFD# then
353            Len := Len + 1;
354            Result (Len) := Wide_Character'Val (C);
355
356         --  Codes in the range 16#D800#..16#DFFF# should never appear in the
357         --  input, since no valid Unicode characters are in this range (which
358         --  would conflict with the UTF-16 surrogate encodings). Similarly
359         --  codes in the range 16#FFFE#..16#FFFF conflict with BOM codes.
360         --  Thus all remaining codes are illegal.
361
362         else
363            Raise_Encoding_Error (Iptr);
364         end if;
365      end loop;
366
367      return Result;
368   end Encode;
369
370end Ada.Strings.UTF_Encoding.Wide_Strings;
371