1------------------------------------------------------------------------------
2--                                                                          --
3--                         GNAT RUN-TIME COMPONENTS                         --
4--                                                                          --
5--                     ADA.STRINGS.UTF_ENCODING.STRINGS                     --
6--                                                                          --
7--                                 B o d y                                  --
8--                                                                          --
9--          Copyright (C) 2010-2013, Free Software Foundation, Inc.         --
10--                                                                          --
11-- GNAT is free software;  you can  redistribute it  and/or modify it under --
12-- terms of the  GNU General Public License as published  by the Free Soft- --
13-- ware  Foundation;  either version 3,  or (at your option) any later ver- --
14-- sion.  GNAT is distributed in the hope that it will be useful, but WITH- --
15-- OUT ANY WARRANTY;  without even the  implied warranty of MERCHANTABILITY --
16-- or FITNESS FOR A PARTICULAR PURPOSE.                                     --
17--                                                                          --
18-- As a special exception under Section 7 of GPL version 3, you are granted --
19-- additional permissions described in the GCC Runtime Library Exception,   --
20-- version 3.1, as published by the Free Software Foundation.               --
21--                                                                          --
22-- You should have received a copy of the GNU General Public License and    --
23-- a copy of the GCC Runtime Library Exception along with this program;     --
24-- see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see    --
25-- <http://www.gnu.org/licenses/>.                                          --
26--                                                                          --
27-- GNAT was originally developed  by the GNAT team at  New York University. --
28-- Extensive contributions were provided by Ada Core Technologies Inc.      --
29--                                                                          --
30------------------------------------------------------------------------------
31
32package body Ada.Strings.UTF_Encoding.Strings is
33   use Interfaces;
34
35   ------------
36   -- Decode --
37   ------------
38
39   --  Decode UTF-8/UTF-16BE/UTF-16LE input to String
40
41   function Decode
42     (Item         : UTF_String;
43      Input_Scheme : Encoding_Scheme) return String
44   is
45   begin
46      if Input_Scheme = UTF_8 then
47         return Decode (Item);
48      else
49         return Decode (To_UTF_16 (Item, Input_Scheme));
50      end if;
51   end Decode;
52
53   --  Decode UTF-8 input to String
54
55   function Decode (Item : UTF_8_String) return String is
56      Result : String (1 .. Item'Length);
57      --  Result string (worst case is same length as input)
58
59      Len : Natural := 0;
60      --  Length of result stored so far
61
62      Iptr : Natural;
63      --  Input Item pointer
64
65      C : Unsigned_8;
66      R : Unsigned_16;
67
68      procedure Get_Continuation;
69      --  Reads a continuation byte of the form 10xxxxxx, shifts R left
70      --  by 6 bits, and or's in the xxxxxx to the low order 6 bits. On
71      --  return Ptr is incremented. Raises exception if continuation
72      --  byte does not exist or is invalid.
73
74      ----------------------
75      -- Get_Continuation --
76      ----------------------
77
78      procedure Get_Continuation is
79      begin
80         if Iptr > Item'Last then
81            Raise_Encoding_Error (Iptr - 1);
82
83         else
84            C := To_Unsigned_8 (Item (Iptr));
85            Iptr := Iptr + 1;
86
87            if C not in 2#10_000000# .. 2#10_111111# then
88               Raise_Encoding_Error (Iptr - 1);
89            else
90               R := Shift_Left (R, 6) or Unsigned_16 (C and 2#00_111111#);
91            end if;
92         end if;
93      end Get_Continuation;
94
95   --  Start of processing for Decode
96
97   begin
98      Iptr := Item'First;
99
100      --  Skip BOM at start
101
102      if Item'Length >= 3
103        and then Item (Iptr .. Iptr + 2) = BOM_8
104      then
105         Iptr := Iptr + 3;
106
107      --  Error if bad BOM
108
109      elsif Item'Length >= 2
110        and then (Item (Iptr .. Iptr + 1) = BOM_16BE
111                    or else
112                  Item (Iptr .. Iptr + 1) = BOM_16LE)
113      then
114         Raise_Encoding_Error (Iptr);
115      end if;
116
117      while Iptr <= Item'Last loop
118         C := To_Unsigned_8 (Item (Iptr));
119         Iptr := Iptr + 1;
120
121         --  Codes in the range 16#00# - 16#7F# are represented as
122         --    0xxxxxxx
123
124         if C <= 16#7F# then
125            R := Unsigned_16 (C);
126
127         --  No initial code can be of the form 10xxxxxx. Such codes are used
128         --  only for continuations.
129
130         elsif C <= 2#10_111111# then
131            Raise_Encoding_Error (Iptr - 1);
132
133         --  Codes in the range 16#80# - 16#7FF# are represented as
134         --    110yyyxx 10xxxxxx
135
136         elsif C <= 2#110_11111# then
137            R := Unsigned_16 (C and 2#000_11111#);
138            Get_Continuation;
139
140         --  Codes in the range 16#800# - 16#FFFF# are represented as
141         --    1110yyyy 10yyyyxx 10xxxxxx
142
143         --  Such codes are out of range for type Character
144
145         --  Codes in the range 16#10000# - 16#10FFFF# are represented as
146         --    11110zzz 10zzyyyy 10yyyyxx 10xxxxxx
147
148         --  Such codes are out of range for Wide_String output
149
150         --  Thus all remaining cases raise Encoding_Error
151
152         else
153            Raise_Encoding_Error (Iptr - 1);
154         end if;
155
156         Len := Len + 1;
157
158         --  The value may still be out of range of Standard.Character. We make
159         --  the check explicit because the library is typically compiled with
160         --  range checks disabled.
161
162         if R > Character'Pos (Character'Last) then
163            Raise_Encoding_Error (Iptr - 1);
164         end if;
165
166         Result (Len) := Character'Val (R);
167      end loop;
168
169      return Result (1 .. Len);
170   end Decode;
171
172   --  Decode UTF-16 input to String
173
174   function Decode (Item : UTF_16_Wide_String) return String is
175      Result : String (1 .. Item'Length);
176      --  Result is same length as input (possibly minus 1 if BOM present)
177
178      Len : Natural := 0;
179      --  Length of result
180
181      Iptr : Natural;
182      --  Index of next Item element
183
184      C : Unsigned_16;
185
186   begin
187      --  Skip UTF-16 BOM at start
188
189      Iptr := Item'First;
190
191      if Item'Length > 0 and then Item (Iptr) = BOM_16 (1) then
192         Iptr := Iptr + 1;
193      end if;
194
195      --  Loop through input characters
196
197      while Iptr <= Item'Last loop
198         C := To_Unsigned_16 (Item (Iptr));
199         Iptr := Iptr + 1;
200
201         --  Codes in the range 16#0000#..16#00FF# represent their own value
202
203         if C <= 16#00FF# then
204            Len := Len + 1;
205            Result (Len) := Character'Val (C);
206
207         --  All other codes are invalid, either they are invalid UTF-16
208         --  encoding sequences, or they represent values that are out of
209         --  range for type Character.
210
211         else
212            Raise_Encoding_Error (Iptr - 1);
213         end if;
214      end loop;
215
216      return Result (1 .. Len);
217   end Decode;
218
219   ------------
220   -- Encode --
221   ------------
222
223   --  Encode String in UTF-8, UTF-16BE or UTF-16LE
224
225   function Encode
226     (Item          : String;
227      Output_Scheme : Encoding_Scheme;
228      Output_BOM    : Boolean  := False) return UTF_String
229   is
230   begin
231      --  Case of UTF_8
232
233      if Output_Scheme = UTF_8 then
234         return Encode (Item, Output_BOM);
235
236      --  Case of UTF_16LE or UTF_16BE, use UTF-16 intermediary
237
238      else
239         return From_UTF_16 (UTF_16_Wide_String'(Encode (Item)),
240                             Output_Scheme, Output_BOM);
241      end if;
242   end Encode;
243
244   --  Encode String in UTF-8
245
246   function Encode
247     (Item       : String;
248      Output_BOM : Boolean  := False) return UTF_8_String
249   is
250      Result : UTF_8_String (1 .. 3 * Item'Length + 3);
251      --  Worst case is three bytes per input byte + space for BOM
252
253      Len : Natural;
254      --  Number of output codes stored in Result
255
256      C : Unsigned_8;
257      --  Single input character
258
259      procedure Store (C : Unsigned_8);
260      pragma Inline (Store);
261      --  Store one output code, C is in the range 0 .. 255
262
263      -----------
264      -- Store --
265      -----------
266
267      procedure Store (C : Unsigned_8) is
268      begin
269         Len := Len + 1;
270         Result (Len) := Character'Val (C);
271      end Store;
272
273   --  Start of processing for UTF8_Encode
274
275   begin
276      --  Output BOM if required
277
278      if Output_BOM then
279         Result (1 .. 3) := BOM_8;
280         Len := 3;
281      else
282         Len := 0;
283      end if;
284
285      --  Loop through characters of input
286
287      for J in Item'Range loop
288         C := To_Unsigned_8 (Item (J));
289
290         --  Codes in the range 16#00# - 16#7F# are represented as
291         --    0xxxxxxx
292
293         if C <= 16#7F# then
294            Store (C);
295
296         --  Codes in the range 16#80# - 16#7FF# are represented as
297         --    110yyyxx 10xxxxxx
298
299         --  For type character of course, the limit is 16#FF# in any case
300
301         else
302            Store (2#110_00000# or Shift_Right (C, 6));
303            Store (2#10_000000# or (C and 2#00_111111#));
304         end if;
305      end loop;
306
307      return Result (1 .. Len);
308   end Encode;
309
310   --  Encode String in UTF-16
311
312   function Encode
313     (Item       : String;
314      Output_BOM : Boolean  := False) return UTF_16_Wide_String
315   is
316      Result : UTF_16_Wide_String
317                 (1 .. Item'Length + Boolean'Pos (Output_BOM));
318      --  Output is same length as input + possible BOM
319
320      Len : Integer;
321      --  Length of output string
322
323      C : Unsigned_8;
324
325   begin
326      --  Output BOM if required
327
328      if Output_BOM then
329         Result (1) := BOM_16 (1);
330         Len := 1;
331      else
332         Len := 0;
333      end if;
334
335      --  Loop through input characters encoding them
336
337      for Iptr in Item'Range loop
338         C := To_Unsigned_8 (Item (Iptr));
339
340         --  Codes in the range 16#0000#..16#00FF# are output unchanged. This
341         --  includes all possible cases of Character values.
342
343         Len := Len + 1;
344         Result (Len) := Wide_Character'Val (C);
345      end loop;
346
347      return Result;
348   end Encode;
349
350end Ada.Strings.UTF_Encoding.Strings;
351