1------------------------------------------------------------------------------
2--                                                                          --
3--                         GNAT RUN-TIME COMPONENTS                         --
4--                                                                          --
5--                     ADA.STRINGS.UTF_ENCODING.STRINGS                     --
6--                                                                          --
7--                                 B o d y                                  --
8--                                                                          --
9--             Copyright (C) 2010-2012, Free Software Foundation, Inc.      --
10--                                                                          --
11-- GNAT is free software;  you can  redistribute it  and/or modify it under --
12-- terms of the  GNU General Public License as published  by the Free Soft- --
13-- ware  Foundation;  either version 3,  or (at your option) any later ver- --
14-- sion.  GNAT is distributed in the hope that it will be useful, but WITH- --
15-- OUT ANY WARRANTY;  without even the  implied warranty of MERCHANTABILITY --
16-- or FITNESS FOR A PARTICULAR PURPOSE.                                     --
17--                                                                          --
18-- As a special exception under Section 7 of GPL version 3, you are granted --
19-- additional permissions described in the GCC Runtime Library Exception,   --
20-- version 3.1, as published by the Free Software Foundation.               --
21--                                                                          --
22-- You should have received a copy of the GNU General Public License and    --
23-- a copy of the GCC Runtime Library Exception along with this program;     --
24-- see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see    --
25-- <http://www.gnu.org/licenses/>.                                          --
26--                                                                          --
27-- GNAT was originally developed  by the GNAT team at  New York University. --
28-- Extensive contributions were provided by Ada Core Technologies Inc.      --
29--                                                                          --
30------------------------------------------------------------------------------
31
32package body Ada.Strings.UTF_Encoding.Strings is
33   use Interfaces;
34
35   ------------
36   -- Decode --
37   ------------
38
39   --  Decode UTF-8/UTF-16BE/UTF-16LE input to String
40
41   function Decode
42     (Item         : UTF_String;
43      Input_Scheme : Encoding_Scheme) return String
44   is
45   begin
46      if Input_Scheme = UTF_8 then
47         return Decode (Item);
48      else
49         return Decode (To_UTF_16 (Item, Input_Scheme));
50      end if;
51   end Decode;
52
53   --  Decode UTF-8 input to String
54
55   function Decode (Item : UTF_8_String) return String is
56      Result : String (1 .. Item'Length);
57      --  Result string (worst case is same length as input)
58
59      Len : Natural := 0;
60      --  Length of result stored so far
61
62      Iptr : Natural;
63      --  Input Item pointer
64
65      C : Unsigned_8;
66      R : Unsigned_16;
67
68      procedure Get_Continuation;
69      --  Reads a continuation byte of the form 10xxxxxx, shifts R left
70      --  by 6 bits, and or's in the xxxxxx to the low order 6 bits. On
71      --  return Ptr is incremented. Raises exception if continuation
72      --  byte does not exist or is invalid.
73
74      ----------------------
75      -- Get_Continuation --
76      ----------------------
77
78      procedure Get_Continuation is
79      begin
80         if Iptr > Item'Last then
81            Raise_Encoding_Error (Iptr - 1);
82
83         else
84            C := To_Unsigned_8 (Item (Iptr));
85            Iptr := Iptr + 1;
86
87            if C not in 2#10_000000# .. 2#10_111111# then
88               Raise_Encoding_Error (Iptr - 1);
89            else
90               R := Shift_Left (R, 6) or Unsigned_16 (C and 2#00_111111#);
91            end if;
92         end if;
93      end Get_Continuation;
94
95   --  Start of processing for Decode
96
97   begin
98      Iptr := Item'First;
99
100      --  Skip BOM at start
101
102      if Item'Length >= 3
103        and then Item (Iptr .. Iptr + 2) = BOM_8
104      then
105         Iptr := Iptr + 3;
106
107      --  Error if bad BOM
108
109      elsif Item'Length >= 2
110        and then (Item (Iptr .. Iptr + 1) = BOM_16BE
111                    or else
112                  Item (Iptr .. Iptr + 1) = BOM_16LE)
113      then
114         Raise_Encoding_Error (Iptr);
115      end if;
116
117      while Iptr <= Item'Last loop
118         C := To_Unsigned_8 (Item (Iptr));
119         Iptr := Iptr + 1;
120
121         --  Codes in the range 16#00# - 16#7F# are represented as
122         --    0xxxxxxx
123
124         if C <= 16#7F# then
125            R := Unsigned_16 (C);
126
127         --  No initial code can be of the form 10xxxxxx. Such codes are used
128         --  only for continuations.
129
130         elsif C <= 2#10_111111# then
131            Raise_Encoding_Error (Iptr - 1);
132
133         --  Codes in the range 16#80# - 16#7FF# are represented as
134         --    110yyyxx 10xxxxxx
135
136         elsif C <= 2#110_11111# then
137            R := Unsigned_16 (C and 2#000_11111#);
138            Get_Continuation;
139
140         --  Codes in the range 16#800# - 16#FFFF# are represented as
141         --    1110yyyy 10yyyyxx 10xxxxxx
142
143         --  Such codes are out of range for type Character
144
145         --  Codes in the range 16#10000# - 16#10FFFF# are represented as
146         --    11110zzz 10zzyyyy 10yyyyxx 10xxxxxx
147
148         --  Such codes are out of range for Wide_String output
149
150         --  Thus all remaining cases raise Encoding_Error
151
152         else
153            Raise_Encoding_Error (Iptr - 1);
154         end if;
155
156         Len := Len + 1;
157         Result (Len) := Character'Val (R);
158      end loop;
159
160      return Result (1 .. Len);
161   end Decode;
162
163   --  Decode UTF-16 input to String
164
165   function Decode (Item : UTF_16_Wide_String) return String is
166      Result : String (1 .. Item'Length);
167      --  Result is same length as input (possibly minus 1 if BOM present)
168
169      Len : Natural := 0;
170      --  Length of result
171
172      Iptr : Natural;
173      --  Index of next Item element
174
175      C : Unsigned_16;
176
177   begin
178      --  Skip UTF-16 BOM at start
179
180      Iptr := Item'First;
181
182      if Item'Length > 0 and then Item (Iptr) = BOM_16 (1) then
183         Iptr := Iptr + 1;
184      end if;
185
186      --  Loop through input characters
187
188      while Iptr <= Item'Last loop
189         C := To_Unsigned_16 (Item (Iptr));
190         Iptr := Iptr + 1;
191
192         --  Codes in the range 16#0000#..16#00FF# represent their own value
193
194         if C <= 16#00FF# then
195            Len := Len + 1;
196            Result (Len) := Character'Val (C);
197
198         --  All other codes are invalid, either they are invalid UTF-16
199         --  encoding sequences, or they represent values that are out of
200         --  range for type Character.
201
202         else
203            Raise_Encoding_Error (Iptr - 1);
204         end if;
205      end loop;
206
207      return Result (1 .. Len);
208   end Decode;
209
210   ------------
211   -- Encode --
212   ------------
213
214   --  Encode String in UTF-8, UTF-16BE or UTF-16LE
215
216   function Encode
217     (Item          : String;
218      Output_Scheme : Encoding_Scheme;
219      Output_BOM    : Boolean  := False) return UTF_String
220   is
221   begin
222      --  Case of UTF_8
223
224      if Output_Scheme = UTF_8 then
225         return Encode (Item, Output_BOM);
226
227      --  Case of UTF_16LE or UTF_16BE, use UTF-16 intermediary
228
229      else
230         return From_UTF_16 (UTF_16_Wide_String'(Encode (Item)),
231                             Output_Scheme, Output_BOM);
232      end if;
233   end Encode;
234
235   --  Encode String in UTF-8
236
237   function Encode
238     (Item       : String;
239      Output_BOM : Boolean  := False) return UTF_8_String
240   is
241      Result : UTF_8_String (1 .. 3 * Item'Length + 3);
242      --  Worst case is three bytes per input byte + space for BOM
243
244      Len : Natural;
245      --  Number of output codes stored in Result
246
247      C : Unsigned_8;
248      --  Single input character
249
250      procedure Store (C : Unsigned_8);
251      pragma Inline (Store);
252      --  Store one output code, C is in the range 0 .. 255
253
254      -----------
255      -- Store --
256      -----------
257
258      procedure Store (C : Unsigned_8) is
259      begin
260         Len := Len + 1;
261         Result (Len) := Character'Val (C);
262      end Store;
263
264   --  Start of processing for UTF8_Encode
265
266   begin
267      --  Output BOM if required
268
269      if Output_BOM then
270         Result (1 .. 3) := BOM_8;
271         Len := 3;
272      else
273         Len := 0;
274      end if;
275
276      --  Loop through characters of input
277
278      for J in Item'Range loop
279         C := To_Unsigned_8 (Item (J));
280
281         --  Codes in the range 16#00# - 16#7F# are represented as
282         --    0xxxxxxx
283
284         if C <= 16#7F# then
285            Store (C);
286
287         --  Codes in the range 16#80# - 16#7FF# are represented as
288         --    110yyyxx 10xxxxxx
289
290         --  For type character of course, the limit is 16#FF# in any case
291
292         else
293            Store (2#110_00000# or Shift_Right (C, 6));
294            Store (2#10_000000# or (C and 2#00_111111#));
295         end if;
296      end loop;
297
298      return Result (1 .. Len);
299   end Encode;
300
301   --  Encode String in UTF-16
302
303   function Encode
304     (Item       : String;
305      Output_BOM : Boolean  := False) return UTF_16_Wide_String
306   is
307      Result : UTF_16_Wide_String
308                 (1 .. Item'Length + Boolean'Pos (Output_BOM));
309      --  Output is same length as input + possible BOM
310
311      Len : Integer;
312      --  Length of output string
313
314      C : Unsigned_8;
315
316   begin
317      --  Output BOM if required
318
319      if Output_BOM then
320         Result (1) := BOM_16 (1);
321         Len := 1;
322      else
323         Len := 0;
324      end if;
325
326      --  Loop through input characters encoding them
327
328      for Iptr in Item'Range loop
329         C := To_Unsigned_8 (Item (Iptr));
330
331         --  Codes in the range 16#0000#..16#00FF# are output unchanged. This
332         --  includes all possible cases of Character values.
333
334         Len := Len + 1;
335         Result (Len) := Wide_Character'Val (C);
336      end loop;
337
338      return Result;
339   end Encode;
340
341end Ada.Strings.UTF_Encoding.Strings;
342