1------------------------------------------------------------------------------ 2-- -- 3-- GNAT RUN-TIME COMPONENTS -- 4-- -- 5-- ADA.STRINGS.UTF_ENCODING.CONVERSIONS -- 6-- -- 7-- B o d y -- 8-- -- 9-- Copyright (C) 2010-2013, Free Software Foundation, Inc. -- 10-- -- 11-- GNAT is free software; you can redistribute it and/or modify it under -- 12-- terms of the GNU General Public License as published by the Free Soft- -- 13-- ware Foundation; either version 3, or (at your option) any later ver- -- 14-- sion. GNAT is distributed in the hope that it will be useful, but WITH- -- 15-- OUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY -- 16-- or FITNESS FOR A PARTICULAR PURPOSE. -- 17-- -- 18-- As a special exception under Section 7 of GPL version 3, you are granted -- 19-- additional permissions described in the GCC Runtime Library Exception, -- 20-- version 3.1, as published by the Free Software Foundation. -- 21-- -- 22-- You should have received a copy of the GNU General Public License and -- 23-- a copy of the GCC Runtime Library Exception along with this program; -- 24-- see the files COPYING3 and COPYING.RUNTIME respectively. If not, see -- 25-- <http://www.gnu.org/licenses/>. -- 26-- -- 27-- GNAT was originally developed by the GNAT team at New York University. -- 28-- Extensive contributions were provided by Ada Core Technologies Inc. -- 29-- -- 30------------------------------------------------------------------------------ 31 32package body Ada.Strings.UTF_Encoding.Conversions is 33 use Interfaces; 34 35 -- Convert from UTF-8/UTF-16BE/LE to UTF-8/UTF-16BE/LE 36 37 function Convert 38 (Item : UTF_String; 39 Input_Scheme : Encoding_Scheme; 40 Output_Scheme : Encoding_Scheme; 41 Output_BOM : Boolean := False) return UTF_String 42 is 43 begin 44 -- Nothing to do if identical schemes, but for UTF_8 we need to 45 -- exclude overlong encodings, so need to do the full conversion. 46 47 if Input_Scheme = Output_Scheme 48 and then Input_Scheme /= UTF_8 49 then 50 return Item; 51 52 -- For remaining cases, one or other of the operands is UTF-16BE/LE 53 -- encoded, so go through UTF-16 intermediate. 54 55 else 56 return Convert (UTF_16_Wide_String'(Convert (Item, Input_Scheme)), 57 Output_Scheme, Output_BOM); 58 end if; 59 end Convert; 60 61 -- Convert from UTF-8/UTF-16BE/LE to UTF-16 62 63 function Convert 64 (Item : UTF_String; 65 Input_Scheme : Encoding_Scheme; 66 Output_BOM : Boolean := False) return UTF_16_Wide_String 67 is 68 begin 69 if Input_Scheme = UTF_8 then 70 return Convert (Item, Output_BOM); 71 else 72 return To_UTF_16 (Item, Input_Scheme, Output_BOM); 73 end if; 74 end Convert; 75 76 -- Convert from UTF-8 to UTF-16 77 78 function Convert 79 (Item : UTF_8_String; 80 Output_BOM : Boolean := False) return UTF_16_Wide_String 81 is 82 Result : UTF_16_Wide_String (1 .. Item'Length + 1); 83 -- Maximum length of result, including possible BOM 84 85 Len : Natural := 0; 86 -- Number of characters stored so far in Result 87 88 Iptr : Natural; 89 -- Next character to process in Item 90 91 C : Unsigned_8; 92 -- Input UTF-8 code 93 94 R : Unsigned_16; 95 -- Output UTF-16 code 96 97 procedure Get_Continuation; 98 -- Reads a continuation byte of the form 10xxxxxx, shifts R left by 6 99 -- bits, and or's in the xxxxxx to the low order 6 bits. On return Ptr 100 -- is incremented. Raises exception if continuation byte does not exist 101 -- or is invalid. 102 103 ---------------------- 104 -- Get_Continuation -- 105 ---------------------- 106 107 procedure Get_Continuation is 108 begin 109 if Iptr > Item'Last then 110 Raise_Encoding_Error (Iptr - 1); 111 112 else 113 C := To_Unsigned_8 (Item (Iptr)); 114 Iptr := Iptr + 1; 115 116 if C < 2#10_000000# or else C > 2#10_111111# then 117 Raise_Encoding_Error (Iptr - 1); 118 119 else 120 R := 121 Shift_Left (R, 6) or Unsigned_16 (C and 2#00_111111#); 122 end if; 123 end if; 124 end Get_Continuation; 125 126 -- Start of processing for Convert 127 128 begin 129 -- Output BOM if required 130 131 if Output_BOM then 132 Len := Len + 1; 133 Result (Len) := BOM_16 (1); 134 end if; 135 136 -- Skip OK BOM 137 138 Iptr := Item'First; 139 140 if Item'Length >= 3 and then Item (Iptr .. Iptr + 2) = BOM_8 then 141 Iptr := Iptr + 3; 142 143 -- Error if bad BOM 144 145 elsif Item'Length >= 2 146 and then (Item (Iptr .. Iptr + 1) = BOM_16BE 147 or else 148 Item (Iptr .. Iptr + 1) = BOM_16LE) 149 then 150 Raise_Encoding_Error (Iptr); 151 152 -- No BOM present 153 154 else 155 Iptr := Item'First; 156 end if; 157 158 while Iptr <= Item'Last loop 159 C := To_Unsigned_8 (Item (Iptr)); 160 Iptr := Iptr + 1; 161 162 -- Codes in the range 16#00# - 16#7F# 163 -- UTF-8: 0xxxxxxx 164 -- UTF-16: 00000000_0xxxxxxx 165 166 if C <= 16#7F# then 167 Len := Len + 1; 168 Result (Len) := Wide_Character'Val (C); 169 170 -- No initial code can be of the form 10xxxxxx. Such codes are used 171 -- only for continuations. 172 173 elsif C <= 2#10_111111# then 174 Raise_Encoding_Error (Iptr - 1); 175 176 -- Codes in the range 16#80# - 16#7FF# 177 -- UTF-8: 110yyyxx 10xxxxxx 178 -- UTF-16: 00000yyy_xxxxxxxx 179 180 elsif C <= 2#110_11111# then 181 R := Unsigned_16 (C and 2#000_11111#); 182 Get_Continuation; 183 Len := Len + 1; 184 Result (Len) := Wide_Character'Val (R); 185 186 -- Codes in the range 16#800# - 16#FFFF# 187 -- UTF-8: 1110yyyy 10yyyyxx 10xxxxxx 188 -- UTF-16: yyyyyyyy_xxxxxxxx 189 190 elsif C <= 2#1110_1111# then 191 R := Unsigned_16 (C and 2#0000_1111#); 192 Get_Continuation; 193 Get_Continuation; 194 Len := Len + 1; 195 Result (Len) := Wide_Character'Val (R); 196 197 -- Make sure that we don't have a result in the forbidden range 198 -- reserved for UTF-16 surrogate characters. 199 200 if R in 16#D800# .. 16#DF00# then 201 Raise_Encoding_Error (Iptr - 3); 202 end if; 203 204 -- Codes in the range 16#10000# - 16#10FFFF# 205 -- UTF-8: 11110zzz 10zzyyyy 10yyyyxx 10xxxxxx 206 -- UTF-16: 110110zz_zzyyyyyy 110111yy_xxxxxxxx 207 -- Note: zzzz in the output is input zzzzz - 1 208 209 elsif C <= 2#11110_111# then 210 R := Unsigned_16 (C and 2#00000_111#); 211 Get_Continuation; 212 213 -- R now has zzzzzyyyy 214 215 R := R - 2#0000_1_0000#; 216 217 -- R now has zzzzyyyy (zzzz minus one for the output) 218 219 Get_Continuation; 220 221 -- R now has zzzzyyyyyyyyxx 222 223 Len := Len + 1; 224 Result (Len) := 225 Wide_Character'Val 226 (2#110110_00_0000_0000# or Shift_Right (R, 4)); 227 228 R := R and 2#1111#; 229 Get_Continuation; 230 Len := Len + 1; 231 Result (Len) := 232 Wide_Character'Val (2#110111_00_0000_0000# or R); 233 234 -- Any other code is an error 235 236 else 237 Raise_Encoding_Error (Iptr - 1); 238 end if; 239 end loop; 240 241 return Result (1 .. Len); 242 end Convert; 243 244 -- Convert from UTF-16 to UTF-8/UTF-16-BE/LE 245 246 function Convert 247 (Item : UTF_16_Wide_String; 248 Output_Scheme : Encoding_Scheme; 249 Output_BOM : Boolean := False) return UTF_String 250 is 251 begin 252 if Output_Scheme = UTF_8 then 253 return Convert (Item, Output_BOM); 254 else 255 return From_UTF_16 (Item, Output_Scheme, Output_BOM); 256 end if; 257 end Convert; 258 259 -- Convert from UTF-16 to UTF-8 260 261 function Convert 262 (Item : UTF_16_Wide_String; 263 Output_BOM : Boolean := False) return UTF_8_String 264 is 265 Result : UTF_8_String (1 .. 3 * Item'Length + 3); 266 -- Worst case is 3 output codes for each input code + BOM space 267 268 Len : Natural; 269 -- Number of result codes stored 270 271 Iptr : Natural; 272 -- Pointer to next input character 273 274 C1, C2 : Unsigned_16; 275 276 zzzzz : Unsigned_16; 277 yyyyyyyy : Unsigned_16; 278 xxxxxxxx : Unsigned_16; 279 -- Components of double length case 280 281 begin 282 Iptr := Item'First; 283 284 -- Skip BOM at start of input 285 286 if Item'Length > 0 and then Item (Iptr) = BOM_16 (1) then 287 Iptr := Iptr + 1; 288 end if; 289 290 -- Generate output BOM if required 291 292 if Output_BOM then 293 Result (1 .. 3) := BOM_8; 294 Len := 3; 295 else 296 Len := 0; 297 end if; 298 299 -- Loop through input 300 301 while Iptr <= Item'Last loop 302 C1 := To_Unsigned_16 (Item (Iptr)); 303 Iptr := Iptr + 1; 304 305 -- Codes in the range 16#0000# - 16#007F# 306 -- UTF-16: 000000000xxxxxxx 307 -- UTF-8: 0xxxxxxx 308 309 if C1 <= 16#007F# then 310 Result (Len + 1) := Character'Val (C1); 311 Len := Len + 1; 312 313 -- Codes in the range 16#80# - 16#7FF# 314 -- UTF-16: 00000yyyxxxxxxxx 315 -- UTF-8: 110yyyxx 10xxxxxx 316 317 elsif C1 <= 16#07FF# then 318 Result (Len + 1) := 319 Character'Val 320 (2#110_00000# or Shift_Right (C1, 6)); 321 Result (Len + 2) := 322 Character'Val 323 (2#10_000000# or (C1 and 2#00_111111#)); 324 Len := Len + 2; 325 326 -- Codes in the range 16#800# - 16#D7FF# or 16#E000# - 16#FFFF# 327 -- UTF-16: yyyyyyyyxxxxxxxx 328 -- UTF-8: 1110yyyy 10yyyyxx 10xxxxxx 329 330 elsif C1 <= 16#D7FF# or else C1 >= 16#E000# then 331 Result (Len + 1) := 332 Character'Val 333 (2#1110_0000# or Shift_Right (C1, 12)); 334 Result (Len + 2) := 335 Character'Val 336 (2#10_000000# or (Shift_Right (C1, 6) and 2#00_111111#)); 337 Result (Len + 3) := 338 Character'Val 339 (2#10_000000# or (C1 and 2#00_111111#)); 340 Len := Len + 3; 341 342 -- Codes in the range 16#10000# - 16#10FFFF# 343 -- UTF-16: 110110zzzzyyyyyy 110111yyxxxxxxxx 344 -- UTF-8: 11110zzz 10zzyyyy 10yyyyxx 10xxxxxx 345 -- Note: zzzzz in the output is input zzzz + 1 346 347 elsif C1 <= 2#110110_11_11111111# then 348 if Iptr > Item'Last then 349 Raise_Encoding_Error (Iptr - 1); 350 else 351 C2 := To_Unsigned_16 (Item (Iptr)); 352 Iptr := Iptr + 1; 353 end if; 354 355 if (C2 and 2#111111_00_00000000#) /= 2#110111_00_00000000# then 356 Raise_Encoding_Error (Iptr - 1); 357 end if; 358 359 zzzzz := (Shift_Right (C1, 6) and 2#1111#) + 1; 360 yyyyyyyy := ((Shift_Left (C1, 2) and 2#111111_00#) 361 or 362 (Shift_Right (C2, 8) and 2#000000_11#)); 363 xxxxxxxx := C2 and 2#11111111#; 364 365 Result (Len + 1) := 366 Character'Val 367 (2#11110_000# or (Shift_Right (zzzzz, 2))); 368 Result (Len + 2) := 369 Character'Val 370 (2#10_000000# or Shift_Left (zzzzz and 2#11#, 4) 371 or Shift_Right (yyyyyyyy, 4)); 372 Result (Len + 3) := 373 Character'Val 374 (2#10_000000# or Shift_Left (yyyyyyyy and 2#1111#, 4) 375 or Shift_Right (xxxxxxxx, 6)); 376 Result (Len + 4) := 377 Character'Val 378 (2#10_000000# or (xxxxxxxx and 2#00_111111#)); 379 Len := Len + 4; 380 381 -- Error if input in 16#DC00# - 16#DFFF# (2nd surrogate with no 1st) 382 383 else 384 Raise_Encoding_Error (Iptr - 2); 385 end if; 386 end loop; 387 388 return Result (1 .. Len); 389 end Convert; 390 391end Ada.Strings.UTF_Encoding.Conversions; 392