1------------------------------------------------------------------------------ 2-- -- 3-- GNAT RUN-TIME COMPONENTS -- 4-- -- 5-- ADA.STRINGS.UTF_ENCODING.CONVERSIONS -- 6-- -- 7-- B o d y -- 8-- -- 9-- Copyright (C) 2010-2012, Free Software Foundation, Inc. -- 10-- -- 11-- GNAT is free software; you can redistribute it and/or modify it under -- 12-- terms of the GNU General Public License as published by the Free Soft- -- 13-- ware Foundation; either version 3, or (at your option) any later ver- -- 14-- sion. GNAT is distributed in the hope that it will be useful, but WITH- -- 15-- OUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY -- 16-- or FITNESS FOR A PARTICULAR PURPOSE. -- 17-- -- 18-- As a special exception under Section 7 of GPL version 3, you are granted -- 19-- additional permissions described in the GCC Runtime Library Exception, -- 20-- version 3.1, as published by the Free Software Foundation. -- 21-- -- 22-- You should have received a copy of the GNU General Public License and -- 23-- a copy of the GCC Runtime Library Exception along with this program; -- 24-- see the files COPYING3 and COPYING.RUNTIME respectively. If not, see -- 25-- <http://www.gnu.org/licenses/>. -- 26-- -- 27-- GNAT was originally developed by the GNAT team at New York University. -- 28-- Extensive contributions were provided by Ada Core Technologies Inc. -- 29-- -- 30------------------------------------------------------------------------------ 31 32package body Ada.Strings.UTF_Encoding.Conversions is 33 use Interfaces; 34 35 -- Convert from UTF-8/UTF-16BE/LE to UTF-8/UTF-16BE/LE 36 37 function Convert 38 (Item : UTF_String; 39 Input_Scheme : Encoding_Scheme; 40 Output_Scheme : Encoding_Scheme; 41 Output_BOM : Boolean := False) return UTF_String 42 is 43 begin 44 -- Nothing to do if identical schemes 45 46 if Input_Scheme = Output_Scheme then 47 return Item; 48 49 -- For remaining cases, one or other of the operands is UTF-16BE/LE 50 -- encoded, so go through UTF-16 intermediate. 51 52 else 53 return Convert (UTF_16_Wide_String'(Convert (Item, Input_Scheme)), 54 Output_Scheme, Output_BOM); 55 end if; 56 end Convert; 57 58 -- Convert from UTF-8/UTF-16BE/LE to UTF-16 59 60 function Convert 61 (Item : UTF_String; 62 Input_Scheme : Encoding_Scheme; 63 Output_BOM : Boolean := False) return UTF_16_Wide_String 64 is 65 begin 66 if Input_Scheme = UTF_8 then 67 return Convert (Item, Output_BOM); 68 else 69 return To_UTF_16 (Item, Input_Scheme, Output_BOM); 70 end if; 71 end Convert; 72 73 -- Convert from UTF-8 to UTF-16 74 75 function Convert 76 (Item : UTF_8_String; 77 Output_BOM : Boolean := False) return UTF_16_Wide_String 78 is 79 Result : UTF_16_Wide_String (1 .. Item'Length + 1); 80 -- Maximum length of result, including possible BOM 81 82 Len : Natural := 0; 83 -- Number of characters stored so far in Result 84 85 Iptr : Natural; 86 -- Next character to process in Item 87 88 C : Unsigned_8; 89 -- Input UTF-8 code 90 91 R : Unsigned_16; 92 -- Output UTF-16 code 93 94 procedure Get_Continuation; 95 -- Reads a continuation byte of the form 10xxxxxx, shifts R left by 6 96 -- bits, and or's in the xxxxxx to the low order 6 bits. On return Ptr 97 -- is incremented. Raises exception if continuation byte does not exist 98 -- or is invalid. 99 100 ---------------------- 101 -- Get_Continuation -- 102 ---------------------- 103 104 procedure Get_Continuation is 105 begin 106 if Iptr > Item'Last then 107 Raise_Encoding_Error (Iptr - 1); 108 109 else 110 C := To_Unsigned_8 (Item (Iptr)); 111 Iptr := Iptr + 1; 112 113 if C < 2#10_000000# or else C > 2#10_111111# then 114 Raise_Encoding_Error (Iptr - 1); 115 116 else 117 R := 118 Shift_Left (R, 6) or Unsigned_16 (C and 2#00_111111#); 119 end if; 120 end if; 121 end Get_Continuation; 122 123 -- Start of processing for Convert 124 125 begin 126 -- Output BOM if required 127 128 if Output_BOM then 129 Len := Len + 1; 130 Result (Len) := BOM_16 (1); 131 end if; 132 133 -- Skip OK BOM 134 135 Iptr := Item'First; 136 137 if Item'Length >= 3 and then Item (Iptr .. Iptr + 2) = BOM_8 then 138 Iptr := Iptr + 3; 139 140 -- Error if bad BOM 141 142 elsif Item'Length >= 2 143 and then (Item (Iptr .. Iptr + 1) = BOM_16BE 144 or else 145 Item (Iptr .. Iptr + 1) = BOM_16LE) 146 then 147 Raise_Encoding_Error (Iptr); 148 149 -- No BOM present 150 151 else 152 Iptr := Item'First; 153 end if; 154 155 while Iptr <= Item'Last loop 156 C := To_Unsigned_8 (Item (Iptr)); 157 Iptr := Iptr + 1; 158 159 -- Codes in the range 16#00# - 16#7F# 160 -- UTF-8: 0xxxxxxx 161 -- UTF-16: 00000000_0xxxxxxx 162 163 if C <= 16#7F# then 164 Len := Len + 1; 165 Result (Len) := Wide_Character'Val (C); 166 167 -- No initial code can be of the form 10xxxxxx. Such codes are used 168 -- only for continuations. 169 170 elsif C <= 2#10_111111# then 171 Raise_Encoding_Error (Iptr - 1); 172 173 -- Codes in the range 16#80# - 16#7FF# 174 -- UTF-8: 110yyyxx 10xxxxxx 175 -- UTF-16: 00000yyy_xxxxxxxx 176 177 elsif C <= 2#110_11111# then 178 R := Unsigned_16 (C and 2#000_11111#); 179 Get_Continuation; 180 Len := Len + 1; 181 Result (Len) := Wide_Character'Val (R); 182 183 -- Codes in the range 16#800# - 16#FFFF# 184 -- UTF-8: 1110yyyy 10yyyyxx 10xxxxxx 185 -- UTF-16: yyyyyyyy_xxxxxxxx 186 187 elsif C <= 2#1110_1111# then 188 R := Unsigned_16 (C and 2#0000_1111#); 189 Get_Continuation; 190 Get_Continuation; 191 Len := Len + 1; 192 Result (Len) := Wide_Character'Val (R); 193 194 -- Make sure that we don't have a result in the forbidden range 195 -- reserved for UTF-16 surrogate characters. 196 197 if R in 16#D800# .. 16#DF00# then 198 Raise_Encoding_Error (Iptr - 3); 199 end if; 200 201 -- Codes in the range 16#10000# - 16#10FFFF# 202 -- UTF-8: 11110zzz 10zzyyyy 10yyyyxx 10xxxxxx 203 -- UTF-16: 110110zz_zzyyyyyy 110111yy_xxxxxxxx 204 -- Note: zzzz in the output is input zzzzz - 1 205 206 elsif C <= 2#11110_111# then 207 R := Unsigned_16 (C and 2#00000_111#); 208 Get_Continuation; 209 210 -- R now has zzzzzyyyy 211 212 R := R - 2#0000_1_0000#; 213 214 -- R now has zzzzyyyy (zzzz minus one for the output) 215 216 Get_Continuation; 217 218 -- R now has zzzzyyyyyyyyxx 219 220 Len := Len + 1; 221 Result (Len) := 222 Wide_Character'Val 223 (2#110110_00_0000_0000# or Shift_Right (R, 4)); 224 225 R := R and 2#1111#; 226 Get_Continuation; 227 Len := Len + 1; 228 Result (Len) := 229 Wide_Character'Val (2#110111_00_0000_0000# or R); 230 231 -- Any other code is an error 232 233 else 234 Raise_Encoding_Error (Iptr - 1); 235 end if; 236 end loop; 237 238 return Result (1 .. Len); 239 end Convert; 240 241 -- Convert from UTF-16 to UTF-8/UTF-16-BE/LE 242 243 function Convert 244 (Item : UTF_16_Wide_String; 245 Output_Scheme : Encoding_Scheme; 246 Output_BOM : Boolean := False) return UTF_String 247 is 248 begin 249 if Output_Scheme = UTF_8 then 250 return Convert (Item, Output_BOM); 251 else 252 return From_UTF_16 (Item, Output_Scheme, Output_BOM); 253 end if; 254 end Convert; 255 256 -- Convert from UTF-16 to UTF-8 257 258 function Convert 259 (Item : UTF_16_Wide_String; 260 Output_BOM : Boolean := False) return UTF_8_String 261 is 262 Result : UTF_8_String (1 .. 3 * Item'Length + 3); 263 -- Worst case is 3 output codes for each input code + BOM space 264 265 Len : Natural; 266 -- Number of result codes stored 267 268 Iptr : Natural; 269 -- Pointer to next input character 270 271 C1, C2 : Unsigned_16; 272 273 zzzzz : Unsigned_16; 274 yyyyyyyy : Unsigned_16; 275 xxxxxxxx : Unsigned_16; 276 -- Components of double length case 277 278 begin 279 Iptr := Item'First; 280 281 -- Skip BOM at start of input 282 283 if Item'Length > 0 and then Item (Iptr) = BOM_16 (1) then 284 Iptr := Iptr + 1; 285 end if; 286 287 -- Generate output BOM if required 288 289 if Output_BOM then 290 Result (1 .. 3) := BOM_8; 291 Len := 3; 292 else 293 Len := 0; 294 end if; 295 296 -- Loop through input 297 298 while Iptr <= Item'Last loop 299 C1 := To_Unsigned_16 (Item (Iptr)); 300 Iptr := Iptr + 1; 301 302 -- Codes in the range 16#0000# - 16#007F# 303 -- UTF-16: 000000000xxxxxxx 304 -- UTF-8: 0xxxxxxx 305 306 if C1 <= 16#007F# then 307 Result (Len + 1) := Character'Val (C1); 308 Len := Len + 1; 309 310 -- Codes in the range 16#80# - 16#7FF# 311 -- UTF-16: 00000yyyxxxxxxxx 312 -- UTF-8: 110yyyxx 10xxxxxx 313 314 elsif C1 <= 16#07FF# then 315 Result (Len + 1) := 316 Character'Val 317 (2#110_00000# or Shift_Right (C1, 6)); 318 Result (Len + 2) := 319 Character'Val 320 (2#10_000000# or (C1 and 2#00_111111#)); 321 Len := Len + 2; 322 323 -- Codes in the range 16#800# - 16#D7FF# or 16#E000# - 16#FFFF# 324 -- UTF-16: yyyyyyyyxxxxxxxx 325 -- UTF-8: 1110yyyy 10yyyyxx 10xxxxxx 326 327 elsif C1 <= 16#D7FF# or else C1 >= 16#E000# then 328 Result (Len + 1) := 329 Character'Val 330 (2#1110_0000# or Shift_Right (C1, 12)); 331 Result (Len + 2) := 332 Character'Val 333 (2#10_000000# or (Shift_Right (C1, 6) and 2#00_111111#)); 334 Result (Len + 3) := 335 Character'Val 336 (2#10_000000# or (C1 and 2#00_111111#)); 337 Len := Len + 3; 338 339 -- Codes in the range 16#10000# - 16#10FFFF# 340 -- UTF-16: 110110zzzzyyyyyy 110111yyxxxxxxxx 341 -- UTF-8: 11110zzz 10zzyyyy 10yyyyxx 10xxxxxx 342 -- Note: zzzzz in the output is input zzzz + 1 343 344 elsif C1 <= 2#110110_11_11111111# then 345 if Iptr > Item'Last then 346 Raise_Encoding_Error (Iptr - 1); 347 else 348 C2 := To_Unsigned_16 (Item (Iptr)); 349 Iptr := Iptr + 1; 350 end if; 351 352 if (C2 and 2#111111_00_00000000#) /= 2#110111_00_00000000# then 353 Raise_Encoding_Error (Iptr - 1); 354 end if; 355 356 zzzzz := (Shift_Right (C1, 6) and 2#1111#) + 1; 357 yyyyyyyy := ((Shift_Left (C1, 2) and 2#111111_00#) 358 or 359 (Shift_Right (C2, 8) and 2#000000_11#)); 360 xxxxxxxx := C2 and 2#11111111#; 361 362 Result (Len + 1) := 363 Character'Val 364 (2#11110_000# or (Shift_Right (zzzzz, 2))); 365 Result (Len + 2) := 366 Character'Val 367 (2#10_000000# or Shift_Left (zzzzz and 2#11#, 4) 368 or Shift_Right (yyyyyyyy, 4)); 369 Result (Len + 3) := 370 Character'Val 371 (2#10_000000# or Shift_Left (yyyyyyyy and 2#1111#, 4) 372 or Shift_Right (xxxxxxxx, 6)); 373 Result (Len + 4) := 374 Character'Val 375 (2#10_000000# or (xxxxxxxx and 2#00_111111#)); 376 Len := Len + 4; 377 378 -- Error if input in 16#DC00# - 16#DFFF# (2nd surrogate with no 1st) 379 380 else 381 Raise_Encoding_Error (Iptr - 2); 382 end if; 383 end loop; 384 385 return Result (1 .. Len); 386 end Convert; 387 388end Ada.Strings.UTF_Encoding.Conversions; 389