1------------------------------------------------------------------------------ 2-- -- 3-- GNAT RUN-TIME COMPONENTS -- 4-- -- 5-- ADA.STRINGS.UTF_ENCODING.WIDE_WIDE_STRINGS -- 6-- -- 7-- B o d y -- 8-- -- 9-- Copyright (C) 2010-2019, Free Software Foundation, Inc. -- 10-- -- 11-- GNAT is free software; you can redistribute it and/or modify it under -- 12-- terms of the GNU General Public License as published by the Free Soft- -- 13-- ware Foundation; either version 3, or (at your option) any later ver- -- 14-- sion. GNAT is distributed in the hope that it will be useful, but WITH- -- 15-- OUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY -- 16-- or FITNESS FOR A PARTICULAR PURPOSE. -- 17-- -- 18-- As a special exception under Section 7 of GPL version 3, you are granted -- 19-- additional permissions described in the GCC Runtime Library Exception, -- 20-- version 3.1, as published by the Free Software Foundation. -- 21-- -- 22-- You should have received a copy of the GNU General Public License and -- 23-- a copy of the GCC Runtime Library Exception along with this program; -- 24-- see the files COPYING3 and COPYING.RUNTIME respectively. If not, see -- 25-- <http://www.gnu.org/licenses/>. -- 26-- -- 27-- GNAT was originally developed by the GNAT team at New York University. -- 28-- Extensive contributions were provided by Ada Core Technologies Inc. -- 29-- -- 30------------------------------------------------------------------------------ 31 32package body Ada.Strings.UTF_Encoding.Wide_Wide_Strings is 33 use Interfaces; 34 35 ------------ 36 -- Decode -- 37 ------------ 38 39 -- Decode UTF-8/UTF-16BE/UTF-16LE input to Wide_Wide_String 40 41 function Decode 42 (Item : UTF_String; 43 Input_Scheme : Encoding_Scheme) return Wide_Wide_String 44 is 45 begin 46 if Input_Scheme = UTF_8 then 47 return Decode (Item); 48 else 49 return Decode (To_UTF_16 (Item, Input_Scheme)); 50 end if; 51 end Decode; 52 53 -- Decode UTF-8 input to Wide_Wide_String 54 55 function Decode (Item : UTF_8_String) return Wide_Wide_String is 56 Result : Wide_Wide_String (1 .. Item'Length); 57 -- Result string (worst case is same length as input) 58 59 Len : Natural := 0; 60 -- Length of result stored so far 61 62 Iptr : Natural; 63 -- Input string pointer 64 65 C : Unsigned_8; 66 R : Unsigned_32; 67 68 procedure Get_Continuation; 69 -- Reads a continuation byte of the form 10xxxxxx, shifts R left by 6 70 -- bits, and or's in the xxxxxx to the low order 6 bits. On return Ptr 71 -- is incremented. Raises exception if continuation byte does not exist 72 -- or is invalid. 73 74 ---------------------- 75 -- Get_Continuation -- 76 ---------------------- 77 78 procedure Get_Continuation is 79 begin 80 if Iptr > Item'Last then 81 Raise_Encoding_Error (Iptr - 1); 82 83 else 84 C := To_Unsigned_8 (Item (Iptr)); 85 Iptr := Iptr + 1; 86 87 if C not in 2#10_000000# .. 2#10_111111# then 88 Raise_Encoding_Error (Iptr - 1); 89 else 90 R := Shift_Left (R, 6) or Unsigned_32 (C and 2#00_111111#); 91 end if; 92 end if; 93 end Get_Continuation; 94 95 -- Start of processing for Decode 96 97 begin 98 Iptr := Item'First; 99 100 -- Skip BOM at start 101 102 if Item'Length >= 3 103 and then Item (Iptr .. Iptr + 2) = BOM_8 104 then 105 Iptr := Iptr + 3; 106 107 -- Error if bad BOM 108 109 elsif Item'Length >= 2 110 and then (Item (Iptr .. Iptr + 1) = BOM_16BE 111 or else 112 Item (Iptr .. Iptr + 1) = BOM_16LE) 113 then 114 Raise_Encoding_Error (Iptr); 115 end if; 116 117 -- Loop through input characters 118 119 while Iptr <= Item'Last loop 120 C := To_Unsigned_8 (Item (Iptr)); 121 Iptr := Iptr + 1; 122 123 -- Codes in the range 16#00# - 16#7F# are represented as 124 -- 0xxxxxxx 125 126 if C <= 16#7F# then 127 R := Unsigned_32 (C); 128 129 -- No initial code can be of the form 10xxxxxx. Such codes are used 130 -- only for continuations. 131 132 elsif C <= 2#10_111111# then 133 Raise_Encoding_Error (Iptr - 1); 134 135 -- Codes in the range 16#80# - 16#7FF# are represented as 136 -- 110yyyxx 10xxxxxx 137 138 elsif C <= 2#110_11111# then 139 R := Unsigned_32 (C and 2#000_11111#); 140 Get_Continuation; 141 142 -- Codes in the range 16#800# - 16#FFFF# are represented as 143 -- 1110yyyy 10yyyyxx 10xxxxxx 144 145 elsif C <= 2#1110_1111# then 146 R := Unsigned_32 (C and 2#0000_1111#); 147 Get_Continuation; 148 Get_Continuation; 149 150 -- Codes in the range 16#10000# - 16#10FFFF# are represented as 151 -- 11110zzz 10zzyyyy 10yyyyxx 10xxxxxx 152 153 elsif C <= 2#11110_111# then 154 R := Unsigned_32 (C and 2#00000_111#); 155 Get_Continuation; 156 Get_Continuation; 157 Get_Continuation; 158 159 -- Any other code is an error 160 161 else 162 Raise_Encoding_Error (Iptr - 1); 163 end if; 164 165 Len := Len + 1; 166 Result (Len) := Wide_Wide_Character'Val (R); 167 end loop; 168 169 return Result (1 .. Len); 170 end Decode; 171 172 -- Decode UTF-16 input to Wide_Wide_String 173 174 function Decode (Item : UTF_16_Wide_String) return Wide_Wide_String is 175 Result : Wide_Wide_String (1 .. Item'Length); 176 -- Result cannot be longer than the input string 177 178 Len : Natural := 0; 179 -- Length of result 180 181 Iptr : Natural; 182 -- Pointer to next element in Item 183 184 C : Unsigned_16; 185 R : Unsigned_32; 186 187 begin 188 -- Skip UTF-16 BOM at start 189 190 Iptr := Item'First; 191 192 if Iptr <= Item'Last and then Item (Iptr) = BOM_16 (1) then 193 Iptr := Iptr + 1; 194 end if; 195 196 -- Loop through input characters 197 198 while Iptr <= Item'Last loop 199 C := To_Unsigned_16 (Item (Iptr)); 200 Iptr := Iptr + 1; 201 202 -- Codes in the range 16#0000#..16#D7FF# or 16#E000#..16#FFFD# 203 -- represent their own value. 204 205 if C <= 16#D7FF# or else C in 16#E000# .. 16#FFFD# then 206 Len := Len + 1; 207 Result (Len) := Wide_Wide_Character'Val (C); 208 209 -- Codes in the range 16#D800#..16#DBFF# represent the first of the 210 -- two surrogates used to encode the range 16#01_000#..16#10_FFFF". 211 -- The first surrogate provides 10 high order bits of the result. 212 213 elsif C <= 16#DBFF# then 214 R := Shift_Left ((Unsigned_32 (C) - 16#D800#), 10); 215 216 -- Error if at end of string 217 218 if Iptr > Item'Last then 219 Raise_Encoding_Error (Iptr - 1); 220 221 -- Otherwise next character must be valid low order surrogate 222 -- which provides the low 10 order bits of the result. 223 224 else 225 C := To_Unsigned_16 (Item (Iptr)); 226 Iptr := Iptr + 1; 227 228 if C not in 16#DC00# .. 16#DFFF# then 229 Raise_Encoding_Error (Iptr - 1); 230 231 else 232 R := R or (Unsigned_32 (C) mod 2 ** 10); 233 234 -- The final adjustment is to add 16#01_0000 to get the 235 -- result back in the required 21 bit range. 236 237 R := R + 16#01_0000#; 238 Len := Len + 1; 239 Result (Len) := Wide_Wide_Character'Val (R); 240 end if; 241 end if; 242 243 -- Remaining codes are invalid 244 245 else 246 Raise_Encoding_Error (Iptr - 1); 247 end if; 248 end loop; 249 250 return Result (1 .. Len); 251 end Decode; 252 253 ------------ 254 -- Encode -- 255 ------------ 256 257 -- Encode Wide_Wide_String in UTF-8, UTF-16BE or UTF-16LE 258 259 function Encode 260 (Item : Wide_Wide_String; 261 Output_Scheme : Encoding_Scheme; 262 Output_BOM : Boolean := False) return UTF_String 263 is 264 begin 265 if Output_Scheme = UTF_8 then 266 return Encode (Item, Output_BOM); 267 else 268 return From_UTF_16 (Encode (Item), Output_Scheme, Output_BOM); 269 end if; 270 end Encode; 271 272 -- Encode Wide_Wide_String in UTF-8 273 274 function Encode 275 (Item : Wide_Wide_String; 276 Output_BOM : Boolean := False) return UTF_8_String 277 is 278 Result : String (1 .. 4 * Item'Length + 3); 279 -- Worst case is four bytes per input byte + space for BOM 280 281 Len : Natural; 282 -- Number of output codes stored in Result 283 284 C : Unsigned_32; 285 -- Single input character 286 287 procedure Store (C : Unsigned_32); 288 pragma Inline (Store); 289 -- Store one output code (input is in range 0 .. 255) 290 291 ----------- 292 -- Store -- 293 ----------- 294 295 procedure Store (C : Unsigned_32) is 296 begin 297 Len := Len + 1; 298 Result (Len) := Character'Val (C); 299 end Store; 300 301 -- Start of processing for Encode 302 303 begin 304 -- Output BOM if required 305 306 if Output_BOM then 307 Result (1 .. 3) := BOM_8; 308 Len := 3; 309 else 310 Len := 0; 311 end if; 312 313 -- Loop through characters of input 314 315 for Iptr in Item'Range loop 316 C := To_Unsigned_32 (Item (Iptr)); 317 318 -- Codes in the range 16#00#..16#7F# are represented as 319 -- 0xxxxxxx 320 321 if C <= 16#7F# then 322 Store (C); 323 324 -- Codes in the range 16#80#..16#7FF# are represented as 325 -- 110yyyxx 10xxxxxx 326 327 elsif C <= 16#7FF# then 328 Store (2#110_00000# or Shift_Right (C, 6)); 329 Store (2#10_000000# or (C and 2#00_111111#)); 330 331 -- Codes in the range 16#800#..16#D7FF# or 16#E000#..16#FFFD# are 332 -- represented as 333 -- 1110yyyy 10yyyyxx 10xxxxxx 334 335 elsif C <= 16#D7FF# or else C in 16#E000# .. 16#FFFD# then 336 Store (2#1110_0000# or Shift_Right (C, 12)); 337 Store (2#10_000000# or 338 Shift_Right (C and 2#111111_000000#, 6)); 339 Store (2#10_000000# or (C and 2#00_111111#)); 340 341 -- Codes in the range 16#10000# - 16#10FFFF# are represented as 342 -- 11110zzz 10zzyyyy 10yyyyxx 10xxxxxx 343 344 elsif C in 16#1_0000# .. 16#10_FFFF# then 345 Store (2#11110_000# or 346 Shift_Right (C, 18)); 347 Store (2#10_000000# or 348 Shift_Right (C and 2#111111_000000_000000#, 12)); 349 Store (2#10_000000# or 350 Shift_Right (C and 2#111111_000000#, 6)); 351 Store (2#10_000000# or 352 (C and 2#00_111111#)); 353 354 -- All other codes are invalid 355 356 else 357 Raise_Encoding_Error (Iptr); 358 end if; 359 end loop; 360 361 return Result (1 .. Len); 362 end Encode; 363 364 -- Encode Wide_Wide_String in UTF-16 365 366 function Encode 367 (Item : Wide_Wide_String; 368 Output_BOM : Boolean := False) return UTF_16_Wide_String 369 is 370 Result : UTF_16_Wide_String (1 .. 2 * Item'Length + 1); 371 -- Worst case is each input character generates two output characters 372 -- plus one for possible BOM. 373 374 Len : Integer; 375 -- Length of output string 376 377 C : Unsigned_32; 378 379 begin 380 -- Output BOM if needed 381 382 if Output_BOM then 383 Result (1) := BOM_16 (1); 384 Len := 1; 385 else 386 Len := 0; 387 end if; 388 389 -- Loop through input characters encoding them 390 391 for Iptr in Item'Range loop 392 C := To_Unsigned_32 (Item (Iptr)); 393 394 -- Codes in the range 16#00_0000#..16#00_D7FF# or 16#E000#..16#FFFD# 395 -- are output unchanged 396 397 if C <= 16#00_D7FF# or else C in 16#E000# .. 16#FFFD# then 398 Len := Len + 1; 399 Result (Len) := Wide_Character'Val (C); 400 401 -- Codes in the range 16#01_0000#..16#10_FFFF# are output using two 402 -- surrogate characters. First 16#1_0000# is subtracted from the code 403 -- point to give a 20-bit value. This is then split into two separate 404 -- 10-bit values each of which is represented as a surrogate with the 405 -- most significant half placed in the first surrogate. The ranges of 406 -- values used for the two surrogates are 16#D800#-16#DBFF# for the 407 -- first, most significant surrogate and 16#DC00#-16#DFFF# for the 408 -- second, least significant surrogate. 409 410 elsif C in 16#1_0000# .. 16#10_FFFF# then 411 C := C - 16#1_0000#; 412 413 Len := Len + 1; 414 Result (Len) := Wide_Character'Val (16#D800# + C / 2 ** 10); 415 416 Len := Len + 1; 417 Result (Len) := Wide_Character'Val (16#DC00# + C mod 2 ** 10); 418 419 -- All other codes are invalid 420 421 else 422 Raise_Encoding_Error (Iptr); 423 end if; 424 end loop; 425 426 return Result (1 .. Len); 427 end Encode; 428 429end Ada.Strings.UTF_Encoding.Wide_Wide_Strings; 430