1------------------------------------------------------------------------------ 2-- -- 3-- GNAT RUN-TIME COMPONENTS -- 4-- -- 5-- A D A . S T R I N G S . U T F _ E N C O D I N G -- 6-- -- 7-- S p e c -- 8-- -- 9-- This specification is derived from the Ada Reference Manual for use with -- 10-- GNAT. The copyright notice above, and the license provisions that follow -- 11-- apply solely to the contents of the part following the private keyword. -- 12-- -- 13-- GNAT is free software; you can redistribute it and/or modify it under -- 14-- terms of the GNU General Public License as published by the Free Soft- -- 15-- ware Foundation; either version 3, or (at your option) any later ver- -- 16-- sion. GNAT is distributed in the hope that it will be useful, but WITH- -- 17-- OUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY -- 18-- or FITNESS FOR A PARTICULAR PURPOSE. -- 19-- -- 20-- As a special exception under Section 7 of GPL version 3, you are granted -- 21-- additional permissions described in the GCC Runtime Library Exception, -- 22-- version 3.1, as published by the Free Software Foundation. -- 23-- -- 24-- You should have received a copy of the GNU General Public License and -- 25-- a copy of the GCC Runtime Library Exception along with this program; -- 26-- see the files COPYING3 and COPYING.RUNTIME respectively. If not, see -- 27-- <http://www.gnu.org/licenses/>. -- 28-- -- 29-- GNAT was originally developed by the GNAT team at New York University. -- 30-- Extensive contributions were provided by Ada Core Technologies Inc. -- 31-- -- 32------------------------------------------------------------------------------ 33 34-- This is one of the Ada 2012 package defined in AI05-0137-1. It is a parent 35-- package that contains declarations used in the child packages for handling 36-- UTF encoded strings. Note: this package is consistent with Ada 95, and may 37-- be used in Ada 95 or Ada 2005 mode. 38 39with Interfaces; 40with Unchecked_Conversion; 41 42package Ada.Strings.UTF_Encoding is 43 pragma Pure (UTF_Encoding); 44 45 subtype UTF_String is String; 46 -- Used to represent a string of 8-bit values containing a sequence of 47 -- values encoded in one of three ways (UTF-8, UTF-16BE, or UTF-16LE). 48 -- Typically used in connection with a Scheme parameter indicating which 49 -- of the encodings applies. This is not strictly a String value in the 50 -- sense defined in the Ada RM, but in practice type String accommodates 51 -- all possible 256 codes, and can be used to hold any sequence of 8-bit 52 -- codes. We use String directly rather than create a new type so that 53 -- all existing facilities for manipulating type String (e.g. the child 54 -- packages of Ada.Strings) are available for manipulation of UTF_Strings. 55 56 type Encoding_Scheme is (UTF_8, UTF_16BE, UTF_16LE); 57 -- Used to specify which of three possible encodings apply to a UTF_String 58 59 subtype UTF_8_String is String; 60 -- Similar to UTF_String but specifically represents a UTF-8 encoded string 61 62 subtype UTF_16_Wide_String is Wide_String; 63 -- This is similar to UTF_8_String but is used to represent a Wide_String 64 -- value which is a sequence of 16-bit values encoded using UTF-16. Again 65 -- this is not strictly a Wide_String in the sense of the Ada RM, but the 66 -- type Wide_String can be used to represent a sequence of arbitrary 16-bit 67 -- values, and it is more convenient to use Wide_String than a new type. 68 69 Encoding_Error : exception; 70 -- This exception is raised in the following situations: 71 -- a) A UTF encoded string contains an invalid encoding sequence 72 -- b) A UTF-16BE or UTF-16LE input string has an odd length 73 -- c) An incorrect character value is present in the Input string 74 -- d) The result for a Wide_Character output exceeds 16#FFFF# 75 -- The exception message has the index value where the error occurred. 76 77 -- The BOM (BYTE_ORDER_MARK) values defined here are used at the start of 78 -- a string to indicate the encoding. The convention in this package is 79 -- that on input a correct BOM is ignored and an incorrect BOM causes an 80 -- Encoding_Error exception. On output, the output string may or may not 81 -- include a BOM depending on the setting of Output_BOM. 82 83 BOM_8 : constant UTF_8_String := 84 Character'Val (16#EF#) & 85 Character'Val (16#BB#) & 86 Character'Val (16#BF#); 87 88 BOM_16BE : constant UTF_String := 89 Character'Val (16#FE#) & 90 Character'Val (16#FF#); 91 92 BOM_16LE : constant UTF_String := 93 Character'Val (16#FF#) & 94 Character'Val (16#FE#); 95 96 BOM_16 : constant UTF_16_Wide_String := 97 (1 => Wide_Character'Val (16#FEFF#)); 98 99 function Encoding 100 (Item : UTF_String; 101 Default : Encoding_Scheme := UTF_8) return Encoding_Scheme; 102 -- This function inspects a UTF_String value to determine whether it 103 -- starts with a BOM for UTF-8, UTF-16BE, or UTF_16LE. If so, the result 104 -- is the scheme corresponding to the BOM. If no valid BOM is present 105 -- then the result is the specified Default value. 106 107private 108 function To_Unsigned_8 is new 109 Unchecked_Conversion (Character, Interfaces.Unsigned_8); 110 111 function To_Unsigned_16 is new 112 Unchecked_Conversion (Wide_Character, Interfaces.Unsigned_16); 113 114 function To_Unsigned_32 is new 115 Unchecked_Conversion (Wide_Wide_Character, Interfaces.Unsigned_32); 116 117 subtype UTF_XE_Encoding is Encoding_Scheme range UTF_16BE .. UTF_16LE; 118 -- Subtype containing only UTF_16BE and UTF_16LE entries 119 120 -- Utility routines for converting between UTF-16 and UTF-16LE/BE 121 122 function From_UTF_16 123 (Item : UTF_16_Wide_String; 124 Output_Scheme : UTF_XE_Encoding; 125 Output_BOM : Boolean := False) return UTF_String; 126 -- The input string Item is encoded in UTF-16. The output is encoded using 127 -- Output_Scheme (which is either UTF-16LE or UTF-16BE). There are no error 128 -- cases. The output starts with BOM_16BE/LE if Output_BOM is True. 129 130 function To_UTF_16 131 (Item : UTF_String; 132 Input_Scheme : UTF_XE_Encoding; 133 Output_BOM : Boolean := False) return UTF_16_Wide_String; 134 -- The input string Item is encoded using Input_Scheme which is either 135 -- UTF-16LE or UTF-16BE. The output is the corresponding UTF_16 wide 136 -- string. Encoding error is raised if the length of the input is odd. 137 -- The output starts with BOM_16 if Output_BOM is True. 138 139 procedure Raise_Encoding_Error (Index : Natural); 140 pragma No_Return (Raise_Encoding_Error); 141 -- Raise Encoding_Error exception for bad encoding in input item. The 142 -- parameter Index is the index of the location in Item for the error. 143 144end Ada.Strings.UTF_Encoding; 145