1------------------------------------------------------------------------------ 2-- -- 3-- Matreshka Project -- 4-- -- 5-- XML Processor -- 6-- -- 7-- Runtime Library Component -- 8-- -- 9------------------------------------------------------------------------------ 10-- -- 11-- Copyright © 2011, Vadim Godunko <vgodunko@gmail.com> -- 12-- All rights reserved. -- 13-- -- 14-- Redistribution and use in source and binary forms, with or without -- 15-- modification, are permitted provided that the following conditions -- 16-- are met: -- 17-- -- 18-- * Redistributions of source code must retain the above copyright -- 19-- notice, this list of conditions and the following disclaimer. -- 20-- -- 21-- * Redistributions in binary form must reproduce the above copyright -- 22-- notice, this list of conditions and the following disclaimer in the -- 23-- documentation and/or other materials provided with the distribution. -- 24-- -- 25-- * Neither the name of the Vadim Godunko, IE nor the names of its -- 26-- contributors may be used to endorse or promote products derived from -- 27-- this software without specific prior written permission. -- 28-- -- 29-- THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS -- 30-- "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT -- 31-- LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR -- 32-- A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT -- 33-- HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, -- 34-- SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED -- 35-- TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR -- 36-- PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF -- 37-- LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING -- 38-- NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS -- 39-- SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -- 40-- -- 41------------------------------------------------------------------------------ 42-- $Revision: 2098 $ $Date: 2011-08-19 21:18:33 +0400 (Fri, 19 Aug 2011) $ 43------------------------------------------------------------------------------ 44-- Normalization utilities for public and system identifiers. 45------------------------------------------------------------------------------ 46with Ada.Streams; 47 48with League.Characters.Latin; 49with League.Text_Codecs; 50 51package body Matreshka.XML_Catalogs.Normalization is 52 53 use League.Characters.Latin; 54 use type League.Characters.Universal_Character; 55 56 PublicId_URN_Namespace : constant League.Strings.Universal_String 57 := League.Strings.To_Universal_String ("urn:publicid:"); 58 59 UTF8_Codec : constant League.Text_Codecs.Text_Codec 60 := League.Text_Codecs.Codec 61 (League.Strings.To_Universal_String ("utf-8")); 62 -- Codec to encode URI. 63 64 --------------------------------- 65 -- Normalize_Public_Identifier -- 66 --------------------------------- 67 68 function Normalize_Public_Identifier 69 (Public_Identifier : League.Strings.Universal_String) 70 return League.Strings.Universal_String 71 is 72 Result : League.Strings.Universal_String; 73 Char : League.Characters.Universal_Character; 74 Is_Space : Boolean; 75 76 begin 77 -- [XML Catalogs] 6.2. Public Identifier Normalization 78 -- 79 -- In order to accurately and interoperably compare public identifiers, 80 -- catalog processors must perform normalization on public identifiers 81 -- in both the catalog and the input passed to them. 82 -- 83 -- All strings of white space in public identifiers must be normalized 84 -- to single space characters (#x20), and leading and trailing white 85 -- space must be removed. 86 87 Is_Space := True; 88 89 for J in 1 .. Public_Identifier.Length loop 90 Char := Public_Identifier.Element (J); 91 92 if Char = Space 93 or Char = Carriage_Return 94 or Char = Line_Feed 95 or Char = Character_Tabulation 96 then 97 if not Is_Space then 98 Is_Space := True; 99 Result.Append (Space); 100 end if; 101 102 else 103 Is_Space := False; 104 Result.Append (Public_Identifier.Element (J)); 105 end if; 106 end loop; 107 108 if Is_Space and not Result.Is_Empty then 109 return Result.Slice (1, Result.Length - 1); 110 111 else 112 return Result; 113 end if; 114 end Normalize_Public_Identifier; 115 116 ------------------- 117 -- Normalize_URI -- 118 ------------------- 119 120 function Normalize_URI 121 (URI : League.Strings.Universal_String) 122 return League.Strings.Universal_String 123 is 124 use type Ada.Streams.Stream_Element; 125 126 -- [XML Catalogs] 6.3. System Identifier and URI Normalization 127 -- 128 -- "In order to accurately and interoperably compare system identifiers 129 -- and URI references, catalog processors must perform normalization. 130 -- The normalization described in this section must be performed on 131 -- system identifiers and URI references passed as input to the resolver 132 -- and on strings in the catalog that are compared to them. 133 -- 134 -- URI references require encoding and escaping of certain characters. 135 -- The disallowed characters include all non-ASCII characters, plus the 136 -- excluded characters listed in Section 2.4 of [RFC 2396], except for 137 -- the number sign (#) and percent sign (%) characters and the square 138 -- bracket characters re-allowed in [RFC 2732]. These characters are 139 -- summarized in Table 1, “Excluded US-ASCII Characters”. 140 -- 141 -- Table 1. Excluded US-ASCII Characters 142 -- Hex Value Character Hex Value Character Hex Value Character 143 -- 00 NUL 0F SI 1E RS 144 -- 01 SOH 10 DLE 1F US 145 -- 02 STX 11 DC1 20 (space) 146 -- 03 ETX 12 DC2 22 " 147 -- 04 EOT 13 DC3 3C < 148 -- 05 ENQ 14 DC4 3E > 149 -- 06 ACK 15 NAK 5C \ 150 -- 07 BEL 16 SYN 5E ^ 151 -- 08 BS 17 ETB 60 ` 152 -- 09 HT 18 CAN 7B { 153 -- 0A LF 19 EM 7C | 154 -- 0B VT 1A SUB 7D } 155 -- 0C FF 1B ESC 7F DEL 156 -- 0D CR 1C FS 157 -- 0E SO 1D GS 158 -- 159 -- Catalog processors must escape disallowed characters as follows: 160 -- 161 -- 1. Each disallowed character is converted to UTF-8 [RFC 2279] as one 162 -- or more bytes. 163 -- 164 -- 2. Any octets corresponding to a disallowed character are escaped 165 -- with the URI escaping mechanism (that is, converted to %HH, where HH 166 -- is the hexadecimal notation of the octet value). If escaping must be 167 -- performed, uppercase hexadecimal characters should be used. 168 -- 169 -- 3. The original character is replaced by the resulting character 170 -- sequence. Note that this normalization process is idempotent: 171 -- repeated normalization does not change a normalized URI reference." 172 173 -- Actual algoriphm is slightly differ: it converts URI into UTF-8 and 174 -- escape all disallowed characters. This produce equivalent result, 175 -- because all ASCII characters are mapped to the elements with the same 176 -- codes, and all non-ASCII characters are mapped to multibyte sequences 177 -- with codes outside of ASCII range. 178 179 Hex : constant 180 array (Ada.Streams.Stream_Element range 0 .. 15) of Wide_Wide_Character 181 := "0123456789ABCDEF"; 182 Encoded : constant Ada.Streams.Stream_Element_Array 183 := UTF8_Codec.Encode (URI).To_Stream_Element_Array; 184 Result : League.Strings.Universal_String; 185 186 begin 187 for J in Encoded'Range loop 188 case Encoded (J) is 189 when 16#00# .. 16#20# -- NUL .. (space) 190 | 16#22# -- " 191 | 16#3C# -- < 192 | 16#3E# -- > 193 | 16#5C# -- \ 194 | 16#5E# -- ^ 195 | 16#60# -- ` 196 | 16#7B# -- { 197 | 16#7C# -- | 198 | 16#7D# -- } 199 | 16#7F# -- DEL 200 | 16#80# .. 16#FF# -- non-ASCII codes 201 => 202 Result.Append ('%'); 203 Result.Append (Hex (Encoded (J) / 16)); 204 Result.Append (Hex (Encoded (J) mod 16)); 205 206 when others => 207 Result.Append (Wide_Wide_Character'Val (Encoded (J))); 208 end case; 209 end loop; 210 211 return Result; 212 end Normalize_URI; 213 214 ---------------- 215 -- Unwrap_URN -- 216 ---------------- 217 218 procedure Unwrap_URN 219 (URI : League.Strings.Universal_String; 220 Identifier : out League.Strings.Universal_String; 221 Unwrapped : out Boolean) 222 is 223 -- [XML Catalogs] 6.4. URN "Unwrapping" 224 -- 225 -- This OASIS Standard requires processors to implement special 226 -- treatment of URNs in the publicid URN Namespace ([RFC 3151]). 227 -- 228 -- URNs of this form must, in some contexts, be "unwrapped" by the 229 -- Catalog processor. This unwrapping translates the URN form of the 230 -- public identifier back into the standard ISO 8879 form for the 231 -- purposes of subsequent catalog processing. 232 -- 233 -- Unwrapping a urn:publicid: URN is accomplished by transcribing 234 -- characters in the URN according to the fol- lowing table after 235 -- discarding the leading urn:publicid: string: 236 -- 237 -- URN Characters Public Identifier Characters 238 -- + " " (space) 239 -- : // 240 -- ; :: 241 -- %2B + 242 -- %3A : 243 -- %2F / 244 -- %3B ; 245 -- %27 ' 246 -- %3F ? 247 -- %23 # 248 -- %25 % 249 -- 250 -- URNs in the publicid namespace should always represent normalized 251 -- public identifiers (Section 6.2, “Public Identifier Normalization”). 252 -- In the event that an unwrapped public identifier is not normalized, 253 -- the catalog processor must normalize it." 254 255 Char : League.Characters.Universal_Character; 256 Char_1 : League.Characters.Universal_Character; 257 Char_2 : League.Characters.Universal_Character; 258 J : Natural; 259 260 begin 261 Identifier := League.Strings.Empty_Universal_String; 262 263 if not URI.Starts_With (PublicId_URN_Namespace) then 264 -- Return when URI is not in publicid URN namespace. 265 266 Unwrapped := False; 267 268 return; 269 end if; 270 271 J := PublicId_URN_Namespace.Length + 1; 272 273 while J <= URI.Length loop 274 Char := URI.Element (J); 275 276 if Char = Plus_Sign then 277 Identifier.Append (Space); 278 279 elsif Char = Colon then 280 Identifier.Append (Solidus); 281 Identifier.Append (Solidus); 282 283 elsif Char = Semicolon then 284 Identifier.Append (Colon); 285 Identifier.Append (Colon); 286 287 elsif Char = Percent_Sign 288 and then J + 2 <= URI.Length 289 then 290 Char_1 := URI.Element (J + 1); 291 Char_2 := URI.Element (J + 2); 292 293 if Char_1 = Digit_Two and Char_2 = Latin_Capital_Letter_B then 294 Identifier.Append (Plus_Sign); 295 296 elsif Char_1 = Digit_Three and Char_2 = Latin_Capital_Letter_A then 297 Identifier.Append (Colon); 298 299 elsif Char_1 = Digit_Two and Char_2 = Latin_Capital_Letter_F then 300 Identifier.Append (Solidus); 301 302 elsif Char_1 = Digit_Three and Char_2 = Latin_Capital_Letter_B then 303 Identifier.Append (Semicolon); 304 305 elsif Char_1 = Digit_Two and Char_2 = Digit_Seven then 306 Identifier.Append (Apostrophe); 307 308 elsif Char_1 = Digit_Three and Char_2 = Latin_Capital_Letter_F then 309 Identifier.Append (Question_Mark); 310 311 elsif Char_1 = Digit_Two and Char_2 = Digit_Three then 312 Identifier.Append (Number_Sign); 313 314 elsif Char_1 = Digit_Two and Char_2 = Digit_Five then 315 Identifier.Append (Percent_Sign); 316 317 else 318 Identifier.Append (Char); 319 Identifier.Append (Char_1); 320 Identifier.Append (Char_2); 321 end if; 322 323 J := J + 2; 324 325 else 326 Identifier.Append (Char); 327 end if; 328 329 J := J + 1; 330 end loop; 331 332 Identifier := Normalize_Public_Identifier (Identifier); 333 Unwrapped := True; 334 end Unwrap_URN; 335 336end Matreshka.XML_Catalogs.Normalization; 337