1------------------------------------------------------------------------------
2--                                                                          --
3--                            Matreshka Project                             --
4--                                                                          --
5--                               XML Processor                              --
6--                                                                          --
7--                        Runtime Library Component                         --
8--                                                                          --
9------------------------------------------------------------------------------
10--                                                                          --
11-- Copyright © 2011, Vadim Godunko <vgodunko@gmail.com>                     --
12-- All rights reserved.                                                     --
13--                                                                          --
14-- Redistribution and use in source and binary forms, with or without       --
15-- modification, are permitted provided that the following conditions       --
16-- are met:                                                                 --
17--                                                                          --
18--  * Redistributions of source code must retain the above copyright        --
19--    notice, this list of conditions and the following disclaimer.         --
20--                                                                          --
21--  * Redistributions in binary form must reproduce the above copyright     --
22--    notice, this list of conditions and the following disclaimer in the   --
23--    documentation and/or other materials provided with the distribution.  --
24--                                                                          --
25--  * Neither the name of the Vadim Godunko, IE nor the names of its        --
26--    contributors may be used to endorse or promote products derived from  --
27--    this software without specific prior written permission.              --
28--                                                                          --
29-- THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS      --
30-- "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT        --
31-- LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR    --
32-- A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT     --
33-- HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,   --
34-- SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED --
35-- TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR   --
36-- PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF   --
37-- LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING     --
38-- NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS       --
39-- SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.             --
40--                                                                          --
41------------------------------------------------------------------------------
42--  $Revision: 2098 $ $Date: 2011-08-19 21:18:33 +0400 (Fri, 19 Aug 2011) $
43------------------------------------------------------------------------------
44--  Normalization utilities for public and system identifiers.
45------------------------------------------------------------------------------
46with Ada.Streams;
47
48with League.Characters.Latin;
49with League.Text_Codecs;
50
51package body Matreshka.XML_Catalogs.Normalization is
52
53   use League.Characters.Latin;
54   use type League.Characters.Universal_Character;
55
56   PublicId_URN_Namespace : constant League.Strings.Universal_String
57     := League.Strings.To_Universal_String ("urn:publicid:");
58
59   UTF8_Codec : constant League.Text_Codecs.Text_Codec
60     := League.Text_Codecs.Codec
61         (League.Strings.To_Universal_String ("utf-8"));
62   --  Codec to encode URI.
63
64   ---------------------------------
65   -- Normalize_Public_Identifier --
66   ---------------------------------
67
68   function Normalize_Public_Identifier
69    (Public_Identifier : League.Strings.Universal_String)
70       return League.Strings.Universal_String
71   is
72      Result   : League.Strings.Universal_String;
73      Char     : League.Characters.Universal_Character;
74      Is_Space : Boolean;
75
76   begin
77      --  [XML Catalogs] 6.2. Public Identifier Normalization
78      --
79      --  In order to accurately and interoperably compare public identifiers,
80      --  catalog processors must perform normalization on public identifiers
81      --  in both the catalog and the input passed to them.
82      --
83      --  All strings of white space in public identifiers must be normalized
84      --  to single space characters (#x20), and leading and trailing white
85      --  space must be removed.
86
87      Is_Space := True;
88
89      for J in 1 .. Public_Identifier.Length loop
90         Char := Public_Identifier.Element (J);
91
92         if Char = Space
93           or Char = Carriage_Return
94           or Char = Line_Feed
95           or Char = Character_Tabulation
96         then
97            if not Is_Space then
98               Is_Space := True;
99               Result.Append (Space);
100            end if;
101
102         else
103            Is_Space := False;
104            Result.Append (Public_Identifier.Element (J));
105         end if;
106      end loop;
107
108      if Is_Space and not Result.Is_Empty then
109         return Result.Slice (1, Result.Length - 1);
110
111      else
112         return Result;
113      end if;
114   end Normalize_Public_Identifier;
115
116   -------------------
117   -- Normalize_URI --
118   -------------------
119
120   function Normalize_URI
121    (URI : League.Strings.Universal_String)
122       return League.Strings.Universal_String
123   is
124      use type Ada.Streams.Stream_Element;
125
126      --  [XML Catalogs] 6.3. System Identifier and URI Normalization
127      --
128      --  "In order to accurately and interoperably compare system identifiers
129      --  and URI references, catalog processors must perform normalization.
130      --  The normalization described in this section must be performed on
131      --  system identifiers and URI references passed as input to the resolver
132      --  and on strings in the catalog that are compared to them.
133      --
134      --  URI references require encoding and escaping of certain characters.
135      --  The disallowed characters include all non-ASCII characters, plus the
136      --  excluded characters listed in Section 2.4 of [RFC 2396], except for
137      --  the number sign (#) and percent sign (%) characters and the square
138      --  bracket characters re-allowed in [RFC 2732]. These characters are
139      --  summarized in Table 1, “Excluded US-ASCII Characters”.
140      --
141      --  Table 1. Excluded US-ASCII Characters
142      --  Hex Value  Character  Hex Value  Character  Hex Value  Character
143      --     00         NUL        0F         SI         1E          RS
144      --     01         SOH        10         DLE        1F          US
145      --     02         STX        11         DC1        20       (space)
146      --     03         ETX        12         DC2        22          "
147      --     04         EOT        13         DC3        3C          <
148      --     05         ENQ        14         DC4        3E          >
149      --     06         ACK        15         NAK        5C          \
150      --     07         BEL        16         SYN        5E          ^
151      --     08         BS         17         ETB        60          `
152      --     09         HT         18         CAN        7B          {
153      --     0A         LF         19         EM         7C          |
154      --     0B         VT         1A         SUB        7D          }
155      --     0C         FF         1B         ESC        7F          DEL
156      --     0D         CR         1C         FS
157      --     0E         SO         1D         GS
158      --
159      --  Catalog processors must escape disallowed characters as follows:
160      --
161      --  1. Each disallowed character is converted to UTF-8 [RFC 2279] as one
162      --  or more bytes.
163      --
164      --  2. Any octets corresponding to a disallowed character are escaped
165      --  with the URI escaping mechanism (that is, converted to %HH, where HH
166      --  is the hexadecimal notation of the octet value). If escaping must be
167      --  performed, uppercase hexadecimal characters should be used.
168      --
169      --  3. The original character is replaced by the resulting character
170      --  sequence.  Note that this normalization process is idempotent:
171      --  repeated normalization does not change a normalized URI reference."
172
173      --  Actual algoriphm is slightly differ: it converts URI into UTF-8 and
174      --  escape all disallowed characters. This produce equivalent result,
175      --  because all ASCII characters are mapped to the elements with the same
176      --  codes, and all non-ASCII characters are mapped to multibyte sequences
177      --  with codes outside of ASCII range.
178
179      Hex     : constant
180        array (Ada.Streams.Stream_Element range 0 .. 15) of Wide_Wide_Character
181          := "0123456789ABCDEF";
182      Encoded : constant Ada.Streams.Stream_Element_Array
183        := UTF8_Codec.Encode (URI).To_Stream_Element_Array;
184      Result  : League.Strings.Universal_String;
185
186   begin
187      for J in Encoded'Range loop
188         case Encoded (J) is
189            when 16#00# .. 16#20#  --  NUL .. (space)
190             | 16#22#              --  "
191             | 16#3C#              --  <
192             | 16#3E#              --  >
193             | 16#5C#              --  \
194             | 16#5E#              --  ^
195             | 16#60#              --  `
196             | 16#7B#              --  {
197             | 16#7C#              --  |
198             | 16#7D#              --  }
199             | 16#7F#              --  DEL
200             | 16#80# .. 16#FF#    --  non-ASCII codes
201            =>
202               Result.Append ('%');
203               Result.Append (Hex (Encoded (J) / 16));
204               Result.Append (Hex (Encoded (J) mod 16));
205
206            when others =>
207               Result.Append (Wide_Wide_Character'Val (Encoded (J)));
208         end case;
209      end loop;
210
211      return Result;
212   end Normalize_URI;
213
214   ----------------
215   -- Unwrap_URN --
216   ----------------
217
218   procedure Unwrap_URN
219    (URI        : League.Strings.Universal_String;
220     Identifier : out League.Strings.Universal_String;
221     Unwrapped  : out Boolean)
222   is
223      --  [XML Catalogs] 6.4. URN "Unwrapping"
224      --
225      --  This OASIS Standard requires processors to implement special
226      --  treatment of URNs in the publicid URN Namespace ([RFC 3151]).
227      --
228      --  URNs of this form must, in some contexts, be "unwrapped" by the
229      --  Catalog processor. This unwrapping translates the URN form of the
230      --  public identifier back into the standard ISO 8879 form for the
231      --  purposes of subsequent catalog processing.
232      --
233      --  Unwrapping a urn:publicid: URN is accomplished by transcribing
234      --  characters in the URN according to the fol- lowing table after
235      --  discarding the leading urn:publicid: string:
236      --
237      --  URN Characters  Public Identifier Characters
238      --         +               " " (space)
239      --         :                    //
240      --         ;                    ::
241      --        %2B                   +
242      --        %3A                   :
243      --        %2F                   /
244      --        %3B                   ;
245      --        %27                   '
246      --        %3F                   ?
247      --        %23                   #
248      --        %25                   %
249      --
250      --  URNs in the publicid namespace should always represent normalized
251      --  public identifiers (Section 6.2, “Public Identifier Normalization”).
252      --  In the event that an unwrapped public identifier is not normalized,
253      --  the catalog processor must normalize it."
254
255      Char   : League.Characters.Universal_Character;
256      Char_1 : League.Characters.Universal_Character;
257      Char_2 : League.Characters.Universal_Character;
258      J      : Natural;
259
260   begin
261      Identifier := League.Strings.Empty_Universal_String;
262
263      if not URI.Starts_With (PublicId_URN_Namespace) then
264         --  Return when URI is not in publicid URN namespace.
265
266         Unwrapped := False;
267
268         return;
269      end if;
270
271      J := PublicId_URN_Namespace.Length + 1;
272
273      while J <= URI.Length loop
274         Char := URI.Element (J);
275
276         if Char = Plus_Sign then
277            Identifier.Append (Space);
278
279         elsif Char = Colon then
280            Identifier.Append (Solidus);
281            Identifier.Append (Solidus);
282
283         elsif Char = Semicolon then
284            Identifier.Append (Colon);
285            Identifier.Append (Colon);
286
287         elsif Char = Percent_Sign
288           and then J + 2 <= URI.Length
289         then
290            Char_1 := URI.Element (J + 1);
291            Char_2 := URI.Element (J + 2);
292
293            if Char_1 = Digit_Two and Char_2 = Latin_Capital_Letter_B then
294               Identifier.Append (Plus_Sign);
295
296            elsif Char_1 = Digit_Three and Char_2 = Latin_Capital_Letter_A then
297               Identifier.Append (Colon);
298
299            elsif Char_1 = Digit_Two and Char_2 = Latin_Capital_Letter_F then
300               Identifier.Append (Solidus);
301
302            elsif Char_1 = Digit_Three and Char_2 = Latin_Capital_Letter_B then
303               Identifier.Append (Semicolon);
304
305            elsif Char_1 = Digit_Two and Char_2 = Digit_Seven then
306               Identifier.Append (Apostrophe);
307
308            elsif Char_1 = Digit_Three and Char_2 = Latin_Capital_Letter_F then
309               Identifier.Append (Question_Mark);
310
311            elsif Char_1 = Digit_Two and Char_2 = Digit_Three then
312               Identifier.Append (Number_Sign);
313
314            elsif Char_1 = Digit_Two and Char_2 = Digit_Five then
315               Identifier.Append (Percent_Sign);
316
317            else
318               Identifier.Append (Char);
319               Identifier.Append (Char_1);
320               Identifier.Append (Char_2);
321            end if;
322
323            J := J + 2;
324
325         else
326            Identifier.Append (Char);
327         end if;
328
329         J := J + 1;
330      end loop;
331
332      Identifier := Normalize_Public_Identifier (Identifier);
333      Unwrapped := True;
334   end Unwrap_URN;
335
336end Matreshka.XML_Catalogs.Normalization;
337