gcc/ada/g-decstr.adb

------------------------------------------------------------------------------
--                                                                          --
--                         GNAT RUN-TIME COMPONENTS                         --
--                                                                          --
--                    G N A T . D E C O D E _ S T R I N G                   --
--                                                                          --
--                                 S p e c                                  --
--                                                                          --
--                     Copyright (C) 2007-2013, AdaCore                     --
--                                                                          --
-- GNAT is free software;  you can  redistribute it  and/or modify it under --
-- terms of the  GNU General Public License as published  by the Free Soft- --
-- ware  Foundation;  either version 3,  or (at your option) any later ver- --
-- sion.  GNAT is distributed in the hope that it will be useful, but WITH- --
-- OUT ANY WARRANTY;  without even the  implied warranty of MERCHANTABILITY --
-- or FITNESS FOR A PARTICULAR PURPOSE.                                     --
--                                                                          --
-- As a special exception under Section 7 of GPL version 3, you are granted --
-- additional permissions described in the GCC Runtime Library Exception,   --
-- version 3.1, as published by the Free Software Foundation.               --
--                                                                          --
-- You should have received a copy of the GNU General Public License and    --
-- a copy of the GCC Runtime Library Exception along with this program;     --
-- see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see    --
-- <http://www.gnu.org/licenses/>.                                          --
--                                                                          --
-- GNAT was originally developed  by the GNAT team at  New York University. --
-- Extensive contributions were provided by Ada Core Technologies Inc.      --
--                                                                          --
------------------------------------------------------------------------------

--  This package provides a utility routine for converting from an encoded
--  string to a corresponding Wide_String or Wide_Wide_String value.

with Interfaces; use Interfaces;

with System.WCh_Cnv; use System.WCh_Cnv;
with System.WCh_Con; use System.WCh_Con;

package body GNAT.Decode_String is

   -----------------------
   -- Local Subprograms --
   -----------------------

   procedure Bad;
   pragma No_Return (Bad);
   --  Raise error for bad encoding

   procedure Past_End;
   pragma No_Return (Past_End);
   --  Raise error for off end of string

   ---------
   -- Bad --
   ---------

   procedure Bad is
   begin
      raise Constraint_Error with
        "bad encoding or character out of range";
   end Bad;

   ---------------------------
   -- Decode_Wide_Character --
   ---------------------------

   procedure Decode_Wide_Character
     (Input  : String;
      Ptr    : in out Natural;
      Result : out Wide_Character)
   is
      Char : Wide_Wide_Character;
   begin
      Decode_Wide_Wide_Character (Input, Ptr, Char);

      if Wide_Wide_Character'Pos (Char) > 16#FFFF# then
         Bad;
      else
         Result := Wide_Character'Val (Wide_Wide_Character'Pos (Char));
      end if;
   end Decode_Wide_Character;

   ------------------------
   -- Decode_Wide_String --
   ------------------------

   function Decode_Wide_String (S : String) return Wide_String is
      Result : Wide_String (1 .. S'Length);
      Length : Natural;
   begin
      Decode_Wide_String (S, Result, Length);
      return Result (1 .. Length);
   end Decode_Wide_String;

   procedure Decode_Wide_String
     (S      : String;
      Result : out Wide_String;
      Length : out Natural)
   is
      Ptr : Natural;

   begin
      Ptr := S'First;
      Length := 0;
      while Ptr <= S'Last loop
         if Length >= Result'Last then
            Past_End;
         end if;

         Length := Length + 1;
         Decode_Wide_Character (S, Ptr, Result (Length));
      end loop;
   end Decode_Wide_String;

   --------------------------------
   -- Decode_Wide_Wide_Character --
   --------------------------------

   procedure Decode_Wide_Wide_Character
     (Input  : String;
      Ptr    : in out Natural;
      Result : out Wide_Wide_Character)
   is
      C : Character;

      function In_Char return Character;
      pragma Inline (In_Char);
      --  Function to get one input character

      -------------
      -- In_Char --
      -------------

      function In_Char return Character is
      begin
         if Ptr <= Input'Last then
            Ptr := Ptr + 1;
            return Input (Ptr - 1);
         else
            Past_End;
         end if;
      end In_Char;

   --  Start of processing for Decode_Wide_Wide_Character

   begin
      C := In_Char;

      --  Special fast processing for UTF-8 case

      if Encoding_Method = WCEM_UTF8 then
         UTF8 : declare
            U : Unsigned_32;
            W : Unsigned_32;

            procedure Get_UTF_Byte;
            pragma Inline (Get_UTF_Byte);
            --  Used to interpret 2#10xxxxxx# continuation byte in UTF-8 mode.
            --  Reads a byte, and raises CE if the first two bits are not 10.
            --  Otherwise shifts W 6 bits left and or's in the 6 xxxxxx bits.

            ------------------
            -- Get_UTF_Byte --
            ------------------

            procedure Get_UTF_Byte is
            begin
               U := Unsigned_32 (Character'Pos (In_Char));

               if (U and 2#11000000#) /= 2#10_000000# then
                  Bad;
               end if;

               W := Shift_Left (W, 6) or (U and 2#00111111#);
            end Get_UTF_Byte;

         --  Start of processing for UTF8 case

         begin
            --  Note: for details of UTF8 encoding see RFC 3629

            U := Unsigned_32 (Character'Pos (C));

            --  16#00_0000#-16#00_007F#: 0xxxxxxx

            if (U and 2#10000000#) = 2#00000000# then
               Result := Wide_Wide_Character'Val (Character'Pos (C));

            --  16#00_0080#-16#00_07FF#: 110xxxxx 10xxxxxx

            elsif (U and 2#11100000#) = 2#110_00000# then
               W := U and 2#00011111#;
               Get_UTF_Byte;

               if W not in 16#00_0080# .. 16#00_07FF# then
                  Bad;
               end if;

               Result := Wide_Wide_Character'Val (W);

            --  16#00_0800#-16#00_ffff#: 1110xxxx 10xxxxxx 10xxxxxx

            elsif (U and 2#11110000#) = 2#1110_0000# then
               W := U and 2#00001111#;
               Get_UTF_Byte;
               Get_UTF_Byte;

               if W not in 16#00_0800# .. 16#00_FFFF# then
                  Bad;
               end if;

               Result := Wide_Wide_Character'Val (W);

            --  16#01_0000#-16#10_FFFF#: 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx

            elsif (U and 2#11111000#) = 2#11110_000# then
               W := U and 2#00000111#;

               for K in 1 .. 3 loop
                  Get_UTF_Byte;
               end loop;

               if W not in 16#01_0000# .. 16#10_FFFF# then
                  Bad;
               end if;

               Result := Wide_Wide_Character'Val (W);

            --  16#0020_0000#-16#03FF_FFFF#: 111110xx 10xxxxxx 10xxxxxx
            --                               10xxxxxx 10xxxxxx

            elsif (U and 2#11111100#) = 2#111110_00# then
               W := U and 2#00000011#;

               for K in 1 .. 4 loop
                  Get_UTF_Byte;
               end loop;

               if W not in 16#0020_0000# .. 16#03FF_FFFF# then
                  Bad;
               end if;

               Result := Wide_Wide_Character'Val (W);

            --  All other cases are invalid, note that this includes:

            --  16#0400_0000#-16#7FFF_FFFF#: 1111110x 10xxxxxx 10xxxxxx
            --                               10xxxxxx 10xxxxxx 10xxxxxx

            --  since Wide_Wide_Character does not include code values
            --  greater than 16#03FF_FFFF#.

            else
               Bad;
            end if;
         end UTF8;

      --  All encoding functions other than UTF-8

      else
         Non_UTF8 : declare
            function Char_Sequence_To_UTF is
              new Char_Sequence_To_UTF_32 (In_Char);

         begin
            --  For brackets, must test for specific case of [ not followed by
            --  quotation, where we must not call Char_Sequence_To_UTF, but
            --  instead just return the bracket unchanged.

            if Encoding_Method = WCEM_Brackets
              and then C = '['
              and then (Ptr > Input'Last or else Input (Ptr) /= '"')
            then
               Result := '[';

            --  All other cases including [" with Brackets

            else
               Result :=
                 Wide_Wide_Character'Val
                   (Char_Sequence_To_UTF (C, Encoding_Method));
            end if;
         end Non_UTF8;
      end if;
   end Decode_Wide_Wide_Character;

   -----------------------------
   -- Decode_Wide_Wide_String --
   -----------------------------

   function Decode_Wide_Wide_String (S : String) return Wide_Wide_String is
      Result : Wide_Wide_String (1 .. S'Length);
      Length : Natural;
   begin
      Decode_Wide_Wide_String (S, Result, Length);
      return Result (1 .. Length);
   end Decode_Wide_Wide_String;

   procedure Decode_Wide_Wide_String
     (S      : String;
      Result : out Wide_Wide_String;
      Length : out Natural)
   is
      Ptr : Natural;

   begin
      Ptr := S'First;
      Length := 0;
      while Ptr <= S'Last loop
         if Length >= Result'Last then
            Past_End;
         end if;

         Length := Length + 1;
         Decode_Wide_Wide_Character (S, Ptr, Result (Length));
      end loop;
   end Decode_Wide_Wide_String;

   -------------------------
   -- Next_Wide_Character --
   -------------------------

   procedure Next_Wide_Character (Input : String; Ptr : in out Natural) is
      Discard : Wide_Character;
      pragma Unreferenced (Discard);
   begin
      Decode_Wide_Character (Input, Ptr, Discard);
   end Next_Wide_Character;

   ------------------------------
   -- Next_Wide_Wide_Character --
   ------------------------------

   procedure Next_Wide_Wide_Character (Input : String; Ptr : in out Natural) is
      Discard : Wide_Wide_Character;
      pragma Unreferenced (Discard);
   begin
      Decode_Wide_Wide_Character (Input, Ptr, Discard);
   end Next_Wide_Wide_Character;

   --------------
   -- Past_End --
   --------------

   procedure Past_End is
   begin
      raise Constraint_Error with "past end of string";
   end Past_End;

   -------------------------
   -- Prev_Wide_Character --
   -------------------------

   procedure Prev_Wide_Character (Input : String; Ptr : in out Natural) is
   begin
      if Ptr > Input'Last + 1 then
         Past_End;
      end if;

      --  Special efficient encoding for UTF-8 case

      if Encoding_Method = WCEM_UTF8 then
         UTF8 : declare
            U : Unsigned_32;

            procedure Getc;
            pragma Inline (Getc);
            --  Gets the character at Input (Ptr - 1) and returns code in U as
            --  Unsigned_32 value. On return Ptr is decremented by one.

            procedure Skip_UTF_Byte;
            pragma Inline (Skip_UTF_Byte);
            --  Checks that U is 2#10xxxxxx# and then calls Get

            ----------
            -- Getc --
            ----------

            procedure Getc is
            begin
               if Ptr <= Input'First then
                  Past_End;
               else
                  Ptr := Ptr - 1;
                  U := Unsigned_32 (Character'Pos (Input (Ptr)));
               end if;
            end Getc;

            -------------------
            -- Skip_UTF_Byte --
            -------------------

            procedure Skip_UTF_Byte is
            begin
               if (U and 2#11000000#) = 2#10_000000# then
                  Getc;
               else
                  Bad;
               end if;
            end Skip_UTF_Byte;

         --  Start of processing for UTF-8 case

         begin
            --  16#00_0000#-16#00_007F#: 0xxxxxxx

            Getc;

            if (U and 2#10000000#) = 2#00000000# then
               return;

            --  16#00_0080#-16#00_07FF#: 110xxxxx 10xxxxxx

            else
               Skip_UTF_Byte;

               if (U and 2#11100000#) = 2#110_00000# then
                  return;

               --  16#00_0800#-16#00_ffff#: 1110xxxx 10xxxxxx 10xxxxxx

               else
                  Skip_UTF_Byte;

                  if (U and 2#11110000#) = 2#1110_0000# then
                     return;

                     --  Any other code is invalid, note that this includes:

                     --  16#01_0000#-16#10_FFFF#: 11110xxx 10xxxxxx 10xxxxxx
                     --                           10xxxxxx

                     --  16#0020_0000#-16#03FF_FFFF#: 111110xx 10xxxxxx
                     --                               10xxxxxx 10xxxxxx
                     --                               10xxxxxx

                     --  16#0400_0000#-16#7FFF_FFFF#: 1111110x 10xxxxxx
                     --                               10xxxxxx 10xxxxxx
                     --                               10xxxxxx 10xxxxxx

                     --  since Wide_Character does not allow codes > 16#FFFF#

                  else
                     Bad;
                  end if;
               end if;
            end if;
         end UTF8;

      --  Special efficient encoding for brackets case

      elsif Encoding_Method = WCEM_Brackets then
         Brackets : declare
            P : Natural;
            S : Natural;

         begin
            --  See if we have "] at end positions

            if Ptr > Input'First + 1
              and then Input (Ptr - 1) = ']'
              and then Input (Ptr - 2) = '"'
            then
               P := Ptr - 2;

               --  Loop back looking for [" at start

               while P >= Ptr - 10 loop
                  if P <= Input'First + 1 then
                     Bad;

                  elsif Input (P - 1) = '"'
                    and then Input (P - 2) = '['
                  then
                     --  Found ["..."], scan forward to check it

                     S := P - 2;
                     P := S;
                     Next_Wide_Character (Input, P);

                     --  OK if at original pointer, else error

                     if P = Ptr then
                        Ptr := S;
                        return;
                     else
                        Bad;
                     end if;
                  end if;

                  P := P - 1;
               end loop;

               --  Falling through loop means more than 8 chars between the
               --  enclosing brackets (or simply a missing left bracket)

               Bad;

            --  Here if no bracket sequence present

            else
               if Ptr = Input'First then
                  Past_End;
               else
                  Ptr := Ptr - 1;
               end if;
            end if;
         end Brackets;

      --  Non-UTF-8/Brackets. These are the inefficient cases where we have to
      --  go to the start of the string and skip forwards till Ptr matches.

      else
         Non_UTF_Brackets : declare
            Discard : Wide_Character;
            PtrS    : Natural;
            PtrP    : Natural;

         begin
            PtrS := Input'First;

            if Ptr <= PtrS then
               Past_End;
            end if;

            loop
               PtrP := PtrS;
               Decode_Wide_Character (Input, PtrS, Discard);

               if PtrS = Ptr then
                  Ptr := PtrP;
                  return;

               elsif PtrS > Ptr then
                  Bad;
               end if;
            end loop;

         exception
            when Constraint_Error =>
               Bad;
         end Non_UTF_Brackets;
      end if;
   end Prev_Wide_Character;

   ------------------------------
   -- Prev_Wide_Wide_Character --
   ------------------------------

   procedure Prev_Wide_Wide_Character (Input : String; Ptr : in out Natural) is
   begin
      if Ptr > Input'Last + 1 then
         Past_End;
      end if;

      --  Special efficient encoding for UTF-8 case

      if Encoding_Method = WCEM_UTF8 then
         UTF8 : declare
            U : Unsigned_32;

            procedure Getc;
            pragma Inline (Getc);
            --  Gets the character at Input (Ptr - 1) and returns code in U as
            --  Unsigned_32 value. On return Ptr is decremented by one.

            procedure Skip_UTF_Byte;
            pragma Inline (Skip_UTF_Byte);
            --  Checks that U is 2#10xxxxxx# and then calls Get

            ----------
            -- Getc --
            ----------

            procedure Getc is
            begin
               if Ptr <= Input'First then
                  Past_End;
               else
                  Ptr := Ptr - 1;
                  U := Unsigned_32 (Character'Pos (Input (Ptr)));
               end if;
            end Getc;

            -------------------
            -- Skip_UTF_Byte --
            -------------------

            procedure Skip_UTF_Byte is
            begin
               if (U and 2#11000000#) = 2#10_000000# then
                  Getc;
               else
                  Bad;
               end if;
            end Skip_UTF_Byte;

         --  Start of processing for UTF-8 case

         begin
            --  16#00_0000#-16#00_007F#: 0xxxxxxx

            Getc;

            if (U and 2#10000000#) = 2#00000000# then
               return;

            --  16#00_0080#-16#00_07FF#: 110xxxxx 10xxxxxx

            else
               Skip_UTF_Byte;

               if (U and 2#11100000#) = 2#110_00000# then
                  return;

               --  16#00_0800#-16#00_ffff#: 1110xxxx 10xxxxxx 10xxxxxx

               else
                  Skip_UTF_Byte;

                  if (U and 2#11110000#) = 2#1110_0000# then
                     return;

                  --  16#01_0000#-16#10_FFFF#: 11110xxx 10xxxxxx 10xxxxxx
                  --                           10xxxxxx

                  else
                     Skip_UTF_Byte;

                     if (U and 2#11111000#) = 2#11110_000# then
                        return;

                     --  16#0020_0000#-16#03FF_FFFF#: 111110xx 10xxxxxx
                     --                               10xxxxxx 10xxxxxx
                     --                               10xxxxxx

                     else
                        Skip_UTF_Byte;

                        if (U and 2#11111100#) = 2#111110_00# then
                           return;

                        --  Any other code is invalid, note that this includes:

                        --  16#0400_0000#-16#7FFF_FFFF#: 1111110x 10xxxxxx
                        --                               10xxxxxx 10xxxxxx
                        --                               10xxxxxx 10xxxxxx

                        --  since Wide_Wide_Character does not allow codes
                        --  greater than 16#03FF_FFFF#

                        else
                           Bad;
                        end if;
                     end if;
                  end if;
               end if;
            end if;
         end UTF8;

      --  Special efficient encoding for brackets case

      elsif Encoding_Method = WCEM_Brackets then
         Brackets : declare
            P : Natural;
            S : Natural;

         begin
            --  See if we have "] at end positions

            if Ptr > Input'First + 1
              and then Input (Ptr - 1) = ']'
              and then Input (Ptr - 2) = '"'
            then
               P := Ptr - 2;

               --  Loop back looking for [" at start

               while P >= Ptr - 10 loop
                  if P <= Input'First + 1 then
                     Bad;

                  elsif Input (P - 1) = '"'
                    and then Input (P - 2) = '['
                  then
                     --  Found ["..."], scan forward to check it

                     S := P - 2;
                     P := S;
                     Next_Wide_Wide_Character (Input, P);

                     --  OK if at original pointer, else error

                     if P = Ptr then
                        Ptr := S;
                        return;
                     else
                        Bad;
                     end if;
                  end if;

                  P := P - 1;
               end loop;

               --  Falling through loop means more than 8 chars between the
               --  enclosing brackets (or simply a missing left bracket)

               Bad;

            --  Here if no bracket sequence present

            else
               if Ptr = Input'First then
                  Past_End;
               else
                  Ptr := Ptr - 1;
               end if;
            end if;
         end Brackets;

      --  Non-UTF-8/Brackets. These are the inefficient cases where we have to
      --  go to the start of the string and skip forwards till Ptr matches.

      else
         Non_UTF8_Brackets : declare
            Discard : Wide_Wide_Character;
            PtrS    : Natural;
            PtrP    : Natural;

         begin
            PtrS := Input'First;

            if Ptr <= PtrS then
               Past_End;
            end if;

            loop
               PtrP := PtrS;
               Decode_Wide_Wide_Character (Input, PtrS, Discard);

               if PtrS = Ptr then
                  Ptr := PtrP;
                  return;

               elsif PtrS > Ptr then
                  Bad;
               end if;
            end loop;

         exception
            when Constraint_Error =>
               Bad;
         end Non_UTF8_Brackets;
      end if;
   end Prev_Wide_Wide_Character;

   --------------------------
   -- Validate_Wide_String --
   --------------------------

   function Validate_Wide_String (S : String) return Boolean is
      Ptr : Natural;

   begin
      Ptr := S'First;
      while Ptr <= S'Last loop
         Next_Wide_Character (S, Ptr);
      end loop;

      return True;

   exception
      when Constraint_Error =>
         return False;
   end Validate_Wide_String;

   -------------------------------
   -- Validate_Wide_Wide_String --
   -------------------------------

   function Validate_Wide_Wide_String (S : String) return Boolean is
      Ptr : Natural;

   begin
      Ptr := S'First;
      while Ptr <= S'Last loop
         Next_Wide_Wide_Character (S, Ptr);
      end loop;

      return True;

   exception
      when Constraint_Error =>
         return False;
   end Validate_Wide_Wide_String;

end GNAT.Decode_String;