1------------------------------------------------------------------------------
2--                                                                          --
3--                         GNAT RUN-TIME COMPONENTS                         --
4--                                                                          --
5--                    G N A T . D E C O D E _ S T R I N G                   --
6--                                                                          --
7--                                 S p e c                                  --
8--                                                                          --
9--                     Copyright (C) 2007-2014, AdaCore                     --
10--                                                                          --
11-- GNAT is free software;  you can  redistribute it  and/or modify it under --
12-- terms of the  GNU General Public License as published  by the Free Soft- --
13-- ware  Foundation;  either version 3,  or (at your option) any later ver- --
14-- sion.  GNAT is distributed in the hope that it will be useful, but WITH- --
15-- OUT ANY WARRANTY;  without even the  implied warranty of MERCHANTABILITY --
16-- or FITNESS FOR A PARTICULAR PURPOSE.                                     --
17--                                                                          --
18-- As a special exception under Section 7 of GPL version 3, you are granted --
19-- additional permissions described in the GCC Runtime Library Exception,   --
20-- version 3.1, as published by the Free Software Foundation.               --
21--                                                                          --
22-- You should have received a copy of the GNU General Public License and    --
23-- a copy of the GCC Runtime Library Exception along with this program;     --
24-- see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see    --
25-- <http://www.gnu.org/licenses/>.                                          --
26--                                                                          --
27-- GNAT was originally developed  by the GNAT team at  New York University. --
28-- Extensive contributions were provided by Ada Core Technologies Inc.      --
29--                                                                          --
30------------------------------------------------------------------------------
31
32--  This package provides a utility routine for converting from an encoded
33--  string to a corresponding Wide_String or Wide_Wide_String value.
34
35with Interfaces; use Interfaces;
36
37with System.WCh_Cnv; use System.WCh_Cnv;
38with System.WCh_Con; use System.WCh_Con;
39
40package body GNAT.Decode_String is
41
42   -----------------------
43   -- Local Subprograms --
44   -----------------------
45
46   procedure Bad;
47   pragma No_Return (Bad);
48   --  Raise error for bad encoding
49
50   procedure Past_End;
51   pragma No_Return (Past_End);
52   --  Raise error for off end of string
53
54   ---------
55   -- Bad --
56   ---------
57
58   procedure Bad is
59   begin
60      raise Constraint_Error with
61        "bad encoding or character out of range";
62   end Bad;
63
64   ---------------------------
65   -- Decode_Wide_Character --
66   ---------------------------
67
68   procedure Decode_Wide_Character
69     (Input  : String;
70      Ptr    : in out Natural;
71      Result : out Wide_Character)
72   is
73      Char : Wide_Wide_Character;
74   begin
75      Decode_Wide_Wide_Character (Input, Ptr, Char);
76
77      if Wide_Wide_Character'Pos (Char) > 16#FFFF# then
78         Bad;
79      else
80         Result := Wide_Character'Val (Wide_Wide_Character'Pos (Char));
81      end if;
82   end Decode_Wide_Character;
83
84   ------------------------
85   -- Decode_Wide_String --
86   ------------------------
87
88   function Decode_Wide_String (S : String) return Wide_String is
89      Result : Wide_String (1 .. S'Length);
90      Length : Natural;
91   begin
92      Decode_Wide_String (S, Result, Length);
93      return Result (1 .. Length);
94   end Decode_Wide_String;
95
96   procedure Decode_Wide_String
97     (S      : String;
98      Result : out Wide_String;
99      Length : out Natural)
100   is
101      Ptr : Natural;
102
103   begin
104      Ptr := S'First;
105      Length := 0;
106      while Ptr <= S'Last loop
107         if Length >= Result'Last then
108            Past_End;
109         end if;
110
111         Length := Length + 1;
112         Decode_Wide_Character (S, Ptr, Result (Length));
113      end loop;
114   end Decode_Wide_String;
115
116   --------------------------------
117   -- Decode_Wide_Wide_Character --
118   --------------------------------
119
120   procedure Decode_Wide_Wide_Character
121     (Input  : String;
122      Ptr    : in out Natural;
123      Result : out Wide_Wide_Character)
124   is
125      C : Character;
126
127      function In_Char return Character;
128      pragma Inline (In_Char);
129      --  Function to get one input character
130
131      -------------
132      -- In_Char --
133      -------------
134
135      function In_Char return Character is
136      begin
137         if Ptr <= Input'Last then
138            Ptr := Ptr + 1;
139            return Input (Ptr - 1);
140         else
141            Past_End;
142         end if;
143      end In_Char;
144
145   --  Start of processing for Decode_Wide_Wide_Character
146
147   begin
148      C := In_Char;
149
150      --  Special fast processing for UTF-8 case
151
152      if Encoding_Method = WCEM_UTF8 then
153         UTF8 : declare
154            U : Unsigned_32;
155            W : Unsigned_32;
156
157            procedure Get_UTF_Byte;
158            pragma Inline (Get_UTF_Byte);
159            --  Used to interpret 2#10xxxxxx# continuation byte in UTF-8 mode.
160            --  Reads a byte, and raises CE if the first two bits are not 10.
161            --  Otherwise shifts W 6 bits left and or's in the 6 xxxxxx bits.
162
163            ------------------
164            -- Get_UTF_Byte --
165            ------------------
166
167            procedure Get_UTF_Byte is
168            begin
169               U := Unsigned_32 (Character'Pos (In_Char));
170
171               if (U and 2#11000000#) /= 2#10_000000# then
172                  Bad;
173               end if;
174
175               W := Shift_Left (W, 6) or (U and 2#00111111#);
176            end Get_UTF_Byte;
177
178         --  Start of processing for UTF8 case
179
180         begin
181            --  Note: for details of UTF8 encoding see RFC 3629
182
183            U := Unsigned_32 (Character'Pos (C));
184
185            --  16#00_0000#-16#00_007F#: 0xxxxxxx
186
187            if (U and 2#10000000#) = 2#00000000# then
188               Result := Wide_Wide_Character'Val (Character'Pos (C));
189
190            --  16#00_0080#-16#00_07FF#: 110xxxxx 10xxxxxx
191
192            elsif (U and 2#11100000#) = 2#110_00000# then
193               W := U and 2#00011111#;
194               Get_UTF_Byte;
195
196               if W not in 16#00_0080# .. 16#00_07FF# then
197                  Bad;
198               end if;
199
200               Result := Wide_Wide_Character'Val (W);
201
202            --  16#00_0800#-16#00_ffff#: 1110xxxx 10xxxxxx 10xxxxxx
203
204            elsif (U and 2#11110000#) = 2#1110_0000# then
205               W := U and 2#00001111#;
206               Get_UTF_Byte;
207               Get_UTF_Byte;
208
209               if W not in 16#00_0800# .. 16#00_FFFF# then
210                  Bad;
211               end if;
212
213               Result := Wide_Wide_Character'Val (W);
214
215            --  16#01_0000#-16#10_FFFF#: 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
216
217            elsif (U and 2#11111000#) = 2#11110_000# then
218               W := U and 2#00000111#;
219
220               for K in 1 .. 3 loop
221                  Get_UTF_Byte;
222               end loop;
223
224               if W not in 16#01_0000# .. 16#10_FFFF# then
225                  Bad;
226               end if;
227
228               Result := Wide_Wide_Character'Val (W);
229
230            --  16#0020_0000#-16#03FF_FFFF#: 111110xx 10xxxxxx 10xxxxxx
231            --                               10xxxxxx 10xxxxxx
232
233            elsif (U and 2#11111100#) = 2#111110_00# then
234               W := U and 2#00000011#;
235
236               for K in 1 .. 4 loop
237                  Get_UTF_Byte;
238               end loop;
239
240               if W not in 16#0020_0000# .. 16#03FF_FFFF# then
241                  Bad;
242               end if;
243
244               Result := Wide_Wide_Character'Val (W);
245
246            --  All other cases are invalid, note that this includes:
247
248            --  16#0400_0000#-16#7FFF_FFFF#: 1111110x 10xxxxxx 10xxxxxx
249            --                               10xxxxxx 10xxxxxx 10xxxxxx
250
251            --  since Wide_Wide_Character does not include code values
252            --  greater than 16#03FF_FFFF#.
253
254            else
255               Bad;
256            end if;
257         end UTF8;
258
259      --  All encoding functions other than UTF-8
260
261      else
262         Non_UTF8 : declare
263            function Char_Sequence_To_UTF is
264              new Char_Sequence_To_UTF_32 (In_Char);
265
266         begin
267            --  For brackets, must test for specific case of [ not followed by
268            --  quotation, where we must not call Char_Sequence_To_UTF, but
269            --  instead just return the bracket unchanged.
270
271            if Encoding_Method = WCEM_Brackets
272              and then C = '['
273              and then (Ptr > Input'Last or else Input (Ptr) /= '"')
274            then
275               Result := '[';
276
277            --  All other cases including [" with Brackets
278
279            else
280               Result :=
281                 Wide_Wide_Character'Val
282                   (Char_Sequence_To_UTF (C, Encoding_Method));
283            end if;
284         end Non_UTF8;
285      end if;
286   end Decode_Wide_Wide_Character;
287
288   -----------------------------
289   -- Decode_Wide_Wide_String --
290   -----------------------------
291
292   function Decode_Wide_Wide_String (S : String) return Wide_Wide_String is
293      Result : Wide_Wide_String (1 .. S'Length);
294      Length : Natural;
295   begin
296      Decode_Wide_Wide_String (S, Result, Length);
297      return Result (1 .. Length);
298   end Decode_Wide_Wide_String;
299
300   procedure Decode_Wide_Wide_String
301     (S      : String;
302      Result : out Wide_Wide_String;
303      Length : out Natural)
304   is
305      Ptr : Natural;
306
307   begin
308      Ptr := S'First;
309      Length := 0;
310      while Ptr <= S'Last loop
311         if Length >= Result'Last then
312            Past_End;
313         end if;
314
315         Length := Length + 1;
316         Decode_Wide_Wide_Character (S, Ptr, Result (Length));
317      end loop;
318   end Decode_Wide_Wide_String;
319
320   -------------------------
321   -- Next_Wide_Character --
322   -------------------------
323
324   procedure Next_Wide_Character (Input : String; Ptr : in out Natural) is
325      Discard : Wide_Character;
326   begin
327      Decode_Wide_Character (Input, Ptr, Discard);
328   end Next_Wide_Character;
329
330   ------------------------------
331   -- Next_Wide_Wide_Character --
332   ------------------------------
333
334   procedure Next_Wide_Wide_Character (Input : String; Ptr : in out Natural) is
335      Discard : Wide_Wide_Character;
336   begin
337      Decode_Wide_Wide_Character (Input, Ptr, Discard);
338   end Next_Wide_Wide_Character;
339
340   --------------
341   -- Past_End --
342   --------------
343
344   procedure Past_End is
345   begin
346      raise Constraint_Error with "past end of string";
347   end Past_End;
348
349   -------------------------
350   -- Prev_Wide_Character --
351   -------------------------
352
353   procedure Prev_Wide_Character (Input : String; Ptr : in out Natural) is
354   begin
355      if Ptr > Input'Last + 1 then
356         Past_End;
357      end if;
358
359      --  Special efficient encoding for UTF-8 case
360
361      if Encoding_Method = WCEM_UTF8 then
362         UTF8 : declare
363            U : Unsigned_32;
364
365            procedure Getc;
366            pragma Inline (Getc);
367            --  Gets the character at Input (Ptr - 1) and returns code in U as
368            --  Unsigned_32 value. On return Ptr is decremented by one.
369
370            procedure Skip_UTF_Byte;
371            pragma Inline (Skip_UTF_Byte);
372            --  Checks that U is 2#10xxxxxx# and then calls Get
373
374            ----------
375            -- Getc --
376            ----------
377
378            procedure Getc is
379            begin
380               if Ptr <= Input'First then
381                  Past_End;
382               else
383                  Ptr := Ptr - 1;
384                  U := Unsigned_32 (Character'Pos (Input (Ptr)));
385               end if;
386            end Getc;
387
388            -------------------
389            -- Skip_UTF_Byte --
390            -------------------
391
392            procedure Skip_UTF_Byte is
393            begin
394               if (U and 2#11000000#) = 2#10_000000# then
395                  Getc;
396               else
397                  Bad;
398               end if;
399            end Skip_UTF_Byte;
400
401         --  Start of processing for UTF-8 case
402
403         begin
404            --  16#00_0000#-16#00_007F#: 0xxxxxxx
405
406            Getc;
407
408            if (U and 2#10000000#) = 2#00000000# then
409               return;
410
411            --  16#00_0080#-16#00_07FF#: 110xxxxx 10xxxxxx
412
413            else
414               Skip_UTF_Byte;
415
416               if (U and 2#11100000#) = 2#110_00000# then
417                  return;
418
419               --  16#00_0800#-16#00_ffff#: 1110xxxx 10xxxxxx 10xxxxxx
420
421               else
422                  Skip_UTF_Byte;
423
424                  if (U and 2#11110000#) = 2#1110_0000# then
425                     return;
426
427                     --  Any other code is invalid, note that this includes:
428
429                     --  16#01_0000#-16#10_FFFF#: 11110xxx 10xxxxxx 10xxxxxx
430                     --                           10xxxxxx
431
432                     --  16#0020_0000#-16#03FF_FFFF#: 111110xx 10xxxxxx
433                     --                               10xxxxxx 10xxxxxx
434                     --                               10xxxxxx
435
436                     --  16#0400_0000#-16#7FFF_FFFF#: 1111110x 10xxxxxx
437                     --                               10xxxxxx 10xxxxxx
438                     --                               10xxxxxx 10xxxxxx
439
440                     --  since Wide_Character does not allow codes > 16#FFFF#
441
442                  else
443                     Bad;
444                  end if;
445               end if;
446            end if;
447         end UTF8;
448
449      --  Special efficient encoding for brackets case
450
451      elsif Encoding_Method = WCEM_Brackets then
452         Brackets : declare
453            P : Natural;
454            S : Natural;
455
456         begin
457            --  See if we have "] at end positions
458
459            if Ptr > Input'First + 1
460              and then Input (Ptr - 1) = ']'
461              and then Input (Ptr - 2) = '"'
462            then
463               P := Ptr - 2;
464
465               --  Loop back looking for [" at start
466
467               while P >= Ptr - 10 loop
468                  if P <= Input'First + 1 then
469                     Bad;
470
471                  elsif Input (P - 1) = '"'
472                    and then Input (P - 2) = '['
473                  then
474                     --  Found ["..."], scan forward to check it
475
476                     S := P - 2;
477                     P := S;
478                     Next_Wide_Character (Input, P);
479
480                     --  OK if at original pointer, else error
481
482                     if P = Ptr then
483                        Ptr := S;
484                        return;
485                     else
486                        Bad;
487                     end if;
488                  end if;
489
490                  P := P - 1;
491               end loop;
492
493               --  Falling through loop means more than 8 chars between the
494               --  enclosing brackets (or simply a missing left bracket)
495
496               Bad;
497
498            --  Here if no bracket sequence present
499
500            else
501               if Ptr = Input'First then
502                  Past_End;
503               else
504                  Ptr := Ptr - 1;
505               end if;
506            end if;
507         end Brackets;
508
509      --  Non-UTF-8/Brackets. These are the inefficient cases where we have to
510      --  go to the start of the string and skip forwards till Ptr matches.
511
512      else
513         Non_UTF_Brackets : declare
514            Discard : Wide_Character;
515            PtrS    : Natural;
516            PtrP    : Natural;
517
518         begin
519            PtrS := Input'First;
520
521            if Ptr <= PtrS then
522               Past_End;
523            end if;
524
525            loop
526               PtrP := PtrS;
527               Decode_Wide_Character (Input, PtrS, Discard);
528
529               if PtrS = Ptr then
530                  Ptr := PtrP;
531                  return;
532
533               elsif PtrS > Ptr then
534                  Bad;
535               end if;
536            end loop;
537
538         exception
539            when Constraint_Error =>
540               Bad;
541         end Non_UTF_Brackets;
542      end if;
543   end Prev_Wide_Character;
544
545   ------------------------------
546   -- Prev_Wide_Wide_Character --
547   ------------------------------
548
549   procedure Prev_Wide_Wide_Character (Input : String; Ptr : in out Natural) is
550   begin
551      if Ptr > Input'Last + 1 then
552         Past_End;
553      end if;
554
555      --  Special efficient encoding for UTF-8 case
556
557      if Encoding_Method = WCEM_UTF8 then
558         UTF8 : declare
559            U : Unsigned_32;
560
561            procedure Getc;
562            pragma Inline (Getc);
563            --  Gets the character at Input (Ptr - 1) and returns code in U as
564            --  Unsigned_32 value. On return Ptr is decremented by one.
565
566            procedure Skip_UTF_Byte;
567            pragma Inline (Skip_UTF_Byte);
568            --  Checks that U is 2#10xxxxxx# and then calls Get
569
570            ----------
571            -- Getc --
572            ----------
573
574            procedure Getc is
575            begin
576               if Ptr <= Input'First then
577                  Past_End;
578               else
579                  Ptr := Ptr - 1;
580                  U := Unsigned_32 (Character'Pos (Input (Ptr)));
581               end if;
582            end Getc;
583
584            -------------------
585            -- Skip_UTF_Byte --
586            -------------------
587
588            procedure Skip_UTF_Byte is
589            begin
590               if (U and 2#11000000#) = 2#10_000000# then
591                  Getc;
592               else
593                  Bad;
594               end if;
595            end Skip_UTF_Byte;
596
597         --  Start of processing for UTF-8 case
598
599         begin
600            --  16#00_0000#-16#00_007F#: 0xxxxxxx
601
602            Getc;
603
604            if (U and 2#10000000#) = 2#00000000# then
605               return;
606
607            --  16#00_0080#-16#00_07FF#: 110xxxxx 10xxxxxx
608
609            else
610               Skip_UTF_Byte;
611
612               if (U and 2#11100000#) = 2#110_00000# then
613                  return;
614
615               --  16#00_0800#-16#00_ffff#: 1110xxxx 10xxxxxx 10xxxxxx
616
617               else
618                  Skip_UTF_Byte;
619
620                  if (U and 2#11110000#) = 2#1110_0000# then
621                     return;
622
623                  --  16#01_0000#-16#10_FFFF#: 11110xxx 10xxxxxx 10xxxxxx
624                  --                           10xxxxxx
625
626                  else
627                     Skip_UTF_Byte;
628
629                     if (U and 2#11111000#) = 2#11110_000# then
630                        return;
631
632                     --  16#0020_0000#-16#03FF_FFFF#: 111110xx 10xxxxxx
633                     --                               10xxxxxx 10xxxxxx
634                     --                               10xxxxxx
635
636                     else
637                        Skip_UTF_Byte;
638
639                        if (U and 2#11111100#) = 2#111110_00# then
640                           return;
641
642                        --  Any other code is invalid, note that this includes:
643
644                        --  16#0400_0000#-16#7FFF_FFFF#: 1111110x 10xxxxxx
645                        --                               10xxxxxx 10xxxxxx
646                        --                               10xxxxxx 10xxxxxx
647
648                        --  since Wide_Wide_Character does not allow codes
649                        --  greater than 16#03FF_FFFF#
650
651                        else
652                           Bad;
653                        end if;
654                     end if;
655                  end if;
656               end if;
657            end if;
658         end UTF8;
659
660      --  Special efficient encoding for brackets case
661
662      elsif Encoding_Method = WCEM_Brackets then
663         Brackets : declare
664            P : Natural;
665            S : Natural;
666
667         begin
668            --  See if we have "] at end positions
669
670            if Ptr > Input'First + 1
671              and then Input (Ptr - 1) = ']'
672              and then Input (Ptr - 2) = '"'
673            then
674               P := Ptr - 2;
675
676               --  Loop back looking for [" at start
677
678               while P >= Ptr - 10 loop
679                  if P <= Input'First + 1 then
680                     Bad;
681
682                  elsif Input (P - 1) = '"'
683                    and then Input (P - 2) = '['
684                  then
685                     --  Found ["..."], scan forward to check it
686
687                     S := P - 2;
688                     P := S;
689                     Next_Wide_Wide_Character (Input, P);
690
691                     --  OK if at original pointer, else error
692
693                     if P = Ptr then
694                        Ptr := S;
695                        return;
696                     else
697                        Bad;
698                     end if;
699                  end if;
700
701                  P := P - 1;
702               end loop;
703
704               --  Falling through loop means more than 8 chars between the
705               --  enclosing brackets (or simply a missing left bracket)
706
707               Bad;
708
709            --  Here if no bracket sequence present
710
711            else
712               if Ptr = Input'First then
713                  Past_End;
714               else
715                  Ptr := Ptr - 1;
716               end if;
717            end if;
718         end Brackets;
719
720      --  Non-UTF-8/Brackets. These are the inefficient cases where we have to
721      --  go to the start of the string and skip forwards till Ptr matches.
722
723      else
724         Non_UTF8_Brackets : declare
725            Discard : Wide_Wide_Character;
726            PtrS    : Natural;
727            PtrP    : Natural;
728
729         begin
730            PtrS := Input'First;
731
732            if Ptr <= PtrS then
733               Past_End;
734            end if;
735
736            loop
737               PtrP := PtrS;
738               Decode_Wide_Wide_Character (Input, PtrS, Discard);
739
740               if PtrS = Ptr then
741                  Ptr := PtrP;
742                  return;
743
744               elsif PtrS > Ptr then
745                  Bad;
746               end if;
747            end loop;
748
749         exception
750            when Constraint_Error =>
751               Bad;
752         end Non_UTF8_Brackets;
753      end if;
754   end Prev_Wide_Wide_Character;
755
756   --------------------------
757   -- Validate_Wide_String --
758   --------------------------
759
760   function Validate_Wide_String (S : String) return Boolean is
761      Ptr : Natural;
762
763   begin
764      Ptr := S'First;
765      while Ptr <= S'Last loop
766         Next_Wide_Character (S, Ptr);
767      end loop;
768
769      return True;
770
771   exception
772      when Constraint_Error =>
773         return False;
774   end Validate_Wide_String;
775
776   -------------------------------
777   -- Validate_Wide_Wide_String --
778   -------------------------------
779
780   function Validate_Wide_Wide_String (S : String) return Boolean is
781      Ptr : Natural;
782
783   begin
784      Ptr := S'First;
785      while Ptr <= S'Last loop
786         Next_Wide_Wide_Character (S, Ptr);
787      end loop;
788
789      return True;
790
791   exception
792      when Constraint_Error =>
793         return False;
794   end Validate_Wide_Wide_String;
795
796end GNAT.Decode_String;
797