1------------------------------------------------------------------------------
2--                                                                          --
3--                         GNAT RUN-TIME COMPONENTS                         --
4--                                                                          --
5--                    G N A T . D E C O D E _ S T R I N G                   --
6--                                                                          --
7--                                 S p e c                                  --
8--                                                                          --
9--                     Copyright (C) 2007-2013, AdaCore                     --
10--                                                                          --
11-- GNAT is free software;  you can  redistribute it  and/or modify it under --
12-- terms of the  GNU General Public License as published  by the Free Soft- --
13-- ware  Foundation;  either version 3,  or (at your option) any later ver- --
14-- sion.  GNAT is distributed in the hope that it will be useful, but WITH- --
15-- OUT ANY WARRANTY;  without even the  implied warranty of MERCHANTABILITY --
16-- or FITNESS FOR A PARTICULAR PURPOSE.                                     --
17--                                                                          --
18-- As a special exception under Section 7 of GPL version 3, you are granted --
19-- additional permissions described in the GCC Runtime Library Exception,   --
20-- version 3.1, as published by the Free Software Foundation.               --
21--                                                                          --
22-- You should have received a copy of the GNU General Public License and    --
23-- a copy of the GCC Runtime Library Exception along with this program;     --
24-- see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see    --
25-- <http://www.gnu.org/licenses/>.                                          --
26--                                                                          --
27-- GNAT was originally developed  by the GNAT team at  New York University. --
28-- Extensive contributions were provided by Ada Core Technologies Inc.      --
29--                                                                          --
30------------------------------------------------------------------------------
31
32--  This package provides a utility routine for converting from an encoded
33--  string to a corresponding Wide_String or Wide_Wide_String value.
34
35with Interfaces; use Interfaces;
36
37with System.WCh_Cnv; use System.WCh_Cnv;
38with System.WCh_Con; use System.WCh_Con;
39
40package body GNAT.Decode_String is
41
42   -----------------------
43   -- Local Subprograms --
44   -----------------------
45
46   procedure Bad;
47   pragma No_Return (Bad);
48   --  Raise error for bad encoding
49
50   procedure Past_End;
51   pragma No_Return (Past_End);
52   --  Raise error for off end of string
53
54   ---------
55   -- Bad --
56   ---------
57
58   procedure Bad is
59   begin
60      raise Constraint_Error with
61        "bad encoding or character out of range";
62   end Bad;
63
64   ---------------------------
65   -- Decode_Wide_Character --
66   ---------------------------
67
68   procedure Decode_Wide_Character
69     (Input  : String;
70      Ptr    : in out Natural;
71      Result : out Wide_Character)
72   is
73      Char : Wide_Wide_Character;
74   begin
75      Decode_Wide_Wide_Character (Input, Ptr, Char);
76
77      if Wide_Wide_Character'Pos (Char) > 16#FFFF# then
78         Bad;
79      else
80         Result := Wide_Character'Val (Wide_Wide_Character'Pos (Char));
81      end if;
82   end Decode_Wide_Character;
83
84   ------------------------
85   -- Decode_Wide_String --
86   ------------------------
87
88   function Decode_Wide_String (S : String) return Wide_String is
89      Result : Wide_String (1 .. S'Length);
90      Length : Natural;
91   begin
92      Decode_Wide_String (S, Result, Length);
93      return Result (1 .. Length);
94   end Decode_Wide_String;
95
96   procedure Decode_Wide_String
97     (S      : String;
98      Result : out Wide_String;
99      Length : out Natural)
100   is
101      Ptr : Natural;
102
103   begin
104      Ptr := S'First;
105      Length := 0;
106      while Ptr <= S'Last loop
107         if Length >= Result'Last then
108            Past_End;
109         end if;
110
111         Length := Length + 1;
112         Decode_Wide_Character (S, Ptr, Result (Length));
113      end loop;
114   end Decode_Wide_String;
115
116   --------------------------------
117   -- Decode_Wide_Wide_Character --
118   --------------------------------
119
120   procedure Decode_Wide_Wide_Character
121     (Input  : String;
122      Ptr    : in out Natural;
123      Result : out Wide_Wide_Character)
124   is
125      C : Character;
126
127      function In_Char return Character;
128      pragma Inline (In_Char);
129      --  Function to get one input character
130
131      -------------
132      -- In_Char --
133      -------------
134
135      function In_Char return Character is
136      begin
137         if Ptr <= Input'Last then
138            Ptr := Ptr + 1;
139            return Input (Ptr - 1);
140         else
141            Past_End;
142         end if;
143      end In_Char;
144
145   --  Start of processing for Decode_Wide_Wide_Character
146
147   begin
148      C := In_Char;
149
150      --  Special fast processing for UTF-8 case
151
152      if Encoding_Method = WCEM_UTF8 then
153         UTF8 : declare
154            U : Unsigned_32;
155            W : Unsigned_32;
156
157            procedure Get_UTF_Byte;
158            pragma Inline (Get_UTF_Byte);
159            --  Used to interpret 2#10xxxxxx# continuation byte in UTF-8 mode.
160            --  Reads a byte, and raises CE if the first two bits are not 10.
161            --  Otherwise shifts W 6 bits left and or's in the 6 xxxxxx bits.
162
163            ------------------
164            -- Get_UTF_Byte --
165            ------------------
166
167            procedure Get_UTF_Byte is
168            begin
169               U := Unsigned_32 (Character'Pos (In_Char));
170
171               if (U and 2#11000000#) /= 2#10_000000# then
172                  Bad;
173               end if;
174
175               W := Shift_Left (W, 6) or (U and 2#00111111#);
176            end Get_UTF_Byte;
177
178         --  Start of processing for UTF8 case
179
180         begin
181            --  Note: for details of UTF8 encoding see RFC 3629
182
183            U := Unsigned_32 (Character'Pos (C));
184
185            --  16#00_0000#-16#00_007F#: 0xxxxxxx
186
187            if (U and 2#10000000#) = 2#00000000# then
188               Result := Wide_Wide_Character'Val (Character'Pos (C));
189
190            --  16#00_0080#-16#00_07FF#: 110xxxxx 10xxxxxx
191
192            elsif (U and 2#11100000#) = 2#110_00000# then
193               W := U and 2#00011111#;
194               Get_UTF_Byte;
195
196               if W not in 16#00_0080# .. 16#00_07FF# then
197                  Bad;
198               end if;
199
200               Result := Wide_Wide_Character'Val (W);
201
202            --  16#00_0800#-16#00_ffff#: 1110xxxx 10xxxxxx 10xxxxxx
203
204            elsif (U and 2#11110000#) = 2#1110_0000# then
205               W := U and 2#00001111#;
206               Get_UTF_Byte;
207               Get_UTF_Byte;
208
209               if W not in 16#00_0800# .. 16#00_FFFF# then
210                  Bad;
211               end if;
212
213               Result := Wide_Wide_Character'Val (W);
214
215            --  16#01_0000#-16#10_FFFF#: 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
216
217            elsif (U and 2#11111000#) = 2#11110_000# then
218               W := U and 2#00000111#;
219
220               for K in 1 .. 3 loop
221                  Get_UTF_Byte;
222               end loop;
223
224               if W not in 16#01_0000# .. 16#10_FFFF# then
225                  Bad;
226               end if;
227
228               Result := Wide_Wide_Character'Val (W);
229
230            --  16#0020_0000#-16#03FF_FFFF#: 111110xx 10xxxxxx 10xxxxxx
231            --                               10xxxxxx 10xxxxxx
232
233            elsif (U and 2#11111100#) = 2#111110_00# then
234               W := U and 2#00000011#;
235
236               for K in 1 .. 4 loop
237                  Get_UTF_Byte;
238               end loop;
239
240               if W not in 16#0020_0000# .. 16#03FF_FFFF# then
241                  Bad;
242               end if;
243
244               Result := Wide_Wide_Character'Val (W);
245
246            --  All other cases are invalid, note that this includes:
247
248            --  16#0400_0000#-16#7FFF_FFFF#: 1111110x 10xxxxxx 10xxxxxx
249            --                               10xxxxxx 10xxxxxx 10xxxxxx
250
251            --  since Wide_Wide_Character does not include code values
252            --  greater than 16#03FF_FFFF#.
253
254            else
255               Bad;
256            end if;
257         end UTF8;
258
259      --  All encoding functions other than UTF-8
260
261      else
262         Non_UTF8 : declare
263            function Char_Sequence_To_UTF is
264              new Char_Sequence_To_UTF_32 (In_Char);
265
266         begin
267            --  For brackets, must test for specific case of [ not followed by
268            --  quotation, where we must not call Char_Sequence_To_UTF, but
269            --  instead just return the bracket unchanged.
270
271            if Encoding_Method = WCEM_Brackets
272              and then C = '['
273              and then (Ptr > Input'Last or else Input (Ptr) /= '"')
274            then
275               Result := '[';
276
277            --  All other cases including [" with Brackets
278
279            else
280               Result :=
281                 Wide_Wide_Character'Val
282                   (Char_Sequence_To_UTF (C, Encoding_Method));
283            end if;
284         end Non_UTF8;
285      end if;
286   end Decode_Wide_Wide_Character;
287
288   -----------------------------
289   -- Decode_Wide_Wide_String --
290   -----------------------------
291
292   function Decode_Wide_Wide_String (S : String) return Wide_Wide_String is
293      Result : Wide_Wide_String (1 .. S'Length);
294      Length : Natural;
295   begin
296      Decode_Wide_Wide_String (S, Result, Length);
297      return Result (1 .. Length);
298   end Decode_Wide_Wide_String;
299
300   procedure Decode_Wide_Wide_String
301     (S      : String;
302      Result : out Wide_Wide_String;
303      Length : out Natural)
304   is
305      Ptr : Natural;
306
307   begin
308      Ptr := S'First;
309      Length := 0;
310      while Ptr <= S'Last loop
311         if Length >= Result'Last then
312            Past_End;
313         end if;
314
315         Length := Length + 1;
316         Decode_Wide_Wide_Character (S, Ptr, Result (Length));
317      end loop;
318   end Decode_Wide_Wide_String;
319
320   -------------------------
321   -- Next_Wide_Character --
322   -------------------------
323
324   procedure Next_Wide_Character (Input : String; Ptr : in out Natural) is
325      Discard : Wide_Character;
326      pragma Unreferenced (Discard);
327   begin
328      Decode_Wide_Character (Input, Ptr, Discard);
329   end Next_Wide_Character;
330
331   ------------------------------
332   -- Next_Wide_Wide_Character --
333   ------------------------------
334
335   procedure Next_Wide_Wide_Character (Input : String; Ptr : in out Natural) is
336      Discard : Wide_Wide_Character;
337      pragma Unreferenced (Discard);
338   begin
339      Decode_Wide_Wide_Character (Input, Ptr, Discard);
340   end Next_Wide_Wide_Character;
341
342   --------------
343   -- Past_End --
344   --------------
345
346   procedure Past_End is
347   begin
348      raise Constraint_Error with "past end of string";
349   end Past_End;
350
351   -------------------------
352   -- Prev_Wide_Character --
353   -------------------------
354
355   procedure Prev_Wide_Character (Input : String; Ptr : in out Natural) is
356   begin
357      if Ptr > Input'Last + 1 then
358         Past_End;
359      end if;
360
361      --  Special efficient encoding for UTF-8 case
362
363      if Encoding_Method = WCEM_UTF8 then
364         UTF8 : declare
365            U : Unsigned_32;
366
367            procedure Getc;
368            pragma Inline (Getc);
369            --  Gets the character at Input (Ptr - 1) and returns code in U as
370            --  Unsigned_32 value. On return Ptr is decremented by one.
371
372            procedure Skip_UTF_Byte;
373            pragma Inline (Skip_UTF_Byte);
374            --  Checks that U is 2#10xxxxxx# and then calls Get
375
376            ----------
377            -- Getc --
378            ----------
379
380            procedure Getc is
381            begin
382               if Ptr <= Input'First then
383                  Past_End;
384               else
385                  Ptr := Ptr - 1;
386                  U := Unsigned_32 (Character'Pos (Input (Ptr)));
387               end if;
388            end Getc;
389
390            -------------------
391            -- Skip_UTF_Byte --
392            -------------------
393
394            procedure Skip_UTF_Byte is
395            begin
396               if (U and 2#11000000#) = 2#10_000000# then
397                  Getc;
398               else
399                  Bad;
400               end if;
401            end Skip_UTF_Byte;
402
403         --  Start of processing for UTF-8 case
404
405         begin
406            --  16#00_0000#-16#00_007F#: 0xxxxxxx
407
408            Getc;
409
410            if (U and 2#10000000#) = 2#00000000# then
411               return;
412
413            --  16#00_0080#-16#00_07FF#: 110xxxxx 10xxxxxx
414
415            else
416               Skip_UTF_Byte;
417
418               if (U and 2#11100000#) = 2#110_00000# then
419                  return;
420
421               --  16#00_0800#-16#00_ffff#: 1110xxxx 10xxxxxx 10xxxxxx
422
423               else
424                  Skip_UTF_Byte;
425
426                  if (U and 2#11110000#) = 2#1110_0000# then
427                     return;
428
429                     --  Any other code is invalid, note that this includes:
430
431                     --  16#01_0000#-16#10_FFFF#: 11110xxx 10xxxxxx 10xxxxxx
432                     --                           10xxxxxx
433
434                     --  16#0020_0000#-16#03FF_FFFF#: 111110xx 10xxxxxx
435                     --                               10xxxxxx 10xxxxxx
436                     --                               10xxxxxx
437
438                     --  16#0400_0000#-16#7FFF_FFFF#: 1111110x 10xxxxxx
439                     --                               10xxxxxx 10xxxxxx
440                     --                               10xxxxxx 10xxxxxx
441
442                     --  since Wide_Character does not allow codes > 16#FFFF#
443
444                  else
445                     Bad;
446                  end if;
447               end if;
448            end if;
449         end UTF8;
450
451      --  Special efficient encoding for brackets case
452
453      elsif Encoding_Method = WCEM_Brackets then
454         Brackets : declare
455            P : Natural;
456            S : Natural;
457
458         begin
459            --  See if we have "] at end positions
460
461            if Ptr > Input'First + 1
462              and then Input (Ptr - 1) = ']'
463              and then Input (Ptr - 2) = '"'
464            then
465               P := Ptr - 2;
466
467               --  Loop back looking for [" at start
468
469               while P >= Ptr - 10 loop
470                  if P <= Input'First + 1 then
471                     Bad;
472
473                  elsif Input (P - 1) = '"'
474                    and then Input (P - 2) = '['
475                  then
476                     --  Found ["..."], scan forward to check it
477
478                     S := P - 2;
479                     P := S;
480                     Next_Wide_Character (Input, P);
481
482                     --  OK if at original pointer, else error
483
484                     if P = Ptr then
485                        Ptr := S;
486                        return;
487                     else
488                        Bad;
489                     end if;
490                  end if;
491
492                  P := P - 1;
493               end loop;
494
495               --  Falling through loop means more than 8 chars between the
496               --  enclosing brackets (or simply a missing left bracket)
497
498               Bad;
499
500            --  Here if no bracket sequence present
501
502            else
503               if Ptr = Input'First then
504                  Past_End;
505               else
506                  Ptr := Ptr - 1;
507               end if;
508            end if;
509         end Brackets;
510
511      --  Non-UTF-8/Brackets. These are the inefficient cases where we have to
512      --  go to the start of the string and skip forwards till Ptr matches.
513
514      else
515         Non_UTF_Brackets : declare
516            Discard : Wide_Character;
517            PtrS    : Natural;
518            PtrP    : Natural;
519
520         begin
521            PtrS := Input'First;
522
523            if Ptr <= PtrS then
524               Past_End;
525            end if;
526
527            loop
528               PtrP := PtrS;
529               Decode_Wide_Character (Input, PtrS, Discard);
530
531               if PtrS = Ptr then
532                  Ptr := PtrP;
533                  return;
534
535               elsif PtrS > Ptr then
536                  Bad;
537               end if;
538            end loop;
539
540         exception
541            when Constraint_Error =>
542               Bad;
543         end Non_UTF_Brackets;
544      end if;
545   end Prev_Wide_Character;
546
547   ------------------------------
548   -- Prev_Wide_Wide_Character --
549   ------------------------------
550
551   procedure Prev_Wide_Wide_Character (Input : String; Ptr : in out Natural) is
552   begin
553      if Ptr > Input'Last + 1 then
554         Past_End;
555      end if;
556
557      --  Special efficient encoding for UTF-8 case
558
559      if Encoding_Method = WCEM_UTF8 then
560         UTF8 : declare
561            U : Unsigned_32;
562
563            procedure Getc;
564            pragma Inline (Getc);
565            --  Gets the character at Input (Ptr - 1) and returns code in U as
566            --  Unsigned_32 value. On return Ptr is decremented by one.
567
568            procedure Skip_UTF_Byte;
569            pragma Inline (Skip_UTF_Byte);
570            --  Checks that U is 2#10xxxxxx# and then calls Get
571
572            ----------
573            -- Getc --
574            ----------
575
576            procedure Getc is
577            begin
578               if Ptr <= Input'First then
579                  Past_End;
580               else
581                  Ptr := Ptr - 1;
582                  U := Unsigned_32 (Character'Pos (Input (Ptr)));
583               end if;
584            end Getc;
585
586            -------------------
587            -- Skip_UTF_Byte --
588            -------------------
589
590            procedure Skip_UTF_Byte is
591            begin
592               if (U and 2#11000000#) = 2#10_000000# then
593                  Getc;
594               else
595                  Bad;
596               end if;
597            end Skip_UTF_Byte;
598
599         --  Start of processing for UTF-8 case
600
601         begin
602            --  16#00_0000#-16#00_007F#: 0xxxxxxx
603
604            Getc;
605
606            if (U and 2#10000000#) = 2#00000000# then
607               return;
608
609            --  16#00_0080#-16#00_07FF#: 110xxxxx 10xxxxxx
610
611            else
612               Skip_UTF_Byte;
613
614               if (U and 2#11100000#) = 2#110_00000# then
615                  return;
616
617               --  16#00_0800#-16#00_ffff#: 1110xxxx 10xxxxxx 10xxxxxx
618
619               else
620                  Skip_UTF_Byte;
621
622                  if (U and 2#11110000#) = 2#1110_0000# then
623                     return;
624
625                  --  16#01_0000#-16#10_FFFF#: 11110xxx 10xxxxxx 10xxxxxx
626                  --                           10xxxxxx
627
628                  else
629                     Skip_UTF_Byte;
630
631                     if (U and 2#11111000#) = 2#11110_000# then
632                        return;
633
634                     --  16#0020_0000#-16#03FF_FFFF#: 111110xx 10xxxxxx
635                     --                               10xxxxxx 10xxxxxx
636                     --                               10xxxxxx
637
638                     else
639                        Skip_UTF_Byte;
640
641                        if (U and 2#11111100#) = 2#111110_00# then
642                           return;
643
644                        --  Any other code is invalid, note that this includes:
645
646                        --  16#0400_0000#-16#7FFF_FFFF#: 1111110x 10xxxxxx
647                        --                               10xxxxxx 10xxxxxx
648                        --                               10xxxxxx 10xxxxxx
649
650                        --  since Wide_Wide_Character does not allow codes
651                        --  greater than 16#03FF_FFFF#
652
653                        else
654                           Bad;
655                        end if;
656                     end if;
657                  end if;
658               end if;
659            end if;
660         end UTF8;
661
662      --  Special efficient encoding for brackets case
663
664      elsif Encoding_Method = WCEM_Brackets then
665         Brackets : declare
666            P : Natural;
667            S : Natural;
668
669         begin
670            --  See if we have "] at end positions
671
672            if Ptr > Input'First + 1
673              and then Input (Ptr - 1) = ']'
674              and then Input (Ptr - 2) = '"'
675            then
676               P := Ptr - 2;
677
678               --  Loop back looking for [" at start
679
680               while P >= Ptr - 10 loop
681                  if P <= Input'First + 1 then
682                     Bad;
683
684                  elsif Input (P - 1) = '"'
685                    and then Input (P - 2) = '['
686                  then
687                     --  Found ["..."], scan forward to check it
688
689                     S := P - 2;
690                     P := S;
691                     Next_Wide_Wide_Character (Input, P);
692
693                     --  OK if at original pointer, else error
694
695                     if P = Ptr then
696                        Ptr := S;
697                        return;
698                     else
699                        Bad;
700                     end if;
701                  end if;
702
703                  P := P - 1;
704               end loop;
705
706               --  Falling through loop means more than 8 chars between the
707               --  enclosing brackets (or simply a missing left bracket)
708
709               Bad;
710
711            --  Here if no bracket sequence present
712
713            else
714               if Ptr = Input'First then
715                  Past_End;
716               else
717                  Ptr := Ptr - 1;
718               end if;
719            end if;
720         end Brackets;
721
722      --  Non-UTF-8/Brackets. These are the inefficient cases where we have to
723      --  go to the start of the string and skip forwards till Ptr matches.
724
725      else
726         Non_UTF8_Brackets : declare
727            Discard : Wide_Wide_Character;
728            PtrS    : Natural;
729            PtrP    : Natural;
730
731         begin
732            PtrS := Input'First;
733
734            if Ptr <= PtrS then
735               Past_End;
736            end if;
737
738            loop
739               PtrP := PtrS;
740               Decode_Wide_Wide_Character (Input, PtrS, Discard);
741
742               if PtrS = Ptr then
743                  Ptr := PtrP;
744                  return;
745
746               elsif PtrS > Ptr then
747                  Bad;
748               end if;
749            end loop;
750
751         exception
752            when Constraint_Error =>
753               Bad;
754         end Non_UTF8_Brackets;
755      end if;
756   end Prev_Wide_Wide_Character;
757
758   --------------------------
759   -- Validate_Wide_String --
760   --------------------------
761
762   function Validate_Wide_String (S : String) return Boolean is
763      Ptr : Natural;
764
765   begin
766      Ptr := S'First;
767      while Ptr <= S'Last loop
768         Next_Wide_Character (S, Ptr);
769      end loop;
770
771      return True;
772
773   exception
774      when Constraint_Error =>
775         return False;
776   end Validate_Wide_String;
777
778   -------------------------------
779   -- Validate_Wide_Wide_String --
780   -------------------------------
781
782   function Validate_Wide_Wide_String (S : String) return Boolean is
783      Ptr : Natural;
784
785   begin
786      Ptr := S'First;
787      while Ptr <= S'Last loop
788         Next_Wide_Wide_Character (S, Ptr);
789      end loop;
790
791      return True;
792
793   exception
794      when Constraint_Error =>
795         return False;
796   end Validate_Wide_Wide_String;
797
798end GNAT.Decode_String;
799