1------------------------------------------------------------------------------ 2-- -- 3-- GNAT RUN-TIME COMPONENTS -- 4-- -- 5-- G N A T . D E C O D E _ S T R I N G -- 6-- -- 7-- S p e c -- 8-- -- 9-- Copyright (C) 2007-2013, AdaCore -- 10-- -- 11-- GNAT is free software; you can redistribute it and/or modify it under -- 12-- terms of the GNU General Public License as published by the Free Soft- -- 13-- ware Foundation; either version 3, or (at your option) any later ver- -- 14-- sion. GNAT is distributed in the hope that it will be useful, but WITH- -- 15-- OUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY -- 16-- or FITNESS FOR A PARTICULAR PURPOSE. -- 17-- -- 18-- As a special exception under Section 7 of GPL version 3, you are granted -- 19-- additional permissions described in the GCC Runtime Library Exception, -- 20-- version 3.1, as published by the Free Software Foundation. -- 21-- -- 22-- You should have received a copy of the GNU General Public License and -- 23-- a copy of the GCC Runtime Library Exception along with this program; -- 24-- see the files COPYING3 and COPYING.RUNTIME respectively. If not, see -- 25-- <http://www.gnu.org/licenses/>. -- 26-- -- 27-- GNAT was originally developed by the GNAT team at New York University. -- 28-- Extensive contributions were provided by Ada Core Technologies Inc. -- 29-- -- 30------------------------------------------------------------------------------ 31 32-- This package provides a utility routine for converting from an encoded 33-- string to a corresponding Wide_String or Wide_Wide_String value. 34 35with Interfaces; use Interfaces; 36 37with System.WCh_Cnv; use System.WCh_Cnv; 38with System.WCh_Con; use System.WCh_Con; 39 40package body GNAT.Decode_String is 41 42 ----------------------- 43 -- Local Subprograms -- 44 ----------------------- 45 46 procedure Bad; 47 pragma No_Return (Bad); 48 -- Raise error for bad encoding 49 50 procedure Past_End; 51 pragma No_Return (Past_End); 52 -- Raise error for off end of string 53 54 --------- 55 -- Bad -- 56 --------- 57 58 procedure Bad is 59 begin 60 raise Constraint_Error with 61 "bad encoding or character out of range"; 62 end Bad; 63 64 --------------------------- 65 -- Decode_Wide_Character -- 66 --------------------------- 67 68 procedure Decode_Wide_Character 69 (Input : String; 70 Ptr : in out Natural; 71 Result : out Wide_Character) 72 is 73 Char : Wide_Wide_Character; 74 begin 75 Decode_Wide_Wide_Character (Input, Ptr, Char); 76 77 if Wide_Wide_Character'Pos (Char) > 16#FFFF# then 78 Bad; 79 else 80 Result := Wide_Character'Val (Wide_Wide_Character'Pos (Char)); 81 end if; 82 end Decode_Wide_Character; 83 84 ------------------------ 85 -- Decode_Wide_String -- 86 ------------------------ 87 88 function Decode_Wide_String (S : String) return Wide_String is 89 Result : Wide_String (1 .. S'Length); 90 Length : Natural; 91 begin 92 Decode_Wide_String (S, Result, Length); 93 return Result (1 .. Length); 94 end Decode_Wide_String; 95 96 procedure Decode_Wide_String 97 (S : String; 98 Result : out Wide_String; 99 Length : out Natural) 100 is 101 Ptr : Natural; 102 103 begin 104 Ptr := S'First; 105 Length := 0; 106 while Ptr <= S'Last loop 107 if Length >= Result'Last then 108 Past_End; 109 end if; 110 111 Length := Length + 1; 112 Decode_Wide_Character (S, Ptr, Result (Length)); 113 end loop; 114 end Decode_Wide_String; 115 116 -------------------------------- 117 -- Decode_Wide_Wide_Character -- 118 -------------------------------- 119 120 procedure Decode_Wide_Wide_Character 121 (Input : String; 122 Ptr : in out Natural; 123 Result : out Wide_Wide_Character) 124 is 125 C : Character; 126 127 function In_Char return Character; 128 pragma Inline (In_Char); 129 -- Function to get one input character 130 131 ------------- 132 -- In_Char -- 133 ------------- 134 135 function In_Char return Character is 136 begin 137 if Ptr <= Input'Last then 138 Ptr := Ptr + 1; 139 return Input (Ptr - 1); 140 else 141 Past_End; 142 end if; 143 end In_Char; 144 145 -- Start of processing for Decode_Wide_Wide_Character 146 147 begin 148 C := In_Char; 149 150 -- Special fast processing for UTF-8 case 151 152 if Encoding_Method = WCEM_UTF8 then 153 UTF8 : declare 154 U : Unsigned_32; 155 W : Unsigned_32; 156 157 procedure Get_UTF_Byte; 158 pragma Inline (Get_UTF_Byte); 159 -- Used to interpret 2#10xxxxxx# continuation byte in UTF-8 mode. 160 -- Reads a byte, and raises CE if the first two bits are not 10. 161 -- Otherwise shifts W 6 bits left and or's in the 6 xxxxxx bits. 162 163 ------------------ 164 -- Get_UTF_Byte -- 165 ------------------ 166 167 procedure Get_UTF_Byte is 168 begin 169 U := Unsigned_32 (Character'Pos (In_Char)); 170 171 if (U and 2#11000000#) /= 2#10_000000# then 172 Bad; 173 end if; 174 175 W := Shift_Left (W, 6) or (U and 2#00111111#); 176 end Get_UTF_Byte; 177 178 -- Start of processing for UTF8 case 179 180 begin 181 -- Note: for details of UTF8 encoding see RFC 3629 182 183 U := Unsigned_32 (Character'Pos (C)); 184 185 -- 16#00_0000#-16#00_007F#: 0xxxxxxx 186 187 if (U and 2#10000000#) = 2#00000000# then 188 Result := Wide_Wide_Character'Val (Character'Pos (C)); 189 190 -- 16#00_0080#-16#00_07FF#: 110xxxxx 10xxxxxx 191 192 elsif (U and 2#11100000#) = 2#110_00000# then 193 W := U and 2#00011111#; 194 Get_UTF_Byte; 195 196 if W not in 16#00_0080# .. 16#00_07FF# then 197 Bad; 198 end if; 199 200 Result := Wide_Wide_Character'Val (W); 201 202 -- 16#00_0800#-16#00_ffff#: 1110xxxx 10xxxxxx 10xxxxxx 203 204 elsif (U and 2#11110000#) = 2#1110_0000# then 205 W := U and 2#00001111#; 206 Get_UTF_Byte; 207 Get_UTF_Byte; 208 209 if W not in 16#00_0800# .. 16#00_FFFF# then 210 Bad; 211 end if; 212 213 Result := Wide_Wide_Character'Val (W); 214 215 -- 16#01_0000#-16#10_FFFF#: 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx 216 217 elsif (U and 2#11111000#) = 2#11110_000# then 218 W := U and 2#00000111#; 219 220 for K in 1 .. 3 loop 221 Get_UTF_Byte; 222 end loop; 223 224 if W not in 16#01_0000# .. 16#10_FFFF# then 225 Bad; 226 end if; 227 228 Result := Wide_Wide_Character'Val (W); 229 230 -- 16#0020_0000#-16#03FF_FFFF#: 111110xx 10xxxxxx 10xxxxxx 231 -- 10xxxxxx 10xxxxxx 232 233 elsif (U and 2#11111100#) = 2#111110_00# then 234 W := U and 2#00000011#; 235 236 for K in 1 .. 4 loop 237 Get_UTF_Byte; 238 end loop; 239 240 if W not in 16#0020_0000# .. 16#03FF_FFFF# then 241 Bad; 242 end if; 243 244 Result := Wide_Wide_Character'Val (W); 245 246 -- All other cases are invalid, note that this includes: 247 248 -- 16#0400_0000#-16#7FFF_FFFF#: 1111110x 10xxxxxx 10xxxxxx 249 -- 10xxxxxx 10xxxxxx 10xxxxxx 250 251 -- since Wide_Wide_Character does not include code values 252 -- greater than 16#03FF_FFFF#. 253 254 else 255 Bad; 256 end if; 257 end UTF8; 258 259 -- All encoding functions other than UTF-8 260 261 else 262 Non_UTF8 : declare 263 function Char_Sequence_To_UTF is 264 new Char_Sequence_To_UTF_32 (In_Char); 265 266 begin 267 -- For brackets, must test for specific case of [ not followed by 268 -- quotation, where we must not call Char_Sequence_To_UTF, but 269 -- instead just return the bracket unchanged. 270 271 if Encoding_Method = WCEM_Brackets 272 and then C = '[' 273 and then (Ptr > Input'Last or else Input (Ptr) /= '"') 274 then 275 Result := '['; 276 277 -- All other cases including [" with Brackets 278 279 else 280 Result := 281 Wide_Wide_Character'Val 282 (Char_Sequence_To_UTF (C, Encoding_Method)); 283 end if; 284 end Non_UTF8; 285 end if; 286 end Decode_Wide_Wide_Character; 287 288 ----------------------------- 289 -- Decode_Wide_Wide_String -- 290 ----------------------------- 291 292 function Decode_Wide_Wide_String (S : String) return Wide_Wide_String is 293 Result : Wide_Wide_String (1 .. S'Length); 294 Length : Natural; 295 begin 296 Decode_Wide_Wide_String (S, Result, Length); 297 return Result (1 .. Length); 298 end Decode_Wide_Wide_String; 299 300 procedure Decode_Wide_Wide_String 301 (S : String; 302 Result : out Wide_Wide_String; 303 Length : out Natural) 304 is 305 Ptr : Natural; 306 307 begin 308 Ptr := S'First; 309 Length := 0; 310 while Ptr <= S'Last loop 311 if Length >= Result'Last then 312 Past_End; 313 end if; 314 315 Length := Length + 1; 316 Decode_Wide_Wide_Character (S, Ptr, Result (Length)); 317 end loop; 318 end Decode_Wide_Wide_String; 319 320 ------------------------- 321 -- Next_Wide_Character -- 322 ------------------------- 323 324 procedure Next_Wide_Character (Input : String; Ptr : in out Natural) is 325 Discard : Wide_Character; 326 pragma Unreferenced (Discard); 327 begin 328 Decode_Wide_Character (Input, Ptr, Discard); 329 end Next_Wide_Character; 330 331 ------------------------------ 332 -- Next_Wide_Wide_Character -- 333 ------------------------------ 334 335 procedure Next_Wide_Wide_Character (Input : String; Ptr : in out Natural) is 336 Discard : Wide_Wide_Character; 337 pragma Unreferenced (Discard); 338 begin 339 Decode_Wide_Wide_Character (Input, Ptr, Discard); 340 end Next_Wide_Wide_Character; 341 342 -------------- 343 -- Past_End -- 344 -------------- 345 346 procedure Past_End is 347 begin 348 raise Constraint_Error with "past end of string"; 349 end Past_End; 350 351 ------------------------- 352 -- Prev_Wide_Character -- 353 ------------------------- 354 355 procedure Prev_Wide_Character (Input : String; Ptr : in out Natural) is 356 begin 357 if Ptr > Input'Last + 1 then 358 Past_End; 359 end if; 360 361 -- Special efficient encoding for UTF-8 case 362 363 if Encoding_Method = WCEM_UTF8 then 364 UTF8 : declare 365 U : Unsigned_32; 366 367 procedure Getc; 368 pragma Inline (Getc); 369 -- Gets the character at Input (Ptr - 1) and returns code in U as 370 -- Unsigned_32 value. On return Ptr is decremented by one. 371 372 procedure Skip_UTF_Byte; 373 pragma Inline (Skip_UTF_Byte); 374 -- Checks that U is 2#10xxxxxx# and then calls Get 375 376 ---------- 377 -- Getc -- 378 ---------- 379 380 procedure Getc is 381 begin 382 if Ptr <= Input'First then 383 Past_End; 384 else 385 Ptr := Ptr - 1; 386 U := Unsigned_32 (Character'Pos (Input (Ptr))); 387 end if; 388 end Getc; 389 390 ------------------- 391 -- Skip_UTF_Byte -- 392 ------------------- 393 394 procedure Skip_UTF_Byte is 395 begin 396 if (U and 2#11000000#) = 2#10_000000# then 397 Getc; 398 else 399 Bad; 400 end if; 401 end Skip_UTF_Byte; 402 403 -- Start of processing for UTF-8 case 404 405 begin 406 -- 16#00_0000#-16#00_007F#: 0xxxxxxx 407 408 Getc; 409 410 if (U and 2#10000000#) = 2#00000000# then 411 return; 412 413 -- 16#00_0080#-16#00_07FF#: 110xxxxx 10xxxxxx 414 415 else 416 Skip_UTF_Byte; 417 418 if (U and 2#11100000#) = 2#110_00000# then 419 return; 420 421 -- 16#00_0800#-16#00_ffff#: 1110xxxx 10xxxxxx 10xxxxxx 422 423 else 424 Skip_UTF_Byte; 425 426 if (U and 2#11110000#) = 2#1110_0000# then 427 return; 428 429 -- Any other code is invalid, note that this includes: 430 431 -- 16#01_0000#-16#10_FFFF#: 11110xxx 10xxxxxx 10xxxxxx 432 -- 10xxxxxx 433 434 -- 16#0020_0000#-16#03FF_FFFF#: 111110xx 10xxxxxx 435 -- 10xxxxxx 10xxxxxx 436 -- 10xxxxxx 437 438 -- 16#0400_0000#-16#7FFF_FFFF#: 1111110x 10xxxxxx 439 -- 10xxxxxx 10xxxxxx 440 -- 10xxxxxx 10xxxxxx 441 442 -- since Wide_Character does not allow codes > 16#FFFF# 443 444 else 445 Bad; 446 end if; 447 end if; 448 end if; 449 end UTF8; 450 451 -- Special efficient encoding for brackets case 452 453 elsif Encoding_Method = WCEM_Brackets then 454 Brackets : declare 455 P : Natural; 456 S : Natural; 457 458 begin 459 -- See if we have "] at end positions 460 461 if Ptr > Input'First + 1 462 and then Input (Ptr - 1) = ']' 463 and then Input (Ptr - 2) = '"' 464 then 465 P := Ptr - 2; 466 467 -- Loop back looking for [" at start 468 469 while P >= Ptr - 10 loop 470 if P <= Input'First + 1 then 471 Bad; 472 473 elsif Input (P - 1) = '"' 474 and then Input (P - 2) = '[' 475 then 476 -- Found ["..."], scan forward to check it 477 478 S := P - 2; 479 P := S; 480 Next_Wide_Character (Input, P); 481 482 -- OK if at original pointer, else error 483 484 if P = Ptr then 485 Ptr := S; 486 return; 487 else 488 Bad; 489 end if; 490 end if; 491 492 P := P - 1; 493 end loop; 494 495 -- Falling through loop means more than 8 chars between the 496 -- enclosing brackets (or simply a missing left bracket) 497 498 Bad; 499 500 -- Here if no bracket sequence present 501 502 else 503 if Ptr = Input'First then 504 Past_End; 505 else 506 Ptr := Ptr - 1; 507 end if; 508 end if; 509 end Brackets; 510 511 -- Non-UTF-8/Brackets. These are the inefficient cases where we have to 512 -- go to the start of the string and skip forwards till Ptr matches. 513 514 else 515 Non_UTF_Brackets : declare 516 Discard : Wide_Character; 517 PtrS : Natural; 518 PtrP : Natural; 519 520 begin 521 PtrS := Input'First; 522 523 if Ptr <= PtrS then 524 Past_End; 525 end if; 526 527 loop 528 PtrP := PtrS; 529 Decode_Wide_Character (Input, PtrS, Discard); 530 531 if PtrS = Ptr then 532 Ptr := PtrP; 533 return; 534 535 elsif PtrS > Ptr then 536 Bad; 537 end if; 538 end loop; 539 540 exception 541 when Constraint_Error => 542 Bad; 543 end Non_UTF_Brackets; 544 end if; 545 end Prev_Wide_Character; 546 547 ------------------------------ 548 -- Prev_Wide_Wide_Character -- 549 ------------------------------ 550 551 procedure Prev_Wide_Wide_Character (Input : String; Ptr : in out Natural) is 552 begin 553 if Ptr > Input'Last + 1 then 554 Past_End; 555 end if; 556 557 -- Special efficient encoding for UTF-8 case 558 559 if Encoding_Method = WCEM_UTF8 then 560 UTF8 : declare 561 U : Unsigned_32; 562 563 procedure Getc; 564 pragma Inline (Getc); 565 -- Gets the character at Input (Ptr - 1) and returns code in U as 566 -- Unsigned_32 value. On return Ptr is decremented by one. 567 568 procedure Skip_UTF_Byte; 569 pragma Inline (Skip_UTF_Byte); 570 -- Checks that U is 2#10xxxxxx# and then calls Get 571 572 ---------- 573 -- Getc -- 574 ---------- 575 576 procedure Getc is 577 begin 578 if Ptr <= Input'First then 579 Past_End; 580 else 581 Ptr := Ptr - 1; 582 U := Unsigned_32 (Character'Pos (Input (Ptr))); 583 end if; 584 end Getc; 585 586 ------------------- 587 -- Skip_UTF_Byte -- 588 ------------------- 589 590 procedure Skip_UTF_Byte is 591 begin 592 if (U and 2#11000000#) = 2#10_000000# then 593 Getc; 594 else 595 Bad; 596 end if; 597 end Skip_UTF_Byte; 598 599 -- Start of processing for UTF-8 case 600 601 begin 602 -- 16#00_0000#-16#00_007F#: 0xxxxxxx 603 604 Getc; 605 606 if (U and 2#10000000#) = 2#00000000# then 607 return; 608 609 -- 16#00_0080#-16#00_07FF#: 110xxxxx 10xxxxxx 610 611 else 612 Skip_UTF_Byte; 613 614 if (U and 2#11100000#) = 2#110_00000# then 615 return; 616 617 -- 16#00_0800#-16#00_ffff#: 1110xxxx 10xxxxxx 10xxxxxx 618 619 else 620 Skip_UTF_Byte; 621 622 if (U and 2#11110000#) = 2#1110_0000# then 623 return; 624 625 -- 16#01_0000#-16#10_FFFF#: 11110xxx 10xxxxxx 10xxxxxx 626 -- 10xxxxxx 627 628 else 629 Skip_UTF_Byte; 630 631 if (U and 2#11111000#) = 2#11110_000# then 632 return; 633 634 -- 16#0020_0000#-16#03FF_FFFF#: 111110xx 10xxxxxx 635 -- 10xxxxxx 10xxxxxx 636 -- 10xxxxxx 637 638 else 639 Skip_UTF_Byte; 640 641 if (U and 2#11111100#) = 2#111110_00# then 642 return; 643 644 -- Any other code is invalid, note that this includes: 645 646 -- 16#0400_0000#-16#7FFF_FFFF#: 1111110x 10xxxxxx 647 -- 10xxxxxx 10xxxxxx 648 -- 10xxxxxx 10xxxxxx 649 650 -- since Wide_Wide_Character does not allow codes 651 -- greater than 16#03FF_FFFF# 652 653 else 654 Bad; 655 end if; 656 end if; 657 end if; 658 end if; 659 end if; 660 end UTF8; 661 662 -- Special efficient encoding for brackets case 663 664 elsif Encoding_Method = WCEM_Brackets then 665 Brackets : declare 666 P : Natural; 667 S : Natural; 668 669 begin 670 -- See if we have "] at end positions 671 672 if Ptr > Input'First + 1 673 and then Input (Ptr - 1) = ']' 674 and then Input (Ptr - 2) = '"' 675 then 676 P := Ptr - 2; 677 678 -- Loop back looking for [" at start 679 680 while P >= Ptr - 10 loop 681 if P <= Input'First + 1 then 682 Bad; 683 684 elsif Input (P - 1) = '"' 685 and then Input (P - 2) = '[' 686 then 687 -- Found ["..."], scan forward to check it 688 689 S := P - 2; 690 P := S; 691 Next_Wide_Wide_Character (Input, P); 692 693 -- OK if at original pointer, else error 694 695 if P = Ptr then 696 Ptr := S; 697 return; 698 else 699 Bad; 700 end if; 701 end if; 702 703 P := P - 1; 704 end loop; 705 706 -- Falling through loop means more than 8 chars between the 707 -- enclosing brackets (or simply a missing left bracket) 708 709 Bad; 710 711 -- Here if no bracket sequence present 712 713 else 714 if Ptr = Input'First then 715 Past_End; 716 else 717 Ptr := Ptr - 1; 718 end if; 719 end if; 720 end Brackets; 721 722 -- Non-UTF-8/Brackets. These are the inefficient cases where we have to 723 -- go to the start of the string and skip forwards till Ptr matches. 724 725 else 726 Non_UTF8_Brackets : declare 727 Discard : Wide_Wide_Character; 728 PtrS : Natural; 729 PtrP : Natural; 730 731 begin 732 PtrS := Input'First; 733 734 if Ptr <= PtrS then 735 Past_End; 736 end if; 737 738 loop 739 PtrP := PtrS; 740 Decode_Wide_Wide_Character (Input, PtrS, Discard); 741 742 if PtrS = Ptr then 743 Ptr := PtrP; 744 return; 745 746 elsif PtrS > Ptr then 747 Bad; 748 end if; 749 end loop; 750 751 exception 752 when Constraint_Error => 753 Bad; 754 end Non_UTF8_Brackets; 755 end if; 756 end Prev_Wide_Wide_Character; 757 758 -------------------------- 759 -- Validate_Wide_String -- 760 -------------------------- 761 762 function Validate_Wide_String (S : String) return Boolean is 763 Ptr : Natural; 764 765 begin 766 Ptr := S'First; 767 while Ptr <= S'Last loop 768 Next_Wide_Character (S, Ptr); 769 end loop; 770 771 return True; 772 773 exception 774 when Constraint_Error => 775 return False; 776 end Validate_Wide_String; 777 778 ------------------------------- 779 -- Validate_Wide_Wide_String -- 780 ------------------------------- 781 782 function Validate_Wide_Wide_String (S : String) return Boolean is 783 Ptr : Natural; 784 785 begin 786 Ptr := S'First; 787 while Ptr <= S'Last loop 788 Next_Wide_Wide_Character (S, Ptr); 789 end loop; 790 791 return True; 792 793 exception 794 when Constraint_Error => 795 return False; 796 end Validate_Wide_Wide_String; 797 798end GNAT.Decode_String; 799