1------------------------------------------------------------------------------ 2-- -- 3-- GNAT RUN-TIME COMPONENTS -- 4-- -- 5-- G N A T . D E C O D E _ S T R I N G -- 6-- -- 7-- S p e c -- 8-- -- 9-- Copyright (C) 2007-2014, AdaCore -- 10-- -- 11-- GNAT is free software; you can redistribute it and/or modify it under -- 12-- terms of the GNU General Public License as published by the Free Soft- -- 13-- ware Foundation; either version 3, or (at your option) any later ver- -- 14-- sion. GNAT is distributed in the hope that it will be useful, but WITH- -- 15-- OUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY -- 16-- or FITNESS FOR A PARTICULAR PURPOSE. -- 17-- -- 18-- As a special exception under Section 7 of GPL version 3, you are granted -- 19-- additional permissions described in the GCC Runtime Library Exception, -- 20-- version 3.1, as published by the Free Software Foundation. -- 21-- -- 22-- You should have received a copy of the GNU General Public License and -- 23-- a copy of the GCC Runtime Library Exception along with this program; -- 24-- see the files COPYING3 and COPYING.RUNTIME respectively. If not, see -- 25-- <http://www.gnu.org/licenses/>. -- 26-- -- 27-- GNAT was originally developed by the GNAT team at New York University. -- 28-- Extensive contributions were provided by Ada Core Technologies Inc. -- 29-- -- 30------------------------------------------------------------------------------ 31 32-- This package provides a utility routine for converting from an encoded 33-- string to a corresponding Wide_String or Wide_Wide_String value. 34 35with Interfaces; use Interfaces; 36 37with System.WCh_Cnv; use System.WCh_Cnv; 38with System.WCh_Con; use System.WCh_Con; 39 40package body GNAT.Decode_String is 41 42 ----------------------- 43 -- Local Subprograms -- 44 ----------------------- 45 46 procedure Bad; 47 pragma No_Return (Bad); 48 -- Raise error for bad encoding 49 50 procedure Past_End; 51 pragma No_Return (Past_End); 52 -- Raise error for off end of string 53 54 --------- 55 -- Bad -- 56 --------- 57 58 procedure Bad is 59 begin 60 raise Constraint_Error with 61 "bad encoding or character out of range"; 62 end Bad; 63 64 --------------------------- 65 -- Decode_Wide_Character -- 66 --------------------------- 67 68 procedure Decode_Wide_Character 69 (Input : String; 70 Ptr : in out Natural; 71 Result : out Wide_Character) 72 is 73 Char : Wide_Wide_Character; 74 begin 75 Decode_Wide_Wide_Character (Input, Ptr, Char); 76 77 if Wide_Wide_Character'Pos (Char) > 16#FFFF# then 78 Bad; 79 else 80 Result := Wide_Character'Val (Wide_Wide_Character'Pos (Char)); 81 end if; 82 end Decode_Wide_Character; 83 84 ------------------------ 85 -- Decode_Wide_String -- 86 ------------------------ 87 88 function Decode_Wide_String (S : String) return Wide_String is 89 Result : Wide_String (1 .. S'Length); 90 Length : Natural; 91 begin 92 Decode_Wide_String (S, Result, Length); 93 return Result (1 .. Length); 94 end Decode_Wide_String; 95 96 procedure Decode_Wide_String 97 (S : String; 98 Result : out Wide_String; 99 Length : out Natural) 100 is 101 Ptr : Natural; 102 103 begin 104 Ptr := S'First; 105 Length := 0; 106 while Ptr <= S'Last loop 107 if Length >= Result'Last then 108 Past_End; 109 end if; 110 111 Length := Length + 1; 112 Decode_Wide_Character (S, Ptr, Result (Length)); 113 end loop; 114 end Decode_Wide_String; 115 116 -------------------------------- 117 -- Decode_Wide_Wide_Character -- 118 -------------------------------- 119 120 procedure Decode_Wide_Wide_Character 121 (Input : String; 122 Ptr : in out Natural; 123 Result : out Wide_Wide_Character) 124 is 125 C : Character; 126 127 function In_Char return Character; 128 pragma Inline (In_Char); 129 -- Function to get one input character 130 131 ------------- 132 -- In_Char -- 133 ------------- 134 135 function In_Char return Character is 136 begin 137 if Ptr <= Input'Last then 138 Ptr := Ptr + 1; 139 return Input (Ptr - 1); 140 else 141 Past_End; 142 end if; 143 end In_Char; 144 145 -- Start of processing for Decode_Wide_Wide_Character 146 147 begin 148 C := In_Char; 149 150 -- Special fast processing for UTF-8 case 151 152 if Encoding_Method = WCEM_UTF8 then 153 UTF8 : declare 154 U : Unsigned_32; 155 W : Unsigned_32; 156 157 procedure Get_UTF_Byte; 158 pragma Inline (Get_UTF_Byte); 159 -- Used to interpret 2#10xxxxxx# continuation byte in UTF-8 mode. 160 -- Reads a byte, and raises CE if the first two bits are not 10. 161 -- Otherwise shifts W 6 bits left and or's in the 6 xxxxxx bits. 162 163 ------------------ 164 -- Get_UTF_Byte -- 165 ------------------ 166 167 procedure Get_UTF_Byte is 168 begin 169 U := Unsigned_32 (Character'Pos (In_Char)); 170 171 if (U and 2#11000000#) /= 2#10_000000# then 172 Bad; 173 end if; 174 175 W := Shift_Left (W, 6) or (U and 2#00111111#); 176 end Get_UTF_Byte; 177 178 -- Start of processing for UTF8 case 179 180 begin 181 -- Note: for details of UTF8 encoding see RFC 3629 182 183 U := Unsigned_32 (Character'Pos (C)); 184 185 -- 16#00_0000#-16#00_007F#: 0xxxxxxx 186 187 if (U and 2#10000000#) = 2#00000000# then 188 Result := Wide_Wide_Character'Val (Character'Pos (C)); 189 190 -- 16#00_0080#-16#00_07FF#: 110xxxxx 10xxxxxx 191 192 elsif (U and 2#11100000#) = 2#110_00000# then 193 W := U and 2#00011111#; 194 Get_UTF_Byte; 195 196 if W not in 16#00_0080# .. 16#00_07FF# then 197 Bad; 198 end if; 199 200 Result := Wide_Wide_Character'Val (W); 201 202 -- 16#00_0800#-16#00_ffff#: 1110xxxx 10xxxxxx 10xxxxxx 203 204 elsif (U and 2#11110000#) = 2#1110_0000# then 205 W := U and 2#00001111#; 206 Get_UTF_Byte; 207 Get_UTF_Byte; 208 209 if W not in 16#00_0800# .. 16#00_FFFF# then 210 Bad; 211 end if; 212 213 Result := Wide_Wide_Character'Val (W); 214 215 -- 16#01_0000#-16#10_FFFF#: 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx 216 217 elsif (U and 2#11111000#) = 2#11110_000# then 218 W := U and 2#00000111#; 219 220 for K in 1 .. 3 loop 221 Get_UTF_Byte; 222 end loop; 223 224 if W not in 16#01_0000# .. 16#10_FFFF# then 225 Bad; 226 end if; 227 228 Result := Wide_Wide_Character'Val (W); 229 230 -- 16#0020_0000#-16#03FF_FFFF#: 111110xx 10xxxxxx 10xxxxxx 231 -- 10xxxxxx 10xxxxxx 232 233 elsif (U and 2#11111100#) = 2#111110_00# then 234 W := U and 2#00000011#; 235 236 for K in 1 .. 4 loop 237 Get_UTF_Byte; 238 end loop; 239 240 if W not in 16#0020_0000# .. 16#03FF_FFFF# then 241 Bad; 242 end if; 243 244 Result := Wide_Wide_Character'Val (W); 245 246 -- All other cases are invalid, note that this includes: 247 248 -- 16#0400_0000#-16#7FFF_FFFF#: 1111110x 10xxxxxx 10xxxxxx 249 -- 10xxxxxx 10xxxxxx 10xxxxxx 250 251 -- since Wide_Wide_Character does not include code values 252 -- greater than 16#03FF_FFFF#. 253 254 else 255 Bad; 256 end if; 257 end UTF8; 258 259 -- All encoding functions other than UTF-8 260 261 else 262 Non_UTF8 : declare 263 function Char_Sequence_To_UTF is 264 new Char_Sequence_To_UTF_32 (In_Char); 265 266 begin 267 -- For brackets, must test for specific case of [ not followed by 268 -- quotation, where we must not call Char_Sequence_To_UTF, but 269 -- instead just return the bracket unchanged. 270 271 if Encoding_Method = WCEM_Brackets 272 and then C = '[' 273 and then (Ptr > Input'Last or else Input (Ptr) /= '"') 274 then 275 Result := '['; 276 277 -- All other cases including [" with Brackets 278 279 else 280 Result := 281 Wide_Wide_Character'Val 282 (Char_Sequence_To_UTF (C, Encoding_Method)); 283 end if; 284 end Non_UTF8; 285 end if; 286 end Decode_Wide_Wide_Character; 287 288 ----------------------------- 289 -- Decode_Wide_Wide_String -- 290 ----------------------------- 291 292 function Decode_Wide_Wide_String (S : String) return Wide_Wide_String is 293 Result : Wide_Wide_String (1 .. S'Length); 294 Length : Natural; 295 begin 296 Decode_Wide_Wide_String (S, Result, Length); 297 return Result (1 .. Length); 298 end Decode_Wide_Wide_String; 299 300 procedure Decode_Wide_Wide_String 301 (S : String; 302 Result : out Wide_Wide_String; 303 Length : out Natural) 304 is 305 Ptr : Natural; 306 307 begin 308 Ptr := S'First; 309 Length := 0; 310 while Ptr <= S'Last loop 311 if Length >= Result'Last then 312 Past_End; 313 end if; 314 315 Length := Length + 1; 316 Decode_Wide_Wide_Character (S, Ptr, Result (Length)); 317 end loop; 318 end Decode_Wide_Wide_String; 319 320 ------------------------- 321 -- Next_Wide_Character -- 322 ------------------------- 323 324 procedure Next_Wide_Character (Input : String; Ptr : in out Natural) is 325 Discard : Wide_Character; 326 begin 327 Decode_Wide_Character (Input, Ptr, Discard); 328 end Next_Wide_Character; 329 330 ------------------------------ 331 -- Next_Wide_Wide_Character -- 332 ------------------------------ 333 334 procedure Next_Wide_Wide_Character (Input : String; Ptr : in out Natural) is 335 Discard : Wide_Wide_Character; 336 begin 337 Decode_Wide_Wide_Character (Input, Ptr, Discard); 338 end Next_Wide_Wide_Character; 339 340 -------------- 341 -- Past_End -- 342 -------------- 343 344 procedure Past_End is 345 begin 346 raise Constraint_Error with "past end of string"; 347 end Past_End; 348 349 ------------------------- 350 -- Prev_Wide_Character -- 351 ------------------------- 352 353 procedure Prev_Wide_Character (Input : String; Ptr : in out Natural) is 354 begin 355 if Ptr > Input'Last + 1 then 356 Past_End; 357 end if; 358 359 -- Special efficient encoding for UTF-8 case 360 361 if Encoding_Method = WCEM_UTF8 then 362 UTF8 : declare 363 U : Unsigned_32; 364 365 procedure Getc; 366 pragma Inline (Getc); 367 -- Gets the character at Input (Ptr - 1) and returns code in U as 368 -- Unsigned_32 value. On return Ptr is decremented by one. 369 370 procedure Skip_UTF_Byte; 371 pragma Inline (Skip_UTF_Byte); 372 -- Checks that U is 2#10xxxxxx# and then calls Get 373 374 ---------- 375 -- Getc -- 376 ---------- 377 378 procedure Getc is 379 begin 380 if Ptr <= Input'First then 381 Past_End; 382 else 383 Ptr := Ptr - 1; 384 U := Unsigned_32 (Character'Pos (Input (Ptr))); 385 end if; 386 end Getc; 387 388 ------------------- 389 -- Skip_UTF_Byte -- 390 ------------------- 391 392 procedure Skip_UTF_Byte is 393 begin 394 if (U and 2#11000000#) = 2#10_000000# then 395 Getc; 396 else 397 Bad; 398 end if; 399 end Skip_UTF_Byte; 400 401 -- Start of processing for UTF-8 case 402 403 begin 404 -- 16#00_0000#-16#00_007F#: 0xxxxxxx 405 406 Getc; 407 408 if (U and 2#10000000#) = 2#00000000# then 409 return; 410 411 -- 16#00_0080#-16#00_07FF#: 110xxxxx 10xxxxxx 412 413 else 414 Skip_UTF_Byte; 415 416 if (U and 2#11100000#) = 2#110_00000# then 417 return; 418 419 -- 16#00_0800#-16#00_ffff#: 1110xxxx 10xxxxxx 10xxxxxx 420 421 else 422 Skip_UTF_Byte; 423 424 if (U and 2#11110000#) = 2#1110_0000# then 425 return; 426 427 -- Any other code is invalid, note that this includes: 428 429 -- 16#01_0000#-16#10_FFFF#: 11110xxx 10xxxxxx 10xxxxxx 430 -- 10xxxxxx 431 432 -- 16#0020_0000#-16#03FF_FFFF#: 111110xx 10xxxxxx 433 -- 10xxxxxx 10xxxxxx 434 -- 10xxxxxx 435 436 -- 16#0400_0000#-16#7FFF_FFFF#: 1111110x 10xxxxxx 437 -- 10xxxxxx 10xxxxxx 438 -- 10xxxxxx 10xxxxxx 439 440 -- since Wide_Character does not allow codes > 16#FFFF# 441 442 else 443 Bad; 444 end if; 445 end if; 446 end if; 447 end UTF8; 448 449 -- Special efficient encoding for brackets case 450 451 elsif Encoding_Method = WCEM_Brackets then 452 Brackets : declare 453 P : Natural; 454 S : Natural; 455 456 begin 457 -- See if we have "] at end positions 458 459 if Ptr > Input'First + 1 460 and then Input (Ptr - 1) = ']' 461 and then Input (Ptr - 2) = '"' 462 then 463 P := Ptr - 2; 464 465 -- Loop back looking for [" at start 466 467 while P >= Ptr - 10 loop 468 if P <= Input'First + 1 then 469 Bad; 470 471 elsif Input (P - 1) = '"' 472 and then Input (P - 2) = '[' 473 then 474 -- Found ["..."], scan forward to check it 475 476 S := P - 2; 477 P := S; 478 Next_Wide_Character (Input, P); 479 480 -- OK if at original pointer, else error 481 482 if P = Ptr then 483 Ptr := S; 484 return; 485 else 486 Bad; 487 end if; 488 end if; 489 490 P := P - 1; 491 end loop; 492 493 -- Falling through loop means more than 8 chars between the 494 -- enclosing brackets (or simply a missing left bracket) 495 496 Bad; 497 498 -- Here if no bracket sequence present 499 500 else 501 if Ptr = Input'First then 502 Past_End; 503 else 504 Ptr := Ptr - 1; 505 end if; 506 end if; 507 end Brackets; 508 509 -- Non-UTF-8/Brackets. These are the inefficient cases where we have to 510 -- go to the start of the string and skip forwards till Ptr matches. 511 512 else 513 Non_UTF_Brackets : declare 514 Discard : Wide_Character; 515 PtrS : Natural; 516 PtrP : Natural; 517 518 begin 519 PtrS := Input'First; 520 521 if Ptr <= PtrS then 522 Past_End; 523 end if; 524 525 loop 526 PtrP := PtrS; 527 Decode_Wide_Character (Input, PtrS, Discard); 528 529 if PtrS = Ptr then 530 Ptr := PtrP; 531 return; 532 533 elsif PtrS > Ptr then 534 Bad; 535 end if; 536 end loop; 537 538 exception 539 when Constraint_Error => 540 Bad; 541 end Non_UTF_Brackets; 542 end if; 543 end Prev_Wide_Character; 544 545 ------------------------------ 546 -- Prev_Wide_Wide_Character -- 547 ------------------------------ 548 549 procedure Prev_Wide_Wide_Character (Input : String; Ptr : in out Natural) is 550 begin 551 if Ptr > Input'Last + 1 then 552 Past_End; 553 end if; 554 555 -- Special efficient encoding for UTF-8 case 556 557 if Encoding_Method = WCEM_UTF8 then 558 UTF8 : declare 559 U : Unsigned_32; 560 561 procedure Getc; 562 pragma Inline (Getc); 563 -- Gets the character at Input (Ptr - 1) and returns code in U as 564 -- Unsigned_32 value. On return Ptr is decremented by one. 565 566 procedure Skip_UTF_Byte; 567 pragma Inline (Skip_UTF_Byte); 568 -- Checks that U is 2#10xxxxxx# and then calls Get 569 570 ---------- 571 -- Getc -- 572 ---------- 573 574 procedure Getc is 575 begin 576 if Ptr <= Input'First then 577 Past_End; 578 else 579 Ptr := Ptr - 1; 580 U := Unsigned_32 (Character'Pos (Input (Ptr))); 581 end if; 582 end Getc; 583 584 ------------------- 585 -- Skip_UTF_Byte -- 586 ------------------- 587 588 procedure Skip_UTF_Byte is 589 begin 590 if (U and 2#11000000#) = 2#10_000000# then 591 Getc; 592 else 593 Bad; 594 end if; 595 end Skip_UTF_Byte; 596 597 -- Start of processing for UTF-8 case 598 599 begin 600 -- 16#00_0000#-16#00_007F#: 0xxxxxxx 601 602 Getc; 603 604 if (U and 2#10000000#) = 2#00000000# then 605 return; 606 607 -- 16#00_0080#-16#00_07FF#: 110xxxxx 10xxxxxx 608 609 else 610 Skip_UTF_Byte; 611 612 if (U and 2#11100000#) = 2#110_00000# then 613 return; 614 615 -- 16#00_0800#-16#00_ffff#: 1110xxxx 10xxxxxx 10xxxxxx 616 617 else 618 Skip_UTF_Byte; 619 620 if (U and 2#11110000#) = 2#1110_0000# then 621 return; 622 623 -- 16#01_0000#-16#10_FFFF#: 11110xxx 10xxxxxx 10xxxxxx 624 -- 10xxxxxx 625 626 else 627 Skip_UTF_Byte; 628 629 if (U and 2#11111000#) = 2#11110_000# then 630 return; 631 632 -- 16#0020_0000#-16#03FF_FFFF#: 111110xx 10xxxxxx 633 -- 10xxxxxx 10xxxxxx 634 -- 10xxxxxx 635 636 else 637 Skip_UTF_Byte; 638 639 if (U and 2#11111100#) = 2#111110_00# then 640 return; 641 642 -- Any other code is invalid, note that this includes: 643 644 -- 16#0400_0000#-16#7FFF_FFFF#: 1111110x 10xxxxxx 645 -- 10xxxxxx 10xxxxxx 646 -- 10xxxxxx 10xxxxxx 647 648 -- since Wide_Wide_Character does not allow codes 649 -- greater than 16#03FF_FFFF# 650 651 else 652 Bad; 653 end if; 654 end if; 655 end if; 656 end if; 657 end if; 658 end UTF8; 659 660 -- Special efficient encoding for brackets case 661 662 elsif Encoding_Method = WCEM_Brackets then 663 Brackets : declare 664 P : Natural; 665 S : Natural; 666 667 begin 668 -- See if we have "] at end positions 669 670 if Ptr > Input'First + 1 671 and then Input (Ptr - 1) = ']' 672 and then Input (Ptr - 2) = '"' 673 then 674 P := Ptr - 2; 675 676 -- Loop back looking for [" at start 677 678 while P >= Ptr - 10 loop 679 if P <= Input'First + 1 then 680 Bad; 681 682 elsif Input (P - 1) = '"' 683 and then Input (P - 2) = '[' 684 then 685 -- Found ["..."], scan forward to check it 686 687 S := P - 2; 688 P := S; 689 Next_Wide_Wide_Character (Input, P); 690 691 -- OK if at original pointer, else error 692 693 if P = Ptr then 694 Ptr := S; 695 return; 696 else 697 Bad; 698 end if; 699 end if; 700 701 P := P - 1; 702 end loop; 703 704 -- Falling through loop means more than 8 chars between the 705 -- enclosing brackets (or simply a missing left bracket) 706 707 Bad; 708 709 -- Here if no bracket sequence present 710 711 else 712 if Ptr = Input'First then 713 Past_End; 714 else 715 Ptr := Ptr - 1; 716 end if; 717 end if; 718 end Brackets; 719 720 -- Non-UTF-8/Brackets. These are the inefficient cases where we have to 721 -- go to the start of the string and skip forwards till Ptr matches. 722 723 else 724 Non_UTF8_Brackets : declare 725 Discard : Wide_Wide_Character; 726 PtrS : Natural; 727 PtrP : Natural; 728 729 begin 730 PtrS := Input'First; 731 732 if Ptr <= PtrS then 733 Past_End; 734 end if; 735 736 loop 737 PtrP := PtrS; 738 Decode_Wide_Wide_Character (Input, PtrS, Discard); 739 740 if PtrS = Ptr then 741 Ptr := PtrP; 742 return; 743 744 elsif PtrS > Ptr then 745 Bad; 746 end if; 747 end loop; 748 749 exception 750 when Constraint_Error => 751 Bad; 752 end Non_UTF8_Brackets; 753 end if; 754 end Prev_Wide_Wide_Character; 755 756 -------------------------- 757 -- Validate_Wide_String -- 758 -------------------------- 759 760 function Validate_Wide_String (S : String) return Boolean is 761 Ptr : Natural; 762 763 begin 764 Ptr := S'First; 765 while Ptr <= S'Last loop 766 Next_Wide_Character (S, Ptr); 767 end loop; 768 769 return True; 770 771 exception 772 when Constraint_Error => 773 return False; 774 end Validate_Wide_String; 775 776 ------------------------------- 777 -- Validate_Wide_Wide_String -- 778 ------------------------------- 779 780 function Validate_Wide_Wide_String (S : String) return Boolean is 781 Ptr : Natural; 782 783 begin 784 Ptr := S'First; 785 while Ptr <= S'Last loop 786 Next_Wide_Wide_Character (S, Ptr); 787 end loop; 788 789 return True; 790 791 exception 792 when Constraint_Error => 793 return False; 794 end Validate_Wide_Wide_String; 795 796end GNAT.Decode_String; 797