1 // Copyright (C) 2003 Davis E. King (davis@dlib.net) 2 // License: Boost Software License See LICENSE.txt for the full license. 3 #ifndef DLIB_XML_PARSER_KERNEl_1_ 4 #define DLIB_XML_PARSER_KERNEl_1_ 5 6 7 #include "xml_parser_kernel_abstract.h" 8 9 #include <sstream> 10 #include <string> 11 #include <fstream> 12 #include <iostream> 13 #include "xml_parser_kernel_interfaces.h" 14 #include "../algs.h" 15 #include <cstdio> 16 #include "../map.h" 17 #include "../stack.h" 18 #include "../sequence.h" 19 #include "../memory_manager.h" 20 21 namespace dlib 22 { 23 24 class xml_parser 25 { 26 typedef dlib::map<std::string,std::string,memory_manager<char>::kernel_2a>::kernel_1b map; 27 typedef dlib::stack<std::string,memory_manager<char>::kernel_2a>::kernel_1a stack; 28 typedef sequence<document_handler*>::kernel_2a seq_dh; 29 typedef sequence<error_handler*>::kernel_2a seq_eh; 30 31 /*! 32 INITIAL VALUE 33 dh_list.size() == 0 34 eh_list.size() == 0 35 36 CONVENTION 37 dh_list == a sequence of pointers to all the document_handlers that 38 have been added to the xml_parser 39 eh_list == a sequence of pointers to all the error_handlers that 40 have been added to the xml_parser 41 42 map is used to implement the attribute_list interface 43 stack is used just inside the parse function 44 seq_dh is used to make the dh_list member variable 45 seq_eh is used to make the eh_list member variable 46 !*/ 47 48 49 50 public: 51 52 // These typedefs are here for backwards compatibly with previous versions of 53 // dlib. 54 typedef xml_parser kernel_1a; 55 typedef xml_parser kernel_1a_c; 56 xml_parser()57 xml_parser( 58 ) {} 59 ~xml_parser()60 virtual ~xml_parser( 61 ){} 62 63 inline void clear( 64 ); 65 66 inline void parse ( 67 std::istream& in 68 ); 69 70 inline void add_document_handler ( 71 document_handler& item 72 ); 73 74 inline void add_error_handler ( 75 error_handler& item 76 ); 77 78 79 inline void swap ( 80 xml_parser& item 81 ); 82 83 84 private: 85 86 // ----------------------------------- 87 88 // attribute_list interface implementation 89 class attrib_list : public attribute_list 90 { 91 public: 92 // the list of attribute name/value pairs 93 map list; 94 is_in_list(const std::string & key)95 bool is_in_list ( 96 const std::string& key 97 ) const 98 { 99 return list.is_in_domain(key); 100 } 101 102 const std::string& operator[] ( 103 const std::string& key 104 ) const 105 { 106 if (is_in_list(key)) 107 return list[key]; 108 else 109 throw xml_attribute_list_error("No XML attribute named " + key + " is present in tag."); 110 } 111 at_start()112 bool at_start ( 113 ) const { return list.at_start(); } 114 reset()115 void reset ( 116 ) const { return list.reset(); } 117 current_element_valid()118 bool current_element_valid ( 119 ) const { return list.current_element_valid(); } 120 element()121 const type& element ( 122 ) const { return list.element(); } 123 element()124 type& element ( 125 ) { return list.element(); } 126 move_next()127 bool move_next ( 128 ) const { return list.move_next(); } 129 size()130 size_t size ( 131 ) const { return list.size(); } 132 }; 133 134 135 // ----------------------------------- 136 137 enum token_type 138 { 139 element_start, // the first tag of an element 140 element_end, // the last tag of an element 141 empty_element, // the singular tag of an empty element 142 pi, // processing instruction 143 chars, // the non-markup data between tags 144 chars_cdata, // the data from a CDATA section 145 eof, // this token is returned when we reach the end of input 146 error, // this token indicates that the tokenizer couldn't 147 // determine which category the next token fits into 148 dtd, // this token is for an entire dtd 149 comment // this is a token for comments 150 }; 151 /* 152 notes about the tokens: 153 the tokenizer guarantees that the following tokens to not 154 contain the '<' character except as the first character of the token 155 element_start, element_end, empty_element, and pi. they also only 156 contain the '>' characer as their last character. 157 158 it is also guaranteed that pi is at least of the form <??>. that 159 is to say that it always always begins with <? and ends with ?>. 160 161 it is also guaranteed that all markup tokens will begin with the '<' 162 character and end with the '>'. there won't be any leading or 163 trailing whitespaces. this whitespace is considered a chars token. 164 */ 165 166 167 // private member functions 168 inline void get_next_token( 169 std::istream& in, 170 std::string& token_text, 171 int& token_kind, 172 unsigned long& line_number 173 ); 174 /*! 175 ensures 176 gets the next token from in and puts it in token_text and 177 token_kind == the kind of the token found and 178 line_number is incremented every time a '\n' is encountered and 179 entity references are translated into the characters they represent 180 only for chars tokens 181 !*/ 182 183 inline int parse_element ( 184 const std::string& token, 185 std::string& name, 186 attrib_list& atts 187 ); 188 /*! 189 requires 190 token is a token of kind start_element or empty_element 191 ensures 192 gets the element name and puts it into the string name and 193 parses out the attributes and puts them into the attribute_list atts 194 195 return 0 upon success or 196 returns -1 if it failed to parse token 197 !*/ 198 199 inline int parse_pi ( 200 const std::string& token, 201 std::string& target, 202 std::string& data 203 ); 204 /*! 205 requires 206 token is a token of kind pi 207 ensures 208 the target from the processing instruction is put into target and 209 the data from the processing instruction is put into data 210 211 return 0 upon success or 212 returns -1 if it failed to parse token 213 !*/ 214 215 inline int parse_element_end ( 216 const std::string& token, 217 std::string& name 218 ); 219 /*! 220 requires 221 token is a token of kind element_end 222 ensures 223 the name from the ending element tag is put into the string name 224 225 return 0 upon success or 226 returns -1 if it failed to parse token 227 !*/ 228 229 inline int change_entity ( 230 std::istream& in 231 ); 232 /*! 233 ensures 234 performs the following translations and returns the new character 235 amp; -> & 236 lt; -> < 237 gt; -> > 238 apos; -> ' 239 quot; -> " 240 241 or returns -1 if we hit an undefined entity reference or EOF. 242 (i.e. it was not one of the entities listed above) 243 244 !*/ 245 246 // ----------------------------------- 247 248 // private member data 249 seq_dh dh_list; 250 seq_eh eh_list; 251 252 // ----------------------------------- 253 254 // restricted functions: assignment and copy construction 255 xml_parser(xml_parser&); 256 xml_parser& operator= ( 257 xml_parser& 258 ); 259 260 }; 261 swap(xml_parser & a,xml_parser & b)262 inline void swap ( 263 xml_parser& a, 264 xml_parser& b 265 ) { a.swap(b); } 266 267 268 // ---------------------------------------------------------------------------------------- 269 // ---------------------------------------------------------------------------------------- 270 // member function definitions 271 // ---------------------------------------------------------------------------------------- 272 // ---------------------------------------------------------------------------------------- 273 274 void xml_parser:: clear()275 clear( 276 ) 277 { 278 // unregister all event handlers 279 eh_list.clear(); 280 dh_list.clear(); 281 } 282 283 // ---------------------------------------------------------------------------------------- 284 285 void xml_parser:: parse(std::istream & in)286 parse ( 287 std::istream& in 288 ) 289 { 290 DLIB_CASSERT ( in.fail() == false , 291 "\tvoid xml_parser::parse" 292 << "\n\tthe input stream must not be in the fail state" 293 << "\n\tthis: " << this 294 ); 295 296 297 // save which exceptions in will throw and make it so it won't throw any 298 // for the life of this function 299 std::ios::iostate old_exceptions = in.exceptions(); 300 // set it to not throw anything 301 in.exceptions(std::ios::goodbit); 302 303 304 try 305 { 306 unsigned long line_number = 1; 307 308 // skip any whitespace before the start of the document 309 while (in.peek() == ' ' || in.peek() == '\t' || in.peek() == '\n' || in.peek() == '\r' ) 310 { 311 if (in.peek() == '\n') 312 ++line_number; 313 in.get(); 314 } 315 316 317 318 stack tags; // this stack contains the last start tag seen 319 bool seen_fatal_error = false; 320 bool seen_root_tag = false; // this is true after we have seen the root tag 321 322 323 324 // notify all the document_handlers that we are about to being parsing 325 for (unsigned long i = 0; i < dh_list.size(); ++i) 326 { 327 dh_list[i]->start_document(); 328 } 329 330 331 std::string chars_buf; // used to collect chars data between consecutive 332 // chars and chars_cdata tokens so that 333 // document_handlers receive all chars data between 334 // tags in one call 335 336 // variables to be used with the parsing functions 337 attrib_list atts; 338 std::string name; 339 std::string target; 340 std::string data; 341 342 343 344 // variables to use with the get_next_token() function 345 std::string token_text; 346 int token_kind; 347 348 get_next_token(in,token_text,token_kind,line_number); 349 350 351 while (token_kind != eof) 352 { 353 bool is_empty = false; // this becomes true when this token is an empty_element 354 355 switch (token_kind) 356 { 357 358 359 case empty_element: is_empty = true; 360 // fall through 361 case element_start: 362 { 363 seen_root_tag = true; 364 365 int status = parse_element(token_text,name,atts); 366 // if there was no error parsing the element 367 if (status == 0) 368 { 369 // notify all the document_handlers 370 for (unsigned long i = 0; i < dh_list.size(); ++i) 371 { 372 dh_list[i]->start_element(line_number,name,atts); 373 if (is_empty) 374 dh_list[i]->end_element(line_number,name); 375 } 376 } 377 else 378 { 379 seen_fatal_error = true; 380 } 381 382 // if this is an element_start token then push the name of 383 // the element on to the stack 384 if (token_kind == element_start) 385 { 386 tags.push(name); 387 } 388 389 }break; 390 391 // ---------------------------------------- 392 393 case element_end: 394 { 395 396 int status = parse_element_end (token_text,name); 397 398 // if there was no error parsing the element 399 if (status == 0) 400 { 401 // make sure this ending element tag matches the last start 402 // element tag we saw 403 if ( tags.size() == 0 || name != tags.current()) 404 { 405 // they don't match so signal a fatal error 406 seen_fatal_error = true; 407 } 408 else 409 { 410 // notify all the document_handlers 411 for (unsigned long i = 0; i < dh_list.size(); ++i) 412 { 413 dh_list[i]->end_element(line_number,name); 414 } 415 416 // they match so throw away this element name 417 tags.pop(name); 418 } 419 } 420 else 421 { 422 seen_fatal_error = true; 423 } 424 425 426 }break; 427 428 // ---------------------------------------- 429 430 case pi: 431 { 432 433 int status = parse_pi (token_text,target,data); 434 // if there was no error parsing the element 435 if (status == 0) 436 { 437 // notify all the document_handlers 438 for (unsigned long i = 0; i < dh_list.size(); ++i) 439 { 440 dh_list[i]->processing_instruction(line_number,target,data); 441 } 442 } 443 else 444 { 445 // notify all the error_handlers 446 for (unsigned long i = 0; i < eh_list.size(); ++i) 447 { 448 eh_list[i]->error(line_number); 449 } 450 } 451 while (in.peek() == ' ' || in.peek() == '\t' || in.peek() == '\n' || in.peek() == '\r' ) 452 { 453 if (in.peek() == '\n') 454 ++line_number; 455 in.get(); 456 } 457 458 459 }break; 460 461 // ---------------------------------------- 462 463 case chars: 464 { 465 if (tags.size() != 0) 466 { 467 chars_buf += token_text; 468 } 469 else if (token_text.find_first_not_of(" \t\r\n") != std::string::npos) 470 { 471 // you can't have non whitespace chars data outside the root element 472 seen_fatal_error = true; 473 } 474 }break; 475 476 // ---------------------------------------- 477 478 case chars_cdata: 479 { 480 if (tags.size() != 0) 481 { 482 chars_buf += token_text; 483 } 484 else 485 { 486 // you can't have chars_data outside the root element 487 seen_fatal_error = true; 488 } 489 }break; 490 491 // ---------------------------------------- 492 493 case eof: 494 break; 495 496 // ---------------------------------------- 497 498 case error: 499 { 500 seen_fatal_error = true; 501 }break; 502 503 // ---------------------------------------- 504 505 case dtd: // fall though 506 case comment: // do nothing 507 break; 508 509 // ---------------------------------------- 510 511 512 } 513 514 // if there was a fatal error then quit loop 515 if (seen_fatal_error) 516 break; 517 518 // if we have seen the last tag then quit the loop 519 if (tags.size() == 0 && seen_root_tag) 520 break; 521 522 523 get_next_token(in,token_text,token_kind,line_number); 524 525 // if the next token is not a chars or chars_cdata token then flush 526 // the chars_buf to the document_handlers 527 if ( (token_kind != chars) && 528 (token_kind != chars_cdata) && 529 (token_kind != dtd) && 530 (token_kind != comment) && 531 (chars_buf.size() != 0) 532 ) 533 { 534 // notify all the document_handlers 535 for (unsigned long i = 0; i < dh_list.size(); ++i) 536 { 537 dh_list[i]->characters(chars_buf); 538 } 539 chars_buf.erase(); 540 } 541 542 543 } //while (token_kind != eof) 544 545 546 547 548 // you can't have any unmatched tags or any fatal erros 549 if (tags.size() != 0 || seen_fatal_error) 550 { 551 // notify all the error_handlers 552 for (unsigned long i = 0; i < eh_list.size(); ++i) 553 { 554 eh_list[i]->fatal_error(line_number); 555 } 556 557 } 558 559 560 // notify all the document_handlers that we have ended parsing 561 for (unsigned long i = 0; i < dh_list.size(); ++i) 562 { 563 dh_list[i]->end_document(); 564 } 565 566 } 567 catch (...) 568 { 569 // notify all the document_handlers that we have ended parsing 570 for (unsigned long i = 0; i < dh_list.size(); ++i) 571 { 572 dh_list[i]->end_document(); 573 } 574 575 // restore the old exception settings to in 576 in.exceptions(old_exceptions); 577 578 // don't forget to rethrow the exception 579 throw; 580 } 581 582 // restore the old exception settings to in 583 in.exceptions(old_exceptions); 584 585 } 586 587 // ---------------------------------------------------------------------------------------- 588 589 void xml_parser:: add_document_handler(document_handler & item)590 add_document_handler ( 591 document_handler& item 592 ) 593 { 594 document_handler* temp = &item; 595 dh_list.add(dh_list.size(),temp); 596 } 597 598 // ---------------------------------------------------------------------------------------- 599 600 void xml_parser:: add_error_handler(error_handler & item)601 add_error_handler ( 602 error_handler& item 603 ) 604 { 605 error_handler* temp = &item; 606 eh_list.add(eh_list.size(),temp); 607 } 608 609 // ---------------------------------------------------------------------------------------- 610 611 void xml_parser:: swap(xml_parser & item)612 swap ( 613 xml_parser& item 614 ) 615 { 616 dh_list.swap(item.dh_list); 617 eh_list.swap(item.eh_list); 618 } 619 620 // ---------------------------------------------------------------------------------------- 621 // ---------------------------------------------------------------------------------------- 622 // private member function definitions 623 // ---------------------------------------------------------------------------------------- 624 // ---------------------------------------------------------------------------------------- 625 626 void xml_parser:: get_next_token(std::istream & in,std::string & token_text,int & token_kind,unsigned long & line_number)627 get_next_token( 628 std::istream& in, 629 std::string& token_text, 630 int& token_kind, 631 unsigned long& line_number 632 ) 633 { 634 635 token_text.erase(); 636 637 std::istream::int_type ch1 = in.get(); 638 std::istream::int_type ch2; 639 640 641 switch (ch1) 642 { 643 644 // ----------------------------------------- 645 646 // this is the start of some kind of a tag 647 case '<': 648 { 649 ch2 = in.get(); 650 switch (ch2) 651 { 652 653 // --------------------------------- 654 655 // this is a dtd, comment, or chars_cdata token 656 case '!': 657 { 658 // if this is a CDATA section ******************************* 659 if ( in.peek() == '[') 660 { 661 token_kind = chars_cdata; 662 663 // throw away the '[' 664 in.get(); 665 666 // make sure the next chars are CDATA[ 667 std::istream::int_type ch = in.get(); 668 if (ch != 'C') 669 token_kind = error; 670 ch = in.get(); 671 if (ch != 'D') 672 token_kind = error; 673 ch = in.get(); 674 if (ch != 'A') 675 token_kind = error; 676 ch = in.get(); 677 if (ch != 'T') 678 token_kind = error; 679 ch = in.get(); 680 if (ch != 'A') 681 token_kind = error; 682 ch = in.get(); 683 if (ch != '[') 684 token_kind = error; 685 // if this is an error token then end 686 if (token_kind == error) 687 break; 688 689 690 // get the rest of the chars and put them into token_text 691 int brackets_seen = 0; // this is the number of ']' chars 692 // we have seen in a row 693 bool seen_closing = false; // true if we have seen ]]> 694 do 695 { 696 ch = in.get(); 697 698 if (ch == '\n') 699 ++line_number; 700 701 token_text += ch; 702 703 // if this is the closing 704 if (brackets_seen == 2 && ch == '>') 705 seen_closing = true; 706 // if we are seeing a bracket 707 else if (ch == ']') 708 ++brackets_seen; 709 // if we didn't see a bracket 710 else 711 brackets_seen = 0; 712 713 714 } while ( (!seen_closing) && (ch != EOF) ); 715 716 // check if this is an error token 717 if (ch == EOF) 718 { 719 token_kind = error; 720 } 721 else 722 { 723 token_text.erase(token_text.size()-3); 724 } 725 726 727 728 } 729 // this is a comment token **************************** 730 else if (in.peek() == '-') 731 { 732 733 token_text += ch1; 734 token_text += ch2; 735 token_text += '-'; 736 737 token_kind = comment; 738 739 // throw away the '-' char 740 in.get(); 741 742 // make sure the next char is another '-' 743 std::istream::int_type ch = in.get(); 744 if (ch != '-') 745 { 746 token_kind = error; 747 break; 748 } 749 750 token_text += '-'; 751 752 753 // get the rest of the chars and put them into token_text 754 int hyphens_seen = 0; // this is the number of '-' chars 755 // we have seen in a row 756 bool seen_closing = false; // true if we have seen ]]> 757 do 758 { 759 ch = in.get(); 760 761 if (ch == '\n') 762 ++line_number; 763 764 token_text += ch; 765 766 // if this should be a closing block 767 if (hyphens_seen == 2) 768 { 769 if (ch == '>') 770 seen_closing = true; 771 else // this isn't a closing so make it signal error 772 ch = EOF; 773 } 774 // if we are seeing a hyphen 775 else if (ch == '-') 776 ++hyphens_seen; 777 // if we didn't see a hyphen 778 else 779 hyphens_seen = 0; 780 781 782 } while ( (!seen_closing) && (ch != EOF) ); 783 784 // check if this is an error token 785 if (ch == EOF) 786 { 787 token_kind = error; 788 } 789 790 791 792 793 794 } 795 else // this is a dtd token ************************* 796 { 797 798 token_text += ch1; 799 token_text += ch2; 800 int bracket_depth = 1; // this is the number of '<' chars seen 801 // minus the number of '>' chars seen 802 803 std::istream::int_type ch; 804 do 805 { 806 ch = in.get(); 807 if (ch == '>') 808 --bracket_depth; 809 else if (ch == '<') 810 ++bracket_depth; 811 else if (ch == '\n') 812 ++line_number; 813 814 token_text += ch; 815 816 } while ( (bracket_depth > 0) && (ch != EOF) ); 817 818 // make sure we didn't just hit EOF 819 if (bracket_depth == 0) 820 { 821 token_kind = dtd; 822 } 823 else 824 { 825 token_kind = error; 826 } 827 } 828 } 829 break; 830 831 // --------------------------------- 832 833 // this is a pi token 834 case '?': 835 { 836 token_text += ch1; 837 token_text += ch2; 838 std::istream::int_type ch; 839 840 do 841 { 842 ch = in.get(); 843 token_text += ch; 844 if (ch == '\n') 845 ++line_number; 846 // else if we hit a < then thats an error 847 else if (ch == '<') 848 ch = EOF; 849 } while (ch != '>' && ch != EOF); 850 // if we hit the end of the pi 851 if (ch == '>') 852 { 853 // make sure there was a trailing '?' 854 if ( (token_text.size() > 3) && 855 (token_text[token_text.size()-2] != '?') 856 ) 857 { 858 token_kind = error; 859 } 860 else 861 { 862 token_kind = pi; 863 } 864 } 865 // if we hit EOF unexpectidely then error 866 else 867 { 868 token_kind = error; 869 } 870 } 871 break; 872 873 // --------------------------------- 874 875 // this is an error token 876 case EOF: 877 { 878 token_kind = error; 879 } 880 break; 881 882 // --------------------------------- 883 // this is an element_end token 884 case '/': 885 { 886 token_kind = element_end; 887 token_text += ch1; 888 token_text += ch2; 889 std::istream::int_type ch; 890 do 891 { 892 ch = in.get(); 893 if (ch == '\n') 894 ++line_number; 895 // else if we hit a < then thats an error 896 else if (ch == '<') 897 ch = EOF; 898 token_text += ch; 899 } while ( (ch != '>') && (ch != EOF)); 900 901 // check if this is an error token 902 if (ch == EOF) 903 { 904 token_kind = error; 905 } 906 } 907 break; 908 909 910 // --------------------------------- 911 912 // this is an element_start or empty_element token 913 default: 914 { 915 916 token_text += ch1; 917 token_text += ch2; 918 std::istream::int_type ch = '\0'; 919 std::istream::int_type last; 920 do 921 { 922 last = ch; 923 ch = in.get(); 924 if (ch == '\n') 925 ++line_number; 926 // else if we hit a < then thats an error 927 else if (ch == '<') 928 ch = EOF; 929 token_text += ch; 930 } while ( (ch != '>') && (ch != EOF)); 931 932 // check if this is an error token 933 if (ch == EOF) 934 { 935 token_kind = error; 936 } 937 // if this is an empty_element 938 else if (last == '/') 939 { 940 token_kind = empty_element; 941 } 942 else 943 { 944 token_kind = element_start; 945 } 946 947 948 } 949 break; 950 951 // --------------------------------- 952 953 } 954 955 } 956 break; 957 958 // ----------------------------------------- 959 960 // this is an eof token 961 case EOF: 962 { 963 token_kind = eof; 964 } 965 break; 966 967 // ----------------------------------------- 968 969 // this is a chars token 970 default: 971 { 972 if (ch1 == '\n') 973 { 974 ++line_number; 975 token_text += ch1; 976 } 977 // if the first thing in this chars token is an entity reference 978 else if (ch1 == '&') 979 { 980 981 int temp = change_entity(in); 982 if (temp == -1) 983 { 984 token_kind = error; 985 break; 986 } 987 else 988 { 989 token_text += temp; 990 } 991 } 992 else 993 { 994 token_text += ch1; 995 } 996 997 998 token_kind = chars; 999 1000 std::istream::int_type ch = 0; 1001 while (in.peek() != '<' && in.peek() != EOF) 1002 { 1003 ch = in.get(); 1004 1005 if (ch == '\n') 1006 ++line_number; 1007 1008 // if this is one of the predefined entity references then change it 1009 if (ch == '&') 1010 { 1011 int temp = change_entity(in); 1012 if (temp == -1) 1013 { 1014 ch = EOF; 1015 break; 1016 } 1017 else 1018 token_text += temp; 1019 } 1020 else 1021 { 1022 token_text += ch; 1023 } 1024 } 1025 1026 // if this is an error token 1027 if (ch == EOF) 1028 { 1029 token_kind = error; 1030 } 1031 1032 } 1033 break; 1034 1035 // ----------------------------------------- 1036 1037 } 1038 1039 1040 } 1041 1042 1043 1044 // ---------------------------------------------------------------------------------------- 1045 1046 int xml_parser:: parse_element(const std::string & token,std::string & name,attrib_list & atts)1047 parse_element ( 1048 const std::string& token, 1049 std::string& name, 1050 attrib_list& atts 1051 ) 1052 { 1053 name.erase(); 1054 atts.list.clear(); 1055 1056 // there must be at least one character between the <> 1057 if (token[1] == '>') 1058 return -1; 1059 1060 std::string::size_type i; 1061 std::istream::int_type ch = token[1]; 1062 i = 2; 1063 1064 // fill out name. the name can not contain any of the following characters 1065 while ( (ch != '>') && 1066 (ch != ' ') && 1067 (ch != '=') && 1068 (ch != '/') && 1069 (ch != '\t') && 1070 (ch != '\r') && 1071 (ch != '\n') 1072 ) 1073 { 1074 name += ch; 1075 ch = token[i]; 1076 ++i; 1077 } 1078 1079 // skip any whitespaces 1080 while ( ch == ' ' || ch == '\t' || ch == '\n' || ch == '\r' ) 1081 { 1082 ch = token[i]; 1083 ++i; 1084 } 1085 1086 // find any attributes 1087 while (ch != '>' && ch != '/') 1088 { 1089 std::string attribute_name; 1090 std::string attribute_value; 1091 1092 // fill out attribute_name 1093 while ( (ch != '=') && 1094 (ch != ' ') && 1095 (ch != '\t') && 1096 (ch != '\r') && 1097 (ch != '\n') && 1098 (ch != '>') 1099 ) 1100 { 1101 attribute_name += ch; 1102 ch = token[i]; 1103 ++i; 1104 } 1105 1106 // you can't have empty attribute names 1107 if (attribute_name.size() == 0) 1108 return -1; 1109 1110 // if we hit > too early then return error 1111 if (ch == '>') 1112 return -1; 1113 1114 // skip any whitespaces 1115 while (ch == ' ' || ch == '\t' || ch =='\n' || ch =='\r') 1116 { 1117 ch = token[i]; 1118 ++i; 1119 } 1120 1121 // the next char should be a '=', error if it's not 1122 if (ch != '=') 1123 return -1; 1124 1125 // get the next char 1126 ch = token[i]; 1127 ++i; 1128 1129 // skip any whitespaces 1130 while (ch == ' ' || ch == '\t' || ch =='\n' || ch =='\r') 1131 { 1132 ch = token[i]; 1133 ++i; 1134 } 1135 1136 1137 // get the delimiter for the attribute value 1138 std::istream::int_type delimiter = ch; // this should be either a ' or " character 1139 ch = token[i]; // get the next char 1140 ++i; 1141 if (delimiter != '\'' && delimiter!='"') 1142 return -1; 1143 1144 1145 // fill out attribute_value 1146 while ( (ch != delimiter) && 1147 (ch != '>') 1148 ) 1149 { 1150 attribute_value += ch; 1151 ch = token[i]; 1152 ++i; 1153 } 1154 1155 1156 // if there was no delimiter then this is an error 1157 if (ch == '>') 1158 { 1159 return -1; 1160 } 1161 1162 // go to the next char 1163 ch = token[i]; 1164 ++i; 1165 1166 // the next char must be either a '>' or '/' (denoting the end of the tag) 1167 // or a white space character 1168 if (ch != '>' && ch != ' ' && ch != '/' && ch != '\t' && ch !='\n' && ch !='\r') 1169 return -1; 1170 1171 // skip any whitespaces 1172 while (ch == ' ' || ch == '\t' || ch =='\n' || ch =='\r') 1173 { 1174 ch = token[i]; 1175 ++i; 1176 } 1177 1178 1179 // add attribute_value and attribute_name to atts 1180 if (atts.list.is_in_domain(attribute_name)) 1181 { 1182 // attributes may not be multiply defined 1183 return -1; 1184 } 1185 else 1186 { 1187 atts.list.add(attribute_name,attribute_value); 1188 } 1189 1190 1191 } 1192 1193 // you can't have an element with no name 1194 if (name.size() == 0) 1195 return -1; 1196 1197 return 0; 1198 1199 } 1200 1201 // ---------------------------------------------------------------------------------------- 1202 1203 int xml_parser:: parse_pi(const std::string & token,std::string & target,std::string & data)1204 parse_pi ( 1205 const std::string& token, 1206 std::string& target, 1207 std::string& data 1208 ) 1209 { 1210 target.erase(); 1211 data.erase(); 1212 1213 std::istream::int_type ch = token[2]; 1214 std::string::size_type i = 3; 1215 while (ch != ' ' && ch != '?' && ch != '\t' && ch != '\n' && ch!='\r') 1216 { 1217 target += ch; 1218 ch = token[i]; 1219 ++i; 1220 } 1221 if (target.size() == 0) 1222 return -1; 1223 1224 // if we aren't at a ? character then go to the next character 1225 if (ch != '?' ) 1226 { 1227 ch = token[i]; 1228 ++i; 1229 } 1230 1231 // if we still aren't at the end of the processing instruction then 1232 // set this stuff in the data section 1233 while (ch != '?') 1234 { 1235 data += ch; 1236 ch = token[i]; 1237 ++i; 1238 } 1239 1240 return 0; 1241 } 1242 1243 // ---------------------------------------------------------------------------------------- 1244 1245 int xml_parser:: parse_element_end(const std::string & token,std::string & name)1246 parse_element_end ( 1247 const std::string& token, 1248 std::string& name 1249 ) 1250 { 1251 name.erase(); 1252 std::string::size_type end = token.size()-1; 1253 for (std::string::size_type i = 2; i < end; ++i) 1254 { 1255 if (token[i] == ' ' || token[i] == '\t' || token[i] == '\n'|| token[i] == '\r') 1256 break; 1257 name += token[i]; 1258 } 1259 1260 if (name.size() == 0) 1261 return -1; 1262 1263 return 0; 1264 } 1265 1266 // ---------------------------------------------------------------------------------------- 1267 1268 int xml_parser:: change_entity(std::istream & in)1269 change_entity ( 1270 std::istream& in 1271 ) 1272 { 1273 1274 std::istream::int_type buf[6]; 1275 1276 1277 buf[1] = in.get(); 1278 1279 // if this is an undefined entity reference then return error 1280 if (buf[1] != 'a' && 1281 buf[1] != 'l' && 1282 buf[1] != 'g' && 1283 buf[1] != 'q' 1284 ) 1285 return -1; 1286 1287 1288 buf[2] = in.get(); 1289 // if this is an undefined entity reference then return error 1290 if (buf[2] != 'm' && 1291 buf[2] != 't' && 1292 buf[2] != 'p' && 1293 buf[2] != 'u' 1294 ) 1295 return -1; 1296 1297 1298 buf[3] = in.get(); 1299 // if this is an undefined entity reference then return error 1300 if (buf[3] != 'p' && 1301 buf[3] != ';' && 1302 buf[3] != 'o' 1303 ) 1304 return -1; 1305 1306 // check if this is < or > 1307 if (buf[3] == ';') 1308 { 1309 if (buf[2] != 't') 1310 return -1; 1311 1312 // if this is < then return '<' 1313 if (buf[1] == 'l') 1314 return '<'; 1315 // if this is > then return '>' 1316 if (buf[1] == 'g') 1317 return '>'; 1318 1319 // it is neither so it must be an undefined entity reference 1320 return -1; 1321 } 1322 1323 1324 buf[4] = in.get(); 1325 // if this should be & 1326 if (buf[4] == ';') 1327 { 1328 // if this is not & then return error 1329 if (buf[1] != 'a' || 1330 buf[2] != 'm' || 1331 buf[3] != 'p' 1332 ) 1333 return -1; 1334 1335 return '&'; 1336 } 1337 1338 buf[5] = in.get(); 1339 1340 // if this should be ' 1341 if (buf[1] == 'a' && 1342 buf[2] == 'p' && 1343 buf[3] == 'o' && 1344 buf[4] == 's' && 1345 buf[5] == ';' 1346 ) 1347 return '\''; 1348 1349 1350 // if this should be " 1351 if (buf[1] == 'q' && 1352 buf[2] == 'u' && 1353 buf[3] == 'o' && 1354 buf[4] == 't' && 1355 buf[5] == ';' 1356 ) 1357 return '"'; 1358 1359 1360 // it was an undefined entity reference 1361 return -1; 1362 1363 } 1364 1365 // ---------------------------------------------------------------------------------------- 1366 // ---------------------------------------------------------------------------------------- 1367 1368 class xml_parse_error : public error 1369 { 1370 public: xml_parse_error(const std::string & a)1371 xml_parse_error( 1372 const std::string& a 1373 ): error(a) {} 1374 }; 1375 1376 namespace impl 1377 { 1378 class default_xml_error_handler : public error_handler 1379 { 1380 std::string filename; 1381 1382 public: 1383 default_xml_error_handler()1384 default_xml_error_handler ( 1385 ) {} 1386 default_xml_error_handler(const std::string & filename_)1387 default_xml_error_handler ( 1388 const std::string& filename_ 1389 ) :filename(filename_) {} 1390 error(const unsigned long)1391 virtual void error ( 1392 const unsigned long 1393 ) 1394 { 1395 // just ignore non-fatal errors 1396 } 1397 fatal_error(const unsigned long line_number)1398 virtual void fatal_error ( 1399 const unsigned long line_number 1400 ) 1401 { 1402 std::ostringstream sout; 1403 if (filename.size() != 0) 1404 sout << "There is a fatal error on line " << line_number << " in the XML file '"<<filename<<"'."; 1405 else 1406 sout << "There is a fatal error on line " << line_number << " in the XML being processed."; 1407 1408 throw xml_parse_error(sout.str()); 1409 } 1410 }; 1411 } 1412 parse_xml(std::istream & in,document_handler & dh,error_handler & eh)1413 inline void parse_xml ( 1414 std::istream& in, 1415 document_handler& dh, 1416 error_handler& eh 1417 ) 1418 { 1419 if (!in) 1420 throw xml_parse_error("Unexpected end of file during xml parsing."); 1421 xml_parser parser; 1422 parser.add_document_handler(dh); 1423 parser.add_error_handler(eh); 1424 parser.parse(in); 1425 } 1426 parse_xml(std::istream & in,error_handler & eh,document_handler & dh)1427 inline void parse_xml ( 1428 std::istream& in, 1429 error_handler& eh, 1430 document_handler& dh 1431 ) 1432 { 1433 if (!in) 1434 throw xml_parse_error("Unexpected end of file during xml parsing."); 1435 xml_parser parser; 1436 parser.add_document_handler(dh); 1437 parser.add_error_handler(eh); 1438 parser.parse(in); 1439 } 1440 parse_xml(std::istream & in,error_handler & eh)1441 inline void parse_xml ( 1442 std::istream& in, 1443 error_handler& eh 1444 ) 1445 { 1446 if (!in) 1447 throw xml_parse_error("Unexpected end of file during xml parsing."); 1448 xml_parser parser; 1449 parser.add_error_handler(eh); 1450 parser.parse(in); 1451 } 1452 parse_xml(std::istream & in,document_handler & dh)1453 inline void parse_xml ( 1454 std::istream& in, 1455 document_handler& dh 1456 ) 1457 { 1458 if (!in) 1459 throw xml_parse_error("Unexpected end of file during xml parsing."); 1460 xml_parser parser; 1461 parser.add_document_handler(dh); 1462 impl::default_xml_error_handler eh; 1463 parser.add_error_handler(eh); 1464 parser.parse(in); 1465 } 1466 1467 // ---------------------------------------------------------------------------------------- 1468 parse_xml(const std::string & filename,document_handler & dh,error_handler & eh)1469 inline void parse_xml ( 1470 const std::string& filename, 1471 document_handler& dh, 1472 error_handler& eh 1473 ) 1474 { 1475 std::ifstream in(filename.c_str()); 1476 if (!in) 1477 throw xml_parse_error("Unable to open file '" + filename + "'."); 1478 xml_parser parser; 1479 parser.add_document_handler(dh); 1480 parser.add_error_handler(eh); 1481 parser.parse(in); 1482 } 1483 parse_xml(const std::string & filename,error_handler & eh,document_handler & dh)1484 inline void parse_xml ( 1485 const std::string& filename, 1486 error_handler& eh, 1487 document_handler& dh 1488 ) 1489 { 1490 std::ifstream in(filename.c_str()); 1491 if (!in) 1492 throw xml_parse_error("Unable to open file '" + filename + "'."); 1493 xml_parser parser; 1494 parser.add_document_handler(dh); 1495 parser.add_error_handler(eh); 1496 parser.parse(in); 1497 } 1498 parse_xml(const std::string & filename,error_handler & eh)1499 inline void parse_xml ( 1500 const std::string& filename, 1501 error_handler& eh 1502 ) 1503 { 1504 std::ifstream in(filename.c_str()); 1505 if (!in) 1506 throw xml_parse_error("Unable to open file '" + filename + "'."); 1507 xml_parser parser; 1508 parser.add_error_handler(eh); 1509 parser.parse(in); 1510 } 1511 parse_xml(const std::string & filename,document_handler & dh)1512 inline void parse_xml ( 1513 const std::string& filename, 1514 document_handler& dh 1515 ) 1516 { 1517 std::ifstream in(filename.c_str()); 1518 if (!in) 1519 throw xml_parse_error("Unable to open file '" + filename + "'."); 1520 xml_parser parser; 1521 parser.add_document_handler(dh); 1522 impl::default_xml_error_handler eh(filename); 1523 parser.add_error_handler(eh); 1524 parser.parse(in); 1525 } 1526 1527 // ---------------------------------------------------------------------------------------- 1528 1529 } 1530 1531 #endif // DLIB_XML_PARSER_KERNEl_1_ 1532 1533