1 /* 2 * Copyright (C) 2020 Linux Studio Plugins Project <https://lsp-plug.in/> 3 * (C) 2020 Vladimir Sadovnikov <sadko4u@gmail.com> 4 * 5 * This file is part of lsp-plugins 6 * Created on: 24 окт. 2019 г. 7 * 8 * lsp-plugins is free software: you can redistribute it and/or modify 9 * it under the terms of the GNU Lesser General Public License as published by 10 * the Free Software Foundation, either version 3 of the License, or 11 * any later version. 12 * 13 * lsp-plugins is distributed in the hope that it will be useful, 14 * but WITHOUT ANY WARRANTY; without even the implied warranty of 15 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 16 * GNU Lesser General Public License for more details. 17 * 18 * You should have received a copy of the GNU Lesser General Public License 19 * along with lsp-plugins. If not, see <https://www.gnu.org/licenses/>. 20 */ 21 22 #include <core/io/InStringSequence.h> 23 #include <core/io/InSequence.h> 24 #include <core/io/InFileStream.h> 25 #include <core/files/xml/PullParser.h> 26 #include <ctype.h> 27 #include <wctype.h> 28 29 namespace lsp 30 { 31 namespace xml 32 { 33 PullParser()34 PullParser::PullParser() 35 { 36 pIn = NULL; 37 nWFlags = 0; 38 nToken = -STATUS_NO_DATA; 39 nState = PS_READ_MISC; 40 enVersion = XML_VERSION_1_0; 41 nFlags = 0; 42 nStates = 0; 43 44 nUngetch = 0; 45 } 46 ~PullParser()47 PullParser::~PullParser() 48 { 49 close(); 50 } 51 open(const char * path,const char * charset)52 status_t PullParser::open(const char *path, const char *charset) 53 { 54 if (pIn != NULL) 55 return STATUS_BAD_STATE; 56 else if (path == NULL) 57 return STATUS_BAD_ARGUMENTS; 58 59 io::InFileStream *ifs = new io::InFileStream(); 60 if (ifs == NULL) 61 return STATUS_NO_MEM; 62 status_t res = ifs->open(path); 63 if (res == STATUS_OK) 64 { 65 res = wrap(ifs, WRAP_CLOSE | WRAP_DELETE, charset); 66 if (res == STATUS_OK) 67 return res; 68 ifs->close(); 69 } 70 delete ifs; 71 72 return res; 73 } 74 open(const LSPString * path,const char * charset)75 status_t PullParser::open(const LSPString *path, const char *charset) 76 { 77 if (pIn != NULL) 78 return STATUS_BAD_STATE; 79 else if (path == NULL) 80 return STATUS_BAD_ARGUMENTS; 81 82 io::InFileStream *ifs = new io::InFileStream(); 83 if (ifs == NULL) 84 return STATUS_NO_MEM; 85 status_t res = ifs->open(path); 86 if (res == STATUS_OK) 87 { 88 res = wrap(ifs, WRAP_CLOSE | WRAP_DELETE, charset); 89 if (res == STATUS_OK) 90 return res; 91 ifs->close(); 92 } 93 delete ifs; 94 95 return res; 96 } 97 open(const io::Path * path,const char * charset)98 status_t PullParser::open(const io::Path *path, const char *charset) 99 { 100 if (pIn != NULL) 101 return STATUS_BAD_STATE; 102 else if (path == NULL) 103 return STATUS_BAD_ARGUMENTS; 104 105 io::InFileStream *ifs = new io::InFileStream(); 106 if (ifs == NULL) 107 return STATUS_NO_MEM; 108 status_t res = ifs->open(path); 109 if (res == STATUS_OK) 110 { 111 res = wrap(ifs, WRAP_CLOSE | WRAP_DELETE, charset); 112 if (res == STATUS_OK) 113 return res; 114 ifs->close(); 115 } 116 delete ifs; 117 118 return res; 119 } 120 wrap(const char * str,const char * charset)121 status_t PullParser::wrap(const char *str, const char *charset) 122 { 123 if (pIn != NULL) 124 return STATUS_BAD_STATE; 125 else if (str == NULL) 126 return STATUS_BAD_ARGUMENTS; 127 128 io::InStringSequence *seq = new io::InStringSequence(); 129 if (seq == NULL) 130 return STATUS_NO_MEM; 131 132 status_t res = seq->wrap(str, charset); 133 if (res == STATUS_OK) 134 { 135 if ((res = wrap(seq, WRAP_CLOSE | WRAP_DELETE)) == STATUS_OK) 136 return res; 137 seq->close(); 138 } 139 140 delete seq; 141 return res; 142 } 143 wrap(const LSPString * str)144 status_t PullParser::wrap(const LSPString *str) 145 { 146 if (pIn != NULL) 147 return STATUS_BAD_STATE; 148 else if (str == NULL) 149 return STATUS_BAD_ARGUMENTS; 150 151 io::InStringSequence *seq = new io::InStringSequence(); 152 if (seq == NULL) 153 return STATUS_NO_MEM; 154 155 status_t res = seq->wrap(str); 156 if (res == STATUS_OK) 157 { 158 if ((res = wrap(seq, WRAP_CLOSE | WRAP_DELETE)) == STATUS_OK) 159 return res; 160 seq->close(); 161 } 162 163 delete seq; 164 return res; 165 } 166 wrap(io::IInStream * is,size_t flags,const char * charset)167 status_t PullParser::wrap(io::IInStream *is, size_t flags, const char *charset) 168 { 169 if (pIn != NULL) 170 return STATUS_BAD_STATE; 171 else if (is == NULL) 172 return STATUS_BAD_ARGUMENTS; 173 174 io::InSequence *seq = new io::InSequence(); 175 if (seq == NULL) 176 return STATUS_NO_MEM; 177 178 status_t res = seq->wrap(is, flags, charset); 179 if (res == STATUS_OK) 180 { 181 if ((res = wrap(seq, WRAP_CLOSE | WRAP_DELETE)) == STATUS_OK) 182 return res; 183 seq->close(); 184 } 185 186 delete seq; 187 return res; 188 } 189 wrap(io::IInSequence * seq,size_t flags)190 status_t PullParser::wrap(io::IInSequence *seq, size_t flags) 191 { 192 if (pIn != NULL) 193 return STATUS_BAD_STATE; 194 else if (seq == NULL) 195 return STATUS_BAD_ARGUMENTS; 196 197 pIn = seq; 198 nWFlags = flags; 199 nToken = -STATUS_NO_DATA; 200 nState = PS_READ_MISC; 201 nStates = 0; 202 enVersion = XML_VERSION_1_0; 203 sVersion.truncate(); 204 sEncoding.truncate(); 205 sDoctype.truncate(); 206 sPublic.truncate(); 207 sSystem.truncate(); 208 nFlags = 0; 209 nUngetch = 0; 210 211 return STATUS_OK; 212 } 213 close()214 status_t PullParser::close() 215 { 216 status_t res = STATUS_OK; 217 218 // Drop unnecessary resources 219 nUngetch = 0; 220 sVersion.truncate(); 221 sEncoding.truncate(); 222 sName.truncate(); 223 sValue.truncate(); 224 sDoctype.truncate(); 225 sPublic.truncate(); 226 sSystem.truncate(); 227 nFlags = 0; 228 229 // Remove all tag hierarchy 230 drop_list(&vTags); 231 drop_list(&vAtts); 232 233 // Release input sequence 234 if (pIn != NULL) 235 { 236 if (nWFlags & WRAP_CLOSE) 237 { 238 if (res == STATUS_OK) 239 res = pIn->close(); 240 else 241 pIn->close(); 242 } 243 244 if (nWFlags & WRAP_DELETE) 245 delete pIn; 246 247 pIn = NULL; 248 } 249 250 return res; 251 } 252 getch()253 lsp_swchar_t PullParser::getch() 254 { 255 return (nUngetch > 0) ? vUngetch[--nUngetch] : pIn->read(); 256 } 257 ungetch(lsp_swchar_t ch)258 void PullParser::ungetch(lsp_swchar_t ch) 259 { 260 vUngetch[nUngetch++] = ch; 261 } 262 push_state(parse_state_t override)263 void PullParser::push_state(parse_state_t override) 264 { 265 vStates[nStates++] = nState; 266 nState = override; 267 } 268 pop_state()269 void PullParser::pop_state() 270 { 271 nState = vStates[--nStates]; 272 } 273 drop_list(cvector<LSPString> * list)274 void PullParser::drop_list(cvector<LSPString> *list) 275 { 276 for (size_t i=0, n=list->size(); i<n; ++i) 277 { 278 LSPString *s = list->at(i); 279 if (s != NULL) 280 delete s; 281 } 282 list->flush(); 283 } 284 check_duplicate_attribute()285 status_t PullParser::check_duplicate_attribute() 286 { 287 // Is item present in list? 288 for (size_t i=0, n=vAtts.size(); i<n; ++i) 289 { 290 LSPString *s = vAtts.at(i); 291 if ((s != NULL) && (s->equals(&sName))) 292 return STATUS_CORRUPTED; 293 } 294 295 // Add to list 296 LSPString *copy = sName.clone(); 297 if (copy == NULL) 298 return STATUS_NO_MEM; 299 if (!vAtts.add(copy)) 300 { 301 delete copy; 302 return STATUS_NO_MEM; 303 } 304 305 return STATUS_OK; 306 } 307 skip_spaces()308 bool PullParser::skip_spaces() 309 { 310 bool skipped = false; 311 312 while (true) 313 { 314 // Read next character 315 lsp_swchar_t c = getch(); 316 if (!is_whitespace(c)) 317 { 318 ungetch(c); 319 break; 320 } 321 skipped = true; 322 } 323 324 return skipped; 325 } 326 read_text(const char * text)327 status_t PullParser::read_text(const char *text) 328 { 329 lsp_swchar_t c; 330 for ( ; *text != '\0'; ++text) 331 { 332 if ((c = getch()) != *text) 333 return (c < 0) ? -c : STATUS_CORRUPTED; 334 } 335 return STATUS_OK; 336 } 337 read_name(LSPString * name)338 status_t PullParser::read_name(LSPString *name) 339 { 340 // Get first character 341 lsp_swchar_t c = getch(); 342 if (!(is_name_first(c))) 343 return (c < 0) ? -c : STATUS_CORRUPTED; 344 345 // Read name 346 name->clear(); 347 do 348 { 349 // Append current character 350 if (!name->append(c)) 351 return STATUS_NO_MEM; 352 353 // Get next character 354 c = getch(); 355 } while (is_name_next(c)); 356 357 // Return back last character and return OK status 358 ungetch(c); 359 return STATUS_OK; 360 } 361 read_attribute_value(lsp_swchar_t qc)362 status_t PullParser::read_attribute_value(lsp_swchar_t qc) 363 { 364 lsp_swchar_t c; 365 status_t res; 366 367 while (true) 368 { 369 // Read character 370 if ((c = getch()) < 0) 371 { 372 pop_state(); 373 return -c; 374 } 375 else if (c == qc) 376 break; 377 378 // Reference? 379 if (c == '&') 380 { 381 // Read and append reference (if possible) to the string value 382 if ((res = read_entity_reference(&sValue)) != STATUS_OK) 383 { 384 pop_state(); 385 return res; 386 } 387 388 // Need to query reference? 389 if (nState != PS_READ_REFERENCE) 390 continue; 391 return STATUS_OK; // Query for reference, do not need to pop_state() 392 } 393 394 // Append current character 395 if (!sValue.append(c)) 396 { 397 pop_state(); 398 return STATUS_NO_MEM; 399 } 400 } 401 402 pop_state(); 403 nToken = XT_ATTRIBUTE; 404 return STATUS_OK; 405 } 406 read_version()407 status_t PullParser::read_version() 408 { 409 // Get quote character 410 lsp_swchar_t qc = getch(); 411 if ((qc != '\'') && (qc != '\"')) 412 return (qc < 0) ? -qc : STATUS_CORRUPTED; 413 414 // Version should be '1.x' 415 lsp_swchar_t c; 416 if ((c = getch()) != '1') 417 return (c < 0) ? -c : STATUS_CORRUPTED; 418 if ((c = getch()) != '.') 419 return (c < 0) ? -c : STATUS_CORRUPTED; 420 421 // Read integer value 422 size_t v=0, k=0; 423 while ((c = getch()) != qc) 424 { 425 if (v >= 0x1000000) // Prevent from integer overflow 426 return STATUS_CORRUPTED; 427 428 if ((c >= '0') && (c <= '9')) 429 v = v * 10 + (c - '0'); 430 else 431 return (c < 0) ? -c : STATUS_CORRUPTED; 432 ++k; 433 } 434 435 // Validate number of digits 436 if (k <= 0) 437 return STATUS_CORRUPTED; 438 439 // Update version text 440 if (!sVersion.fmt_ascii("1.%d", int(v))) 441 return STATUS_NO_MEM; 442 443 enVersion = (v >= 1) ? XML_VERSION_1_1 : XML_VERSION_1_0; 444 nFlags |= XF_VERSION; 445 446 return STATUS_OK; 447 } 448 read_encoding()449 status_t PullParser::read_encoding() 450 { 451 sEncoding.clear(); 452 453 // Get quote character 454 lsp_swchar_t qc = getch(); 455 if ((qc != '\'') && (qc != '\"')) 456 return (qc < 0) ? -qc : STATUS_CORRUPTED; 457 458 // Read encoding char 459 lsp_swchar_t c = getch(); 460 if (!is_encoding_first(c)) 461 return STATUS_BAD_FORMAT; 462 if (!sEncoding.append(c)) 463 return STATUS_NO_MEM; 464 465 // Check the remained characters 466 while ((c = getch()) != qc) 467 { 468 if (!is_encoding_next(c)) 469 return (c < 0) ? -c : STATUS_CORRUPTED; 470 if (!sEncoding.append(c)) 471 return STATUS_NO_MEM; 472 } 473 474 nFlags |= XF_ENCODING; 475 476 return STATUS_OK; 477 } 478 read_standalone()479 status_t PullParser::read_standalone() 480 { 481 LSPString tmp; 482 483 // Get quote character 484 lsp_swchar_t qc = getch(); 485 if ((qc != '\'') && (qc != '\"')) 486 return (qc < 0) ? -qc : STATUS_CORRUPTED; 487 488 // Read quoted string 489 lsp_swchar_t c; 490 while ((c = getch()) != qc) 491 { 492 if (tmp.length() >= 3) 493 return STATUS_CORRUPTED; 494 if (!tmp.append(c)) 495 return STATUS_NO_MEM; 496 } 497 498 // Compare string with possible value 499 if (tmp.equals_ascii("yes")) 500 nFlags |= XF_STANDALONE; 501 else if (tmp.equals_ascii("no")) 502 nFlags &= ~XF_STANDALONE; 503 else 504 return STATUS_CORRUPTED; 505 506 return STATUS_OK; 507 } 508 read_header()509 status_t PullParser::read_header() 510 { 511 status_t res; 512 lsp_swchar_t c; 513 514 // Fetch optional attributes 515 enum flags_t 516 { 517 F_VERSION = 1 << 0, 518 F_ENCODING = 1 << 1, 519 F_STANDALONE = 1 << 2 520 }; 521 522 size_t flags = 0; 523 LSPString name, value; 524 525 while (true) 526 { 527 // Skip spaces and read next character 528 bool skipped = skip_spaces(); 529 if ((c = getch()) < 0) 530 return -c; 531 532 if (c == '?') // end of header? 533 { 534 // Read next character 535 if ((c = getch()) != '>') 536 return (c < 0) ? -c : STATUS_CORRUPTED; 537 return (flags & F_VERSION) ? read_start_document() : STATUS_CORRUPTED; 538 } 539 540 // At least one space is mandatory 541 if (!skipped) 542 return STATUS_CORRUPTED; 543 544 // Read attribute name 545 ungetch(c); 546 if ((res = read_name(&name)) != STATUS_OK) 547 return res; 548 549 // Required '=' sign 550 skip_spaces(); 551 if ((c = getch()) != '=') 552 return (c < 0) ? -c : STATUS_CORRUPTED; 553 554 // Check attribute type 555 size_t flag = 0; 556 if (name.equals_ascii("version")) 557 { 558 flag = F_VERSION; 559 if ((res = read_version()) != STATUS_OK) 560 return res; 561 } 562 else if (name.equals_ascii("encoding")) 563 { 564 flag = F_ENCODING; 565 if ((res = read_encoding()) != STATUS_OK) 566 return res; 567 } 568 else if (name.equals_ascii("standalone")) 569 { 570 flag = F_STANDALONE; 571 if ((res = read_standalone()) != STATUS_OK) 572 return res; 573 } 574 575 // Check that attribute is at proper place 576 if (flag <= flags) 577 return STATUS_CORRUPTED; 578 flags |= flag; 579 } 580 } 581 read_comment()582 status_t PullParser::read_comment() 583 { 584 lsp_swchar_t c, xc; 585 sValue.clear(); 586 587 while (true) 588 { 589 // Fetch new character 590 if ((c = getch()) < 0) 591 return -c; 592 593 // Going to end of comment? 594 if (c == '-') 595 { 596 // End of comment? 597 if ((xc = getch()) == '-') 598 { 599 // Next character should be '>' 600 if ((xc = getch()) != '>') 601 return (xc < 0) ? -xc : STATUS_CORRUPTED; 602 603 nToken = XT_COMMENT; 604 return STATUS_OK; 605 } 606 607 // Return character back 608 ungetch(xc); 609 } 610 611 if (!sValue.append(c)) 612 return STATUS_NO_MEM; 613 } 614 615 return STATUS_OK; 616 } 617 read_processing_instruction()618 status_t PullParser::read_processing_instruction() 619 { 620 status_t res; 621 622 // Read processing instruction name 623 if ((res = read_name(&sName)) != STATUS_OK) 624 return res; 625 626 if (sName.equals_ascii_nocase("xml")) 627 { 628 if (nFlags & XF_HEADER) 629 return STATUS_CORRUPTED; // XML processing instruction is prohibited 630 return read_header(); 631 } 632 633 // Read processing instruction value 634 lsp_swchar_t c; 635 skip_spaces(); // Skip spaces 636 637 sValue.clear(); 638 while (true) 639 { 640 // Fetch new character 641 if ((c = getch()) < 0) 642 return -c; 643 644 // PI end? 645 if (c == '>') 646 { 647 ssize_t pos = sValue.length() - 1; 648 if ((pos >= 0) && (sValue.char_at(pos) == '?')) 649 break; 650 } 651 652 // No, simple character 653 if (!sValue.append(c)) 654 return STATUS_NO_MEM; 655 } 656 657 // Remove last character which is '?' 658 sValue.set_length(sValue.length() - 1); 659 660 nToken = XT_PROCESSING_INSTRUCTION; 661 return STATUS_OK; 662 } 663 read_system_literal(LSPString * dst)664 status_t PullParser::read_system_literal(LSPString *dst) 665 { 666 LSPString tmp; 667 668 // Get quote character 669 lsp_swchar_t qc = getch(); 670 if ((qc != '\'') && (qc != '\"')) 671 return (qc < 0) ? -qc : STATUS_CORRUPTED; 672 673 // Read quoted string 674 lsp_swchar_t c; 675 while ((c = getch()) != qc) 676 { 677 if (!tmp.append(c)) 678 return STATUS_NO_MEM; 679 } 680 681 dst->swap(&tmp); 682 return STATUS_OK; 683 } 684 read_pubid_literal(LSPString * dst)685 status_t PullParser::read_pubid_literal(LSPString *dst) 686 { 687 LSPString tmp; 688 689 // Get quote character 690 lsp_swchar_t qc = getch(); 691 if ((qc != '\'') && (qc != '\"')) 692 return (qc < 0) ? -qc : STATUS_CORRUPTED; 693 694 // Read quoted string 695 lsp_swchar_t c; 696 while ((c = getch()) != qc) 697 { 698 if ((!is_pubid_char(c)) || (c == qc)) 699 return STATUS_CORRUPTED; 700 if (!tmp.append(c)) 701 return STATUS_NO_MEM; 702 } 703 704 dst->swap(&tmp); 705 return STATUS_OK; 706 } 707 read_doctype()708 status_t PullParser::read_doctype() 709 { 710 status_t res; 711 lsp_swchar_t c; 712 LSPString x; 713 714 // Duplicate DOCTYPE? 715 if (nFlags & XF_DOCTYPE) 716 return STATUS_CORRUPTED; 717 718 // Space is required 719 if (!skip_spaces()) 720 return STATUS_CORRUPTED; 721 if ((res = read_name(&sDoctype)) != STATUS_OK) 722 return res; 723 724 // Watch next token 725 nFlags |= XF_DOCTYPE; 726 bool skip = skip_spaces(); 727 if ((c = getch()) < 0) 728 return -c; 729 730 // ExternalID is present? 731 if (c == 'P') 732 { 733 if (!skip) 734 return STATUS_CORRUPTED; 735 if ((res = read_text("UBLIC")) != STATUS_OK) 736 return res; 737 if (!skip_spaces()) 738 return STATUS_CORRUPTED; 739 if ((res = read_pubid_literal(&sPublic)) != STATUS_OK) 740 return res; 741 nFlags |= XF_DOCTYPE_PUB; 742 if (!skip_spaces()) 743 return STATUS_CORRUPTED; 744 if ((res = read_system_literal(&sSystem)) != STATUS_OK) 745 return res; 746 nFlags |= XF_DOCTYPE_SYS; 747 748 // Skip spaces and get next token 749 skip_spaces(); 750 if ((c = getch()) < 0) 751 return -c; 752 } 753 else if (c == 'S') 754 { 755 if (!skip) 756 return STATUS_CORRUPTED; 757 if ((res = read_text("YSTEM")) != STATUS_OK) 758 return res; 759 if (!skip_spaces()) 760 return STATUS_CORRUPTED; 761 if ((res = read_system_literal(&sSystem)) != STATUS_OK) 762 return res; 763 nFlags |= XF_DOCTYPE_SYS; 764 765 // Skip spaces and get next token 766 skip_spaces(); 767 if ((c = getch()) < 0) 768 return -c; 769 } 770 771 // intSubset? 772 if (c == '[') 773 { 774 // TODO: currently we don't support DOCTYPE definition with built-in doctypes 775 return STATUS_NOT_IMPLEMENTED; 776 } 777 778 // End of Doctype? 779 nToken = XT_DTD; 780 return (c == '>') ? STATUS_OK : STATUS_CORRUPTED; 781 } 782 read_start_document()783 status_t PullParser::read_start_document() 784 { 785 nToken = XT_START_DOCUMENT; 786 nFlags |= XF_HEADER; 787 return STATUS_OK; 788 } 789 read_end_document()790 status_t PullParser::read_end_document() 791 { 792 nToken = XT_END_DOCUMENT; 793 nState = PS_END_DOCUMENT; 794 return STATUS_OK; 795 } 796 read_misc()797 status_t PullParser::read_misc() 798 { 799 status_t res; 800 lsp_swchar_t c; 801 802 // Skip whitespace 803 if (!(nFlags & XF_HEADER)) 804 { 805 if (skip_spaces()) 806 return read_start_document(); 807 } 808 else 809 skip_spaces(); 810 811 // Next character should be '<' 812 if ((c = getch()) != '<') 813 { 814 if (c == -STATUS_EOF) 815 return (nFlags & XF_HEADER) ? read_end_document() : read_start_document(); 816 return (c < 0) ? -c : STATUS_CORRUPTED; 817 } 818 819 // Get the following character 820 if ((c = getch()) < 0) 821 return -c; 822 823 // Processing instruction? 824 if (c == '?') 825 return read_processing_instruction(); 826 else if (!(nFlags & XF_HEADER)) 827 { 828 ungetch(c); 829 ungetch('<'); 830 return read_start_document(); 831 } 832 833 // Comment or Doctype? 834 if (c == '!') 835 { 836 // Get next character 837 if ((c = getch()) < 0) 838 return -c; 839 840 if (c == '-') // Comment? 841 { 842 // '<!--' should be parsed 843 if ((c = getch()) != '-') 844 return (c < 0) ? -c : STATUS_CORRUPTED; 845 return read_comment(); 846 } 847 848 if (c == 'D') // Doctype? 849 { 850 // 'DOCTYPE' should be parsed 851 if ((res = read_text("OCTYPE")) != STATUS_OK) 852 return res; 853 return read_doctype(); 854 } 855 856 return STATUS_CORRUPTED; 857 } 858 859 // We already have root tag? 860 if (nFlags & XF_ROOT) 861 return STATUS_CORRUPTED; 862 nFlags |= XF_ROOT; // Now we already have root tag defined 863 864 // Return character and read root tag name 865 ungetch(c); 866 return read_tag_open(); 867 } 868 read_cdata()869 status_t PullParser::read_cdata() 870 { 871 lsp_swchar_t c; 872 873 sValue.clear(); 874 875 while (true) 876 { 877 // Get next character 878 if ((c = getch()) < 0) 879 return -c; 880 881 // CDATA end? 882 if (c == '>') 883 { 884 ssize_t pos = sValue.length() - 2; 885 if ( 886 (pos >= 0) && 887 (sValue.char_at(pos) == ']') && 888 (sValue.char_at(pos+1) == ']') 889 ) 890 break; 891 } 892 893 // No, simple character 894 if (!sValue.append(c)) 895 return STATUS_NO_MEM; 896 } 897 898 // Remove last two characters which are ']]' 899 sValue.set_length(sValue.length() - 2); 900 901 nToken = XT_CDATA; 902 return STATUS_OK; 903 } 904 read_tag_open()905 status_t PullParser::read_tag_open() 906 { 907 status_t res; 908 if ((res = read_name(&sName)) != STATUS_OK) 909 return res; 910 911 // Add tag to stack 912 LSPString *tag = sName.clone(); 913 if (tag == NULL) 914 return STATUS_NO_MEM; 915 else if (!vTags.push(tag)) 916 { 917 delete tag; 918 return STATUS_NO_MEM; 919 } 920 921 // Change state 922 drop_list(&vAtts); 923 nToken = XT_START_ELEMENT; 924 nState = PS_READ_ATTRIBUTES; 925 return STATUS_OK; 926 } 927 read_tag_close(bool copy)928 status_t PullParser::read_tag_close(bool copy) 929 { 930 // Get last tag name 931 LSPString *name = NULL; 932 if (!vTags.pop(&name)) 933 return STATUS_CORRUPTED; 934 935 if (copy) 936 sName.swap(name); 937 else if (!sName.equals(name)) 938 { 939 delete name; 940 return STATUS_CORRUPTED; 941 } 942 delete name; 943 944 // Update state 945 drop_list(&vAtts); 946 nToken = XT_END_ELEMENT; 947 nState = (vTags.size() > 0) ? PS_READ_ELEMENT_DATA : PS_READ_MISC; 948 return STATUS_OK; 949 } 950 read_entity_reference(LSPString * cdata)951 status_t PullParser::read_entity_reference(LSPString *cdata) 952 { 953 lsp_swchar_t c, code = 0; 954 status_t res; 955 956 // Get character 957 if ((c = getch()) < 0) 958 return -c; 959 960 // Entity reference ? 961 if (c != '#') 962 { 963 ungetch(c); 964 965 // Read entity name 966 if ((res = read_name(&sRefName)) != STATUS_OK) 967 return res; 968 969 if (sRefName.equals_ascii("amp")) 970 code = '&'; 971 else if (sRefName.equals_ascii("gt")) 972 code = '>'; 973 else if (sRefName.equals_ascii("lt")) 974 code = '<'; 975 else if (sRefName.equals_ascii("apos")) 976 code = '\''; 977 else if (sRefName.equals_ascii("quot")) 978 code = '\"'; 979 980 // Get next character which should be ';' 981 if ((c = getch()) < 0) 982 return -c; 983 } 984 else 985 { 986 // Get next character 987 if ((c = getch()) < 0) 988 return -c; 989 990 // Hexadecimal character? 991 if (c == 'x') 992 { 993 // Read hex digit 994 while ((c = getch()) >= 0) 995 { 996 // Protect from integer overflow 997 if (code >= 0x1000000) 998 return STATUS_CORRUPTED; 999 1000 // Decode hex character 1001 if ((c >= '0') && (c <= '9')) 1002 code = (code << 4) | (c - '0'); 1003 else if ((c >= 'a') && (c <= 'f')) 1004 code = (code << 4) | (c - 'a' + 10); 1005 else if ((c >= 'A') && (c <= 'F')) 1006 code = (code << 4) | (c - 'A' + 10); 1007 else 1008 break; 1009 } 1010 } 1011 else // Decimal character? 1012 { 1013 do 1014 { 1015 // Protect from integer overflow 1016 if (code >= 0x1000000) 1017 return STATUS_CORRUPTED; 1018 1019 // Decode decimal character 1020 if ((c >= '0') && (c <= '9')) 1021 code = (code * 10) + (c - '0'); 1022 else 1023 break; 1024 } while ((c = getch()) >= 0); 1025 } 1026 1027 // Validate character 1028 if (!is_valid_char(code, enVersion)) 1029 return STATUS_CORRUPTED; 1030 } 1031 1032 // Current character should be ';' 1033 if (c != ';') 1034 return STATUS_CORRUPTED; 1035 else if (code == 0) 1036 { 1037 push_state(PS_READ_REFERENCE); 1038 nToken = XT_ENTITY_RESOLVE; 1039 return STATUS_OK; 1040 } 1041 1042 // Append fetched character to the character data and exit 1043 return (cdata->append(code)) ? STATUS_OK : STATUS_NO_MEM; 1044 } 1045 read_characters()1046 status_t PullParser::read_characters() 1047 { 1048 lsp_swchar_t c; 1049 status_t res; 1050 1051 while (true) 1052 { 1053 // Get next character 1054 if ((c = getch()) < 0) 1055 { 1056 pop_state(); 1057 return -c; 1058 } 1059 1060 // Start of tag? 1061 if (c == '<') 1062 { 1063 ungetch(c); 1064 break; 1065 } 1066 1067 // Reference? 1068 if (c == '&') 1069 { 1070 // Read and append reference (if possible) to the string value 1071 if ((res = read_entity_reference(&sValue)) != STATUS_OK) 1072 { 1073 pop_state(); 1074 return res; 1075 } 1076 1077 // Need to query reference? 1078 if (nState != PS_READ_REFERENCE) 1079 continue; 1080 return STATUS_OK; // Query for reference, do not need to pop_state() 1081 } 1082 1083 // CDATA end? 1084 if (c == '>') 1085 { 1086 ssize_t pos = sValue.length() - 2; 1087 if ( 1088 (pos >= 0) && 1089 (sValue.char_at(pos) == ']') && 1090 (sValue.char_at(pos+1) == ']') 1091 ) 1092 { 1093 pop_state(); 1094 return STATUS_CORRUPTED; 1095 } 1096 } 1097 1098 // No, simple character 1099 if (!sValue.append(c)) 1100 { 1101 pop_state(); 1102 return STATUS_NO_MEM; 1103 } 1104 } 1105 1106 // Ensure that there is character data 1107 pop_state(); 1108 1109 if (sValue.length() <= 0) 1110 return STATUS_CORRUPTED; 1111 1112 nToken = XT_CHARACTERS; 1113 return STATUS_OK; 1114 } 1115 read_tag_content()1116 status_t PullParser::read_tag_content() 1117 { 1118 lsp_swchar_t c; 1119 status_t res; 1120 1121 // Read character 1122 if ((c = getch()) < 0) 1123 return -c; 1124 1125 // Tag? Processing instruction? End of tag? Comment? CDATA? 1126 if (c != '<') 1127 { 1128 ungetch(c); 1129 sValue.clear(); 1130 push_state(PS_READ_CHARACTERS); 1131 return read_characters(); 1132 } 1133 1134 // Get next character 1135 if ((c = getch()) < 0) 1136 return -c; 1137 1138 // Read tag name 1139 if (c == '/') // End of tag ? 1140 { 1141 // Read tag name 1142 if ((res = read_name(&sName)) != STATUS_OK) 1143 return res; 1144 1145 // '>' is required 1146 skip_spaces(); 1147 if ((c = getch()) != '>') 1148 return (c < 0) ? -c : STATUS_CORRUPTED; 1149 1150 return read_tag_close(false); 1151 } 1152 else if (c == '?') // Processing instruction ? 1153 return read_processing_instruction(); 1154 else if (c == '!') // Comment? CDATA? 1155 { 1156 // Get next character 1157 if ((c = getch()) < 0) 1158 return -c; 1159 1160 // CDATA? 1161 if (c == '[') 1162 { 1163 // Lookup CDATA start 1164 if ((res = read_text("CDATA[")) != STATUS_OK) 1165 return res; 1166 return read_cdata(); 1167 } 1168 1169 // Comment? 1170 if (c == '-') 1171 { 1172 // Next character is required to be '-' 1173 if ((c = getch()) != '-') 1174 return (c < 0) ? -c : STATUS_CORRUPTED; 1175 return read_comment(); 1176 } 1177 1178 // No match 1179 return STATUS_CORRUPTED; 1180 } 1181 1182 // Just open tag name? 1183 ungetch(c); 1184 return read_tag_open(); 1185 } 1186 read_tag_attribute()1187 status_t PullParser::read_tag_attribute() 1188 { 1189 lsp_swchar_t c; 1190 status_t res; 1191 1192 // Ignore set of spaces if they are present 1193 bool skipped = skip_spaces(); 1194 if ((c = getch()) < 0) 1195 return -c; 1196 1197 // End of tag header? 1198 if (c == '>') 1199 { 1200 nState = PS_READ_ELEMENT_DATA; 1201 return read_tag_content(); 1202 } 1203 1204 // End of tag? 1205 if (c == '/') 1206 { 1207 // Required character 1208 if ((c = getch()) != '>') 1209 return (c < 0) ? -c : STATUS_CORRUPTED; 1210 1211 return read_tag_close(true); 1212 } 1213 1214 // Try to read attribute and preprocess it's value 1215 if (!skipped) // At least one space is mandatory 1216 return STATUS_CORRUPTED; 1217 1218 // Read attribute name 1219 ungetch(c); 1220 if ((res = read_name(&sName)) != STATUS_OK) 1221 return res; 1222 else if (check_duplicate_attribute()) 1223 return STATUS_CORRUPTED; 1224 1225 skip_spaces(); // Spaces are optional 1226 if ((c = getch()) != '=') 1227 return STATUS_CORRUPTED; 1228 1229 skip_spaces(); // Spaces are optional 1230 c = getch(); // Get quote character 1231 if ((c != '\'') && (c != '\"')) 1232 return (c < 0) ? -c : STATUS_CORRUPTED; 1233 1234 // Read quoted value 1235 sValue.clear(); 1236 push_state((c == '\'') ? PS_READ_SQ_ATTRIBUTE : PS_READ_DQ_ATTRIBUTE); 1237 return read_attribute_value(c); 1238 } 1239 read_token()1240 status_t PullParser::read_token() 1241 { 1242 if (pIn == NULL) 1243 return STATUS_BAD_STATE; 1244 1245 switch (nState) 1246 { 1247 case PS_END_DOCUMENT: 1248 nToken = XT_END_DOCUMENT; 1249 return STATUS_EOF; 1250 1251 case PS_READ_MISC: 1252 return read_misc(); 1253 1254 case PS_READ_ATTRIBUTES: 1255 return read_tag_attribute(); 1256 1257 case PS_READ_ELEMENT_DATA: 1258 return read_tag_content(); 1259 1260 case PS_READ_REFERENCE: 1261 nToken = XT_ENTITY_RESOLVE; 1262 return STATUS_OK; 1263 1264 case PS_READ_CHARACTERS: 1265 return read_characters(); 1266 1267 case PS_READ_SQ_ATTRIBUTE: 1268 return read_attribute_value('\''); 1269 1270 case PS_READ_DQ_ATTRIBUTE: 1271 return read_attribute_value('\"'); 1272 1273 default: 1274 break; 1275 } 1276 return STATUS_CORRUPTED; 1277 } 1278 set_value(const LSPString * value)1279 status_t PullParser::set_value(const LSPString *value) 1280 { 1281 if (pIn == NULL) 1282 return STATUS_BAD_STATE; 1283 else if (value == NULL) 1284 return STATUS_BAD_ARGUMENTS; 1285 1286 if (nState != PS_READ_REFERENCE) 1287 return STATUS_BAD_STATE; 1288 1289 // Append value with entity content 1290 if (!sValue.append(value)) 1291 return STATUS_NO_MEM; 1292 1293 pop_state(); 1294 return STATUS_OK; 1295 } 1296 resolve_entity(const char * value,const char * charset)1297 status_t PullParser::resolve_entity(const char *value, const char *charset) 1298 { 1299 LSPString tmp; 1300 if (!tmp.set_native(value, charset)) 1301 return STATUS_NO_MEM; 1302 return resolve_entity(value); 1303 } 1304 read_next()1305 status_t PullParser::read_next() 1306 { 1307 status_t res = read_token(); 1308 return (res == STATUS_OK) ? nToken : -res; 1309 } 1310 get_current()1311 status_t PullParser::get_current() 1312 { 1313 return nToken; 1314 } 1315 name() const1316 const LSPString *PullParser::name() const 1317 { 1318 if (pIn == NULL) 1319 return NULL; 1320 1321 switch (nToken) 1322 { 1323 case XT_ATTRIBUTE: 1324 case XT_PROCESSING_INSTRUCTION: 1325 case XT_START_ELEMENT: 1326 case XT_END_ELEMENT: 1327 return &sName; 1328 case XT_ENTITY_RESOLVE: 1329 return &sRefName; 1330 default: 1331 break; 1332 } 1333 return NULL; 1334 } 1335 value() const1336 const LSPString *PullParser::value() const 1337 { 1338 if (pIn == NULL) 1339 return NULL; 1340 1341 switch (nToken) 1342 { 1343 case XT_ATTRIBUTE: 1344 case XT_CDATA: 1345 case XT_CHARACTERS: 1346 case XT_COMMENT: 1347 case XT_PROCESSING_INSTRUCTION: 1348 return &sValue; 1349 default: 1350 break; 1351 } 1352 return NULL; 1353 } 1354 1355 } /* namespace xml */ 1356 } /* namespace lsp */ 1357