1 /*
2 Copyright (c) 2000-2002 Lee Thomason (www.grinninglizard.com)
3
4 This software is provided 'as-is', without any express or implied
5 warranty. In no event will the authors be held liable for any
6 damages arising from the use of this software.
7
8 Permission is granted to anyone to use this software for any
9 purpose, including commercial applications, and to alter it and
10 redistribute it freely, subject to the following restrictions:
11
12 1. The origin of this software must not be misrepresented; you must
13 not claim that you wrote the original software. If you use this
14 software in a product, an acknowledgment in the product documentation
15 would be appreciated but is not required.
16
17 2. Altered source versions must be plainly marked as such, and
18 must not be misrepresented as being the original software.
19
20 3. This notice may not be removed or altered from any source
21 distribution.
22 */
23
24 #include "tinyxml.h"
25 #include <ctype.h>
26 #include <strstream>
27 using namespace std;
28
29 //#define DEBUG_PARSER
30
31 TiXmlBase::Entity TiXmlBase::entity[ NUM_ENTITY ] =
32 {
33 { "&", 5, '&' },
34 { "<", 4, '<' },
35 { ">", 4, '>' },
36 { """, 6, '\"' },
37 { "'", 6, '\'' }
38 };
39
40
SkipWhiteSpace(const char * p)41 const char* TiXmlBase::SkipWhiteSpace( const char* p )
42 {
43 if ( !p || !*p )
44 {
45 return 0;
46 }
47 while ( p && *p )
48 {
49 if ( isspace( *p ) || *p == '\n' || *p =='\r' ) // Still using old rules for white space.
50 ++p;
51 else
52 break;
53 }
54
55 return p;
56 }
57
58
StreamWhiteSpace(std::istream * in,std::string * tag)59 /*static*/ bool TiXmlBase::StreamWhiteSpace( std::istream* in, std::string* tag )
60 {
61 for( ;; )
62 {
63 if ( !in->good() ) return false;
64
65 int c = in->peek();
66 if ( !IsWhiteSpace( c ) )
67 return true;
68 *tag += in->get();
69 }
70 }
71
72
StreamTo(std::istream * in,int character,std::string * tag)73 /*static*/ bool TiXmlBase::StreamTo( std::istream* in, int character, std::string* tag )
74 {
75 while ( in->good() )
76 {
77 int c = in->peek();
78 if ( c == character )
79 return true;
80
81 in->get();
82 *tag += c;
83 }
84 return false;
85 }
86
87
ReadName(const char * p,string * name)88 const char* TiXmlBase::ReadName( const char* p, string* name )
89 {
90 *name = "";
91 assert( p );
92
93 // Names start with letters or underscores.
94 // After that, they can be letters, underscores, numbers,
95 // hyphens, or colons. (Colons are valid ony for namespaces,
96 // but tinyxml can't tell namespaces from names.)
97 if ( p && *p
98 && ( isalpha( (unsigned char) *p ) || *p == '_' ) )
99 {
100 while( p && *p
101 && ( isalnum( (unsigned char ) *p )
102 || *p == '_'
103 || *p == '-'
104 || *p == ':' ) )
105 {
106 (*name) += *p;
107 ++p;
108 }
109 return p;
110 }
111 return 0;
112 }
113
114
GetEntity(const char * p,char * value)115 const char* TiXmlBase::GetEntity( const char* p, char* value )
116 {
117 // Presume an entity, and pull it out.
118 string ent;
119 int i;
120
121 // Ignore the &#x entities.
122 if ( strncmp( "&#x", p, 3 ) == 0 )
123 {
124 *value = *p;
125 return p+1;
126 }
127
128 // Now try to match it.
129 for( i=0; i<NUM_ENTITY; ++i )
130 {
131 if ( strncmp( entity[i].str, p, entity[i].strLength ) == 0 )
132 {
133 assert( strlen( entity[i].str ) == entity[i].strLength );
134 *value = entity[i].chr;
135 return ( p + entity[i].strLength );
136 }
137 }
138
139 // So it wasn't an entity, its unrecognized, or something like that.
140 *value = *p; // Don't put back the last one, since we return it!
141 return p+1;
142 }
143
144
StringEqual(const char * p,const char * tag,bool ignoreCase)145 bool TiXmlBase::StringEqual( const char* p,
146 const char* tag,
147 bool ignoreCase )
148 {
149 assert( p );
150 if ( !p || !*p )
151 {
152 assert( 0 );
153 return false;
154 }
155
156 if ( tolower( *p ) == tolower( *tag ) )
157 {
158 const char* q = p;
159
160 if (ignoreCase)
161 {
162 while ( *q && *tag && *q == *tag )
163 {
164 ++q;
165 ++tag;
166 }
167
168 if ( *tag == 0 ) // Have we found the end of the tag, and everything equal?
169 {
170 return true;
171 }
172 }
173 else
174 {
175 while ( *q && *tag && tolower( *q ) == tolower( *tag ) )
176 {
177 ++q;
178 ++tag;
179 }
180
181 if ( *tag == 0 )
182 {
183 return true;
184 }
185 }
186 }
187 return false;
188 }
189
190
ReadText(const char * p,string * text,bool trimWhiteSpace,const char * endTag,bool caseInsensitive)191 const char* TiXmlBase::ReadText( const char* p,
192 string* text,
193 bool trimWhiteSpace,
194 const char* endTag,
195 bool caseInsensitive )
196 {
197 *text = "";
198
199 if ( !trimWhiteSpace // certain tags always keep whitespace
200 || !condenseWhiteSpace ) // if true, whitespace is always kept
201 {
202 // Keep all the white space.
203 while ( p && *p
204 && !StringEqual( p, endTag, caseInsensitive )
205 )
206 {
207 char c;
208 p = GetChar( p, &c );
209 text->append( &c, 1 );
210 }
211 }
212 else
213 {
214 bool whitespace = false;
215
216 // Remove leading white space:
217 p = SkipWhiteSpace( p );
218 while ( p && *p
219 && !StringEqual( p, endTag, caseInsensitive ) )
220 {
221 if ( *p == '\r' || *p == '\n' )
222 {
223 whitespace = true;
224 ++p;
225 }
226 else if ( isspace( *p ) )
227 {
228 whitespace = true;
229 ++p;
230 }
231 else
232 {
233 // If we've found whitespace, add it before the
234 // new character. Any whitespace just becomes a space.
235 if ( whitespace )
236 {
237 text->append( " ", 1 );
238 whitespace = false;
239 }
240 char c;
241 p = GetChar( p, &c );
242 text->append( &c, 1 );
243 }
244 }
245 }
246 return p + strlen( endTag );
247 }
248
249
StreamIn(std::istream * in,std::string * tag)250 void TiXmlDocument::StreamIn( std::istream* in, std::string* tag )
251 {
252 // The basic issue with a document is that we don't know what we're
253 // streaming. Read something presumed to be a tag (and hope), then
254 // identify it, and call the appropriate stream method on the tag.
255 //
256 // This "pre-streaming" will never read the closing ">" so the
257 // sub-tag can orient itself.
258
259 if ( !StreamTo( in, '<', tag ) )
260 {
261 SetError( TIXML_ERROR_PARSING_EMPTY );
262 return;
263 }
264
265 while ( in->good() )
266 {
267 int tagIndex = tag->length();
268 while ( in->good() && in->peek() != '>' )
269 {
270 int c = in->get();
271 (*tag) += (char) c;
272 }
273
274 if ( in->good() )
275 {
276 // We now have something we presume to be a node of
277 // some sort. Identify it, and call the node to
278 // continue streaming.
279 TiXmlNode* node = Identify( tag->c_str() + tagIndex );
280
281 if ( node )
282 {
283 node->StreamIn( in, tag );
284 bool isElement = node->ToElement() != 0;
285 delete node;
286 node = 0;
287
288 // If this is the root element, we're done. Parsing will be
289 // done by the >> operator.
290 if ( isElement )
291 {
292 return;
293 }
294 }
295 else
296 {
297 SetError( TIXML_ERROR );
298 return;
299 }
300 }
301 }
302 // We should have returned sooner.
303 SetError( TIXML_ERROR );
304 }
305
306
Parse(const char * p)307 const char* TiXmlDocument::Parse( const char* p )
308 {
309 // Parse away, at the document level. Since a document
310 // contains nothing but other tags, most of what happens
311 // here is skipping white space.
312 //
313 // In this variant (as opposed to stream and Parse) we
314 // read everything we can.
315
316
317 if ( !p || !*p || !( p = SkipWhiteSpace( p ) ) )
318 {
319 SetError( TIXML_ERROR_DOCUMENT_EMPTY );
320 return false;
321 }
322
323 while ( p && *p )
324 {
325 TiXmlNode* node = Identify( p );
326 if ( node )
327 {
328 p = node->Parse( p );
329 LinkEndChild( node );
330 }
331 else
332 {
333 break;
334 }
335 p = SkipWhiteSpace( p );
336 }
337 // All is well.
338 return p;
339 }
340
341
Identify(const char * p)342 TiXmlNode* TiXmlNode::Identify( const char* p )
343 {
344 TiXmlNode* returnNode = 0;
345
346 p = SkipWhiteSpace( p );
347 if( !p || !*p || *p != '<' )
348 {
349 return 0;
350 }
351
352 TiXmlDocument* doc = GetDocument();
353 p = SkipWhiteSpace( p );
354
355 if ( !p || !*p )
356 {
357 return 0;
358 }
359
360 // What is this thing?
361 // - Elements start with a letter or underscore, but xml is reserved.
362 // - Comments: <!--
363 // - Decleration: <?xml
364 // - Everthing else is unknown to tinyxml.
365 //
366
367 const char* xmlHeader = { "<?xml" };
368 const char* commentHeader = { "<!--" };
369
370 if ( StringEqual( p, xmlHeader, true ) )
371 {
372 #ifdef DEBUG_PARSER
373 TIXML_LOG( "XML parsing Declaration\n" );
374 #endif
375 returnNode = new TiXmlDeclaration();
376 }
377 else if ( isalpha( *(p+1) )
378 || *(p+1) == '_' )
379 {
380 #ifdef DEBUG_PARSER
381 TIXML_LOG( "XML parsing Element\n" );
382 #endif
383 returnNode = new TiXmlElement( "" );
384 }
385 else if ( StringEqual( p, commentHeader, false ) )
386 {
387 #ifdef DEBUG_PARSER
388 TIXML_LOG( "XML parsing Comment\n" );
389 #endif
390 returnNode = new TiXmlComment();
391 }
392 else
393 {
394 #ifdef DEBUG_PARSER
395 TIXML_LOG( "XML parsing Unknown\n" );
396 #endif
397 returnNode = new TiXmlUnknown();
398 }
399
400 if ( returnNode )
401 {
402 // Set the parent, so it can report errors
403 returnNode->parent = this;
404 //p = returnNode->Parse( p );
405 }
406 else
407 {
408 if ( doc )
409 doc->SetError( TIXML_ERROR_OUT_OF_MEMORY );
410 }
411 return returnNode;
412 }
413
414
StreamIn(std::istream * in,std::string * tag)415 void TiXmlElement::StreamIn( std::istream* in, std::string* tag )
416 {
417 // We're called with some amount of pre-parsing. That is, some of "this"
418 // element is in "tag". Go ahead and stream to the closing ">"
419 while( in->good() )
420 {
421 int c = in->get();
422 (*tag) += (char) c ;
423
424 if ( c == '>' )
425 break;
426 }
427
428 if ( tag->length() < 3 ) return;
429
430 // Okay...if we are a "/>" tag, then we're done. We've read a complete tag.
431 // If not, identify and stream.
432
433 if ( tag->at( tag->length() - 1 ) == '>'
434 && tag->at( tag->length() - 2 ) == '/' )
435 {
436 // All good!
437 return;
438 }
439 else if ( tag->at( tag->length() - 1 ) == '>' )
440 {
441 // There is more. Could be:
442 // text
443 // closing tag
444 // another node.
445 for ( ;; )
446 {
447 StreamWhiteSpace( in, tag );
448
449 // Do we have text?
450 if ( in->peek() != '<' )
451 {
452 // Yep, text.
453 TiXmlText text( "" );
454 text.StreamIn( in, tag );
455
456 // What follows text is a closing tag or another node.
457 // Go around again and figure it out.
458 continue;
459 }
460
461 // We now have either a closing tag...or another node.
462 // We should be at a "<", regardless.
463 if ( !in->good() ) return;
464 assert( in->peek() == '<' );
465 int tagIndex = tag->length();
466
467 bool closingTag = false;
468 bool firstCharFound = false;
469
470 for( ;; )
471 {
472 if ( !in->good() )
473 return;
474
475 int c = in->peek();
476
477 if ( c == '>' )
478 break;
479
480 *tag += c;
481 in->get();
482
483 if ( !firstCharFound && c != '<' && !IsWhiteSpace( c ) )
484 {
485 firstCharFound = true;
486 if ( c == '/' )
487 closingTag = true;
488 }
489 }
490 // If it was a closing tag, then read in the closing '>' to clean up the input stream.
491 // If it was not, the streaming will be done by the tag.
492 if ( closingTag )
493 {
494 int c = in->get();
495 assert( c == '>' );
496 *tag += c;
497
498 // We are done, once we've found our closing tag.
499 return;
500 }
501 else
502 {
503 // If not a closing tag, id it, and stream.
504 const char* tagloc = tag->c_str() + tagIndex;
505 TiXmlNode* node = Identify( tagloc );
506 if ( !node )
507 return;
508 node->StreamIn( in, tag );
509 delete node;
510 node = 0;
511
512 // No return: go around from the beginning: text, closing tag, or node.
513 }
514 }
515 }
516 }
517
518
Parse(const char * p)519 const char* TiXmlElement::Parse( const char* p )
520 {
521 p = SkipWhiteSpace( p );
522 TiXmlDocument* document = GetDocument();
523
524 if ( !p || !*p || *p != '<' )
525 {
526 if ( document ) document->SetError( TIXML_ERROR_PARSING_ELEMENT );
527 return false;
528 }
529
530 p = SkipWhiteSpace( p+1 );
531
532 // Read the name.
533 p = ReadName( p, &value );
534 if ( !p || !*p )
535 {
536 if ( document ) document->SetError( TIXML_ERROR_FAILED_TO_READ_ELEMENT_NAME );
537 return false;
538 }
539
540 string endTag = "</";
541 endTag += value;
542 endTag += ">";
543
544 // Check for and read attributes. Also look for an empty
545 // tag or an end tag.
546 while ( p && *p )
547 {
548 p = SkipWhiteSpace( p );
549 if ( !p || !*p )
550 {
551 if ( document ) document->SetError( TIXML_ERROR_READING_ATTRIBUTES );
552 return 0;
553 }
554 if ( *p == '/' )
555 {
556 ++p;
557 // Empty tag.
558 if ( *p != '>' )
559 {
560 if ( document ) document->SetError( TIXML_ERROR_PARSING_EMPTY );
561 return 0;
562 }
563 return (p+1);
564 }
565 else if ( *p == '>' )
566 {
567 // Done with attributes (if there were any.)
568 // Read the value -- which can include other
569 // elements -- read the end tag, and return.
570 ++p;
571 p = ReadValue( p ); // Note this is an Element method, and will set the error if one happens.
572 if ( !p || !*p )
573 return 0;
574
575 // We should find the end tag now
576 if ( StringEqual( p, endTag.c_str(), false ) )
577 {
578 p += endTag.length();
579 return p;
580 }
581 else
582 {
583 if ( document ) document->SetError( TIXML_ERROR_READING_END_TAG );
584 return 0;
585 }
586 }
587 else
588 {
589 // Try to read an element:
590 TiXmlAttribute attrib;
591 attrib.SetDocument( document );
592 p = attrib.Parse( p );
593
594 if ( !p || !*p )
595 {
596 if ( document ) document->SetError( TIXML_ERROR_PARSING_ELEMENT );
597 return 0;
598 }
599 SetAttribute( attrib.Name(), attrib.Value() );
600 }
601 }
602 return p;
603 }
604
605
ReadValue(const char * p)606 const char* TiXmlElement::ReadValue( const char* p )
607 {
608 TiXmlDocument* document = GetDocument();
609
610 // Read in text and elements in any order.
611 p = SkipWhiteSpace( p );
612 while ( p && *p )
613 {
614 // string text;
615 // while ( p && *p && *p != '<' )
616 // {
617 // text += (*p);
618 // ++p;
619 // }
620 //
621 // p = SkipWhiteSpace( p );
622
623 if ( *p != '<' )
624 {
625 // Take what we have, make a text element.
626 TiXmlText* textNode = new TiXmlText( "" );
627
628 if ( !textNode )
629 {
630 if ( document ) document->SetError( TIXML_ERROR_OUT_OF_MEMORY );
631 return 0;
632 }
633
634 p = textNode->Parse( p );
635
636 if ( !textNode->Blank() )
637 LinkEndChild( textNode );
638 else
639 delete textNode;
640 }
641 else
642 {
643 // We hit a '<'
644 // Have we hit a new element or an end tag?
645 if ( StringEqual( p, "</", false ) )
646 {
647 return p;
648 }
649 else
650 {
651 TiXmlNode* node = Identify( p );
652 if ( node )
653 {
654 p = node->Parse( p );
655 LinkEndChild( node );
656 }
657 else
658 {
659 return 0;
660 }
661 }
662 }
663 p = SkipWhiteSpace( p );
664 }
665
666 if ( !p )
667 {
668 if ( document ) document->SetError( TIXML_ERROR_READING_ELEMENT_VALUE );
669 }
670 return p;
671 }
672
673
StreamIn(std::istream * in,std::string * tag)674 void TiXmlUnknown::StreamIn( std::istream* in, std::string* tag )
675 {
676 while ( in->good() )
677 {
678 int c = in->get();
679 (*tag) += c;
680
681 if ( c == '>' )
682 {
683 // All is well.
684 return;
685 }
686 }
687 }
688
689
Parse(const char * p)690 const char* TiXmlUnknown::Parse( const char* p )
691 {
692 TiXmlDocument* document = GetDocument();
693 p = SkipWhiteSpace( p );
694 if ( !p || !*p || *p != '<' )
695 {
696 if ( document ) document->SetError( TIXML_ERROR_PARSING_UNKNOWN );
697 return 0;
698 }
699 ++p;
700 value = "";
701
702 while ( p && *p && *p != '>' )
703 {
704 value += *p;
705 ++p;
706 }
707
708 if ( !p )
709 {
710 if ( document ) document->SetError( TIXML_ERROR_PARSING_UNKNOWN );
711 }
712 if ( *p == '>' )
713 return p+1;
714 return p;
715 }
716
717
StreamIn(std::istream * in,std::string * tag)718 void TiXmlComment::StreamIn( std::istream* in, std::string* tag )
719 {
720 while ( in->good() )
721 {
722 int c = in->get();
723 (*tag) += c;
724
725 if ( c == '>'
726 && tag->at( tag->length() - 2 ) == '-'
727 && tag->at( tag->length() - 3 ) == '-' )
728 {
729 // All is well.
730 return;
731 }
732 }
733 }
734
735
Parse(const char * p)736 const char* TiXmlComment::Parse( const char* p )
737 {
738 TiXmlDocument* document = GetDocument();
739 value = "";
740
741 p = SkipWhiteSpace( p );
742 const char* startTag = "<!--";
743 const char* endTag = "-->";
744
745 if ( !StringEqual( p, startTag, false ) )
746 {
747 document->SetError( TIXML_ERROR_PARSING_COMMENT );
748 return 0;
749 }
750 p += strlen( startTag );
751 p = ReadText( p, &value, false, endTag, false );
752 return p;
753 }
754
755
Parse(const char * p)756 const char* TiXmlAttribute::Parse( const char* p )
757 {
758 p = SkipWhiteSpace( p );
759 if ( !p || !*p ) return 0;
760
761 // Read the name, the '=' and the value.
762 p = ReadName( p, &name );
763 if ( !p || !*p )
764 {
765 if ( document ) document->SetError( TIXML_ERROR_READING_ATTRIBUTES );
766 return 0;
767 }
768 p = SkipWhiteSpace( p );
769 if ( !p || !*p || *p != '=' )
770 {
771 if ( document ) document->SetError( TIXML_ERROR_READING_ATTRIBUTES );
772 return 0;
773 }
774
775 ++p; // skip '='
776 p = SkipWhiteSpace( p );
777 if ( !p || !*p )
778 {
779 if ( document ) document->SetError( TIXML_ERROR_READING_ATTRIBUTES );
780 return 0;
781 }
782
783 const char* end;
784
785 if ( *p == '\'' )
786 {
787 ++p;
788 end = "\'";
789 p = ReadText( p, &value, false, end, false );
790 }
791 else if ( *p == '"' )
792 {
793 ++p;
794 end = "\"";
795 p = ReadText( p, &value, false, end, false );
796 }
797 else
798 {
799 // All attribute values should be in single or double quotes.
800 // But this is such a common error that the parser will try
801 // its best, even without them.
802 value = "";
803 while ( p && *p // existence
804 && !isspace( *p ) && *p != '\n' && *p != '\r' // whitespace
805 && *p != '/' && *p != '>' ) // tag end
806 {
807 value += *p;
808 ++p;
809 }
810 }
811 return p;
812 }
813
814
StreamIn(std::istream * in,std::string * tag)815 void TiXmlText::StreamIn( std::istream* in, std::string* tag )
816 {
817 while ( in->good() )
818 {
819 int c = in->peek();
820 if ( c == '<' )
821 return;
822
823 (*tag) += c;
824 in->get();
825 }
826 }
827
828
829
Parse(const char * p)830 const char* TiXmlText::Parse( const char* p )
831 {
832 value = "";
833
834 //TiXmlDocument* doc = GetDocument();
835 bool ignoreWhite = true;
836 // if ( doc && !doc->IgnoreWhiteSpace() ) ignoreWhite = false;
837
838 const char* end = "<";
839 p = ReadText( p, &value, ignoreWhite, end, false );
840 if ( p )
841 return p-1; // don't truncate the '<'
842 return 0;
843 }
844
845
StreamIn(std::istream * in,std::string * tag)846 void TiXmlDeclaration::StreamIn( std::istream* in, std::string* tag )
847 {
848 while ( in->good() )
849 {
850 int c = in->get();
851 (*tag) += c;
852
853 if ( c == '>' )
854 {
855 // All is well.
856 return;
857 }
858 }
859 }
860
Parse(const char * p)861 const char* TiXmlDeclaration::Parse( const char* p )
862 {
863 p = SkipWhiteSpace( p );
864 // Find the beginning, find the end, and look for
865 // the stuff in-between.
866 TiXmlDocument* document = GetDocument();
867 if ( !p || !*p || !StringEqual( p, "<?xml", true ) )
868 {
869 if ( document ) document->SetError( TIXML_ERROR_PARSING_DECLARATION );
870 return 0;
871 }
872
873 p += 5;
874 // const char* start = p+5;
875 // const char* end = strstr( start, "?>" );
876
877 version = "";
878 encoding = "";
879 standalone = "";
880
881 while ( p && *p )
882 {
883 if ( *p == '>' )
884 {
885 ++p;
886 return p;
887 }
888
889 p = SkipWhiteSpace( p );
890 if ( StringEqual( p, "version", true ) )
891 {
892 // p += 7;
893 TiXmlAttribute attrib;
894 p = attrib.Parse( p );
895 version = attrib.Value();
896 }
897 else if ( StringEqual( p, "encoding", true ) )
898 {
899 // p += 8;
900 TiXmlAttribute attrib;
901 p = attrib.Parse( p );
902 encoding = attrib.Value();
903 }
904 else if ( StringEqual( p, "standalone", true ) )
905 {
906 // p += 10;
907 TiXmlAttribute attrib;
908 p = attrib.Parse( p );
909 standalone = attrib.Value();
910 }
911 else
912 {
913 // Read over whatever it is.
914 while( p && *p && *p != '>' && !isspace( *p ) )
915 ++p;
916 }
917 }
918 return 0;
919 }
920
Blank() const921 bool TiXmlText::Blank() const
922 {
923 for ( unsigned i=0; i<value.size(); i++ )
924 if ( !isspace( value[i] ) )
925 return false;
926 return true;
927 }
928
929