1 /*
2 www.sourceforge.net/projects/tinyxml
3 Original code (2.0 and earlier )copyright (c) 2000-2002 Lee Thomason (www.grinninglizard.com)
4
5 This software is provided 'as-is', without any express or implied
6 warranty. In no event will the authors be held liable for any
7 damages arising from the use of this software.
8
9 Permission is granted to anyone to use this software for any
10 purpose, including commercial applications, and to alter it and
11 redistribute it freely, subject to the following restrictions:
12
13 1. The origin of this software must not be misrepresented; you must
14 not claim that you wrote the original software. If you use this
15 software in a product, an acknowledgment in the product documentation
16 would be appreciated but is not required.
17
18 2. Altered source versions must be plainly marked as such, and
19 must not be misrepresented as being the original software.
20
21 3. This notice may not be removed or altered from any source
22 distribution.
23 */
24
25 #include "tinyxml/tinyxml.h"
26
27 #include <ctype.h>
28
29 //#define DEBUG_PARSER
30
31 #if defined( _DEBUG ) && defined( _MSC_VER )
32 #include <windows.h>
33 #define TIXML_LOG OutputDebugString
34 #else
35 #define TIXML_LOG printf
36 #endif
37
38 // Note tha "PutString" hardcodes the same list. This
39 // is less flexible than it appears. Changing the entries
40 // or order will break putstring.
41 TiXmlBase::Entity TiXmlBase::entity[ NUM_ENTITY ] =
42 {
43 { "&", 5, '&' },
44 { "<", 4, '<' },
45 { ">", 4, '>' },
46 { """, 6, '\"' },
47 { "'", 6, '\'' }
48 };
49
50 // Bunch of unicode info at:
51 // http://www.unicode.org/faq/utf_bom.html
52 // Including the basic of this table, which determines the #bytes in the
53 // sequence from the lead byte. 1 placed for invalid sequences --
54 // although the result will be junk, pass it through as much as possible.
55 // Beware of the non-characters in UTF-8:
56 // ef bb bf (Microsoft "lead bytes")
57 // ef bf be
58 // ef bf bf
59
60 const char TIXML_UTF_LEAD_0 = (const char)0xef;
61 const char TIXML_UTF_LEAD_1 = (const char)0xbb;
62 const char TIXML_UTF_LEAD_2 = (const char)0xbf;
63
64 const int TiXmlBase::utf8ByteTable[256] =
65 {
66 // 0 1 2 3 4 5 6 7 8 9 a b c d e f
67 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 0x00
68 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 0x10
69 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 0x20
70 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 0x30
71 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 0x40
72 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 0x50
73 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 0x60
74 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 0x70 End of ASCII range
75 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 0x80 0x80 to 0xc1 invalid
76 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 0x90
77 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 0xa0
78 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 0xb0
79 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, // 0xc0 0xc2 to 0xdf 2 byte
80 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, // 0xd0
81 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, // 0xe0 0xe0 to 0xef 3 byte
82 4, 4, 4, 4, 4, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1 // 0xf0 0xf0 to 0xf4 4 byte, 0xf5 and higher invalid
83 };
84
85
ConvertUTF32ToUTF8(unsigned long input,char * output,int * length)86 void TiXmlBase::ConvertUTF32ToUTF8( unsigned long input, char* output, int* length )
87 {
88 const unsigned long BYTE_MASK = 0xBF;
89 const unsigned long BYTE_MARK = 0x80;
90 const unsigned long FIRST_BYTE_MARK[7] = { 0x00, 0x00, 0xC0, 0xE0, 0xF0, 0xF8, 0xFC };
91
92 if (input < 0x80)
93 *length = 1;
94 else if ( input < 0x800 )
95 *length = 2;
96 else if ( input < 0x10000 )
97 *length = 3;
98 else if ( input < 0x200000 )
99 *length = 4;
100 else
101 { *length = 0; return; } // This code won't covert this correctly anyway.
102
103 output += *length;
104
105 // Scary scary fall throughs.
106 switch (*length)
107 {
108 case 4:
109 --output;
110 *output = (char)((input | BYTE_MARK) & BYTE_MASK);
111 input >>= 6;
112 case 3:
113 --output;
114 *output = (char)((input | BYTE_MARK) & BYTE_MASK);
115 input >>= 6;
116 case 2:
117 --output;
118 *output = (char)((input | BYTE_MARK) & BYTE_MASK);
119 input >>= 6;
120 case 1:
121 --output;
122 *output = (char)(input | FIRST_BYTE_MARK[*length]);
123 }
124 }
125
126
IsAlpha(unsigned char anyByte,TiXmlEncoding)127 /*static*/ int TiXmlBase::IsAlpha( unsigned char anyByte, TiXmlEncoding )
128 {
129 // This will only work for low-ascii, everything else is assumed to be a valid
130 // letter. I'm not sure this is the best approach, but it is quite tricky trying
131 // to figure out alhabetical vs. not across encoding. So take a very
132 // conservative approach.
133
134 // if ( encoding == TIXML_ENCODING_UTF8 )
135 // {
136 if ( anyByte < 127 )
137 return isalpha( anyByte );
138 else
139 return 1; // What else to do? The unicode set is huge...get the english ones right.
140 // }
141 // else
142 // {
143 // return isalpha( anyByte );
144 // }
145 }
146
147
IsAlphaNum(unsigned char anyByte,TiXmlEncoding)148 /*static*/ int TiXmlBase::IsAlphaNum( unsigned char anyByte, TiXmlEncoding )
149 {
150 // This will only work for low-ascii, everything else is assumed to be a valid
151 // letter. I'm not sure this is the best approach, but it is quite tricky trying
152 // to figure out alhabetical vs. not across encoding. So take a very
153 // conservative approach.
154
155 // if ( encoding == TIXML_ENCODING_UTF8 )
156 // {
157 if ( anyByte < 127 )
158 return isalnum( anyByte );
159 else
160 return 1; // What else to do? The unicode set is huge...get the english ones right.
161 // }
162 // else
163 // {
164 // return isalnum( anyByte );
165 // }
166 }
167
168
169 class TiXmlParsingData
170 {
171 friend class TiXmlDocument;
172 public:
173 void Stamp( const char* now, TiXmlEncoding encoding );
174
Cursor()175 const TiXmlCursor& Cursor() { return cursor; }
176
177 private:
178 // Only used by the document!
TiXmlParsingData(const char * start,int _tabsize,int row,int col)179 TiXmlParsingData( const char* start, int _tabsize, int row, int col )
180 {
181 assert( start );
182 stamp = start;
183 tabsize = _tabsize;
184 cursor.row = row;
185 cursor.col = col;
186 }
187
188 TiXmlCursor cursor;
189 const char* stamp;
190 int tabsize;
191 };
192
193
Stamp(const char * now,TiXmlEncoding encoding)194 void TiXmlParsingData::Stamp( const char* now, TiXmlEncoding encoding )
195 {
196 assert( now );
197
198 // Do nothing if the tabsize is 0.
199 if ( tabsize < 1 )
200 {
201 return;
202 }
203
204 // Get the current row, column.
205 int row = cursor.row;
206 int col = cursor.col;
207 const char* p = stamp;
208 assert( p );
209
210 while ( p < now )
211 {
212 // Code contributed by Fletcher Dunn: (modified by lee)
213 switch (*p) {
214 case 0:
215 // We *should* never get here, but in case we do, don't
216 // advance past the terminating null character, ever
217 return;
218
219 case '\r':
220 // bump down to the next line
221 ++row;
222 col = 0;
223 // Eat the character
224 ++p;
225
226 // Check for \r\n sequence, and treat this as a single character
227 if (*p == '\n') {
228 ++p;
229 }
230 break;
231
232 case '\n':
233 // bump down to the next line
234 ++row;
235 col = 0;
236
237 // Eat the character
238 ++p;
239
240 // Check for \n\r sequence, and treat this as a single
241 // character. (Yes, this bizarre thing does occur still
242 // on some arcane platforms...)
243 if (*p == '\r') {
244 ++p;
245 }
246 break;
247
248 case '\t':
249 // Eat the character
250 ++p;
251
252 // Skip to next tab stop
253 col = (col / tabsize + 1) * tabsize;
254 break;
255
256 case TIXML_UTF_LEAD_0:
257 if ( encoding == TIXML_ENCODING_UTF8 )
258 {
259 if ( *(p+1) && *(p+2) )
260 {
261 // In these cases, don't advance the column. These are
262 // 0-width spaces.
263 if ( *(p+1)==TIXML_UTF_LEAD_1 && *(p+2)==TIXML_UTF_LEAD_2 )
264 p += 3;
265 else if ( *(p+1)==(char)(0xbf) && *(p+2)==(char)(0xbe) )
266 p += 3;
267 else if ( *(p+1)==(char)(0xbf) && *(p+2)==(char)(0xbf) )
268 p += 3;
269 else
270 { p +=3; ++col; } // A normal character.
271 }
272 }
273 else
274 {
275 ++p;
276 ++col;
277 }
278 break;
279
280 default:
281 if ( encoding == TIXML_ENCODING_UTF8 )
282 {
283 // Eat the 1 to 4 byte utf8 character.
284 int step = TiXmlBase::utf8ByteTable[*((unsigned char*)p)];
285 if ( step == 0 )
286 step = 1; // Error case from bad encoding, but handle gracefully.
287 p += step;
288
289 // Just advance one column, of course.
290 ++col;
291 }
292 else
293 {
294 ++p;
295 ++col;
296 }
297 break;
298 }
299 }
300 cursor.row = row;
301 cursor.col = col;
302 assert( cursor.row >= -1 );
303 assert( cursor.col >= -1 );
304 stamp = p;
305 assert( stamp );
306 }
307
308
SkipWhiteSpace(const char * p,TiXmlEncoding encoding)309 const char* TiXmlBase::SkipWhiteSpace( const char* p, TiXmlEncoding encoding )
310 {
311 if ( !p || !*p )
312 {
313 return 0;
314 }
315 if ( encoding == TIXML_ENCODING_UTF8 )
316 {
317 while ( *p )
318 {
319 // Skip the stupid Microsoft UTF-8 Byte order marks
320 if ( *(p+0)==TIXML_UTF_LEAD_0
321 && *(p+1)==TIXML_UTF_LEAD_1
322 && *(p+2)==TIXML_UTF_LEAD_2 )
323 {
324 p += 3;
325 continue;
326 }
327 else if(*(p+0)==TIXML_UTF_LEAD_0
328 && *(p+1)==(const char) 0xbf
329 && *(p+2)==(const char) 0xbe )
330 {
331 p += 3;
332 continue;
333 }
334 else if(*(p+0)==TIXML_UTF_LEAD_0
335 && *(p+1)==(const char) 0xbf
336 && *(p+2)==(const char) 0xbf )
337 {
338 p += 3;
339 continue;
340 }
341
342 if ( IsWhiteSpace( *p ) || *p == '\n' || *p =='\r' ) // Still using old rules for white space.
343 ++p;
344 else
345 break;
346 }
347 }
348 else
349 {
350 while ( *p && ( IsWhiteSpace( *p ) || *p == '\n' || *p =='\r' ) )
351 ++p;
352 }
353
354 return p;
355 }
356
357 #ifdef TIXML_USE_STL
StreamWhiteSpace(TIXML_ISTREAM * in,TIXML_STRING * tag)358 /*static*/ bool TiXmlBase::StreamWhiteSpace( TIXML_ISTREAM * in, TIXML_STRING * tag )
359 {
360 for( ;; )
361 {
362 if ( !in->good() ) return false;
363
364 int c = in->peek();
365 // At this scope, we can't get to a document. So fail silently.
366 if ( !IsWhiteSpace( c ) || c <= 0 )
367 return true;
368
369 *tag += (char) in->get();
370 }
371 }
372
StreamTo(TIXML_ISTREAM * in,int character,TIXML_STRING * tag)373 /*static*/ bool TiXmlBase::StreamTo( TIXML_ISTREAM * in, int character, TIXML_STRING * tag )
374 {
375 //assert( character > 0 && character < 128 ); // else it won't work in utf-8
376 while ( in->good() )
377 {
378 int c = in->peek();
379 if ( c == character )
380 return true;
381 if ( c <= 0 ) // Silent failure: can't get document at this scope
382 return false;
383
384 in->get();
385 *tag += (char) c;
386 }
387 return false;
388 }
389 #endif
390
ReadName(const char * p,TIXML_STRING * name,TiXmlEncoding encoding)391 const char* TiXmlBase::ReadName( const char* p, TIXML_STRING * name, TiXmlEncoding encoding )
392 {
393 *name = "";
394 assert( p );
395
396 // Names start with letters or underscores.
397 // Of course, in unicode, tinyxml has no idea what a letter *is*. The
398 // algorithm is generous.
399 //
400 // After that, they can be letters, underscores, numbers,
401 // hyphens, or colons. (Colons are valid ony for namespaces,
402 // but tinyxml can't tell namespaces from names.)
403 if ( p && *p
404 && ( IsAlpha( (unsigned char) *p, encoding ) || *p == '_' ) )
405 {
406 while( p && *p
407 && ( IsAlphaNum( (unsigned char ) *p, encoding )
408 || *p == '_'
409 || *p == '-'
410 || *p == '.'
411 || *p == ':' ) )
412 {
413 (*name) += *p;
414 ++p;
415 }
416 return p;
417 }
418 return 0;
419 }
420
GetEntity(const char * p,char * value,int * length,TiXmlEncoding encoding)421 const char* TiXmlBase::GetEntity( const char* p, char* value, int* length, TiXmlEncoding encoding )
422 {
423 // Presume an entity, and pull it out.
424 TIXML_STRING ent;
425 int i;
426 *length = 0;
427
428 if ( *(p+1) && *(p+1) == '#' && *(p+2) )
429 {
430 unsigned long ucs = 0;
431 //*ME: warning C4244: convert '__w64 int' to 'unsigned'
432 //*ME: Use size_t instead of unsigned (pointer-arithmetic)
433 size_t delta = 0;
434 unsigned mult = 1;
435
436 if ( *(p+2) == 'x' )
437 {
438 // Hexadecimal.
439 if ( !*(p+3) ) return 0;
440
441 const char* q = p+3;
442 q = strchr( q, ';' );
443
444 if ( !q || !*q ) return 0;
445
446 delta = q-p;
447 --q;
448
449 while ( *q != 'x' )
450 {
451 if ( *q >= '0' && *q <= '9' )
452 ucs += mult * (*q - '0');
453 else if ( *q >= 'a' && *q <= 'f' )
454 ucs += mult * (*q - 'a' + 10);
455 else if ( *q >= 'A' && *q <= 'F' )
456 ucs += mult * (*q - 'A' + 10 );
457 else
458 return 0;
459 mult *= 16;
460 --q;
461 }
462 }
463 else
464 {
465 // Decimal.
466 if ( !*(p+2) ) return 0;
467
468 const char* q = p+2;
469 q = strchr( q, ';' );
470
471 if ( !q || !*q ) return 0;
472
473 delta = q-p;
474 --q;
475
476 while ( *q != '#' )
477 {
478 if ( *q >= '0' && *q <= '9' )
479 ucs += mult * (*q - '0');
480 else
481 return 0;
482 mult *= 10;
483 --q;
484 }
485 }
486 if ( encoding == TIXML_ENCODING_UTF8 )
487 {
488 // convert the UCS to UTF-8
489 ConvertUTF32ToUTF8( ucs, value, length );
490 }
491 else
492 {
493 *value = (char)ucs;
494 *length = 1;
495 }
496 return p + delta + 1;
497 }
498
499 // Now try to match it.
500 for( i=0; i<NUM_ENTITY; ++i )
501 {
502 if ( strncmp( entity[i].str, p, entity[i].strLength ) == 0 )
503 {
504 assert( strlen( entity[i].str ) == entity[i].strLength );
505 *value = entity[i].chr;
506 *length = 1;
507 return ( p + entity[i].strLength );
508 }
509 }
510
511 // So it wasn't an entity, its unrecognized, or something like that.
512 *value = *p; // Don't put back the last one, since we return it!
513 return p+1;
514 }
515
516
StringEqual(const char * p,const char * tag,bool ignoreCase,TiXmlEncoding encoding)517 bool TiXmlBase::StringEqual( const char* p,
518 const char* tag,
519 bool ignoreCase,
520 TiXmlEncoding encoding )
521 {
522 assert( p );
523 assert( tag );
524 if ( !p || !*p )
525 {
526 assert( 0 );
527 return false;
528 }
529
530 const char* q = p;
531
532 if ( ignoreCase )
533 {
534 while ( *q && *tag && ToLower( *q, encoding ) == ToLower( *tag, encoding ) )
535 {
536 ++q;
537 ++tag;
538 }
539
540 if ( *tag == 0 )
541 return true;
542 }
543 else
544 {
545 while ( *q && *tag && *q == *tag )
546 {
547 ++q;
548 ++tag;
549 }
550
551 if ( *tag == 0 ) // Have we found the end of the tag, and everything equal?
552 return true;
553 }
554 return false;
555 }
556
ReadText(const char * p,TIXML_STRING * text,bool trimWhiteSpace,const char * endTag,bool caseInsensitive,TiXmlEncoding encoding)557 const char* TiXmlBase::ReadText( const char* p,
558 TIXML_STRING * text,
559 bool trimWhiteSpace,
560 const char* endTag,
561 bool caseInsensitive,
562 TiXmlEncoding encoding )
563 {
564 *text = "";
565 if ( !trimWhiteSpace // certain tags always keep whitespace
566 || !condenseWhiteSpace ) // if true, whitespace is always kept
567 {
568 // Keep all the white space.
569 while ( p && *p
570 && !StringEqual( p, endTag, caseInsensitive, encoding )
571 )
572 {
573 int len;
574 char cArr[4] = { 0, 0, 0, 0 };
575 p = GetChar( p, cArr, &len, encoding );
576 text->append( cArr, len );
577 }
578 }
579 else
580 {
581 bool whitespace = false;
582
583 // Remove leading white space:
584 p = SkipWhiteSpace( p, encoding );
585 while ( p && *p
586 && !StringEqual( p, endTag, caseInsensitive, encoding ) )
587 {
588 if ( *p == '\r' || *p == '\n' )
589 {
590 whitespace = true;
591 ++p;
592 }
593 else if ( IsWhiteSpace( *p ) )
594 {
595 whitespace = true;
596 ++p;
597 }
598 else
599 {
600 // If we've found whitespace, add it before the
601 // new character. Any whitespace just becomes a space.
602 if ( whitespace )
603 {
604 (*text) += ' ';
605 whitespace = false;
606 }
607 int len;
608 char cArr[4] = { 0, 0, 0, 0 };
609 p = GetChar( p, cArr, &len, encoding );
610 if ( len == 1 )
611 (*text) += cArr[0]; // more efficient
612 else
613 text->append( cArr, len );
614 }
615 }
616 }
617 return p + strlen( endTag );
618 }
619
620 #ifdef TIXML_USE_STL
621
StreamIn(TIXML_ISTREAM * in,TIXML_STRING * tag)622 void TiXmlDocument::StreamIn( TIXML_ISTREAM * in, TIXML_STRING * tag )
623 {
624 // The basic issue with a document is that we don't know what we're
625 // streaming. Read something presumed to be a tag (and hope), then
626 // identify it, and call the appropriate stream method on the tag.
627 //
628 // This "pre-streaming" will never read the closing ">" so the
629 // sub-tag can orient itself.
630
631 if ( !StreamTo( in, '<', tag ) )
632 {
633 SetError( TIXML_ERROR_PARSING_EMPTY, 0, 0, TIXML_ENCODING_UNKNOWN );
634 return;
635 }
636
637 while ( in->good() )
638 {
639 int tagIndex = (int) tag->length();
640 while ( in->good() && in->peek() != '>' )
641 {
642 int c = in->get();
643 if ( c <= 0 )
644 {
645 SetError( TIXML_ERROR_EMBEDDED_NULL, 0, 0, TIXML_ENCODING_UNKNOWN );
646 break;
647 }
648 (*tag) += (char) c;
649 }
650
651 if ( in->good() )
652 {
653 // We now have something we presume to be a node of
654 // some sort. Identify it, and call the node to
655 // continue streaming.
656 TiXmlNode* node = Identify( tag->c_str() + tagIndex, TIXML_DEFAULT_ENCODING );
657
658 if ( node )
659 {
660 node->StreamIn( in, tag );
661 bool isElement = node->ToElement() != 0;
662 delete node;
663 node = 0;
664
665 // If this is the root element, we're done. Parsing will be
666 // done by the >> operator.
667 if ( isElement )
668 {
669 return;
670 }
671 }
672 else
673 {
674 SetError( TIXML_ERROR, 0, 0, TIXML_ENCODING_UNKNOWN );
675 return;
676 }
677 }
678 }
679 // We should have returned sooner.
680 SetError( TIXML_ERROR, 0, 0, TIXML_ENCODING_UNKNOWN );
681 }
682
683 #endif
684
Parse(const char * p,TiXmlParsingData * prevData,TiXmlEncoding encoding)685 const char* TiXmlDocument::Parse( const char* p, TiXmlParsingData* prevData, TiXmlEncoding encoding )
686 {
687 ClearError();
688
689 // Parse away, at the document level. Since a document
690 // contains nothing but other tags, most of what happens
691 // here is skipping white space.
692 if ( !p || !*p )
693 {
694 SetError( TIXML_ERROR_DOCUMENT_EMPTY, 0, 0, TIXML_ENCODING_UNKNOWN );
695 return 0;
696 }
697
698 // Note that, for a document, this needs to come
699 // before the while space skip, so that parsing
700 // starts from the pointer we are given.
701 location.Clear();
702 if ( prevData )
703 {
704 location.row = prevData->cursor.row;
705 location.col = prevData->cursor.col;
706 }
707 else
708 {
709 location.row = 0;
710 location.col = 0;
711 }
712 TiXmlParsingData data( p, TabSize(), location.row, location.col );
713 location = data.Cursor();
714
715 if ( encoding == TIXML_ENCODING_UNKNOWN )
716 {
717 // Check for the Microsoft UTF-8 lead bytes.
718 if ( *(p+0) && *(p+0) == TIXML_UTF_LEAD_0
719 && *(p+1) && *(p+1) == TIXML_UTF_LEAD_1
720 && *(p+2) && *(p+2) == TIXML_UTF_LEAD_2 )
721 {
722 encoding = TIXML_ENCODING_UTF8;
723 }
724 }
725
726 p = SkipWhiteSpace( p, encoding );
727 if ( !p )
728 {
729 SetError( TIXML_ERROR_DOCUMENT_EMPTY, 0, 0, TIXML_ENCODING_UNKNOWN );
730 return 0;
731 }
732
733 while ( p && *p )
734 {
735 TiXmlNode* node = Identify( p, encoding );
736 if ( node )
737 {
738 p = node->Parse( p, &data, encoding );
739 LinkEndChild( node );
740 }
741 else
742 {
743 break;
744 }
745
746 // Did we get encoding info?
747 if ( encoding == TIXML_ENCODING_UNKNOWN
748 && node->ToDeclaration() )
749 {
750 TiXmlDeclaration* dec = node->ToDeclaration();
751 const char* enc = dec->Encoding();
752 assert( enc );
753
754 if ( *enc == 0 )
755 encoding = TIXML_ENCODING_UTF8;
756 else if ( StringEqual( enc, "UTF-8", true, TIXML_ENCODING_UNKNOWN ) )
757 encoding = TIXML_ENCODING_UTF8;
758 else if ( StringEqual( enc, "UTF8", true, TIXML_ENCODING_UNKNOWN ) )
759 encoding = TIXML_ENCODING_UTF8; // incorrect, but be nice
760 else
761 encoding = TIXML_ENCODING_LEGACY;
762 }
763
764 p = SkipWhiteSpace( p, encoding );
765 }
766
767 // Was this empty?
768 if ( !firstChild ) {
769 SetError( TIXML_ERROR_DOCUMENT_EMPTY, 0, 0, encoding );
770 return 0;
771 }
772
773 // All is well.
774 return p;
775 }
776
SetError(int err,const char * pError,TiXmlParsingData * data,TiXmlEncoding encoding)777 void TiXmlDocument::SetError( int err, const char* pError, TiXmlParsingData* data, TiXmlEncoding encoding )
778 {
779 // The first error in a chain is more accurate - don't set again!
780 if ( error )
781 return;
782
783 assert( err > 0 && err < TIXML_ERROR_STRING_COUNT );
784 error = true;
785 errorId = err;
786 errorDesc = errorString[ errorId ];
787
788 errorLocation.Clear();
789 if ( pError && data )
790 {
791 //TiXmlParsingData data( pError, prevData );
792 data->Stamp( pError, encoding );
793 errorLocation = data->Cursor();
794 }
795 }
796
797
Identify(const char * p,TiXmlEncoding encoding)798 TiXmlNode* TiXmlNode::Identify( const char* p, TiXmlEncoding encoding )
799 {
800 TiXmlNode* returnNode = 0;
801
802 p = SkipWhiteSpace( p, encoding );
803 if( !p || !*p || *p != '<' )
804 {
805 return 0;
806 }
807
808 TiXmlDocument* doc = GetDocument();
809 p = SkipWhiteSpace( p, encoding );
810
811 if ( !p || !*p )
812 {
813 return 0;
814 }
815
816 // What is this thing?
817 // - Elements start with a letter or underscore, but xml is reserved.
818 // - Comments: <!--
819 // - Decleration: <?xml
820 // - Everthing else is unknown to tinyxml.
821 //
822
823 const char* xmlHeader = { "<?xml" };
824 const char* commentHeader = { "<!--" };
825 const char* dtdHeader = { "<!" };
826
827 if ( StringEqual( p, xmlHeader, true, encoding ) )
828 {
829 #ifdef DEBUG_PARSER
830 TIXML_LOG( "XML parsing Declaration\n" );
831 #endif
832 returnNode = new TiXmlDeclaration();
833 }
834 else if ( StringEqual( p, commentHeader, false, encoding ) )
835 {
836 #ifdef DEBUG_PARSER
837 TIXML_LOG( "XML parsing Comment\n" );
838 #endif
839 returnNode = new TiXmlComment();
840 }
841 else if ( StringEqual( p, dtdHeader, false, encoding ) )
842 {
843 #ifdef DEBUG_PARSER
844 TIXML_LOG( "XML parsing Unknown(1)\n" );
845 #endif
846 returnNode = new TiXmlUnknown();
847 }
848 else if ( IsAlpha( *(p+1), encoding )
849 || *(p+1) == '_' )
850 {
851 #ifdef DEBUG_PARSER
852 TIXML_LOG( "XML parsing Element\n" );
853 #endif
854 returnNode = new TiXmlElement( "" );
855 }
856 else
857 {
858 #ifdef DEBUG_PARSER
859 TIXML_LOG( "XML parsing Unknown(2)\n" );
860 #endif
861 returnNode = new TiXmlUnknown();
862 }
863
864 if ( returnNode )
865 {
866 // Set the parent, so it can report errors
867 returnNode->parent = this;
868 }
869 else
870 {
871 if ( doc )
872 doc->SetError( TIXML_ERROR_OUT_OF_MEMORY, 0, 0, TIXML_ENCODING_UNKNOWN );
873 }
874 return returnNode;
875 }
876
877 #ifdef TIXML_USE_STL
878
StreamIn(TIXML_ISTREAM * in,TIXML_STRING * tag)879 void TiXmlElement::StreamIn (TIXML_ISTREAM * in, TIXML_STRING * tag)
880 {
881 // We're called with some amount of pre-parsing. That is, some of "this"
882 // element is in "tag". Go ahead and stream to the closing ">"
883 while( in->good() )
884 {
885 int c = in->get();
886 if ( c <= 0 )
887 {
888 TiXmlDocument* document = GetDocument();
889 if ( document )
890 document->SetError( TIXML_ERROR_EMBEDDED_NULL, 0, 0, TIXML_ENCODING_UNKNOWN );
891 return;
892 }
893 (*tag) += (char) c ;
894
895 if ( c == '>' )
896 break;
897 }
898
899 if ( tag->length() < 3 ) return;
900
901 // Okay...if we are a "/>" tag, then we're done. We've read a complete tag.
902 // If not, identify and stream.
903
904 if ( tag->at( tag->length() - 1 ) == '>'
905 && tag->at( tag->length() - 2 ) == '/' )
906 {
907 // All good!
908 return;
909 }
910 else if ( tag->at( tag->length() - 1 ) == '>' )
911 {
912 // There is more. Could be:
913 // text
914 // closing tag
915 // another node.
916 for ( ;; )
917 {
918 StreamWhiteSpace( in, tag );
919
920 // Do we have text?
921 if ( in->good() && in->peek() != '<' )
922 {
923 // Yep, text.
924 TiXmlText text( "" );
925 text.StreamIn( in, tag );
926
927 // What follows text is a closing tag or another node.
928 // Go around again and figure it out.
929 continue;
930 }
931
932 // We now have either a closing tag...or another node.
933 // We should be at a "<", regardless.
934 if ( !in->good() ) return;
935 assert( in->peek() == '<' );
936 size_t tagIndex = tag->length();
937
938 bool closingTag = false;
939 bool firstCharFound = false;
940
941 for( ;; )
942 {
943 if ( !in->good() )
944 return;
945
946 int c = in->peek();
947 if ( c <= 0 )
948 {
949 TiXmlDocument* document = GetDocument();
950 if ( document )
951 document->SetError( TIXML_ERROR_EMBEDDED_NULL, 0, 0, TIXML_ENCODING_UNKNOWN );
952 return;
953 }
954
955 if ( c == '>' )
956 break;
957
958 *tag += (char) c;
959 in->get();
960
961 if ( !firstCharFound && c != '<' && !IsWhiteSpace( c ) )
962 {
963 firstCharFound = true;
964 if ( c == '/' )
965 closingTag = true;
966 }
967 }
968 // If it was a closing tag, then read in the closing '>' to clean up the input stream.
969 // If it was not, the streaming will be done by the tag.
970 if ( closingTag )
971 {
972 if ( !in->good() )
973 return;
974
975 int c = in->get();
976 if ( c <= 0 )
977 {
978 TiXmlDocument* document = GetDocument();
979 if ( document )
980 document->SetError( TIXML_ERROR_EMBEDDED_NULL, 0, 0, TIXML_ENCODING_UNKNOWN );
981 return;
982 }
983 assert( c == '>' );
984 *tag += (char) c;
985
986 // We are done, once we've found our closing tag.
987 return;
988 }
989 else
990 {
991 // If not a closing tag, id it, and stream.
992 const char* tagloc = tag->c_str() + tagIndex;
993 TiXmlNode* node = Identify( tagloc, TIXML_DEFAULT_ENCODING );
994 if ( !node )
995 return;
996 node->StreamIn( in, tag );
997 delete node;
998 node = 0;
999
1000 // No return: go around from the beginning: text, closing tag, or node.
1001 }
1002 }
1003 }
1004 }
1005 #endif
1006
Parse(const char * p,TiXmlParsingData * data,TiXmlEncoding encoding)1007 const char* TiXmlElement::Parse( const char* p, TiXmlParsingData* data, TiXmlEncoding encoding )
1008 {
1009 p = SkipWhiteSpace( p, encoding );
1010 TiXmlDocument* document = GetDocument();
1011
1012 if ( !p || !*p )
1013 {
1014 if ( document ) document->SetError( TIXML_ERROR_PARSING_ELEMENT, 0, 0, encoding );
1015 return 0;
1016 }
1017
1018 // TiXmlParsingData data( p, prevData );
1019 if ( data )
1020 {
1021 data->Stamp( p, encoding );
1022 location = data->Cursor();
1023 }
1024
1025 if ( *p != '<' )
1026 {
1027 if ( document ) document->SetError( TIXML_ERROR_PARSING_ELEMENT, p, data, encoding );
1028 return 0;
1029 }
1030
1031 p = SkipWhiteSpace( p+1, encoding );
1032
1033 // Read the name.
1034 const char* pErr = p;
1035
1036 p = ReadName( p, &value, encoding );
1037 if ( !p || !*p )
1038 {
1039 if ( document ) document->SetError( TIXML_ERROR_FAILED_TO_READ_ELEMENT_NAME, pErr, data, encoding );
1040 return 0;
1041 }
1042
1043 TIXML_STRING endTag ("</");
1044 endTag += value;
1045 endTag += ">";
1046
1047 // Check for and read attributes. Also look for an empty
1048 // tag or an end tag.
1049 while ( p && *p )
1050 {
1051 pErr = p;
1052 p = SkipWhiteSpace( p, encoding );
1053 if ( !p || !*p )
1054 {
1055 if ( document ) document->SetError( TIXML_ERROR_READING_ATTRIBUTES, pErr, data, encoding );
1056 return 0;
1057 }
1058 if ( *p == '/' )
1059 {
1060 ++p;
1061 // Empty tag.
1062 if ( *p != '>' )
1063 {
1064 if ( document ) document->SetError( TIXML_ERROR_PARSING_EMPTY, p, data, encoding );
1065 return 0;
1066 }
1067 return (p+1);
1068 }
1069 else if ( *p == '>' )
1070 {
1071 // Done with attributes (if there were any.)
1072 // Read the value -- which can include other
1073 // elements -- read the end tag, and return.
1074 ++p;
1075 p = ReadValue( p, data, encoding ); // Note this is an Element method, and will set the error if one happens.
1076 if ( !p || !*p )
1077 return 0;
1078
1079 // We should find the end tag now
1080 if ( StringEqual( p, endTag.c_str(), false, encoding ) )
1081 {
1082 p += endTag.length();
1083 return p;
1084 }
1085 else
1086 {
1087 if ( document ) document->SetError( TIXML_ERROR_READING_END_TAG, p, data, encoding );
1088 return 0;
1089 }
1090 }
1091 else
1092 {
1093 // Try to read an attribute:
1094 TiXmlAttribute* attrib = new TiXmlAttribute();
1095 if ( !attrib )
1096 {
1097 if ( document ) document->SetError( TIXML_ERROR_OUT_OF_MEMORY, pErr, data, encoding );
1098 return 0;
1099 }
1100
1101 attrib->SetDocument( document );
1102 const char* pErr = p;
1103 p = attrib->Parse( p, data, encoding );
1104
1105 if ( !p || !*p )
1106 {
1107 if ( document ) document->SetError( TIXML_ERROR_PARSING_ELEMENT, pErr, data, encoding );
1108 delete attrib;
1109 return 0;
1110 }
1111
1112 // Handle the strange case of double attributes:
1113 TiXmlAttribute* node = attributeSet.Find( attrib->Name() );
1114 if ( node )
1115 {
1116 node->SetValue( attrib->Value() );
1117 delete attrib;
1118 return 0;
1119 }
1120
1121 attributeSet.Add( attrib );
1122 }
1123 }
1124 return p;
1125 }
1126
1127
ReadValue(const char * p,TiXmlParsingData * data,TiXmlEncoding encoding)1128 const char* TiXmlElement::ReadValue( const char* p, TiXmlParsingData* data, TiXmlEncoding encoding )
1129 {
1130 TiXmlDocument* document = GetDocument();
1131
1132 const char* pWithWhiteSpace = p;
1133 // Read in text and elements in any order.
1134 p = SkipWhiteSpace( p, encoding );
1135 while ( p && *p )
1136 {
1137 if ( *p != '<' )
1138 {
1139 // Take what we have, make a text element.
1140 TiXmlText* textNode = new TiXmlText( "" );
1141
1142 if ( !textNode )
1143 {
1144 if ( document ) document->SetError( TIXML_ERROR_OUT_OF_MEMORY, 0, 0, encoding );
1145 return 0;
1146 }
1147
1148 if ( TiXmlBase::IsWhiteSpaceCondensed() )
1149 {
1150 p = textNode->Parse( p, data, encoding );
1151 }
1152 else
1153 {
1154 // Special case: we want to keep the white space
1155 // so that leading spaces aren't removed.
1156 p = textNode->Parse( pWithWhiteSpace, data, encoding );
1157 }
1158
1159 if ( !textNode->Blank() )
1160 LinkEndChild( textNode );
1161 else
1162 delete textNode;
1163 }
1164 else
1165 {
1166 // We hit a '<'
1167 // Have we hit a new element or an end tag?
1168 if ( StringEqual( p, "</", false, encoding ) )
1169 {
1170 return p;
1171 }
1172 else
1173 {
1174 TiXmlNode* node = Identify( p, encoding );
1175 if ( node )
1176 {
1177 p = node->Parse( p, data, encoding );
1178 LinkEndChild( node );
1179 }
1180 else
1181 {
1182 return 0;
1183 }
1184 }
1185 }
1186 p = SkipWhiteSpace( p, encoding );
1187 }
1188
1189 if ( !p )
1190 {
1191 if ( document ) document->SetError( TIXML_ERROR_READING_ELEMENT_VALUE, 0, 0, encoding );
1192 }
1193 return p;
1194 }
1195
1196
1197 #ifdef TIXML_USE_STL
StreamIn(TIXML_ISTREAM * in,TIXML_STRING * tag)1198 void TiXmlUnknown::StreamIn( TIXML_ISTREAM * in, TIXML_STRING * tag )
1199 {
1200 while ( in->good() )
1201 {
1202 int c = in->get();
1203 if ( c <= 0 )
1204 {
1205 TiXmlDocument* document = GetDocument();
1206 if ( document )
1207 document->SetError( TIXML_ERROR_EMBEDDED_NULL, 0, 0, TIXML_ENCODING_UNKNOWN );
1208 return;
1209 }
1210 (*tag) += (char) c;
1211
1212 if ( c == '>' )
1213 {
1214 // All is well.
1215 return;
1216 }
1217 }
1218 }
1219 #endif
1220
1221
Parse(const char * p,TiXmlParsingData * data,TiXmlEncoding encoding)1222 const char* TiXmlUnknown::Parse( const char* p, TiXmlParsingData* data, TiXmlEncoding encoding )
1223 {
1224 TiXmlDocument* document = GetDocument();
1225 p = SkipWhiteSpace( p, encoding );
1226
1227 // TiXmlParsingData data( p, prevData );
1228 if ( data )
1229 {
1230 data->Stamp( p, encoding );
1231 location = data->Cursor();
1232 }
1233 if ( !p || !*p || *p != '<' )
1234 {
1235 if ( document ) document->SetError( TIXML_ERROR_PARSING_UNKNOWN, p, data, encoding );
1236 return 0;
1237 }
1238 ++p;
1239 value = "";
1240
1241 while ( p && *p && *p != '>' )
1242 {
1243 value += *p;
1244 ++p;
1245 }
1246
1247 if ( !p )
1248 {
1249 if ( document ) document->SetError( TIXML_ERROR_PARSING_UNKNOWN, 0, 0, encoding );
1250 }
1251 if ( *p == '>' )
1252 return p+1;
1253 return p;
1254 }
1255
1256 #ifdef TIXML_USE_STL
StreamIn(TIXML_ISTREAM * in,TIXML_STRING * tag)1257 void TiXmlComment::StreamIn( TIXML_ISTREAM * in, TIXML_STRING * tag )
1258 {
1259 while ( in->good() )
1260 {
1261 int c = in->get();
1262 if ( c <= 0 )
1263 {
1264 TiXmlDocument* document = GetDocument();
1265 if ( document )
1266 document->SetError( TIXML_ERROR_EMBEDDED_NULL, 0, 0, TIXML_ENCODING_UNKNOWN );
1267 return;
1268 }
1269
1270 (*tag) += (char) c;
1271
1272 if ( c == '>'
1273 && tag->at( tag->length() - 2 ) == '-'
1274 && tag->at( tag->length() - 3 ) == '-' )
1275 {
1276 // All is well.
1277 return;
1278 }
1279 }
1280 }
1281 #endif
1282
1283
Parse(const char * p,TiXmlParsingData * data,TiXmlEncoding encoding)1284 const char* TiXmlComment::Parse( const char* p, TiXmlParsingData* data, TiXmlEncoding encoding )
1285 {
1286 TiXmlDocument* document = GetDocument();
1287 value = "";
1288
1289 p = SkipWhiteSpace( p, encoding );
1290
1291 // TiXmlParsingData data( p, prevData );
1292 if ( data )
1293 {
1294 data->Stamp( p, encoding );
1295 location = data->Cursor();
1296 }
1297 const char* startTag = "<!--";
1298 const char* endTag = "-->";
1299
1300 if ( !StringEqual( p, startTag, false, encoding ) )
1301 {
1302 document->SetError( TIXML_ERROR_PARSING_COMMENT, p, data, encoding );
1303 return 0;
1304 }
1305 p += strlen( startTag );
1306 p = ReadText( p, &value, false, endTag, false, encoding );
1307 return p;
1308 }
1309
1310
Parse(const char * p,TiXmlParsingData * data,TiXmlEncoding encoding)1311 const char* TiXmlAttribute::Parse( const char* p, TiXmlParsingData* data, TiXmlEncoding encoding )
1312 {
1313 p = SkipWhiteSpace( p, encoding );
1314 if ( !p || !*p ) return 0;
1315
1316 int tabsize = 4;
1317 if ( document )
1318 tabsize = document->TabSize();
1319
1320 // TiXmlParsingData data( p, prevData );
1321 if ( data )
1322 {
1323 data->Stamp( p, encoding );
1324 location = data->Cursor();
1325 }
1326 // Read the name, the '=' and the value.
1327 const char* pErr = p;
1328 p = ReadName( p, &name, encoding );
1329 if ( !p || !*p )
1330 {
1331 if ( document ) document->SetError( TIXML_ERROR_READING_ATTRIBUTES, pErr, data, encoding );
1332 return 0;
1333 }
1334 p = SkipWhiteSpace( p, encoding );
1335 if ( !p || !*p || *p != '=' )
1336 {
1337 if ( document ) document->SetError( TIXML_ERROR_READING_ATTRIBUTES, p, data, encoding );
1338 return 0;
1339 }
1340
1341 ++p; // skip '='
1342 p = SkipWhiteSpace( p, encoding );
1343 if ( !p || !*p )
1344 {
1345 if ( document ) document->SetError( TIXML_ERROR_READING_ATTRIBUTES, p, data, encoding );
1346 return 0;
1347 }
1348
1349 const char* end;
1350
1351 if ( *p == '\'' )
1352 {
1353 ++p;
1354 end = "\'";
1355 p = ReadText( p, &value, false, end, false, encoding );
1356 }
1357 else if ( *p == '"' )
1358 {
1359 ++p;
1360 end = "\"";
1361 p = ReadText( p, &value, false, end, false, encoding );
1362 }
1363 else
1364 {
1365 // All attribute values should be in single or double quotes.
1366 // But this is such a common error that the parser will try
1367 // its best, even without them.
1368 value = "";
1369 while ( p && *p // existence
1370 && !IsWhiteSpace( *p ) && *p != '\n' && *p != '\r' // whitespace
1371 && *p != '/' && *p != '>' ) // tag end
1372 {
1373 value += *p;
1374 ++p;
1375 }
1376 }
1377 return p;
1378 }
1379
1380 #ifdef TIXML_USE_STL
StreamIn(TIXML_ISTREAM * in,TIXML_STRING * tag)1381 void TiXmlText::StreamIn( TIXML_ISTREAM * in, TIXML_STRING * tag )
1382 {
1383 while ( in->good() )
1384 {
1385 int c = in->peek();
1386 if ( c == '<' )
1387 return;
1388 if ( c <= 0 )
1389 {
1390 TiXmlDocument* document = GetDocument();
1391 if ( document )
1392 document->SetError( TIXML_ERROR_EMBEDDED_NULL, 0, 0, TIXML_ENCODING_UNKNOWN );
1393 return;
1394 }
1395
1396 (*tag) += (char) c;
1397 in->get();
1398 }
1399 }
1400 #endif
1401
Parse(const char * p,TiXmlParsingData * data,TiXmlEncoding encoding)1402 const char* TiXmlText::Parse( const char* p, TiXmlParsingData* data, TiXmlEncoding encoding )
1403 {
1404 value = "";
1405 // TiXmlParsingData data( p, prevData );
1406 if ( data )
1407 {
1408 data->Stamp( p, encoding );
1409 location = data->Cursor();
1410 }
1411 bool ignoreWhite = true;
1412
1413 const char* end = "<";
1414 p = ReadText( p, &value, ignoreWhite, end, false, encoding );
1415 if ( p )
1416 return p-1; // don't truncate the '<'
1417 return 0;
1418 }
1419
1420 #ifdef TIXML_USE_STL
StreamIn(TIXML_ISTREAM * in,TIXML_STRING * tag)1421 void TiXmlDeclaration::StreamIn( TIXML_ISTREAM * in, TIXML_STRING * tag )
1422 {
1423 while ( in->good() )
1424 {
1425 int c = in->get();
1426 if ( c <= 0 )
1427 {
1428 TiXmlDocument* document = GetDocument();
1429 if ( document )
1430 document->SetError( TIXML_ERROR_EMBEDDED_NULL, 0, 0, TIXML_ENCODING_UNKNOWN );
1431 return;
1432 }
1433 (*tag) += (char) c;
1434
1435 if ( c == '>' )
1436 {
1437 // All is well.
1438 return;
1439 }
1440 }
1441 }
1442 #endif
1443
Parse(const char * p,TiXmlParsingData * data,TiXmlEncoding _encoding)1444 const char* TiXmlDeclaration::Parse( const char* p, TiXmlParsingData* data, TiXmlEncoding _encoding )
1445 {
1446 p = SkipWhiteSpace( p, _encoding );
1447 // Find the beginning, find the end, and look for
1448 // the stuff in-between.
1449 TiXmlDocument* document = GetDocument();
1450 if ( !p || !*p || !StringEqual( p, "<?xml", true, _encoding ) )
1451 {
1452 if ( document ) document->SetError( TIXML_ERROR_PARSING_DECLARATION, 0, 0, _encoding );
1453 return 0;
1454 }
1455 // TiXmlParsingData data( p, prevData );
1456 if ( data )
1457 {
1458 data->Stamp( p, _encoding );
1459 location = data->Cursor();
1460 }
1461 p += 5;
1462
1463 version = "";
1464 encoding = "";
1465 standalone = "";
1466
1467 while ( p && *p )
1468 {
1469 if ( *p == '>' )
1470 {
1471 ++p;
1472 return p;
1473 }
1474
1475 p = SkipWhiteSpace( p, _encoding );
1476 if ( StringEqual( p, "version", true, _encoding ) )
1477 {
1478 TiXmlAttribute attrib;
1479 p = attrib.Parse( p, data, _encoding );
1480 version = attrib.Value();
1481 }
1482 else if ( StringEqual( p, "encoding", true, _encoding ) )
1483 {
1484 TiXmlAttribute attrib;
1485 p = attrib.Parse( p, data, _encoding );
1486 encoding = attrib.Value();
1487 }
1488 else if ( StringEqual( p, "standalone", true, _encoding ) )
1489 {
1490 TiXmlAttribute attrib;
1491 p = attrib.Parse( p, data, _encoding );
1492 standalone = attrib.Value();
1493 }
1494 else
1495 {
1496 // Read over whatever it is.
1497 while( p && *p && *p != '>' && !IsWhiteSpace( *p ) )
1498 ++p;
1499 }
1500 }
1501 return 0;
1502 }
1503
Blank() const1504 bool TiXmlText::Blank() const
1505 {
1506 for ( unsigned i=0; i<value.length(); i++ )
1507 if ( !IsWhiteSpace( value[i] ) )
1508 return false;
1509 return true;
1510 }
1511
1512