1 /* -*- Mode: C++; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4 -*- */
2 /*
3  * This file is part of the LibreOffice project.
4  *
5  * This Source Code Form is subject to the terms of the Mozilla Public
6  * License, v. 2.0. If a copy of the MPL was not distributed with this
7  * file, You can obtain one at http://mozilla.org/MPL/2.0/.
8  *
9  * This file incorporates work covered by the following license notice:
10  *
11  *   Licensed to the Apache Software Foundation (ASF) under one or more
12  *   contributor license agreements. See the NOTICE file distributed
13  *   with this work for additional information regarding copyright
14  *   ownership. The ASF licenses this file to you under the Apache
15  *   License, Version 2.0 (the "License"); you may not use this file
16  *   except in compliance with the License. You may obtain a copy of
17  *   the License at http://www.apache.org/licenses/LICENSE-2.0 .
18  */
19 
20 #include <sal/config.h>
21 
22 #include <cassert>
23 #include <climits>
24 
25 #include <com/sun/star/container/NoSuchElementException.hpp>
26 #include <com/sun/star/uno/RuntimeException.hpp>
27 #include <osl/file.h>
28 #include <rtl/character.hxx>
29 #include <rtl/string.h>
30 #include <rtl/ustring.hxx>
31 #include <sal/log.hxx>
32 #include <sal/types.h>
33 #include <xmlreader/pad.hxx>
34 #include <xmlreader/span.hxx>
35 #include <xmlreader/xmlreader.hxx>
36 
37 namespace xmlreader {
38 
39 namespace {
40 
isSpace(char c)41 bool isSpace(char c) {
42     switch (c) {
43     case '\x09':
44     case '\x0A':
45     case '\x0D':
46     case ' ':
47         return true;
48     default:
49         return false;
50     }
51 }
52 
53 }
54 
XmlReader(OUString const & fileUrl)55 XmlReader::XmlReader(OUString const & fileUrl)
56     : fileUrl_(fileUrl)
57     , fileHandle_(nullptr)
58 {
59     oslFileError e = osl_openFile(
60         fileUrl_.pData, &fileHandle_, osl_File_OpenFlag_Read);
61     switch (e)
62     {
63     case osl_File_E_None:
64         break;
65     case osl_File_E_NOENT:
66         throw css::container::NoSuchElementException( fileUrl_ );
67     default:
68         throw css::uno::RuntimeException(
69             "cannot open " + fileUrl_ + ": " + OUString::number(e));
70     }
71     e = osl_getFileSize(fileHandle_, &fileSize_);
72     if (e == osl_File_E_None) {
73         e = osl_mapFile(
74             fileHandle_, &fileAddress_, fileSize_, 0,
75             osl_File_MapFlag_WillNeed);
76     }
77     if (e != osl_File_E_None) {
78         oslFileError e2 = osl_closeFile(fileHandle_);
79         if (e2 != osl_File_E_None) {
80             SAL_WARN(
81                 "xmlreader",
82                 "osl_closeFile of \"" << fileUrl_ << "\" failed with " << +e2);
83         }
84         throw css::uno::RuntimeException(
85             "cannot mmap " + fileUrl_ + " (" + OUString::number(e) + ")" );
86     }
87     namespaceIris_.emplace_back("http://www.w3.org/XML/1998/namespace");
88     namespaces_.emplace_back(Span("xml"), NAMESPACE_XML);
89     pos_ = static_cast< char * >(fileAddress_);
90     end_ = pos_ + fileSize_;
91     state_ = State::Content;
92     firstAttribute_ = true;
93 }
94 
~XmlReader()95 XmlReader::~XmlReader() {
96     if (!fileHandle_)
97         return;
98     oslFileError e = osl_unmapMappedFile(fileHandle_, fileAddress_, fileSize_);
99     if (e != osl_File_E_None) {
100         SAL_WARN(
101             "xmlreader",
102             "osl_unmapMappedFile of \"" << fileUrl_ << "\" failed with " << +e);
103     }
104     e = osl_closeFile(fileHandle_);
105     if (e != osl_File_E_None) {
106         SAL_WARN(
107             "xmlreader",
108             "osl_closeFile of \"" << fileUrl_ << "\" failed with " << +e);
109     }
110 }
111 
registerNamespaceIri(Span const & iri)112 int XmlReader::registerNamespaceIri(Span const & iri) {
113     int id = toNamespaceId(namespaceIris_.size());
114     namespaceIris_.push_back(iri);
115     if (iri == "http://www.w3.org/2001/XMLSchema-instance") {
116         // Old user layer .xcu files used the xsi namespace prefix without
117         // declaring a corresponding namespace binding, see issue 77174; reading
118         // those files during migration would fail without this hack that can be
119         // removed once migration is no longer relevant (see
120         // configmgr::Components::parseModificationLayer):
121         namespaces_.emplace_back(Span("xsi"), id);
122     }
123     return id;
124 }
125 
nextItem(Text reportText,Span * data,int * nsId)126 XmlReader::Result XmlReader::nextItem(Text reportText, Span * data, int * nsId)
127 {
128     switch (state_) {
129     case State::Content:
130         switch (reportText) {
131         case Text::NONE:
132             return handleSkippedText(data, nsId);
133         case Text::Raw:
134             return handleRawText(data);
135         default: // Text::Normalized
136             return handleNormalizedText(data);
137         }
138     case State::StartTag:
139         return handleStartTag(nsId, data);
140     case State::EndTag:
141         return handleEndTag();
142     case State::EmptyElementTag:
143         handleElementEnd();
144         return Result::End;
145     default: // State::Done
146         return Result::Done;
147     }
148 }
149 
nextAttribute(int * nsId,Span * localName)150 bool XmlReader::nextAttribute(int * nsId, Span * localName) {
151     assert(nsId != nullptr && localName != nullptr);
152     if (firstAttribute_) {
153         currentAttribute_ = attributes_.begin();
154         firstAttribute_ = false;
155     } else {
156         ++currentAttribute_;
157     }
158     if (currentAttribute_ == attributes_.end()) {
159         return false;
160     }
161     if (currentAttribute_->nameColon == nullptr) {
162         *nsId = NAMESPACE_NONE;
163         *localName = Span(
164             currentAttribute_->nameBegin,
165             currentAttribute_->nameEnd - currentAttribute_->nameBegin);
166     } else {
167         *nsId = getNamespaceId(
168             Span(
169                 currentAttribute_->nameBegin,
170                 currentAttribute_->nameColon - currentAttribute_->nameBegin));
171         *localName = Span(
172             currentAttribute_->nameColon + 1,
173             currentAttribute_->nameEnd - (currentAttribute_->nameColon + 1));
174     }
175     return true;
176 }
177 
getAttributeValue(bool fullyNormalize)178 Span XmlReader::getAttributeValue(bool fullyNormalize) {
179     return handleAttributeValue(
180         currentAttribute_->valueBegin, currentAttribute_->valueEnd,
181         fullyNormalize);
182 }
183 
getNamespaceId(Span const & prefix) const184 int XmlReader::getNamespaceId(Span const & prefix) const {
185     auto i = std::find_if(namespaces_.crbegin(), namespaces_.crend(),
186         [&prefix](const NamespaceData& rNamespaceData) { return prefix == rNamespaceData.prefix; });
187 
188     if (i != namespaces_.rend())
189         return i->nsId;
190 
191     return NAMESPACE_UNKNOWN;
192 }
193 
194 
normalizeLineEnds(Span const & text)195 void XmlReader::normalizeLineEnds(Span const & text) {
196     char const * p = text.begin;
197     sal_Int32 n = text.length;
198     for (;;) {
199         sal_Int32 i = rtl_str_indexOfChar_WithLength(p, n, '\x0D');
200         if (i < 0) {
201             break;
202         }
203         pad_.add(p, i);
204         p += i + 1;
205         n -= i + 1;
206         if (n == 0 || *p != '\x0A') {
207             pad_.add("\x0A");
208         }
209     }
210     pad_.add(p, n);
211 }
212 
skipSpace()213 void XmlReader::skipSpace() {
214     while (isSpace(peek())) {
215         ++pos_;
216     }
217 }
218 
skipComment()219 bool XmlReader::skipComment() {
220     if (rtl_str_shortenedCompare_WithLength(
221             pos_, end_ - pos_, RTL_CONSTASCII_STRINGPARAM("--"),
222             RTL_CONSTASCII_LENGTH("--")) !=
223         0)
224     {
225         return false;
226     }
227     pos_ += RTL_CONSTASCII_LENGTH("--");
228     sal_Int32 i = rtl_str_indexOfStr_WithLength(
229         pos_, end_ - pos_, RTL_CONSTASCII_STRINGPARAM("--"));
230     if (i < 0) {
231         throw css::uno::RuntimeException(
232             "premature end (within comment) of " + fileUrl_ );
233     }
234     pos_ += i + RTL_CONSTASCII_LENGTH("--");
235     if (read() != '>') {
236         throw css::uno::RuntimeException(
237             "illegal \"--\" within comment in " + fileUrl_ );
238     }
239     return true;
240 }
241 
skipProcessingInstruction()242 void XmlReader::skipProcessingInstruction() {
243     sal_Int32 i = rtl_str_indexOfStr_WithLength(
244         pos_, end_ - pos_, RTL_CONSTASCII_STRINGPARAM("?>"));
245     if (i < 0) {
246         throw css::uno::RuntimeException(
247             "bad '<?' in " + fileUrl_ );
248     }
249     pos_ += i + RTL_CONSTASCII_LENGTH("?>");
250 }
251 
skipDocumentTypeDeclaration()252 void XmlReader::skipDocumentTypeDeclaration() {
253     // Neither is it checked that the doctypedecl is at the correct position in
254     // the document, nor that it is well-formed:
255     for (;;) {
256         char c = read();
257         switch (c) {
258         case '\0': // i.e., EOF
259             throw css::uno::RuntimeException(
260                 "premature end (within DTD) of " + fileUrl_ );
261         case '"':
262         case '\'':
263             {
264                 sal_Int32 i = rtl_str_indexOfChar_WithLength(
265                     pos_, end_ - pos_, c);
266                 if (i < 0) {
267                     throw css::uno::RuntimeException(
268                         "premature end (within DTD) of " + fileUrl_ );
269                 }
270                 pos_ += i + 1;
271             }
272             break;
273         case '>':
274             return;
275         case '[':
276             for (;;) {
277                 c = read();
278                 switch (c) {
279                 case '\0': // i.e., EOF
280                     throw css::uno::RuntimeException(
281                         "premature end (within DTD) of " + fileUrl_ );
282                 case '"':
283                 case '\'':
284                     {
285                         sal_Int32 i = rtl_str_indexOfChar_WithLength(
286                             pos_, end_ - pos_, c);
287                         if (i < 0) {
288                             throw css::uno::RuntimeException(
289                                 "premature end (within DTD) of " + fileUrl_ );
290                         }
291                         pos_ += i + 1;
292                     }
293                     break;
294                 case '<':
295                     switch (read()) {
296                     case '\0': // i.e., EOF
297                         throw css::uno::RuntimeException(
298                             "premature end (within DTD) of " + fileUrl_ );
299                     case '!':
300                         skipComment();
301                         break;
302                     case '?':
303                         skipProcessingInstruction();
304                         break;
305                     default:
306                         break;
307                     }
308                     break;
309                 case ']':
310                     skipSpace();
311                     if (read() != '>') {
312                         throw css::uno::RuntimeException(
313                             "missing \">\" of DTD in " + fileUrl_ );
314                     }
315                     return;
316                 default:
317                     break;
318                 }
319             }
320         default:
321             break;
322         }
323     }
324 }
325 
scanCdataSection()326 Span XmlReader::scanCdataSection() {
327     if (rtl_str_shortenedCompare_WithLength(
328             pos_, end_ - pos_, RTL_CONSTASCII_STRINGPARAM("[CDATA["),
329             RTL_CONSTASCII_LENGTH("[CDATA[")) !=
330         0)
331     {
332         return Span();
333     }
334     pos_ += RTL_CONSTASCII_LENGTH("[CDATA[");
335     char const * begin = pos_;
336     sal_Int32 i = rtl_str_indexOfStr_WithLength(
337         pos_, end_ - pos_, RTL_CONSTASCII_STRINGPARAM("]]>"));
338     if (i < 0) {
339         throw css::uno::RuntimeException(
340             "premature end (within CDATA section) of " + fileUrl_ );
341     }
342     pos_ += i + RTL_CONSTASCII_LENGTH("]]>");
343     return Span(begin, i);
344 }
345 
scanName(char const ** nameColon)346 bool XmlReader::scanName(char const ** nameColon) {
347     assert(nameColon != nullptr && *nameColon == nullptr);
348     for (char const * begin = pos_;; ++pos_) {
349         switch (peek()) {
350         case '\0': // i.e., EOF
351         case '\x09':
352         case '\x0A':
353         case '\x0D':
354         case ' ':
355         case '/':
356         case '=':
357         case '>':
358             return pos_ != begin;
359         case ':':
360             *nameColon = pos_;
361             break;
362         default:
363             break;
364         }
365     }
366 }
367 
scanNamespaceIri(char const * begin,char const * end)368 int XmlReader::scanNamespaceIri(char const * begin, char const * end) {
369     assert(begin != nullptr && begin <= end);
370     Span iri(handleAttributeValue(begin, end, false));
371     for (NamespaceIris::size_type i = 0; i < namespaceIris_.size(); ++i) {
372         if (namespaceIris_[i] == iri) {
373             return toNamespaceId(i);
374         }
375     }
376     return XmlReader::NAMESPACE_UNKNOWN;
377 }
378 
handleReference(char const * position,char const * end)379 char const * XmlReader::handleReference(char const * position, char const * end)
380 {
381     assert(position != nullptr && *position == '&' && position < end);
382     ++position;
383     if (*position == '#') {
384         ++position;
385         sal_uInt32 val = 0;
386         char const * p;
387         if (*position == 'x') {
388             ++position;
389             p = position;
390             for (;; ++position) {
391                 char c = *position;
392                 if (c >= '0' && c <= '9') {
393                     val = 16 * val + (c - '0');
394                 } else if (c >= 'A' && c <= 'F') {
395                     val = 16 * val + (c - 'A') + 10;
396                 } else if (c >= 'a' && c <= 'f') {
397                     val = 16 * val + (c - 'a') + 10;
398                 } else {
399                     break;
400                 }
401                 if (!rtl::isUnicodeCodePoint(val)) { // avoid overflow
402                     throw css::uno::RuntimeException(
403                         "'&#x...' too large in " + fileUrl_ );
404                 }
405             }
406         } else {
407             p = position;
408             for (;; ++position) {
409                 char c = *position;
410                 if (c >= '0' && c <= '9') {
411                     val = 10 * val + (c - '0');
412                 } else {
413                     break;
414                 }
415                 if (!rtl::isUnicodeCodePoint(val)) { // avoid overflow
416                     throw css::uno::RuntimeException(
417                         "'&#...' too large in " + fileUrl_ );
418                 }
419             }
420         }
421         if (position == p || *position++ != ';') {
422             throw css::uno::RuntimeException(
423                 "'&#...' missing ';' in " + fileUrl_ );
424         }
425         assert(rtl::isUnicodeCodePoint(val));
426         if ((val < 0x20 && val != 0x9 && val != 0xA && val != 0xD) ||
427             (val >= 0xD800 && val <= 0xDFFF) || val == 0xFFFE || val == 0xFFFF)
428         {
429             throw css::uno::RuntimeException(
430                 "character reference denoting invalid character in " + fileUrl_ );
431         }
432         char buf[4];
433         sal_Int32 len;
434         if (val < 0x80) {
435             buf[0] = static_cast< char >(val);
436             len = 1;
437         } else if (val < 0x800) {
438             buf[0] = static_cast< char >((val >> 6) | 0xC0);
439             buf[1] = static_cast< char >((val & 0x3F) | 0x80);
440             len = 2;
441         } else if (val < 0x10000) {
442             buf[0] = static_cast< char >((val >> 12) | 0xE0);
443             buf[1] = static_cast< char >(((val >> 6) & 0x3F) | 0x80);
444             buf[2] = static_cast< char >((val & 0x3F) | 0x80);
445             len = 3;
446         } else {
447             buf[0] = static_cast< char >((val >> 18) | 0xF0);
448             buf[1] = static_cast< char >(((val >> 12) & 0x3F) | 0x80);
449             buf[2] = static_cast< char >(((val >> 6) & 0x3F) | 0x80);
450             buf[3] = static_cast< char >((val & 0x3F) | 0x80);
451             len = 4;
452         }
453         pad_.addEphemeral(buf, len);
454         return position;
455     } else {
456         struct EntityRef {
457             char const * inBegin;
458             sal_Int32 const inLength;
459             char const * outBegin;
460             sal_Int32 const outLength;
461         };
462         static EntityRef const refs[] = {
463             { RTL_CONSTASCII_STRINGPARAM("amp;"),
464               RTL_CONSTASCII_STRINGPARAM("&") },
465             { RTL_CONSTASCII_STRINGPARAM("lt;"),
466               RTL_CONSTASCII_STRINGPARAM("<") },
467             { RTL_CONSTASCII_STRINGPARAM("gt;"),
468               RTL_CONSTASCII_STRINGPARAM(">") },
469             { RTL_CONSTASCII_STRINGPARAM("apos;"),
470               RTL_CONSTASCII_STRINGPARAM("'") },
471             { RTL_CONSTASCII_STRINGPARAM("quot;"),
472               RTL_CONSTASCII_STRINGPARAM("\"") } };
473         for (const auto & ref : refs) {
474             if (rtl_str_shortenedCompare_WithLength(
475                     position, end - position, ref.inBegin, ref.inLength,
476                     ref.inLength) ==
477                 0)
478             {
479                 position += ref.inLength;
480                 pad_.add(ref.outBegin, ref.outLength);
481                 return position;
482             }
483         }
484         throw css::uno::RuntimeException(
485             "unknown entity reference in " + fileUrl_ );
486     }
487 }
488 
handleAttributeValue(char const * begin,char const * end,bool fullyNormalize)489 Span XmlReader::handleAttributeValue(
490     char const * begin, char const * end, bool fullyNormalize)
491 {
492     pad_.clear();
493     if (fullyNormalize) {
494         while (begin != end && isSpace(*begin)) {
495             ++begin;
496         }
497         while (end != begin && isSpace(end[-1])) {
498             --end;
499         }
500         char const * p = begin;
501         enum Space { SPACE_NONE, SPACE_SPAN, SPACE_BREAK };
502             // a single true space character can go into the current span,
503             // everything else breaks the span
504         Space space = SPACE_NONE;
505         while (p != end) {
506             switch (*p) {
507             case '\x09':
508             case '\x0A':
509             case '\x0D':
510                 switch (space) {
511                 case SPACE_NONE:
512                     pad_.add(begin, p - begin);
513                     pad_.add(" ");
514                     space = SPACE_BREAK;
515                     break;
516                 case SPACE_SPAN:
517                     pad_.add(begin, p - begin);
518                     space = SPACE_BREAK;
519                     break;
520                 case SPACE_BREAK:
521                     break;
522                 }
523                 begin = ++p;
524                 break;
525             case ' ':
526                 switch (space) {
527                 case SPACE_NONE:
528                     ++p;
529                     space = SPACE_SPAN;
530                     break;
531                 case SPACE_SPAN:
532                     pad_.add(begin, p - begin);
533                     begin = ++p;
534                     space = SPACE_BREAK;
535                     break;
536                 case SPACE_BREAK:
537                     begin = ++p;
538                     break;
539                 }
540                 break;
541             case '&':
542                 pad_.add(begin, p - begin);
543                 p = handleReference(p, end);
544                 begin = p;
545                 space = SPACE_NONE;
546                 break;
547             default:
548                 ++p;
549                 space = SPACE_NONE;
550                 break;
551             }
552         }
553         pad_.add(begin, p - begin);
554     } else {
555         char const * p = begin;
556         while (p != end) {
557             switch (*p) {
558             case '\x09':
559             case '\x0A':
560                 pad_.add(begin, p - begin);
561                 begin = ++p;
562                 pad_.add(" ");
563                 break;
564             case '\x0D':
565                 pad_.add(begin, p - begin);
566                 ++p;
567                 if (peek() == '\x0A') {
568                     ++p;
569                 }
570                 begin = p;
571                 pad_.add(" ");
572                 break;
573             case '&':
574                 pad_.add(begin, p - begin);
575                 p = handleReference(p, end);
576                 begin = p;
577                 break;
578             default:
579                 ++p;
580                 break;
581             }
582         }
583         pad_.add(begin, p - begin);
584     }
585     return pad_.get();
586 }
587 
handleStartTag(int * nsId,Span * localName)588 XmlReader::Result XmlReader::handleStartTag(int * nsId, Span * localName) {
589     assert(nsId != nullptr && localName);
590     char const * nameBegin = pos_;
591     char const * nameColon = nullptr;
592     if (!scanName(&nameColon)) {
593         throw css::uno::RuntimeException(
594             "bad tag name in " + fileUrl_ );
595     }
596     char const * nameEnd = pos_;
597     NamespaceList::size_type inheritedNamespaces = namespaces_.size();
598     bool hasDefaultNs = false;
599     int defaultNsId = NAMESPACE_NONE;
600     attributes_.clear();
601     for (;;) {
602         char const * p = pos_;
603         skipSpace();
604         if (peek() == '/' || peek() == '>') {
605             break;
606         }
607         if (pos_ == p) {
608             throw css::uno::RuntimeException(
609                 "missing whitespace before attribute in " + fileUrl_ );
610         }
611         char const * attrNameBegin = pos_;
612         char const * attrNameColon = nullptr;
613         if (!scanName(&attrNameColon)) {
614             throw css::uno::RuntimeException(
615                 "bad attribute name in " + fileUrl_ );
616         }
617         char const * attrNameEnd = pos_;
618         skipSpace();
619         if (read() != '=') {
620             throw css::uno::RuntimeException(
621                 "missing '=' in " + fileUrl_ );
622         }
623         skipSpace();
624         char del = read();
625         if (del != '\'' && del != '"') {
626             throw css::uno::RuntimeException(
627                 "bad attribute value in " + fileUrl_ );
628         }
629         char const * valueBegin = pos_;
630         sal_Int32 i = rtl_str_indexOfChar_WithLength(pos_, end_ - pos_, del);
631         if (i < 0) {
632             throw css::uno::RuntimeException(
633                 "unterminated attribute value in " + fileUrl_ );
634         }
635         char const * valueEnd = pos_ + i;
636         pos_ += i + 1;
637         if (attrNameColon == nullptr &&
638             Span(attrNameBegin, attrNameEnd - attrNameBegin) == "xmlns")
639         {
640             hasDefaultNs = true;
641             defaultNsId = scanNamespaceIri(valueBegin, valueEnd);
642         } else if (attrNameColon != nullptr &&
643                    Span(attrNameBegin, attrNameColon - attrNameBegin) ==
644                        "xmlns")
645         {
646             namespaces_.emplace_back(
647                     Span(attrNameColon + 1, attrNameEnd - (attrNameColon + 1)),
648                     scanNamespaceIri(valueBegin, valueEnd));
649         } else {
650             attributes_.emplace_back(
651                     attrNameBegin, attrNameEnd, attrNameColon, valueBegin,
652                     valueEnd);
653         }
654     }
655     if (!hasDefaultNs && !elements_.empty()) {
656         defaultNsId = elements_.top().defaultNamespaceId;
657     }
658     firstAttribute_ = true;
659     if (peek() == '/') {
660         state_ = State::EmptyElementTag;
661         ++pos_;
662     } else {
663         state_ = State::Content;
664     }
665     if (peek() != '>') {
666         throw css::uno::RuntimeException(
667             "missing '>' in " + fileUrl_ );
668     }
669     ++pos_;
670     elements_.push(
671         ElementData(
672             Span(nameBegin, nameEnd - nameBegin), inheritedNamespaces,
673             defaultNsId));
674     if (nameColon == nullptr) {
675         *nsId = defaultNsId;
676         *localName = Span(nameBegin, nameEnd - nameBegin);
677     } else {
678         *nsId = getNamespaceId(Span(nameBegin, nameColon - nameBegin));
679         *localName = Span(nameColon + 1, nameEnd - (nameColon + 1));
680     }
681     return Result::Begin;
682 }
683 
handleEndTag()684 XmlReader::Result XmlReader::handleEndTag() {
685     if (elements_.empty()) {
686         throw css::uno::RuntimeException(
687             "spurious end tag in " + fileUrl_ );
688     }
689     char const * nameBegin = pos_;
690     char const * nameColon = nullptr;
691     if (!scanName(&nameColon) ||
692         !elements_.top().name.equals(nameBegin, pos_ - nameBegin))
693     {
694         throw css::uno::RuntimeException(
695             "tag mismatch in " + fileUrl_ );
696     }
697     handleElementEnd();
698     skipSpace();
699     if (peek() != '>') {
700         throw css::uno::RuntimeException(
701             "missing '>' in " + fileUrl_ );
702     }
703     ++pos_;
704     return Result::End;
705 }
706 
handleElementEnd()707 void XmlReader::handleElementEnd() {
708     assert(!elements_.empty());
709     auto end = elements_.top().inheritedNamespaces;
710     namespaces_.resize(end);
711     elements_.pop();
712     state_ = elements_.empty() ? State::Done : State::Content;
713 }
714 
handleSkippedText(Span * data,int * nsId)715 XmlReader::Result XmlReader::handleSkippedText(Span * data, int * nsId) {
716     for (;;) {
717         auto i = static_cast<const char*>(std::memchr(pos_, '<', end_ - pos_));
718         if (!i) {
719             throw css::uno::RuntimeException(
720                 "premature end of " + fileUrl_ );
721         }
722         pos_ = i + 1;
723         switch (peek()) {
724         case '!':
725             ++pos_;
726             if (!skipComment() && !scanCdataSection().is()) {
727                 skipDocumentTypeDeclaration();
728             }
729             break;
730         case '/':
731             ++pos_;
732             return handleEndTag();
733         case '?':
734             ++pos_;
735             skipProcessingInstruction();
736             break;
737         default:
738             return handleStartTag(nsId, data);
739         }
740     }
741 }
742 
handleRawText(Span * text)743 XmlReader::Result XmlReader::handleRawText(Span * text) {
744     pad_.clear();
745     for (char const * begin = pos_;;) {
746         switch (peek()) {
747         case '\0': // i.e., EOF
748             throw css::uno::RuntimeException(
749                 "premature end of " + fileUrl_ );
750         case '\x0D':
751             pad_.add(begin, pos_ - begin);
752             ++pos_;
753             if (peek() != '\x0A') {
754                 pad_.add("\x0A");
755             }
756             begin = pos_;
757             break;
758         case '&':
759             pad_.add(begin, pos_ - begin);
760             pos_ = handleReference(pos_, end_);
761             begin = pos_;
762             break;
763         case '<':
764             pad_.add(begin, pos_ - begin);
765             ++pos_;
766             switch (peek()) {
767             case '!':
768                 ++pos_;
769                 if (!skipComment()) {
770                     Span cdata(scanCdataSection());
771                     if (cdata.is()) {
772                         normalizeLineEnds(cdata);
773                     } else {
774                         skipDocumentTypeDeclaration();
775                     }
776                 }
777                 begin = pos_;
778                 break;
779             case '/':
780                 *text = pad_.get();
781                 ++pos_;
782                 state_ = State::EndTag;
783                 return Result::Text;
784             case '?':
785                 ++pos_;
786                 skipProcessingInstruction();
787                 begin = pos_;
788                 break;
789             default:
790                 *text = pad_.get();
791                 state_ = State::StartTag;
792                 return Result::Text;
793             }
794             break;
795         default:
796             ++pos_;
797             break;
798         }
799     }
800 }
801 
handleNormalizedText(Span * text)802 XmlReader::Result XmlReader::handleNormalizedText(Span * text) {
803     pad_.clear();
804     char const * flowBegin = pos_;
805     char const * flowEnd = pos_;
806     enum Space { SPACE_START, SPACE_NONE, SPACE_SPAN, SPACE_BREAK };
807         // a single true space character can go into the current flow,
808         // everything else breaks the flow
809     Space space = SPACE_START;
810     for (;;) {
811         switch (peek()) {
812         case '\0': // i.e., EOF
813             throw css::uno::RuntimeException(
814                 "premature end of " + fileUrl_ );
815         case '\x09':
816         case '\x0A':
817         case '\x0D':
818             switch (space) {
819             case SPACE_START:
820             case SPACE_BREAK:
821                 break;
822             case SPACE_NONE:
823             case SPACE_SPAN:
824                 space = SPACE_BREAK;
825                 break;
826             }
827             ++pos_;
828             break;
829         case ' ':
830             switch (space) {
831             case SPACE_START:
832             case SPACE_BREAK:
833                 break;
834             case SPACE_NONE:
835                 space = SPACE_SPAN;
836                 break;
837             case SPACE_SPAN:
838                 space = SPACE_BREAK;
839                 break;
840             }
841             ++pos_;
842             break;
843         case '&':
844             switch (space) {
845             case SPACE_START:
846                 break;
847             case SPACE_NONE:
848             case SPACE_SPAN:
849                 pad_.add(flowBegin, pos_ - flowBegin);
850                 break;
851             case SPACE_BREAK:
852                 pad_.add(flowBegin, flowEnd - flowBegin);
853                 pad_.add(" ");
854                 break;
855             }
856             pos_ = handleReference(pos_, end_);
857             flowBegin = pos_;
858             flowEnd = pos_;
859             space = SPACE_NONE;
860             break;
861         case '<':
862             ++pos_;
863             switch (peek()) {
864             case '!':
865                 ++pos_;
866                 if (skipComment()) {
867                     space = SPACE_BREAK;
868                 } else {
869                     Span cdata(scanCdataSection());
870                     if (cdata.is()) {
871                         // CDATA is not normalized (similar to character
872                         // references; it keeps the code simple), but it might
873                         // arguably be better to normalize it:
874                         switch (space) {
875                         case SPACE_START:
876                             break;
877                         case SPACE_NONE:
878                         case SPACE_SPAN:
879                             pad_.add(flowBegin, pos_ - flowBegin);
880                             break;
881                         case SPACE_BREAK:
882                             pad_.add(flowBegin, flowEnd - flowBegin);
883                             pad_.add(" ");
884                             break;
885                         }
886                         normalizeLineEnds(cdata);
887                         flowBegin = pos_;
888                         flowEnd = pos_;
889                         space = SPACE_NONE;
890                     } else {
891                         skipDocumentTypeDeclaration();
892                     }
893                 }
894                 break;
895             case '/':
896                 ++pos_;
897                 pad_.add(flowBegin, flowEnd - flowBegin);
898                 *text = pad_.get();
899                 state_ = State::EndTag;
900                 return Result::Text;
901             case '?':
902                 ++pos_;
903                 skipProcessingInstruction();
904                 space = SPACE_BREAK;
905                 break;
906             default:
907                 pad_.add(flowBegin, flowEnd - flowBegin);
908                 *text = pad_.get();
909                 state_ = State::StartTag;
910                 return Result::Text;
911             }
912             break;
913         default:
914             switch (space) {
915             case SPACE_START:
916                 flowBegin = pos_;
917                 break;
918             case SPACE_NONE:
919             case SPACE_SPAN:
920                 break;
921             case SPACE_BREAK:
922                 pad_.add(flowBegin, flowEnd - flowBegin);
923                 pad_.add(" ");
924                 flowBegin = pos_;
925                 break;
926             }
927             flowEnd = ++pos_;
928             space = SPACE_NONE;
929             break;
930         }
931     }
932 }
933 
toNamespaceId(NamespaceIris::size_type pos)934 int XmlReader::toNamespaceId(NamespaceIris::size_type pos) {
935     assert(pos <= INT_MAX);
936     return static_cast< int >(pos);
937 }
938 
939 }
940 
941 /* vim:set shiftwidth=4 softtabstop=4 expandtab: */
942