1 /*
2  * Copyright (C) 2020 Linux Studio Plugins Project <https://lsp-plug.in/>
3  *           (C) 2020 Vladimir Sadovnikov <sadko4u@gmail.com>
4  *
5  * This file is part of lsp-plugins
6  * Created on: 24 окт. 2019 г.
7  *
8  * lsp-plugins is free software: you can redistribute it and/or modify
9  * it under the terms of the GNU Lesser General Public License as published by
10  * the Free Software Foundation, either version 3 of the License, or
11  * any later version.
12  *
13  * lsp-plugins is distributed in the hope that it will be useful,
14  * but WITHOUT ANY WARRANTY; without even the implied warranty of
15  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
16  * GNU Lesser General Public License for more details.
17  *
18  * You should have received a copy of the GNU Lesser General Public License
19  * along with lsp-plugins. If not, see <https://www.gnu.org/licenses/>.
20  */
21 
22 #include <core/io/InStringSequence.h>
23 #include <core/io/InSequence.h>
24 #include <core/io/InFileStream.h>
25 #include <core/files/xml/PullParser.h>
26 #include <ctype.h>
27 #include <wctype.h>
28 
29 namespace lsp
30 {
31     namespace xml
32     {
33 
PullParser()34         PullParser::PullParser()
35         {
36             pIn         = NULL;
37             nWFlags     = 0;
38             nToken      = -STATUS_NO_DATA;
39             nState      = PS_READ_MISC;
40             enVersion   = XML_VERSION_1_0;
41             nFlags      = 0;
42             nStates     = 0;
43 
44             nUngetch    = 0;
45         }
46 
~PullParser()47         PullParser::~PullParser()
48         {
49             close();
50         }
51 
open(const char * path,const char * charset)52         status_t PullParser::open(const char *path, const char *charset)
53         {
54             if (pIn != NULL)
55                 return STATUS_BAD_STATE;
56             else if (path == NULL)
57                 return STATUS_BAD_ARGUMENTS;
58 
59             io::InFileStream *ifs = new io::InFileStream();
60             if (ifs == NULL)
61                 return STATUS_NO_MEM;
62             status_t res = ifs->open(path);
63             if (res == STATUS_OK)
64             {
65                 res     = wrap(ifs, WRAP_CLOSE | WRAP_DELETE, charset);
66                 if (res == STATUS_OK)
67                     return res;
68                 ifs->close();
69             }
70             delete ifs;
71 
72             return res;
73         }
74 
open(const LSPString * path,const char * charset)75         status_t PullParser::open(const LSPString *path, const char *charset)
76         {
77             if (pIn != NULL)
78                 return STATUS_BAD_STATE;
79             else if (path == NULL)
80                 return STATUS_BAD_ARGUMENTS;
81 
82             io::InFileStream *ifs = new io::InFileStream();
83             if (ifs == NULL)
84                 return STATUS_NO_MEM;
85             status_t res = ifs->open(path);
86             if (res == STATUS_OK)
87             {
88                 res     = wrap(ifs, WRAP_CLOSE | WRAP_DELETE, charset);
89                 if (res == STATUS_OK)
90                     return res;
91                 ifs->close();
92             }
93             delete ifs;
94 
95             return res;
96         }
97 
open(const io::Path * path,const char * charset)98         status_t PullParser::open(const io::Path *path, const char *charset)
99         {
100             if (pIn != NULL)
101                 return STATUS_BAD_STATE;
102             else if (path == NULL)
103                 return STATUS_BAD_ARGUMENTS;
104 
105             io::InFileStream *ifs = new io::InFileStream();
106             if (ifs == NULL)
107                 return STATUS_NO_MEM;
108             status_t res = ifs->open(path);
109             if (res == STATUS_OK)
110             {
111                 res     = wrap(ifs, WRAP_CLOSE | WRAP_DELETE, charset);
112                 if (res == STATUS_OK)
113                     return res;
114                 ifs->close();
115             }
116             delete ifs;
117 
118             return res;
119         }
120 
wrap(const char * str,const char * charset)121         status_t PullParser::wrap(const char *str, const char *charset)
122         {
123             if (pIn != NULL)
124                 return STATUS_BAD_STATE;
125             else if (str == NULL)
126                 return STATUS_BAD_ARGUMENTS;
127 
128             io::InStringSequence *seq = new io::InStringSequence();
129             if (seq == NULL)
130                 return STATUS_NO_MEM;
131 
132             status_t res = seq->wrap(str, charset);
133             if (res == STATUS_OK)
134             {
135                 if ((res = wrap(seq, WRAP_CLOSE | WRAP_DELETE)) == STATUS_OK)
136                     return res;
137                 seq->close();
138             }
139 
140             delete seq;
141             return res;
142         }
143 
wrap(const LSPString * str)144         status_t PullParser::wrap(const LSPString *str)
145         {
146             if (pIn != NULL)
147                 return STATUS_BAD_STATE;
148             else if (str == NULL)
149                 return STATUS_BAD_ARGUMENTS;
150 
151             io::InStringSequence *seq = new io::InStringSequence();
152             if (seq == NULL)
153                 return STATUS_NO_MEM;
154 
155             status_t res = seq->wrap(str);
156             if (res == STATUS_OK)
157             {
158                 if ((res = wrap(seq, WRAP_CLOSE | WRAP_DELETE)) == STATUS_OK)
159                     return res;
160                 seq->close();
161             }
162 
163             delete seq;
164             return res;
165         }
166 
wrap(io::IInStream * is,size_t flags,const char * charset)167         status_t PullParser::wrap(io::IInStream *is, size_t flags, const char *charset)
168         {
169             if (pIn != NULL)
170                 return STATUS_BAD_STATE;
171             else if (is == NULL)
172                 return STATUS_BAD_ARGUMENTS;
173 
174             io::InSequence *seq = new io::InSequence();
175             if (seq == NULL)
176                 return STATUS_NO_MEM;
177 
178             status_t res = seq->wrap(is, flags, charset);
179             if (res == STATUS_OK)
180             {
181                 if ((res = wrap(seq, WRAP_CLOSE | WRAP_DELETE)) == STATUS_OK)
182                     return res;
183                 seq->close();
184             }
185 
186             delete seq;
187             return res;
188         }
189 
wrap(io::IInSequence * seq,size_t flags)190         status_t PullParser::wrap(io::IInSequence *seq, size_t flags)
191         {
192             if (pIn != NULL)
193                 return STATUS_BAD_STATE;
194             else if (seq == NULL)
195                 return STATUS_BAD_ARGUMENTS;
196 
197             pIn             = seq;
198             nWFlags         = flags;
199             nToken          = -STATUS_NO_DATA;
200             nState          = PS_READ_MISC;
201             nStates         = 0;
202             enVersion       = XML_VERSION_1_0;
203             sVersion.truncate();
204             sEncoding.truncate();
205             sDoctype.truncate();
206             sPublic.truncate();
207             sSystem.truncate();
208             nFlags          = 0;
209             nUngetch        = 0;
210 
211             return STATUS_OK;
212         }
213 
close()214         status_t PullParser::close()
215         {
216             status_t res = STATUS_OK;
217 
218             // Drop unnecessary resources
219             nUngetch        = 0;
220             sVersion.truncate();
221             sEncoding.truncate();
222             sName.truncate();
223             sValue.truncate();
224             sDoctype.truncate();
225             sPublic.truncate();
226             sSystem.truncate();
227             nFlags          = 0;
228 
229             // Remove all tag hierarchy
230             drop_list(&vTags);
231             drop_list(&vAtts);
232 
233             // Release input sequence
234             if (pIn != NULL)
235             {
236                 if (nWFlags & WRAP_CLOSE)
237                 {
238                     if (res == STATUS_OK)
239                         res = pIn->close();
240                     else
241                         pIn->close();
242                 }
243 
244                 if (nWFlags & WRAP_DELETE)
245                     delete pIn;
246 
247                 pIn     = NULL;
248             }
249 
250             return res;
251         }
252 
getch()253         lsp_swchar_t PullParser::getch()
254         {
255             return (nUngetch > 0) ? vUngetch[--nUngetch] : pIn->read();
256         }
257 
ungetch(lsp_swchar_t ch)258         void PullParser::ungetch(lsp_swchar_t ch)
259         {
260             vUngetch[nUngetch++] = ch;
261         }
262 
push_state(parse_state_t override)263         void PullParser::push_state(parse_state_t override)
264         {
265             vStates[nStates++]  = nState;
266             nState              = override;
267         }
268 
pop_state()269         void PullParser::pop_state()
270         {
271             nState  = vStates[--nStates];
272         }
273 
drop_list(cvector<LSPString> * list)274         void PullParser::drop_list(cvector<LSPString> *list)
275         {
276             for (size_t i=0, n=list->size(); i<n; ++i)
277             {
278                 LSPString *s = list->at(i);
279                 if (s != NULL)
280                     delete s;
281             }
282             list->flush();
283         }
284 
check_duplicate_attribute()285         status_t PullParser::check_duplicate_attribute()
286         {
287             // Is item present in list?
288             for (size_t i=0, n=vAtts.size(); i<n; ++i)
289             {
290                 LSPString *s = vAtts.at(i);
291                 if ((s != NULL) && (s->equals(&sName)))
292                     return STATUS_CORRUPTED;
293             }
294 
295             // Add to list
296             LSPString *copy = sName.clone();
297             if (copy == NULL)
298                 return STATUS_NO_MEM;
299             if (!vAtts.add(copy))
300             {
301                 delete copy;
302                 return STATUS_NO_MEM;
303             }
304 
305             return STATUS_OK;
306         }
307 
skip_spaces()308         bool PullParser::skip_spaces()
309         {
310             bool skipped = false;
311 
312             while (true)
313             {
314                 // Read next character
315                 lsp_swchar_t c = getch();
316                 if (!is_whitespace(c))
317                 {
318                     ungetch(c);
319                     break;
320                 }
321                 skipped = true;
322             }
323 
324             return skipped;
325         }
326 
read_text(const char * text)327         status_t PullParser::read_text(const char *text)
328         {
329             lsp_swchar_t c;
330             for ( ; *text != '\0'; ++text)
331             {
332                 if ((c = getch()) != *text)
333                     return (c < 0) ? -c : STATUS_CORRUPTED;
334             }
335             return STATUS_OK;
336         }
337 
read_name(LSPString * name)338         status_t PullParser::read_name(LSPString *name)
339         {
340             // Get first character
341             lsp_swchar_t c = getch();
342             if (!(is_name_first(c)))
343                 return (c < 0) ? -c : STATUS_CORRUPTED;
344 
345             // Read name
346             name->clear();
347             do
348             {
349                 // Append current character
350                 if (!name->append(c))
351                     return STATUS_NO_MEM;
352 
353                 // Get next character
354                 c = getch();
355             } while (is_name_next(c));
356 
357             // Return back last character and return OK status
358             ungetch(c);
359             return STATUS_OK;
360         }
361 
read_attribute_value(lsp_swchar_t qc)362         status_t PullParser::read_attribute_value(lsp_swchar_t qc)
363         {
364             lsp_swchar_t c;
365             status_t res;
366 
367             while (true)
368             {
369                 // Read character
370                 if ((c = getch()) < 0)
371                 {
372                     pop_state();
373                     return -c;
374                 }
375                 else if (c == qc)
376                     break;
377 
378                 // Reference?
379                 if (c == '&')
380                 {
381                     // Read and append reference (if possible) to the string value
382                     if ((res = read_entity_reference(&sValue)) != STATUS_OK)
383                     {
384                         pop_state();
385                         return res;
386                     }
387 
388                     // Need to query reference?
389                     if (nState != PS_READ_REFERENCE)
390                         continue;
391                     return STATUS_OK; // Query for reference, do not need to pop_state()
392                 }
393 
394                 // Append current character
395                 if (!sValue.append(c))
396                 {
397                     pop_state();
398                     return STATUS_NO_MEM;
399                 }
400             }
401 
402             pop_state();
403             nToken = XT_ATTRIBUTE;
404             return STATUS_OK;
405         }
406 
read_version()407         status_t PullParser::read_version()
408         {
409             // Get quote character
410             lsp_swchar_t qc = getch();
411             if ((qc != '\'') && (qc != '\"'))
412                 return (qc < 0) ? -qc : STATUS_CORRUPTED;
413 
414             // Version should be '1.x'
415             lsp_swchar_t c;
416             if ((c = getch()) != '1')
417                 return (c < 0) ? -c : STATUS_CORRUPTED;
418             if ((c = getch()) != '.')
419                 return (c < 0) ? -c : STATUS_CORRUPTED;
420 
421             // Read integer value
422             size_t v=0, k=0;
423             while ((c = getch()) != qc)
424             {
425                 if (v >= 0x1000000) // Prevent from integer overflow
426                     return STATUS_CORRUPTED;
427 
428                 if ((c >= '0') && (c <= '9'))
429                     v = v * 10 + (c - '0');
430                 else
431                     return (c < 0) ? -c : STATUS_CORRUPTED;
432                 ++k;
433             }
434 
435             // Validate number of digits
436             if (k <= 0)
437                 return STATUS_CORRUPTED;
438 
439             // Update version text
440             if (!sVersion.fmt_ascii("1.%d", int(v)))
441                 return STATUS_NO_MEM;
442 
443             enVersion = (v >= 1) ? XML_VERSION_1_1 : XML_VERSION_1_0;
444             nFlags |= XF_VERSION;
445 
446             return STATUS_OK;
447         }
448 
read_encoding()449         status_t PullParser::read_encoding()
450         {
451             sEncoding.clear();
452 
453             // Get quote character
454             lsp_swchar_t qc = getch();
455             if ((qc != '\'') && (qc != '\"'))
456                 return (qc < 0) ? -qc : STATUS_CORRUPTED;
457 
458             // Read encoding char
459             lsp_swchar_t c = getch();
460             if (!is_encoding_first(c))
461                 return STATUS_BAD_FORMAT;
462             if (!sEncoding.append(c))
463                 return STATUS_NO_MEM;
464 
465             // Check the remained characters
466             while ((c = getch()) != qc)
467             {
468                 if (!is_encoding_next(c))
469                     return (c < 0) ? -c : STATUS_CORRUPTED;
470                 if (!sEncoding.append(c))
471                     return STATUS_NO_MEM;
472             }
473 
474             nFlags |= XF_ENCODING;
475 
476             return STATUS_OK;
477         }
478 
read_standalone()479         status_t PullParser::read_standalone()
480         {
481             LSPString tmp;
482 
483             // Get quote character
484             lsp_swchar_t qc = getch();
485             if ((qc != '\'') && (qc != '\"'))
486                 return (qc < 0) ? -qc : STATUS_CORRUPTED;
487 
488             // Read quoted string
489             lsp_swchar_t c;
490             while ((c = getch()) != qc)
491             {
492                 if (tmp.length() >= 3)
493                     return STATUS_CORRUPTED;
494                 if (!tmp.append(c))
495                     return STATUS_NO_MEM;
496             }
497 
498             // Compare string with possible value
499             if (tmp.equals_ascii("yes"))
500                 nFlags |= XF_STANDALONE;
501             else if (tmp.equals_ascii("no"))
502                 nFlags &= ~XF_STANDALONE;
503             else
504                 return STATUS_CORRUPTED;
505 
506             return STATUS_OK;
507         }
508 
read_header()509         status_t PullParser::read_header()
510         {
511             status_t res;
512             lsp_swchar_t c;
513 
514             // Fetch optional attributes
515             enum flags_t
516             {
517                 F_VERSION       = 1 << 0,
518                 F_ENCODING      = 1 << 1,
519                 F_STANDALONE    = 1 << 2
520             };
521 
522             size_t flags = 0;
523             LSPString name, value;
524 
525             while (true)
526             {
527                 // Skip spaces and read next character
528                 bool skipped = skip_spaces();
529                 if ((c = getch()) < 0)
530                     return -c;
531 
532                 if (c == '?') // end of header?
533                 {
534                     // Read next character
535                     if ((c = getch()) != '>')
536                         return (c < 0) ? -c : STATUS_CORRUPTED;
537                     return (flags & F_VERSION) ? read_start_document() : STATUS_CORRUPTED;
538                 }
539 
540                 // At least one space is mandatory
541                 if (!skipped)
542                     return STATUS_CORRUPTED;
543 
544                 // Read attribute name
545                 ungetch(c);
546                 if ((res = read_name(&name)) != STATUS_OK)
547                     return res;
548 
549                 // Required '=' sign
550                 skip_spaces();
551                 if ((c = getch()) != '=')
552                     return (c < 0) ? -c : STATUS_CORRUPTED;
553 
554                 // Check attribute type
555                 size_t flag = 0;
556                 if (name.equals_ascii("version"))
557                 {
558                     flag = F_VERSION;
559                     if ((res = read_version()) != STATUS_OK)
560                         return res;
561                 }
562                 else if (name.equals_ascii("encoding"))
563                 {
564                     flag = F_ENCODING;
565                     if ((res = read_encoding()) != STATUS_OK)
566                         return res;
567                 }
568                 else if (name.equals_ascii("standalone"))
569                 {
570                     flag = F_STANDALONE;
571                     if ((res = read_standalone()) != STATUS_OK)
572                         return res;
573                 }
574 
575                 // Check that attribute is at proper place
576                 if (flag <= flags)
577                     return STATUS_CORRUPTED;
578                 flags |= flag;
579             }
580         }
581 
read_comment()582         status_t PullParser::read_comment()
583         {
584             lsp_swchar_t c, xc;
585             sValue.clear();
586 
587             while (true)
588             {
589                 // Fetch new character
590                 if ((c = getch()) < 0)
591                     return -c;
592 
593                 // Going to end of comment?
594                 if (c == '-')
595                 {
596                     // End of comment?
597                     if ((xc = getch()) == '-')
598                     {
599                         // Next character should be '>'
600                         if ((xc = getch()) != '>')
601                             return (xc < 0) ? -xc : STATUS_CORRUPTED;
602 
603                         nToken      = XT_COMMENT;
604                         return STATUS_OK;
605                     }
606 
607                     // Return character back
608                     ungetch(xc);
609                 }
610 
611                 if (!sValue.append(c))
612                     return STATUS_NO_MEM;
613             }
614 
615             return STATUS_OK;
616         }
617 
read_processing_instruction()618         status_t PullParser::read_processing_instruction()
619         {
620             status_t res;
621 
622             // Read processing instruction name
623             if ((res = read_name(&sName)) != STATUS_OK)
624                 return res;
625 
626             if (sName.equals_ascii_nocase("xml"))
627             {
628                 if (nFlags & XF_HEADER)
629                     return STATUS_CORRUPTED; // XML processing instruction is prohibited
630                 return read_header();
631             }
632 
633             // Read processing instruction value
634             lsp_swchar_t c;
635             skip_spaces(); // Skip spaces
636 
637             sValue.clear();
638             while (true)
639             {
640                 // Fetch new character
641                 if ((c = getch()) < 0)
642                     return -c;
643 
644                 // PI end?
645                 if (c == '>')
646                 {
647                     ssize_t pos = sValue.length() - 1;
648                     if ((pos >= 0) && (sValue.char_at(pos) == '?'))
649                         break;
650                 }
651 
652                 // No, simple character
653                 if (!sValue.append(c))
654                     return STATUS_NO_MEM;
655             }
656 
657             // Remove last character which is '?'
658             sValue.set_length(sValue.length() - 1);
659 
660             nToken = XT_PROCESSING_INSTRUCTION;
661             return STATUS_OK;
662         }
663 
read_system_literal(LSPString * dst)664         status_t PullParser::read_system_literal(LSPString *dst)
665         {
666             LSPString tmp;
667 
668             // Get quote character
669             lsp_swchar_t qc = getch();
670             if ((qc != '\'') && (qc != '\"'))
671                 return (qc < 0) ? -qc : STATUS_CORRUPTED;
672 
673             // Read quoted string
674             lsp_swchar_t c;
675             while ((c = getch()) != qc)
676             {
677                 if (!tmp.append(c))
678                     return STATUS_NO_MEM;
679             }
680 
681             dst->swap(&tmp);
682             return STATUS_OK;
683         }
684 
read_pubid_literal(LSPString * dst)685         status_t PullParser::read_pubid_literal(LSPString *dst)
686         {
687             LSPString tmp;
688 
689             // Get quote character
690             lsp_swchar_t qc = getch();
691             if ((qc != '\'') && (qc != '\"'))
692                 return (qc < 0) ? -qc : STATUS_CORRUPTED;
693 
694             // Read quoted string
695             lsp_swchar_t c;
696             while ((c = getch()) != qc)
697             {
698                 if ((!is_pubid_char(c)) || (c == qc))
699                     return STATUS_CORRUPTED;
700                 if (!tmp.append(c))
701                     return STATUS_NO_MEM;
702             }
703 
704             dst->swap(&tmp);
705             return STATUS_OK;
706         }
707 
read_doctype()708         status_t PullParser::read_doctype()
709         {
710             status_t res;
711             lsp_swchar_t c;
712             LSPString x;
713 
714             // Duplicate DOCTYPE?
715             if (nFlags & XF_DOCTYPE)
716                 return STATUS_CORRUPTED;
717 
718             // Space is required
719             if (!skip_spaces())
720                 return STATUS_CORRUPTED;
721             if ((res = read_name(&sDoctype)) != STATUS_OK)
722                 return res;
723 
724             // Watch next token
725             nFlags |= XF_DOCTYPE;
726             bool skip = skip_spaces();
727             if ((c = getch()) < 0)
728                 return -c;
729 
730             // ExternalID is present?
731             if (c == 'P')
732             {
733                 if (!skip)
734                     return STATUS_CORRUPTED;
735                 if ((res = read_text("UBLIC")) != STATUS_OK)
736                     return res;
737                 if (!skip_spaces())
738                     return STATUS_CORRUPTED;
739                 if ((res = read_pubid_literal(&sPublic)) != STATUS_OK)
740                     return res;
741                 nFlags |= XF_DOCTYPE_PUB;
742                 if (!skip_spaces())
743                     return STATUS_CORRUPTED;
744                 if ((res = read_system_literal(&sSystem)) != STATUS_OK)
745                     return res;
746                 nFlags |= XF_DOCTYPE_SYS;
747 
748                 // Skip spaces and get next token
749                 skip_spaces();
750                 if ((c = getch()) < 0)
751                     return -c;
752             }
753             else if (c == 'S')
754             {
755                 if (!skip)
756                     return STATUS_CORRUPTED;
757                 if ((res = read_text("YSTEM")) != STATUS_OK)
758                     return res;
759                 if (!skip_spaces())
760                     return STATUS_CORRUPTED;
761                 if ((res = read_system_literal(&sSystem)) != STATUS_OK)
762                     return res;
763                 nFlags |= XF_DOCTYPE_SYS;
764 
765                 // Skip spaces and get next token
766                 skip_spaces();
767                 if ((c = getch()) < 0)
768                     return -c;
769             }
770 
771             // intSubset?
772             if (c == '[')
773             {
774                 // TODO: currently we don't support DOCTYPE definition with built-in doctypes
775                 return STATUS_NOT_IMPLEMENTED;
776             }
777 
778             // End of Doctype?
779             nToken      = XT_DTD;
780             return (c == '>') ? STATUS_OK : STATUS_CORRUPTED;
781         }
782 
read_start_document()783         status_t PullParser::read_start_document()
784         {
785             nToken  = XT_START_DOCUMENT;
786             nFlags |= XF_HEADER;
787             return STATUS_OK;
788         }
789 
read_end_document()790         status_t PullParser::read_end_document()
791         {
792             nToken  = XT_END_DOCUMENT;
793             nState  = PS_END_DOCUMENT;
794             return STATUS_OK;
795         }
796 
read_misc()797         status_t PullParser::read_misc()
798         {
799             status_t res;
800             lsp_swchar_t c;
801 
802             // Skip whitespace
803             if (!(nFlags & XF_HEADER))
804             {
805                 if (skip_spaces())
806                     return read_start_document();
807             }
808             else
809                 skip_spaces();
810 
811             // Next character should be '<'
812             if ((c = getch()) != '<')
813             {
814                 if (c == -STATUS_EOF)
815                     return (nFlags & XF_HEADER) ? read_end_document() : read_start_document();
816                 return (c < 0) ? -c : STATUS_CORRUPTED;
817             }
818 
819             // Get the following character
820             if ((c = getch()) < 0)
821                 return -c;
822 
823             // Processing instruction?
824             if (c == '?')
825                 return read_processing_instruction();
826             else if (!(nFlags & XF_HEADER))
827             {
828                 ungetch(c);
829                 ungetch('<');
830                 return read_start_document();
831             }
832 
833             // Comment or Doctype?
834             if (c == '!')
835             {
836                 // Get next character
837                 if ((c = getch()) < 0)
838                     return -c;
839 
840                 if (c == '-') // Comment?
841                 {
842                     // '<!--' should be parsed
843                     if ((c = getch()) != '-')
844                         return (c < 0) ? -c : STATUS_CORRUPTED;
845                     return read_comment();
846                 }
847 
848                 if (c == 'D') // Doctype?
849                 {
850                     // 'DOCTYPE' should be parsed
851                     if ((res = read_text("OCTYPE")) != STATUS_OK)
852                         return res;
853                     return read_doctype();
854                 }
855 
856                 return STATUS_CORRUPTED;
857             }
858 
859             // We already have root tag?
860             if (nFlags & XF_ROOT)
861                 return STATUS_CORRUPTED;
862             nFlags |= XF_ROOT;  // Now we already have root tag defined
863 
864             // Return character and read root tag name
865             ungetch(c);
866             return read_tag_open();
867         }
868 
read_cdata()869         status_t PullParser::read_cdata()
870         {
871             lsp_swchar_t c;
872 
873             sValue.clear();
874 
875             while (true)
876             {
877                 // Get next character
878                 if ((c = getch()) < 0)
879                     return -c;
880 
881                 // CDATA end?
882                 if (c == '>')
883                 {
884                     ssize_t pos = sValue.length() - 2;
885                     if (
886                         (pos >= 0) &&
887                         (sValue.char_at(pos) == ']') &&
888                         (sValue.char_at(pos+1) == ']')
889                     )
890                         break;
891                 }
892 
893                 // No, simple character
894                 if (!sValue.append(c))
895                     return STATUS_NO_MEM;
896             }
897 
898             // Remove last two characters which are ']]'
899             sValue.set_length(sValue.length() - 2);
900 
901             nToken      = XT_CDATA;
902             return STATUS_OK;
903         }
904 
read_tag_open()905         status_t PullParser::read_tag_open()
906         {
907             status_t res;
908             if ((res = read_name(&sName)) != STATUS_OK)
909                 return res;
910 
911             // Add tag to stack
912             LSPString *tag = sName.clone();
913             if (tag == NULL)
914                 return STATUS_NO_MEM;
915             else if (!vTags.push(tag))
916             {
917                 delete tag;
918                 return STATUS_NO_MEM;
919             }
920 
921             // Change state
922             drop_list(&vAtts);
923             nToken  = XT_START_ELEMENT;
924             nState  = PS_READ_ATTRIBUTES;
925             return STATUS_OK;
926         }
927 
read_tag_close(bool copy)928         status_t PullParser::read_tag_close(bool copy)
929         {
930             // Get last tag name
931             LSPString *name = NULL;
932             if (!vTags.pop(&name))
933                 return STATUS_CORRUPTED;
934 
935             if (copy)
936                 sName.swap(name);
937             else if (!sName.equals(name))
938             {
939                 delete name;
940                 return STATUS_CORRUPTED;
941             }
942             delete name;
943 
944             // Update state
945             drop_list(&vAtts);
946             nToken = XT_END_ELEMENT;
947             nState = (vTags.size() > 0) ? PS_READ_ELEMENT_DATA : PS_READ_MISC;
948             return STATUS_OK;
949         }
950 
read_entity_reference(LSPString * cdata)951         status_t PullParser::read_entity_reference(LSPString *cdata)
952         {
953             lsp_swchar_t c, code = 0;
954             status_t res;
955 
956             // Get character
957             if ((c = getch()) < 0)
958                 return -c;
959 
960             // Entity reference ?
961             if (c != '#')
962             {
963                 ungetch(c);
964 
965                 // Read entity name
966                 if ((res = read_name(&sRefName)) != STATUS_OK)
967                     return res;
968 
969                 if (sRefName.equals_ascii("amp"))
970                     code    = '&';
971                 else if (sRefName.equals_ascii("gt"))
972                     code    = '>';
973                 else if (sRefName.equals_ascii("lt"))
974                     code    = '<';
975                 else if (sRefName.equals_ascii("apos"))
976                     code    = '\'';
977                 else if (sRefName.equals_ascii("quot"))
978                     code    = '\"';
979 
980                 // Get next character which should be ';'
981                 if ((c = getch()) < 0)
982                     return -c;
983             }
984             else
985             {
986                 // Get next character
987                 if ((c = getch()) < 0)
988                     return -c;
989 
990                 // Hexadecimal character?
991                 if (c == 'x')
992                 {
993                     // Read hex digit
994                     while ((c = getch()) >= 0)
995                     {
996                         // Protect from integer overflow
997                         if (code >= 0x1000000)
998                             return STATUS_CORRUPTED;
999 
1000                         // Decode hex character
1001                         if ((c >= '0') && (c <= '9'))
1002                             code = (code << 4) | (c - '0');
1003                         else if ((c >= 'a') && (c <= 'f'))
1004                             code = (code << 4) | (c - 'a' + 10);
1005                         else if ((c >= 'A') && (c <= 'F'))
1006                             code = (code << 4) | (c - 'A' + 10);
1007                         else
1008                             break;
1009                     }
1010                 }
1011                 else // Decimal character?
1012                 {
1013                     do
1014                     {
1015                         // Protect from integer overflow
1016                         if (code >= 0x1000000)
1017                             return STATUS_CORRUPTED;
1018 
1019                         // Decode decimal character
1020                         if ((c >= '0') && (c <= '9'))
1021                             code = (code * 10) + (c - '0');
1022                         else
1023                             break;
1024                     } while ((c = getch()) >= 0);
1025                 }
1026 
1027                 // Validate character
1028                 if (!is_valid_char(code, enVersion))
1029                     return STATUS_CORRUPTED;
1030             }
1031 
1032             // Current character should be ';'
1033             if (c != ';')
1034                 return STATUS_CORRUPTED;
1035             else if (code == 0)
1036             {
1037                 push_state(PS_READ_REFERENCE);
1038                 nToken      = XT_ENTITY_RESOLVE;
1039                 return STATUS_OK;
1040             }
1041 
1042             // Append fetched character to the character data and exit
1043             return (cdata->append(code)) ? STATUS_OK : STATUS_NO_MEM;
1044         }
1045 
read_characters()1046         status_t PullParser::read_characters()
1047         {
1048             lsp_swchar_t c;
1049             status_t res;
1050 
1051             while (true)
1052             {
1053                 // Get next character
1054                 if ((c = getch()) < 0)
1055                 {
1056                     pop_state();
1057                     return -c;
1058                 }
1059 
1060                 // Start of tag?
1061                 if (c == '<')
1062                 {
1063                     ungetch(c);
1064                     break;
1065                 }
1066 
1067                 // Reference?
1068                 if (c == '&')
1069                 {
1070                     // Read and append reference (if possible) to the string value
1071                     if ((res = read_entity_reference(&sValue)) != STATUS_OK)
1072                     {
1073                         pop_state();
1074                         return res;
1075                     }
1076 
1077                     // Need to query reference?
1078                     if (nState != PS_READ_REFERENCE)
1079                         continue;
1080                     return STATUS_OK; // Query for reference, do not need to pop_state()
1081                 }
1082 
1083                 // CDATA end?
1084                 if (c == '>')
1085                 {
1086                     ssize_t pos = sValue.length() - 2;
1087                     if (
1088                         (pos >= 0) &&
1089                         (sValue.char_at(pos) == ']') &&
1090                         (sValue.char_at(pos+1) == ']')
1091                     )
1092                     {
1093                         pop_state();
1094                         return STATUS_CORRUPTED;
1095                     }
1096                 }
1097 
1098                 // No, simple character
1099                 if (!sValue.append(c))
1100                 {
1101                     pop_state();
1102                     return STATUS_NO_MEM;
1103                 }
1104             }
1105 
1106             // Ensure that there is character data
1107             pop_state();
1108 
1109             if (sValue.length() <= 0)
1110                 return STATUS_CORRUPTED;
1111 
1112             nToken      = XT_CHARACTERS;
1113             return STATUS_OK;
1114         }
1115 
read_tag_content()1116         status_t PullParser::read_tag_content()
1117         {
1118             lsp_swchar_t c;
1119             status_t res;
1120 
1121             // Read character
1122             if ((c = getch()) < 0)
1123                 return -c;
1124 
1125             // Tag? Processing instruction? End of tag? Comment? CDATA?
1126             if (c != '<')
1127             {
1128                 ungetch(c);
1129                 sValue.clear();
1130                 push_state(PS_READ_CHARACTERS);
1131                 return read_characters();
1132             }
1133 
1134             // Get next character
1135             if ((c = getch()) < 0)
1136                 return -c;
1137 
1138             // Read tag name
1139             if (c == '/') // End of tag ?
1140             {
1141                 // Read tag name
1142                 if ((res = read_name(&sName)) != STATUS_OK)
1143                     return res;
1144 
1145                 // '>' is required
1146                 skip_spaces();
1147                 if ((c = getch()) != '>')
1148                     return (c < 0) ? -c : STATUS_CORRUPTED;
1149 
1150                 return read_tag_close(false);
1151             }
1152             else if (c == '?') // Processing instruction ?
1153                 return read_processing_instruction();
1154             else if (c == '!') // Comment? CDATA?
1155             {
1156                 // Get next character
1157                 if ((c = getch()) < 0)
1158                     return -c;
1159 
1160                 // CDATA?
1161                 if (c == '[')
1162                 {
1163                     // Lookup CDATA start
1164                     if ((res = read_text("CDATA[")) != STATUS_OK)
1165                         return res;
1166                     return read_cdata();
1167                 }
1168 
1169                 // Comment?
1170                 if (c == '-')
1171                 {
1172                     // Next character is required to be '-'
1173                     if ((c = getch()) != '-')
1174                         return (c < 0) ? -c : STATUS_CORRUPTED;
1175                     return read_comment();
1176                 }
1177 
1178                 // No match
1179                 return STATUS_CORRUPTED;
1180             }
1181 
1182             // Just open tag name?
1183             ungetch(c);
1184             return read_tag_open();
1185         }
1186 
read_tag_attribute()1187         status_t PullParser::read_tag_attribute()
1188         {
1189             lsp_swchar_t c;
1190             status_t res;
1191 
1192             // Ignore set of spaces if they are present
1193             bool skipped = skip_spaces();
1194             if ((c = getch()) < 0)
1195                 return -c;
1196 
1197             // End of tag header?
1198             if (c == '>')
1199             {
1200                 nState = PS_READ_ELEMENT_DATA;
1201                 return read_tag_content();
1202             }
1203 
1204             // End of tag?
1205             if (c == '/')
1206             {
1207                 // Required character
1208                 if ((c = getch()) != '>')
1209                     return (c < 0) ? -c : STATUS_CORRUPTED;
1210 
1211                 return read_tag_close(true);
1212             }
1213 
1214             // Try to read attribute and preprocess it's value
1215             if (!skipped) // At least one space is mandatory
1216                 return STATUS_CORRUPTED;
1217 
1218             // Read attribute name
1219             ungetch(c);
1220             if ((res = read_name(&sName)) != STATUS_OK)
1221                 return res;
1222             else if (check_duplicate_attribute())
1223                 return STATUS_CORRUPTED;
1224 
1225             skip_spaces(); // Spaces are optional
1226             if ((c = getch()) != '=')
1227                 return STATUS_CORRUPTED;
1228 
1229             skip_spaces(); // Spaces are optional
1230             c = getch(); // Get quote character
1231             if ((c != '\'') && (c != '\"'))
1232                 return (c < 0) ? -c : STATUS_CORRUPTED;
1233 
1234             // Read quoted value
1235             sValue.clear();
1236             push_state((c == '\'') ? PS_READ_SQ_ATTRIBUTE : PS_READ_DQ_ATTRIBUTE);
1237             return read_attribute_value(c);
1238         }
1239 
read_token()1240         status_t PullParser::read_token()
1241         {
1242             if (pIn == NULL)
1243                 return STATUS_BAD_STATE;
1244 
1245             switch (nState)
1246             {
1247                 case PS_END_DOCUMENT:
1248                     nToken          = XT_END_DOCUMENT;
1249                     return STATUS_EOF;
1250 
1251                 case PS_READ_MISC:
1252                     return read_misc();
1253 
1254                 case PS_READ_ATTRIBUTES:
1255                     return read_tag_attribute();
1256 
1257                 case PS_READ_ELEMENT_DATA:
1258                     return read_tag_content();
1259 
1260                 case PS_READ_REFERENCE:
1261                     nToken          = XT_ENTITY_RESOLVE;
1262                     return STATUS_OK;
1263 
1264                 case PS_READ_CHARACTERS:
1265                     return read_characters();
1266 
1267                 case PS_READ_SQ_ATTRIBUTE:
1268                     return read_attribute_value('\'');
1269 
1270                 case PS_READ_DQ_ATTRIBUTE:
1271                     return read_attribute_value('\"');
1272 
1273                 default:
1274                     break;
1275             }
1276             return STATUS_CORRUPTED;
1277         }
1278 
set_value(const LSPString * value)1279         status_t PullParser::set_value(const LSPString *value)
1280         {
1281             if (pIn == NULL)
1282                 return STATUS_BAD_STATE;
1283             else if (value == NULL)
1284                 return STATUS_BAD_ARGUMENTS;
1285 
1286             if (nState != PS_READ_REFERENCE)
1287                 return STATUS_BAD_STATE;
1288 
1289             // Append value with entity content
1290             if (!sValue.append(value))
1291                 return STATUS_NO_MEM;
1292 
1293             pop_state();
1294             return STATUS_OK;
1295         }
1296 
resolve_entity(const char * value,const char * charset)1297         status_t PullParser::resolve_entity(const char *value, const char *charset)
1298         {
1299             LSPString tmp;
1300             if (!tmp.set_native(value, charset))
1301                 return STATUS_NO_MEM;
1302             return resolve_entity(value);
1303         }
1304 
read_next()1305         status_t PullParser::read_next()
1306         {
1307             status_t res = read_token();
1308             return (res == STATUS_OK) ? nToken : -res;
1309         }
1310 
get_current()1311         status_t PullParser::get_current()
1312         {
1313             return nToken;
1314         }
1315 
name() const1316         const LSPString *PullParser::name() const
1317         {
1318             if (pIn == NULL)
1319                 return NULL;
1320 
1321             switch (nToken)
1322             {
1323                 case XT_ATTRIBUTE:
1324                 case XT_PROCESSING_INSTRUCTION:
1325                 case XT_START_ELEMENT:
1326                 case XT_END_ELEMENT:
1327                     return &sName;
1328                 case XT_ENTITY_RESOLVE:
1329                     return &sRefName;
1330                 default:
1331                     break;
1332             }
1333             return NULL;
1334         }
1335 
value() const1336         const LSPString *PullParser::value() const
1337         {
1338             if (pIn == NULL)
1339                 return NULL;
1340 
1341             switch (nToken)
1342             {
1343                 case XT_ATTRIBUTE:
1344                 case XT_CDATA:
1345                 case XT_CHARACTERS:
1346                 case XT_COMMENT:
1347                 case XT_PROCESSING_INSTRUCTION:
1348                     return &sValue;
1349                 default:
1350                     break;
1351             }
1352             return NULL;
1353         }
1354 
1355     } /* namespace xml */
1356 } /* namespace lsp */
1357