1 /* -*- Mode: C++; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4 -*- */
2 /*
3  * This file is part of the LibreOffice project.
4  *
5  * This Source Code Form is subject to the terms of the Mozilla Public
6  * License, v. 2.0. If a copy of the MPL was not distributed with this
7  * file, You can obtain one at http://mozilla.org/MPL/2.0/.
8  *
9  * This file incorporates work covered by the following license notice:
10  *
11  *   Licensed to the Apache Software Foundation (ASF) under one or more
12  *   contributor license agreements. See the NOTICE file distributed
13  *   with this work for additional information regarding copyright
14  *   ownership. The ASF licenses this file to you under the Apache
15  *   License, Version 2.0 (the "License"); you may not use this file
16  *   except in compliance with the License. You may obtain a copy of
17  *   the License at http://www.apache.org/licenses/LICENSE-2.0 .
18  */
19 
20 #pragma once
21 
22 #include <svtools/svtdllapi.h>
23 #include <svtools/svparser.hxx>
24 #include <svtools/htmltokn.h>
25 
26 #include <string_view>
27 #include <vector>
28 
29 namespace com :: sun :: star :: uno { template <class interface_type> class Reference; }
30 
31 namespace com::sun::star {
32     namespace document {
33         class XDocumentProperties;
34     }
35 }
36 
37 class Color;
38 enum class HtmlOptionId;
39 
40 #define HTMLFONTSZ1_DFLT 7
41 #define HTMLFONTSZ2_DFLT 10
42 #define HTMLFONTSZ3_DFLT 12
43 #define HTMLFONTSZ4_DFLT 14
44 #define HTMLFONTSZ5_DFLT 18
45 #define HTMLFONTSZ6_DFLT 24
46 #define HTMLFONTSZ7_DFLT 36
47 
48 enum class HTMLTableFrame { Void, Above, Below, HSides, LHS, RHS, VSides, Box };
49 
50 enum class HTMLTableRules { NONE, Groups, Rows, Cols, All };
51 
52 enum class HTMLInputType
53 {
54     Text =      1,
55     Password,
56     Checkbox,
57     Radio,
58     Range,
59     Scribble,
60     File,
61     Hidden,
62     Submit,
63     Image,
64     Reset,
65     Button
66 };
67 
68 enum class HTMLScriptLanguage
69 {
70     StarBasic,
71     JavaScript,
72     Unknown
73 };
74 
75 template<typename EnumT>
76 struct HTMLOptionEnum
77 {
78     const char *pName;  // value of an HTML option
79     EnumT       nValue; // and corresponding value of an enum
80 };
81 
82 /** Representation of an HTML option (=attribute in a start tag).
83  * The values of the options are always stored as strings.
84  * The methods GetNumber,... may only be called if the option
85  * is actually numerical,...
86  */
87 class SVT_DLLPUBLIC HTMLOption
88 {
89     OUString aValue;          // value of the option (always as string)
90     OUString aToken;          // name of the option as string
91     HtmlOptionId nToken;        // and respective token
92 
93 public:
94 
95     HTMLOption( HtmlOptionId nTyp, const OUString& rToken, const OUString& rValue );
96 
97     // name of the option...
GetToken() const98     HtmlOptionId GetToken() const { return nToken; }  // ... as enum
GetTokenString() const99     const OUString& GetTokenString() const { return aToken; } // ... as string
100 
101     // value of the option ...
GetString() const102     const OUString& GetString() const { return aValue; }  // ... as string
103 
104     sal_uInt32 GetNumber() const;                           // ... as number
105     sal_Int32 GetSNumber() const;                           // ... as number
106     void GetNumbers( std::vector<sal_uInt32> &rNumbers ) const; // ... as numbers
107     void GetColor( Color& ) const;                      // ... as color
108 
109     template<typename EnumT>
GetEnum(const HTMLOptionEnum<EnumT> * pOptEnums,EnumT nDflt=static_cast<EnumT> (0)) const110     EnumT GetEnum( const HTMLOptionEnum<EnumT> *pOptEnums,
111                         EnumT nDflt = static_cast<EnumT>(0) ) const
112     {
113         while( pOptEnums->pName )
114         {
115             if( aValue.equalsIgnoreAsciiCaseAscii( pOptEnums->pName ) )
116                 return pOptEnums->nValue;
117             pOptEnums++;
118         }
119         return nDflt;
120     }
121 
122     template<typename EnumT>
GetEnum(EnumT & rEnum,const HTMLOptionEnum<EnumT> * pOptEnums) const123     bool GetEnum( EnumT &rEnum, const HTMLOptionEnum<EnumT> *pOptEnums ) const
124     {
125         while( pOptEnums->pName )
126         {
127             if( aValue.equalsIgnoreAsciiCaseAscii( pOptEnums->pName ) )
128             {
129                 rEnum = pOptEnums->nValue;
130                 return true;
131             }
132             pOptEnums++;
133         }
134         return false;
135     }
136 
137     // ... and as a few special enums
138     HTMLInputType GetInputType() const;                 // <INPUT TYPE=...>
139     HTMLTableFrame GetTableFrame() const;               // <TABLE FRAME=...>
140     HTMLTableRules GetTableRules() const;               // <TABLE RULES=...>
141     //SvxAdjust GetAdjust() const;                      // <P,TH,TD ALIGN=>
142 };
143 
144 typedef ::std::vector<HTMLOption> HTMLOptions;
145 
146 class SVT_DLLPUBLIC HTMLParser : public SvParser<HtmlTokenId>
147 {
148 private:
149     mutable HTMLOptions maOptions; // options of the start tag
150 
151     bool bNewDoc        : 1;        // read new Doc?
152     bool bIsInHeader    : 1;        // scan header section
153     bool bReadListing   : 1;        // read listings
154     bool bReadXMP       : 1;        // read XMP
155     bool bReadPRE       : 1;        // read preformatted text
156     bool bReadTextArea  : 1;        // read TEXTAREA
157     bool bReadScript    : 1;        // read <SCRIPT>
158     bool bReadStyle     : 1;        // read <STYLE>
159     bool bEndTokenFound : 1;        // found </SCRIPT> or </STYLE>
160 
161     bool bPre_IgnoreNewPara : 1;    // flags for reading of PRE paragraphs
162     bool bReadNextChar : 1;         // true: read NextChar again(JavaScript!)
163     bool bReadComment : 1;          // true: read NextChar again (JavaScript!)
164 
165     sal_uInt32 nPre_LinePos;            // Pos in the line in the PRE-Tag
166 
167     HtmlTokenId mnPendingOffToken;          ///< OFF token pending for a <XX.../> ON/OFF ON token
168 
169     OUString aEndToken;
170 
171     /// XML namespace, in case of XHTML.
172     OUString maNamespace;
173 
174 protected:
175     OUString sSaveToken;             // the read tag as string
176 
177     HtmlTokenId ScanText( const sal_Unicode cBreak = 0U );
178 
179     HtmlTokenId GetNextRawToken();
180 
181     // scan next token
182     virtual HtmlTokenId GetNextToken_() override;
183 
184     virtual ~HTMLParser() override;
185 
FinishHeader()186     void FinishHeader() { bIsInHeader = false; }
187 
188     void SetNamespace(std::u16string_view rNamespace);
189 
190 public:
191     HTMLParser( SvStream& rIn, bool bReadNewDoc = true );
192 
193     virtual SvParserState CallParser() override;
194 
IsNewDoc() const195     bool IsNewDoc() const       { return bNewDoc; }
IsInHeader() const196     bool IsInHeader() const     { return bIsInHeader; }
IsReadListing() const197     bool IsReadListing() const  { return bReadListing; }
IsReadXMP() const198     bool IsReadXMP() const      { return bReadXMP; }
IsReadPRE() const199     bool IsReadPRE() const      { return bReadPRE; }
IsReadScript() const200     bool IsReadScript() const   { return bReadScript; }
IsReadStyle() const201     bool IsReadStyle() const    { return bReadStyle; }
202 
203     // start PRE-/LISTING or XMP mode or filter tags respectively
204     inline void StartPRE();
FinishPRE()205     void FinishPRE() { bReadPRE = false; }
206     HtmlTokenId FilterPRE( HtmlTokenId nToken );
207 
208     inline void StartListing();
FinishListing()209     void FinishListing() { bReadListing = false; }
210     HtmlTokenId FilterListing( HtmlTokenId nToken );
211 
212     inline void StartXMP();
FinishXMP()213     void FinishXMP() { bReadXMP = false; }
214     HtmlTokenId FilterXMP( HtmlTokenId nToken );
215 
FinishTextArea()216     void FinishTextArea() { bReadTextArea = false; }
217 
218     // finish PRE-/LISTING- and XMP mode
FinishPREListingXMP()219     void FinishPREListingXMP() { bReadPRE = bReadListing = bReadXMP = false; }
220 
221     // Filter the current token according to the current mode
222     // (PRE, XMP, ...) and set the flags. Is called by Continue before
223     // NextToken is called. If you implement own loops or call
224     // NextToken yourself, you should call this method beforehand.
225     HtmlTokenId FilterToken( HtmlTokenId nToken );
226 
ReadRawData(const OUString & rEndToken)227     void ReadRawData( const OUString &rEndToken ) { aEndToken = rEndToken; }
228 
229     // Token without \-sequences
230     void UnescapeToken();
231 
232     // Determine the options. pNoConvertToken is the optional token
233     // of an option, for which the CR/LFs are not deleted from the value
234     // of the option.
235     const HTMLOptions& GetOptions( HtmlOptionId const *pNoConvertToken=nullptr );
236 
237     // for asynchronous reading from the SvStream
238     virtual void Continue( HtmlTokenId nToken ) override;
239 
240 
241 protected:
242 
243     static rtl_TextEncoding GetEncodingByMIME( const OUString& rMime );
244 
245     /// template method: called when ParseMetaOptions adds a user-defined meta
246     virtual void AddMetaUserDefined( OUString const & i_rMetaName );
247 
248 private:
249     /// parse meta options into XDocumentProperties and encoding
250     bool ParseMetaOptionsImpl( const css::uno::Reference< css::document::XDocumentProperties>&,
251             SvKeyValueIterator*,
252             const HTMLOptions&,
253             rtl_TextEncoding& rEnc );
254 
255 public:
256     /// overriding method must call this implementation!
257     virtual bool ParseMetaOptions( const css::uno::Reference< css::document::XDocumentProperties>&,
258             SvKeyValueIterator* );
259 
260     void ParseScriptOptions( OUString& rLangString, const OUString&, HTMLScriptLanguage& rLang,
261                              OUString& rSrc, OUString& rLibrary, OUString& rModule );
262 
263     // Remove a comment around the content of <SCRIPT> or <STYLE>.
264     // The whole line behind a "<!--" might be deleted (for JavaScript).
265     static void RemoveSGMLComment( OUString &rString );
266 
267     static bool InternalImgToPrivateURL( OUString& rURL );
268     static rtl_TextEncoding GetEncodingByHttpHeader( SvKeyValueIterator *pHTTPHeader );
269     bool SetEncodingByHTTPHeader( SvKeyValueIterator *pHTTPHeader );
270 };
271 
StartPRE()272 inline void HTMLParser::StartPRE()
273 {
274     bReadPRE = true;
275     bPre_IgnoreNewPara = true;
276     nPre_LinePos = 0;
277 }
278 
StartListing()279 inline void HTMLParser::StartListing()
280 {
281     bReadListing = true;
282     bPre_IgnoreNewPara = true;
283     nPre_LinePos = 0;
284 }
285 
StartXMP()286 inline void HTMLParser::StartXMP()
287 {
288     bReadXMP = true;
289     bPre_IgnoreNewPara = true;
290     nPre_LinePos = 0;
291 }
292 
293 /* vim:set shiftwidth=4 softtabstop=4 expandtab: */
294