1 /* -*- Mode: C++; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4 -*- */
2 /*
3 * This file is part of the LibreOffice project.
4 *
5 * This Source Code Form is subject to the terms of the Mozilla Public
6 * License, v. 2.0. If a copy of the MPL was not distributed with this
7 * file, You can obtain one at http://mozilla.org/MPL/2.0/.
8 *
9 * This file incorporates work covered by the following license notice:
10 *
11 * Licensed to the Apache Software Foundation (ASF) under one or more
12 * contributor license agreements. See the NOTICE file distributed
13 * with this work for additional information regarding copyright
14 * ownership. The ASF licenses this file to you under the Apache
15 * License, Version 2.0 (the "License"); you may not use this file
16 * except in compliance with the License. You may obtain a copy of
17 * the License at http://www.apache.org/licenses/LICENSE-2.0 .
18 */
19
20 #pragma once
21
22 #include <svtools/svtdllapi.h>
23 #include <svtools/svparser.hxx>
24 #include <svtools/htmltokn.h>
25
26 #include <string_view>
27 #include <vector>
28
29 namespace com :: sun :: star :: uno { template <class interface_type> class Reference; }
30
31 namespace com::sun::star {
32 namespace document {
33 class XDocumentProperties;
34 }
35 }
36
37 class Color;
38 enum class HtmlOptionId;
39
40 #define HTMLFONTSZ1_DFLT 7
41 #define HTMLFONTSZ2_DFLT 10
42 #define HTMLFONTSZ3_DFLT 12
43 #define HTMLFONTSZ4_DFLT 14
44 #define HTMLFONTSZ5_DFLT 18
45 #define HTMLFONTSZ6_DFLT 24
46 #define HTMLFONTSZ7_DFLT 36
47
48 enum class HTMLTableFrame { Void, Above, Below, HSides, LHS, RHS, VSides, Box };
49
50 enum class HTMLTableRules { NONE, Groups, Rows, Cols, All };
51
52 enum class HTMLInputType
53 {
54 Text = 1,
55 Password,
56 Checkbox,
57 Radio,
58 Range,
59 Scribble,
60 File,
61 Hidden,
62 Submit,
63 Image,
64 Reset,
65 Button
66 };
67
68 enum class HTMLScriptLanguage
69 {
70 StarBasic,
71 JavaScript,
72 Unknown
73 };
74
75 template<typename EnumT>
76 struct HTMLOptionEnum
77 {
78 const char *pName; // value of an HTML option
79 EnumT nValue; // and corresponding value of an enum
80 };
81
82 /** Representation of an HTML option (=attribute in a start tag).
83 * The values of the options are always stored as strings.
84 * The methods GetNumber,... may only be called if the option
85 * is actually numerical,...
86 */
87 class SVT_DLLPUBLIC HTMLOption
88 {
89 OUString aValue; // value of the option (always as string)
90 OUString aToken; // name of the option as string
91 HtmlOptionId nToken; // and respective token
92
93 public:
94
95 HTMLOption( HtmlOptionId nTyp, const OUString& rToken, const OUString& rValue );
96
97 // name of the option...
GetToken() const98 HtmlOptionId GetToken() const { return nToken; } // ... as enum
GetTokenString() const99 const OUString& GetTokenString() const { return aToken; } // ... as string
100
101 // value of the option ...
GetString() const102 const OUString& GetString() const { return aValue; } // ... as string
103
104 sal_uInt32 GetNumber() const; // ... as number
105 sal_Int32 GetSNumber() const; // ... as number
106 void GetNumbers( std::vector<sal_uInt32> &rNumbers ) const; // ... as numbers
107 void GetColor( Color& ) const; // ... as color
108
109 template<typename EnumT>
GetEnum(const HTMLOptionEnum<EnumT> * pOptEnums,EnumT nDflt=static_cast<EnumT> (0)) const110 EnumT GetEnum( const HTMLOptionEnum<EnumT> *pOptEnums,
111 EnumT nDflt = static_cast<EnumT>(0) ) const
112 {
113 while( pOptEnums->pName )
114 {
115 if( aValue.equalsIgnoreAsciiCaseAscii( pOptEnums->pName ) )
116 return pOptEnums->nValue;
117 pOptEnums++;
118 }
119 return nDflt;
120 }
121
122 template<typename EnumT>
GetEnum(EnumT & rEnum,const HTMLOptionEnum<EnumT> * pOptEnums) const123 bool GetEnum( EnumT &rEnum, const HTMLOptionEnum<EnumT> *pOptEnums ) const
124 {
125 while( pOptEnums->pName )
126 {
127 if( aValue.equalsIgnoreAsciiCaseAscii( pOptEnums->pName ) )
128 {
129 rEnum = pOptEnums->nValue;
130 return true;
131 }
132 pOptEnums++;
133 }
134 return false;
135 }
136
137 // ... and as a few special enums
138 HTMLInputType GetInputType() const; // <INPUT TYPE=...>
139 HTMLTableFrame GetTableFrame() const; // <TABLE FRAME=...>
140 HTMLTableRules GetTableRules() const; // <TABLE RULES=...>
141 //SvxAdjust GetAdjust() const; // <P,TH,TD ALIGN=>
142 };
143
144 typedef ::std::vector<HTMLOption> HTMLOptions;
145
146 class SVT_DLLPUBLIC HTMLParser : public SvParser<HtmlTokenId>
147 {
148 private:
149 mutable HTMLOptions maOptions; // options of the start tag
150
151 bool bNewDoc : 1; // read new Doc?
152 bool bIsInHeader : 1; // scan header section
153 bool bReadListing : 1; // read listings
154 bool bReadXMP : 1; // read XMP
155 bool bReadPRE : 1; // read preformatted text
156 bool bReadTextArea : 1; // read TEXTAREA
157 bool bReadScript : 1; // read <SCRIPT>
158 bool bReadStyle : 1; // read <STYLE>
159 bool bEndTokenFound : 1; // found </SCRIPT> or </STYLE>
160
161 bool bPre_IgnoreNewPara : 1; // flags for reading of PRE paragraphs
162 bool bReadNextChar : 1; // true: read NextChar again(JavaScript!)
163 bool bReadComment : 1; // true: read NextChar again (JavaScript!)
164
165 sal_uInt32 nPre_LinePos; // Pos in the line in the PRE-Tag
166
167 HtmlTokenId mnPendingOffToken; ///< OFF token pending for a <XX.../> ON/OFF ON token
168
169 OUString aEndToken;
170
171 /// XML namespace, in case of XHTML.
172 OUString maNamespace;
173
174 protected:
175 OUString sSaveToken; // the read tag as string
176
177 HtmlTokenId ScanText( const sal_Unicode cBreak = 0U );
178
179 HtmlTokenId GetNextRawToken();
180
181 // scan next token
182 virtual HtmlTokenId GetNextToken_() override;
183
184 virtual ~HTMLParser() override;
185
FinishHeader()186 void FinishHeader() { bIsInHeader = false; }
187
188 void SetNamespace(std::u16string_view rNamespace);
189
190 public:
191 HTMLParser( SvStream& rIn, bool bReadNewDoc = true );
192
193 virtual SvParserState CallParser() override;
194
IsNewDoc() const195 bool IsNewDoc() const { return bNewDoc; }
IsInHeader() const196 bool IsInHeader() const { return bIsInHeader; }
IsReadListing() const197 bool IsReadListing() const { return bReadListing; }
IsReadXMP() const198 bool IsReadXMP() const { return bReadXMP; }
IsReadPRE() const199 bool IsReadPRE() const { return bReadPRE; }
IsReadScript() const200 bool IsReadScript() const { return bReadScript; }
IsReadStyle() const201 bool IsReadStyle() const { return bReadStyle; }
202
203 // start PRE-/LISTING or XMP mode or filter tags respectively
204 inline void StartPRE();
FinishPRE()205 void FinishPRE() { bReadPRE = false; }
206 HtmlTokenId FilterPRE( HtmlTokenId nToken );
207
208 inline void StartListing();
FinishListing()209 void FinishListing() { bReadListing = false; }
210 HtmlTokenId FilterListing( HtmlTokenId nToken );
211
212 inline void StartXMP();
FinishXMP()213 void FinishXMP() { bReadXMP = false; }
214 HtmlTokenId FilterXMP( HtmlTokenId nToken );
215
FinishTextArea()216 void FinishTextArea() { bReadTextArea = false; }
217
218 // finish PRE-/LISTING- and XMP mode
FinishPREListingXMP()219 void FinishPREListingXMP() { bReadPRE = bReadListing = bReadXMP = false; }
220
221 // Filter the current token according to the current mode
222 // (PRE, XMP, ...) and set the flags. Is called by Continue before
223 // NextToken is called. If you implement own loops or call
224 // NextToken yourself, you should call this method beforehand.
225 HtmlTokenId FilterToken( HtmlTokenId nToken );
226
ReadRawData(const OUString & rEndToken)227 void ReadRawData( const OUString &rEndToken ) { aEndToken = rEndToken; }
228
229 // Token without \-sequences
230 void UnescapeToken();
231
232 // Determine the options. pNoConvertToken is the optional token
233 // of an option, for which the CR/LFs are not deleted from the value
234 // of the option.
235 const HTMLOptions& GetOptions( HtmlOptionId const *pNoConvertToken=nullptr );
236
237 // for asynchronous reading from the SvStream
238 virtual void Continue( HtmlTokenId nToken ) override;
239
240
241 protected:
242
243 static rtl_TextEncoding GetEncodingByMIME( const OUString& rMime );
244
245 /// template method: called when ParseMetaOptions adds a user-defined meta
246 virtual void AddMetaUserDefined( OUString const & i_rMetaName );
247
248 private:
249 /// parse meta options into XDocumentProperties and encoding
250 bool ParseMetaOptionsImpl( const css::uno::Reference< css::document::XDocumentProperties>&,
251 SvKeyValueIterator*,
252 const HTMLOptions&,
253 rtl_TextEncoding& rEnc );
254
255 public:
256 /// overriding method must call this implementation!
257 virtual bool ParseMetaOptions( const css::uno::Reference< css::document::XDocumentProperties>&,
258 SvKeyValueIterator* );
259
260 void ParseScriptOptions( OUString& rLangString, const OUString&, HTMLScriptLanguage& rLang,
261 OUString& rSrc, OUString& rLibrary, OUString& rModule );
262
263 // Remove a comment around the content of <SCRIPT> or <STYLE>.
264 // The whole line behind a "<!--" might be deleted (for JavaScript).
265 static void RemoveSGMLComment( OUString &rString );
266
267 static bool InternalImgToPrivateURL( OUString& rURL );
268 static rtl_TextEncoding GetEncodingByHttpHeader( SvKeyValueIterator *pHTTPHeader );
269 bool SetEncodingByHTTPHeader( SvKeyValueIterator *pHTTPHeader );
270 };
271
StartPRE()272 inline void HTMLParser::StartPRE()
273 {
274 bReadPRE = true;
275 bPre_IgnoreNewPara = true;
276 nPre_LinePos = 0;
277 }
278
StartListing()279 inline void HTMLParser::StartListing()
280 {
281 bReadListing = true;
282 bPre_IgnoreNewPara = true;
283 nPre_LinePos = 0;
284 }
285
StartXMP()286 inline void HTMLParser::StartXMP()
287 {
288 bReadXMP = true;
289 bPre_IgnoreNewPara = true;
290 nPre_LinePos = 0;
291 }
292
293 /* vim:set shiftwidth=4 softtabstop=4 expandtab: */
294