LibreOffice Module svtools (master) 1
parhtml.hxx
Go to the documentation of this file.
1/* -*- Mode: C++; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4 -*- */
2/*
3 * This file is part of the LibreOffice project.
4 *
5 * This Source Code Form is subject to the terms of the Mozilla Public
6 * License, v. 2.0. If a copy of the MPL was not distributed with this
7 * file, You can obtain one at http://mozilla.org/MPL/2.0/.
8 *
9 * This file incorporates work covered by the following license notice:
10 *
11 * Licensed to the Apache Software Foundation (ASF) under one or more
12 * contributor license agreements. See the NOTICE file distributed
13 * with this work for additional information regarding copyright
14 * ownership. The ASF licenses this file to you under the Apache
15 * License, Version 2.0 (the "License"); you may not use this file
16 * except in compliance with the License. You may obtain a copy of
17 * the License at http://www.apache.org/licenses/LICENSE-2.0 .
18 */
19
20#pragma once
21
22#include <svtools/svtdllapi.h>
23#include <svtools/svparser.hxx>
24#include <svtools/htmltokn.h>
25
26#include <string_view>
27#include <vector>
28
29namespace com :: sun :: star :: uno { template <class interface_type> class Reference; }
30
31namespace com::sun::star {
32 namespace document {
33 class XDocumentProperties;
34 }
35}
36
37class Color;
38enum class HtmlOptionId;
39
40#define HTMLFONTSZ1_DFLT 7
41#define HTMLFONTSZ2_DFLT 10
42#define HTMLFONTSZ3_DFLT 12
43#define HTMLFONTSZ4_DFLT 14
44#define HTMLFONTSZ5_DFLT 18
45#define HTMLFONTSZ6_DFLT 24
46#define HTMLFONTSZ7_DFLT 36
47
49
51
52enum class HTMLInputType
53{
54 Text = 1,
57 Radio,
58 Range,
60 File,
61 Hidden,
62 Submit,
63 Image,
64 Reset,
65 Button
66};
67
69{
73};
74
75template<typename EnumT>
77{
78 const char *pName; // value of an HTML option
79 EnumT nValue; // and corresponding value of an enum
80};
81
88{
89 OUString aValue; // value of the option (always as string)
90 OUString aToken; // name of the option as string
91 HtmlOptionId nToken; // and respective token
92
93public:
94
95 HTMLOption( HtmlOptionId nTyp, OUString aToken, OUString aValue );
96
97 // name of the option...
98 HtmlOptionId GetToken() const { return nToken; } // ... as enum
99 const OUString& GetTokenString() const { return aToken; } // ... as string
100
101 // value of the option ...
102 const OUString& GetString() const { return aValue; } // ... as string
103
104 sal_uInt32 GetNumber() const; // ... as number
105 sal_Int32 GetSNumber() const; // ... as number
106 void GetNumbers( std::vector<sal_uInt32> &rNumbers ) const; // ... as numbers
107 void GetColor( Color& ) const; // ... as color
108
109 template<typename EnumT>
110 EnumT GetEnum( const HTMLOptionEnum<EnumT> *pOptEnums,
111 EnumT nDflt = static_cast<EnumT>(0) ) const
112 {
113 while( pOptEnums->pName )
114 {
115 if( aValue.equalsIgnoreAsciiCaseAscii( pOptEnums->pName ) )
116 return pOptEnums->nValue;
117 pOptEnums++;
118 }
119 return nDflt;
120 }
121
122 template<typename EnumT>
123 bool GetEnum( EnumT &rEnum, const HTMLOptionEnum<EnumT> *pOptEnums ) const
124 {
125 while( pOptEnums->pName )
126 {
127 if( aValue.equalsIgnoreAsciiCaseAscii( pOptEnums->pName ) )
128 {
129 rEnum = pOptEnums->nValue;
130 return true;
131 }
132 pOptEnums++;
133 }
134 return false;
135 }
136
137 // ... and as a few special enums
138 HTMLInputType GetInputType() const; // <INPUT TYPE=...>
139 HTMLTableFrame GetTableFrame() const; // <TABLE FRAME=...>
140 HTMLTableRules GetTableRules() const; // <TABLE RULES=...>
141 //SvxAdjust GetAdjust() const; // <P,TH,TD ALIGN=>
142};
143
144typedef ::std::vector<HTMLOption> HTMLOptions;
145
146class SVT_DLLPUBLIC HTMLParser : public SvParser<HtmlTokenId>
147{
148private:
149 mutable HTMLOptions maOptions; // options of the start tag
150
151 bool bNewDoc : 1; // read new Doc?
152 bool bIsInHeader : 1; // scan header section
153 bool bReadListing : 1; // read listings
154 bool bReadXMP : 1; // read XMP
155 bool bReadPRE : 1; // read preformatted text
156 bool bReadTextArea : 1; // read TEXTAREA
157 bool bReadScript : 1; // read <SCRIPT>
158 bool bReadStyle : 1; // read <STYLE>
159 bool bEndTokenFound : 1; // found </SCRIPT> or </STYLE>
160
161 bool bPre_IgnoreNewPara : 1; // flags for reading of PRE paragraphs
162 bool bReadNextChar : 1; // true: read NextChar again(JavaScript!)
163 bool bReadComment : 1; // true: read NextChar again (JavaScript!)
164
165 sal_uInt32 nPre_LinePos; // Pos in the line in the PRE-Tag
166
168
169 OUString aEndToken;
170
172 OUString maNamespace;
173
174protected:
175 OUString sSaveToken; // the read tag as string
176
177 HtmlTokenId ScanText( const sal_Unicode cBreak = 0U );
178
179 HtmlTokenId GetNextRawToken();
180
181 // scan next token
182 virtual HtmlTokenId GetNextToken_() override;
183
184 virtual ~HTMLParser() override;
185
186 void FinishHeader() { bIsInHeader = false; }
187
188 void SetNamespace(std::u16string_view rNamespace);
189
190public:
191 HTMLParser( SvStream& rIn, bool bReadNewDoc = true );
192
193 virtual SvParserState CallParser() override;
194
195 bool IsNewDoc() const { return bNewDoc; }
196 bool IsInHeader() const { return bIsInHeader; }
197 bool IsReadListing() const { return bReadListing; }
198 bool IsReadXMP() const { return bReadXMP; }
199 bool IsReadPRE() const { return bReadPRE; }
200 bool IsReadScript() const { return bReadScript; }
201 bool IsReadStyle() const { return bReadStyle; }
202
203 // start PRE-/LISTING or XMP mode or filter tags respectively
204 inline void StartPRE();
205 void FinishPRE() { bReadPRE = false; }
206 HtmlTokenId FilterPRE( HtmlTokenId nToken );
207
208 inline void StartListing();
209 void FinishListing() { bReadListing = false; }
210 HtmlTokenId FilterListing( HtmlTokenId nToken );
211
212 inline void StartXMP();
213 void FinishXMP() { bReadXMP = false; }
214 HtmlTokenId FilterXMP( HtmlTokenId nToken );
215
216 void FinishTextArea() { bReadTextArea = false; }
217
218 // finish PRE-/LISTING- and XMP mode
219 void FinishPREListingXMP() { bReadPRE = bReadListing = bReadXMP = false; }
220
221 // Filter the current token according to the current mode
222 // (PRE, XMP, ...) and set the flags. Is called by Continue before
223 // NextToken is called. If you implement own loops or call
224 // NextToken yourself, you should call this method beforehand.
225 HtmlTokenId FilterToken( HtmlTokenId nToken );
226
227 void ReadRawData( const OUString &rEndToken ) { aEndToken = rEndToken; }
228
229 // Token without \-sequences
230 void UnescapeToken();
231
232 // Determine the options. pNoConvertToken is the optional token
233 // of an option, for which the CR/LFs are not deleted from the value
234 // of the option.
235 const HTMLOptions& GetOptions( HtmlOptionId const *pNoConvertToken=nullptr );
236
237 // for asynchronous reading from the SvStream
238 virtual void Continue( HtmlTokenId nToken ) override;
239
240
241protected:
242
243 static rtl_TextEncoding GetEncodingByMIME( const OUString& rMime );
244
246 virtual void AddMetaUserDefined( OUString const & i_rMetaName );
247
248private:
250 bool ParseMetaOptionsImpl( const css::uno::Reference< css::document::XDocumentProperties>&,
252 const HTMLOptions&,
253 rtl_TextEncoding& rEnc );
254
255public:
257 virtual bool ParseMetaOptions( const css::uno::Reference< css::document::XDocumentProperties>&,
259
260 void ParseScriptOptions( OUString& rLangString, std::u16string_view rBaseURL, HTMLScriptLanguage& rLang,
261 OUString& rSrc, OUString& rLibrary, OUString& rModule );
262
263 // Remove a comment around the content of <SCRIPT> or <STYLE>.
264 // The whole line behind a "<!--" might be deleted (for JavaScript).
265 static void RemoveSGMLComment( OUString &rString );
266
267 static bool InternalImgToPrivateURL( OUString& rURL );
268 static rtl_TextEncoding GetEncodingByHttpHeader( SvKeyValueIterator *pHTTPHeader );
269 bool SetEncodingByHTTPHeader( SvKeyValueIterator *pHTTPHeader );
270};
271
273{
274 bReadPRE = true;
275 bPre_IgnoreNewPara = true;
276 nPre_LinePos = 0;
277}
278
280{
281 bReadListing = true;
282 bPre_IgnoreNewPara = true;
283 nPre_LinePos = 0;
284}
285
287{
288 bReadXMP = true;
289 bPre_IgnoreNewPara = true;
290 nPre_LinePos = 0;
291}
292
293/* vim:set shiftwidth=4 softtabstop=4 expandtab: */
Representation of an HTML option (=attribute in a start tag).
Definition: parhtml.hxx:88
HtmlOptionId GetToken() const
Definition: parhtml.hxx:98
const OUString & GetTokenString() const
Definition: parhtml.hxx:99
const OUString & GetString() const
Definition: parhtml.hxx:102
bool GetEnum(EnumT &rEnum, const HTMLOptionEnum< EnumT > *pOptEnums) const
Definition: parhtml.hxx:123
HtmlOptionId nToken
Definition: parhtml.hxx:91
EnumT GetEnum(const HTMLOptionEnum< EnumT > *pOptEnums, EnumT nDflt=static_cast< EnumT >(0)) const
Definition: parhtml.hxx:110
OUString aValue
Definition: parhtml.hxx:89
OUString aToken
Definition: parhtml.hxx:90
HtmlTokenId mnPendingOffToken
OFF token pending for a <XX.../> ON/OFF ON token.
Definition: parhtml.hxx:167
bool bReadPRE
Definition: parhtml.hxx:155
bool IsReadStyle() const
Definition: parhtml.hxx:201
bool bReadXMP
Definition: parhtml.hxx:154
void FinishHeader()
Definition: parhtml.hxx:186
void StartPRE()
Definition: parhtml.hxx:272
bool IsReadScript() const
Definition: parhtml.hxx:200
bool bReadComment
Definition: parhtml.hxx:163
bool bIsInHeader
Definition: parhtml.hxx:152
bool IsInHeader() const
Definition: parhtml.hxx:196
bool bReadNextChar
Definition: parhtml.hxx:162
bool IsReadXMP() const
Definition: parhtml.hxx:198
void FinishPRE()
Definition: parhtml.hxx:205
void StartXMP()
Definition: parhtml.hxx:286
void FinishXMP()
Definition: parhtml.hxx:213
bool bReadStyle
Definition: parhtml.hxx:158
bool bReadTextArea
Definition: parhtml.hxx:156
bool bReadListing
Definition: parhtml.hxx:153
OUString maNamespace
XML namespace, in case of XHTML.
Definition: parhtml.hxx:172
void FinishListing()
Definition: parhtml.hxx:209
bool bNewDoc
Definition: parhtml.hxx:151
bool bReadScript
Definition: parhtml.hxx:157
void StartListing()
Definition: parhtml.hxx:279
OUString sSaveToken
Definition: parhtml.hxx:175
sal_uInt32 nPre_LinePos
Definition: parhtml.hxx:165
bool bEndTokenFound
Definition: parhtml.hxx:159
void FinishPREListingXMP()
Definition: parhtml.hxx:219
OUString aEndToken
Definition: parhtml.hxx:169
void ReadRawData(const OUString &rEndToken)
Definition: parhtml.hxx:227
bool IsNewDoc() const
Definition: parhtml.hxx:195
HTMLOptions maOptions
Definition: parhtml.hxx:149
bool IsReadListing() const
Definition: parhtml.hxx:197
bool IsReadPRE() const
Definition: parhtml.hxx:199
void FinishTextArea()
Definition: parhtml.hxx:216
bool bPre_IgnoreNewPara
Definition: parhtml.hxx:161
HtmlOptionId
Definition: htmltokn.h:301
HtmlTokenId
Definition: htmltokn.h:46
NONE
Unknown
Reference
HTMLInputType
Definition: parhtml.hxx:53
HTMLTableRules
Definition: parhtml.hxx:50
HTMLScriptLanguage
Definition: parhtml.hxx:69
HTMLTableFrame
Definition: parhtml.hxx:48
::std::vector< HTMLOption > HTMLOptions
Definition: parhtml.hxx:144
DefTokenId nToken
const char * pName
Definition: parhtml.hxx:78
EnumT nValue
Definition: parhtml.hxx:79
SvParserState
Definition: svparser.hxx:36
#define SVT_DLLPUBLIC
Definition: svtdllapi.h:27
sal_uInt16 sal_Unicode