LibreOffice Module svtools (master) 1
parhtml.cxx
Go to the documentation of this file.
1/* -*- Mode: C++; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4 -*- */
2/*
3 * This file is part of the LibreOffice project.
4 *
5 * This Source Code Form is subject to the terms of the Mozilla Public
6 * License, v. 2.0. If a copy of the MPL was not distributed with this
7 * file, You can obtain one at http://mozilla.org/MPL/2.0/.
8 *
9 * This file incorporates work covered by the following license notice:
10 *
11 * Licensed to the Apache Software Foundation (ASF) under one or more
12 * contributor license agreements. See the NOTICE file distributed
13 * with this work for additional information regarding copyright
14 * ownership. The ASF licenses this file to you under the Apache
15 * License, Version 2.0 (the "License"); you may not use this file
16 * except in compliance with the License. You may obtain a copy of
17 * the License at http://www.apache.org/licenses/LICENSE-2.0 .
18 */
19
20#include <comphelper/string.hxx>
21#include <o3tl/safeint.hxx>
22#include <o3tl/string_view.hxx>
23#include <tools/stream.hxx>
24#include <tools/debug.hxx>
25#include <tools/color.hxx>
26#include <rtl/ustrbuf.hxx>
27#include <rtl/character.hxx>
28#include <rtl/tencinfo.h>
29#include <sal/log.hxx>
30#include <tools/tenccvt.hxx>
31#include <tools/datetime.hxx>
32#include <unotools/datetime.hxx>
33#include <svl/inettype.hxx>
34#include <svl/lngmisc.hxx>
35#include <com/sun/star/beans/PropertyAttribute.hpp>
36#include <com/sun/star/document/XDocumentProperties.hpp>
37
38#include <svtools/parhtml.hxx>
39#include <svtools/htmltokn.h>
40#include <svtools/htmlkywd.hxx>
41
42#include <utility>
43
44using namespace ::com::sun::star;
45
46
47const sal_Int32 MAX_LEN( 1024 );
48
49const sal_Int32 MAX_ENTITY_LEN( 8 );
50
51
52// Tables to convert option values into strings
53
54// <INPUT TYPE=xxx>
56{
69 { nullptr, HTMLInputType(0) }
70};
71
72// <TABLE FRAME=xxx>
74{
84 { nullptr, HTMLTableFrame(0) }
85};
86
87// <TABLE RULES=xxx>
89{
95 { nullptr, HTMLTableRules(0) }
96};
97
98
99HTMLOption::HTMLOption( HtmlOptionId nTok, OUString _aToken,
100 OUString _aValue )
101 : aValue(std::move(_aValue))
102 , aToken(std::move(_aToken))
103 , nToken( nTok )
104{
106 "HTMLOption: unknown token" );
107}
108
109sal_uInt32 HTMLOption::GetNumber() const
110{
116 "GetNumber: Option not numerical" );
117 OUString aTmp(comphelper::string::stripStart(aValue, ' '));
118 sal_Int32 nTmp = aTmp.toInt32();
119 return nTmp >= 0 ? static_cast<sal_uInt32>(nTmp) : 0;
120}
121
122sal_Int32 HTMLOption::GetSNumber() const
123{
126 "GetSNumber: Option not numerical" );
127 OUString aTmp(comphelper::string::stripStart(aValue, ' '));
128 return aTmp.toInt32();
129}
130
131void HTMLOption::GetNumbers( std::vector<sal_uInt32> &rNumbers ) const
132{
133 rNumbers.clear();
134
135 // This is a very simplified scanner: it only searches all
136 // numerals in the string.
137 bool bInNum = false;
138 sal_uInt32 nNum = 0;
139 for( sal_Int32 i=0; i<aValue.getLength(); i++ )
140 {
141 sal_Unicode c = aValue[ i ];
142 if( c>='0' && c<='9' )
143 {
144 nNum *= 10;
145 nNum += (c - '0');
146 bInNum = true;
147 }
148 else if( bInNum )
149 {
150 rNumbers.push_back( nNum );
151 bInNum = false;
152 nNum = 0;
153 }
154 }
155 if( bInNum )
156 {
157 rNumbers.push_back( nNum );
158 }
159}
160
161void HTMLOption::GetColor( Color& rColor ) const
162{
164 "GetColor: Option is not a color." );
165
166 OUString aTmp(aValue.toAsciiLowerCase());
167 sal_uInt32 nColor = SAL_MAX_UINT32;
168 if (!aTmp.isEmpty() && aTmp[0] != '#')
169 nColor = GetHTMLColor(aTmp);
170
171 if( SAL_MAX_UINT32 == nColor )
172 {
173 nColor = 0;
174 sal_Int32 nPos = 0;
175 for (sal_uInt32 i=0; i<6; ++i)
176 {
177 // Whatever Netscape does to get color values,
178 // at maximum three characters < '0' are ignored.
179 sal_Unicode c = nPos<aTmp.getLength() ? aTmp[ nPos++ ] : '0';
180 if( c < '0' )
181 {
182 c = nPos<aTmp.getLength() ? aTmp[nPos++] : '0';
183 if( c < '0' )
184 c = nPos<aTmp.getLength() ? aTmp[nPos++] : '0';
185 }
186 nColor *= 16;
187 if( c >= '0' && c <= '9' )
188 nColor += (c - '0');
189 else if( c >= 'a' && c <= 'f' )
190 nColor += (c + 0xa - 'a');
191 }
192 }
193
194 rColor.SetRed( static_cast<sal_uInt8>((nColor & 0x00ff0000) >> 16) );
195 rColor.SetGreen( static_cast<sal_uInt8>((nColor & 0x0000ff00) >> 8));
196 rColor.SetBlue( static_cast<sal_uInt8>(nColor & 0x000000ff) );
197}
198
200{
201 DBG_ASSERT( nToken==HtmlOptionId::TYPE, "GetInputType: Option not TYPE" );
203}
204
206{
207 DBG_ASSERT( nToken==HtmlOptionId::FRAME, "GetTableFrame: Option not FRAME" );
209}
210
212{
213 DBG_ASSERT( nToken==HtmlOptionId::RULES, "GetTableRules: Option not RULES" );
215}
216
217HTMLParser::HTMLParser( SvStream& rIn, bool bReadNewDoc ) :
218 SvParser<HtmlTokenId>( rIn ),
219 bNewDoc(bReadNewDoc),
220 bIsInHeader(true),
221 bReadListing(false),
222 bReadXMP(false),
223 bReadPRE(false),
224 bReadTextArea(false),
225 bReadScript(false),
226 bReadStyle(false),
227 bEndTokenFound(false),
228 bPre_IgnoreNewPara(false),
229 bReadNextChar(false),
230 bReadComment(false),
231 nPre_LinePos(0),
232 mnPendingOffToken(HtmlTokenId::NONE)
233{
234 //#i76649, default to UTF-8 for HTML unless we know differently
235 SetSrcEncoding(RTL_TEXTENCODING_UTF8);
236}
237
239{
240}
241
242void HTMLParser::SetNamespace(std::u16string_view rNamespace)
243{
244 // Convert namespace alias to a prefix.
245 maNamespace = OUString::Concat(rNamespace) + ":";
246}
247
248namespace
249{
250 class RefGuard
251 {
252 private:
254 public:
255 RefGuard(HTMLParser& rParser)
256 : m_rParser(rParser)
257 {
258 m_rParser.AddFirstRef();
259 }
260
261 ~RefGuard()
262 {
263 if (m_rParser.GetStatus() != SvParserState::Pending)
264 m_rParser.ReleaseRef(); // Parser not needed anymore
265 }
266 };
267}
268
270{
271 eState = SvParserState::Working;
272 nNextCh = GetNextChar();
273 SaveState( HtmlTokenId::NONE );
274
275 nPre_LinePos = 0;
276 bPre_IgnoreNewPara = false;
277
278 RefGuard aRefGuard(*this);
279
281
282 return eState;
283}
284
286{
288 nToken = GetNextToken();
289
290 while( IsParserWorking() )
291 {
292 SaveState( nToken );
294
296 NextToken( nToken );
297
298 if( IsParserWorking() )
299 SaveState( HtmlTokenId::NONE ); // continue with new token
300
301 nToken = GetNextToken();
302 }
303}
304
306{
307 switch( nToken )
308 {
309 case HtmlTokenId(EOF):
311 break; // don't pass
312
314 bIsInHeader = false;
315 break;
316
318 bIsInHeader = true;
319 break;
320
322 bIsInHeader = false;
323 break;
324
326 bIsInHeader = false;
327 break;
328
330 bReadPRE = bReadListing = bReadXMP = false;
331 break;
332
335 bReadPRE = bReadListing = bReadXMP = false;
336 break; // HtmlTokenId::ON hasn't been passed either !
337
339 StartPRE();
340 break;
341
343 FinishPRE();
344 break;
345
347 StartListing();
348 break;
349
352 break;
353
355 StartXMP();
356 break;
357
359 FinishXMP();
360 break;
361
362 default:
363 if( bReadPRE )
365 else if( bReadListing )
367 else if( bReadXMP )
369
370 break;
371 }
372
373 return nToken;
374}
375
376namespace {
377
378constexpr bool HTML_ISPRINTABLE(sal_Unicode c) { return c >= 32 && c != 127; }
379
380}
381
383{
384 OUStringBuffer sTmpBuffer( MAX_LEN );
385 bool bContinue = true;
386 bool bEqSignFound = false;
387 sal_uInt32 cQuote = 0U;
388
389 while( bContinue && IsParserWorking() )
390 {
391 bool bNextCh = true;
392 switch( nNextCh )
393 {
394 case '&':
395 bEqSignFound = false;
396 if( bReadXMP )
397 sTmpBuffer.append( '&' );
398 else
399 {
400 sal_uInt64 nStreamPos = rInput.Tell();
401 sal_uInt32 nLinePos = GetLinePos();
402
403 sal_uInt32 cChar = 0U;
404 if( '#' == (nNextCh = GetNextChar()) )
405 {
406 nNextCh = GetNextChar();
407 const bool bIsHex( 'x' == nNextCh );
408 const bool bIsDecOrHex( bIsHex || rtl::isAsciiDigit(nNextCh) );
409 if ( bIsDecOrHex )
410 {
411 if ( bIsHex )
412 {
413 nNextCh = GetNextChar();
414 while ( rtl::isAsciiHexDigit(nNextCh) )
415 {
416 cChar = cChar * 16U +
417 ( nNextCh <= '9'
418 ? sal_uInt32( nNextCh - '0' )
419 : ( nNextCh <= 'F'
420 ? sal_uInt32( nNextCh - 'A' + 10 )
421 : sal_uInt32( nNextCh - 'a' + 10 ) ) );
422 nNextCh = GetNextChar();
423 }
424 }
425 else
426 {
427 do
428 {
429 cChar = cChar * 10U + sal_uInt32( nNextCh - '0');
430 nNextCh = GetNextChar();
431 }
432 while( rtl::isAsciiDigit(nNextCh) );
433 }
434
435 if( RTL_TEXTENCODING_DONTKNOW != eSrcEnc &&
436 RTL_TEXTENCODING_UCS2 != eSrcEnc &&
437 RTL_TEXTENCODING_UTF8 != eSrcEnc &&
438 cChar < 256 )
439 {
440 const sal_uInt32 convertFlags =
441 RTL_TEXTTOUNICODE_FLAGS_UNDEFINED_DEFAULT |
442 RTL_TEXTTOUNICODE_FLAGS_MBUNDEFINED_DEFAULT |
443 RTL_TEXTTOUNICODE_FLAGS_INVALID_DEFAULT;
444
445 char cEncodedChar = static_cast<char>(cChar);
446 cChar = OUString(&cEncodedChar, 1, eSrcEnc, convertFlags).toChar();
447 if( 0U == cChar )
448 {
449 // If the character could not be
450 // converted, because a conversion is not
451 // available, do no conversion at all.
452 cChar = cEncodedChar;
453 }
454 }
455 }
456 else
457 nNextCh = 0U;
458
459 if (!rtl::isUnicodeCodePoint(cChar)
461 && cChar != '\r' && cChar != '\n' && cChar != '\t'))
462 {
463 cChar = '?';
464 }
465 }
466 else if( rtl::isAsciiAlpha( nNextCh ) )
467 {
468 OUStringBuffer sEntityBuffer( MAX_ENTITY_LEN );
469 sal_Int32 nPos = 0;
470 do
471 {
472 sEntityBuffer.appendUtf32( nNextCh );
473 nPos++;
474 nNextCh = GetNextChar();
475 }
476 while( nPos < MAX_ENTITY_LEN && rtl::isAsciiAlphanumeric( nNextCh ) &&
477 !rInput.eof() );
478
479 if( IsParserWorking() && !rInput.eof() )
480 {
481 std::u16string_view sEntity(sEntityBuffer.subView(0, nPos));
482 cChar = GetHTMLCharName( sEntity );
483
484 // not found ( == 0 ): plain text
485 // or a character which is inserted as attribute
486 if( 0U == cChar && ';' != nNextCh )
487 {
488 DBG_ASSERT( rInput.Tell() - nStreamPos ==
489 static_cast<sal_uInt64>(nPos+1)*GetCharSize(),
490 "UTF-8 is failing here" );
491 for( sal_Int32 i = nPos-1; i>1; i-- )
492 {
493 nNextCh = sEntityBuffer[i];
494 sEntityBuffer.setLength( i );
495 sEntity = sEntityBuffer.subView(0, i);
496 cChar = GetHTMLCharName( sEntity );
497 if( cChar )
498 {
499 rInput.SeekRel( -static_cast<sal_Int64>
500 (nPos-i)*GetCharSize() );
501 nlLinePos -= sal_uInt32(nPos-i);
502 nPos = i;
503 ClearTxtConvContext();
504 break;
505 }
506 }
507 }
508
509 if( !cChar ) // unknown character?
510 {
511 // back in stream, insert '&'
512 // and restart with next character
513 sTmpBuffer.append( '&' );
514
515 DBG_ASSERT( rInput.Tell()-nStreamPos ==
516 static_cast<sal_uInt64>(nPos+1)*GetCharSize(),
517 "Wrong stream position" );
518 DBG_ASSERT( nlLinePos-nLinePos ==
519 static_cast<sal_uInt32>(nPos+1),
520 "Wrong line position" );
521 rInput.Seek( nStreamPos );
522 nlLinePos = nLinePos;
523 ClearTxtConvContext();
524 break;
525 }
526
527 assert(cChar != 0);
528
529 // 1 == Non Breaking Space
530 // 2 == SoftHyphen
531
532 if (cChar == 1 || cChar == 2)
533 {
534 if( '>' == cBreak )
535 {
536 // When reading the content of a tag we have
537 // to change it to ' ' or '-'
538 if( 1U == cChar )
539 cChar = ' ';
540 else //2U
541 cChar = '-';
542 }
543 else
544 {
545 // If not scanning a tag return token
546 aToken.append( sTmpBuffer );
547 sTmpBuffer.setLength(0);
548
549 if( !aToken.isEmpty() )
550 {
551 // restart with character
552 nNextCh = '&';
553 DBG_ASSERT( rInput.Tell()-nStreamPos ==
554 static_cast<sal_uInt64>(nPos+1)*GetCharSize(),
555 "Wrong stream position" );
556 DBG_ASSERT( nlLinePos-nLinePos ==
557 static_cast<sal_uInt32>(nPos+1),
558 "Wrong line position" );
559 rInput.Seek( nStreamPos );
560 nlLinePos = nLinePos;
561 ClearTxtConvContext();
563 }
564
565 // Hack: _GetNextChar shall not read the
566 // next character
567 if( ';' != nNextCh )
568 aToken.append( " " );
569 if( 1U == cChar )
571 else //2U
573 }
574 }
575 }
576 else
577 nNextCh = 0U;
578 }
579 // &{...};-JavaScript-Macros are not supported any longer.
580 else if( IsParserWorking() )
581 {
582 sTmpBuffer.append( '&' );
583 bNextCh = false;
584 break;
585 }
586
587 bNextCh = (';' == nNextCh);
588 if( cBreak=='>' && (cChar=='\\' || cChar=='\'' ||
589 cChar=='\"' || cChar==' ') )
590 {
591 // ' and " have to be escaped within tags to separate
592 // them from ' and " enclosing options.
593 // \ has to be escaped as well.
594 // Space is protected because it's not a delimiter between
595 // options.
596 sTmpBuffer.append( '\\' );
597 }
598 if( IsParserWorking() )
599 {
600 if( cChar )
601 sTmpBuffer.appendUtf32( cChar );
602 }
603 else if( SvParserState::Pending==eState && '>'!=cBreak )
604 {
605 // Restart with '&', the remainder is returned as
606 // text token.
607 if( !aToken.isEmpty() || !sTmpBuffer.isEmpty() )
608 {
609 // _GetNextChar() returns the previous text and
610 // during the next execution a new character is read.
611 // Thus we have to position in front of the '&'.
612 nNextCh = 0U;
613 rInput.Seek( nStreamPos - GetCharSize() );
614 nlLinePos = nLinePos-1;
615 ClearTxtConvContext();
616 bReadNextChar = true;
617 }
618 bNextCh = false;
619 }
620 }
621 break;
622 case '=':
623 if( '>'==cBreak && !cQuote )
624 bEqSignFound = true;
625 sTmpBuffer.appendUtf32( nNextCh );
626 break;
627
628 case '\\':
629 if( '>'==cBreak )
630 {
631 // mark within tags
632 sTmpBuffer.append( '\\' );
633 }
634 sTmpBuffer.append( '\\' );
635 break;
636
637 case '\"':
638 case '\'':
639 if( '>'==cBreak )
640 {
641 if( bEqSignFound )
642 cQuote = nNextCh;
643 else if( cQuote && (cQuote==nNextCh ) )
644 cQuote = 0U;
645 }
646 sTmpBuffer.appendUtf32( nNextCh );
647 bEqSignFound = false;
648 break;
649
650 case sal_Unicode(EOF):
651 if( rInput.eof() )
652 {
653 bContinue = false;
654 }
655 // else: ignore, not a valid code point
656 break;
657
658 case '<':
659 bEqSignFound = false;
660 if( '>'==cBreak )
661 sTmpBuffer.appendUtf32( nNextCh );
662 else
663 bContinue = false; // break, string is together
664 break;
665
666 case '\f':
667 if( '>' == cBreak )
668 {
669 // If scanning options treat it like a space, ...
670 sTmpBuffer.append( ' ' );
671 }
672 else
673 {
674 // otherwise it's a separate token.
675 bContinue = false;
676 }
677 break;
678
679 case '\r':
680 case '\n':
681 if( '>'==cBreak )
682 {
683 // cr/lf in tag is handled in GetNextToken_()
684 sTmpBuffer.appendUtf32( nNextCh );
685 break;
686 }
687 else if( bReadListing || bReadXMP || bReadPRE || bReadTextArea )
688 {
689 bContinue = false;
690 break;
691 }
692 // Reduce sequence of CR/LF/BLANK/TAB to a single blank
693 [[fallthrough]];
694 case '\t':
695 if( '\t'==nNextCh && bReadPRE && '>'!=cBreak )
696 {
697 // Pass Tabs up in <PRE>
698 bContinue = false;
699 break;
700 }
701 [[fallthrough]];
702 case '\x0b':
703 if( '\x0b'==nNextCh && (bReadPRE || bReadXMP ||bReadListing) &&
704 '>'!=cBreak )
705 {
706 break;
707 }
708 nNextCh = ' ';
709 [[fallthrough]];
710 case ' ':
711 sTmpBuffer.appendUtf32( nNextCh );
712 if( '>'!=cBreak && (!bReadListing && !bReadXMP &&
713 !bReadPRE && !bReadTextArea) )
714 {
715 // Reduce sequences of Blanks/Tabs/CR/LF to a single blank
716 do {
717 nNextCh = GetNextChar();
718 if( sal_Unicode(EOF) == nNextCh && rInput.eof() )
719 {
720 if( !aToken.isEmpty() || sTmpBuffer.getLength() > 1 )
721 {
722 // Have seen s.th. aside from blanks?
723 aToken.append( sTmpBuffer );
724 sTmpBuffer.setLength(0);
726 }
727 else
728 // Only read blanks: no text must be returned
729 // and GetNextToken_ has to read until EOF
730 return HtmlTokenId::NONE;
731 }
732 } while ( ' ' == nNextCh || '\t' == nNextCh ||
733 '\r' == nNextCh || '\n' == nNextCh ||
734 '\x0b' == nNextCh );
735 bNextCh = false;
736 }
737 break;
738
739 default:
740 bEqSignFound = false;
741 if (nNextCh == cBreak && !cQuote)
742 bContinue = false;
743 else
744 {
745 do {
746 if (!linguistic::IsControlChar(nNextCh))
747 {
748 // All remaining characters make their way into the text.
749 sTmpBuffer.appendUtf32( nNextCh );
750 }
751
752 nNextCh = GetNextChar();
753 if( ( sal_Unicode(EOF) == nNextCh && rInput.eof() ) ||
754 !IsParserWorking() )
755 {
756 if( !sTmpBuffer.isEmpty() )
757 aToken.append( sTmpBuffer );
759 }
760 } while( rtl::isAsciiAlpha( nNextCh ) || rtl::isAsciiDigit( nNextCh ) );
761 bNextCh = false;
762 }
763 }
764
765 if( bContinue && bNextCh )
766 nNextCh = GetNextChar();
767 }
768
769 if( !sTmpBuffer.isEmpty() )
770 aToken.append( sTmpBuffer );
771
773}
774
776{
777 OUStringBuffer sTmpBuffer( MAX_LEN );
778
779 if( bEndTokenFound )
780 {
781 // During the last execution we already found the end token,
782 // thus we don't have to search it again.
783 bReadScript = false;
784 bReadStyle = false;
785 aEndToken.clear();
786 bEndTokenFound = false;
787
788 return HtmlTokenId::NONE;
789 }
790
791 // Default return value: HtmlTokenId::RAWDATA
792 bool bContinue = true;
794 SaveState( HtmlTokenId::NONE );
795 while( bContinue && IsParserWorking() )
796 {
797 bool bNextCh = true;
798 switch( nNextCh )
799 {
800 case '<':
801 {
802 // Maybe we've reached the end.
803
804 // Save what we have read previously...
805 aToken.append( sTmpBuffer );
806 sTmpBuffer.setLength(0);
807
808 // and remember position in stream.
809 sal_uInt64 nStreamPos = rInput.Tell();
810 sal_uInt32 nLineNr = GetLineNr();
811 sal_uInt32 nLinePos = GetLinePos();
812
813 // Start of an end token?
814 bool bOffState = false;
815 if( '/' == (nNextCh = GetNextChar()) )
816 {
817 bOffState = true;
818 nNextCh = GetNextChar();
819 }
820 else if( '!' == nNextCh )
821 {
822 sTmpBuffer.appendUtf32( nNextCh );
823 nNextCh = GetNextChar();
824 }
825
826 // Read following letters
827 while( (rtl::isAsciiAlpha(nNextCh) || '-'==nNextCh) &&
828 IsParserWorking() && sTmpBuffer.getLength() < MAX_LEN )
829 {
830 sTmpBuffer.appendUtf32( nNextCh );
831 nNextCh = GetNextChar();
832 }
833
834 OUString aTok( sTmpBuffer.toString() );
835 aTok = aTok.toAsciiLowerCase();
836 bool bDone = false;
837 if( bReadScript || !aEndToken.isEmpty() )
838 {
839 if( !bReadComment )
840 {
841 if( aTok.startsWith( OOO_STRING_SVTOOLS_HTML_comment ) )
842 {
843 bReadComment = true;
844 }
845 else
846 {
847 // A script has to end with "</SCRIPT>". But
848 // ">" is optional for security reasons
849 bDone = bOffState &&
852 : aTok == aEndToken );
853 }
854 }
855 if( bReadComment && '>'==nNextCh && aTok.endsWith( "--" ) )
856 {
857 // End of comment of style <!----->
858 bReadComment = false;
859 }
860 }
861 else
862 {
863 // Style sheets can be closed by </STYLE>, </HEAD> or <BODY>
864 if( bOffState )
865 bDone = aTok == OOO_STRING_SVTOOLS_HTML_style ||
867 else
868 bDone = aTok == OOO_STRING_SVTOOLS_HTML_body;
869 }
870
871 if( bDone )
872 {
873 // Done! Return the previously read string (if requested)
874 // and continue.
875
876 bContinue = false;
877
878 // nToken==0 means, GetNextToken_ continues to read
879 if( aToken.isEmpty() && (bReadStyle || bReadScript) )
880 {
881 // Immediately close environment (or context?)
882 // and parse the end token
883 bReadScript = false;
884 bReadStyle = false;
885 aEndToken.clear();
887 }
888 else
889 {
890 // Keep bReadScript/bReadStyle alive
891 // and parse end token during next execution
892 bEndTokenFound = true;
893 }
894
895 // Move backwards in stream to '<'
896 rInput.Seek( nStreamPos );
897 SetLineNr( nLineNr );
898 SetLinePos( nLinePos );
899 ClearTxtConvContext();
900 nNextCh = '<';
901
902 // Don't append string to token.
903 sTmpBuffer.setLength( 0 );
904 }
905 else
906 {
907 // remember "</" , everything else we find in the buffer
908 aToken.append( "<" );
909 if( bOffState )
910 aToken.append( "/" );
911
912 bNextCh = false;
913 }
914 }
915 break;
916 case '-':
917 sTmpBuffer.appendUtf32( nNextCh );
918 if( bReadComment )
919 {
920 bool bTwoMinus = false;
921 nNextCh = GetNextChar();
922 while( '-' == nNextCh && IsParserWorking() )
923 {
924 bTwoMinus = true;
925 sTmpBuffer.appendUtf32( nNextCh );
926 nNextCh = GetNextChar();
927 }
928
929 if( '>' == nNextCh && IsParserWorking() && bTwoMinus )
930 bReadComment = false;
931
932 bNextCh = false;
933 }
934 break;
935
936 case '\r':
937 // \r\n? closes the current text token (even if it's empty)
938 nNextCh = GetNextChar();
939 if( nNextCh=='\n' )
940 nNextCh = GetNextChar();
941 bContinue = false;
942 break;
943 case '\n':
944 // \n closes the current text token (even if it's empty)
945 nNextCh = GetNextChar();
946 bContinue = false;
947 break;
948 case sal_Unicode(EOF):
949 // eof closes the current text token and behaves like having read
950 // an end token
951 if( rInput.eof() )
952 {
953 bContinue = false;
954 if( !aToken.isEmpty() || !sTmpBuffer.isEmpty() )
955 {
956 bEndTokenFound = true;
957 }
958 else
959 {
960 bReadScript = false;
961 bReadStyle = false;
962 aEndToken.clear();
964 }
965 }
966 break;
967 default:
968 if (!linguistic::IsControlChar(nNextCh) || nNextCh == '\t')
969 {
970 // all remaining characters are appended to the buffer
971 sTmpBuffer.appendUtf32( nNextCh );
972 }
973 break;
974 }
975
976 if( !bContinue && !sTmpBuffer.isEmpty() )
977 {
978 aToken.append( sTmpBuffer );
979 sTmpBuffer.setLength(0);
980 }
981
982 if( bContinue && bNextCh )
983 nNextCh = GetNextChar();
984 }
985
986 if( IsParserWorking() )
987 SaveState( HtmlTokenId::NONE );
988 else
990
991 return nToken;
992}
993
994// Scan next token
996{
998 sSaveToken.clear();
999
1001 {
1002 // HtmlTokenId::<TOKEN>_OFF generated for HtmlTokenId::<TOKEN>_ON
1003 nRet = mnPendingOffToken;
1005 aToken.setLength( 0 );
1006 return nRet;
1007 }
1008
1009 // Delete options
1010 maOptions.clear();
1011
1012 if( !IsParserWorking() ) // Don't continue if already an error occurred
1013 return HtmlTokenId::NONE;
1014
1015 bool bReadNextCharSave = bReadNextChar;
1016 if( bReadNextChar )
1017 {
1019 "Read a character despite </SCRIPT> was read?" );
1020 nNextCh = GetNextChar();
1021 if( !IsParserWorking() ) // Don't continue if already an error occurred
1022 return HtmlTokenId::NONE;
1023 bReadNextChar = false;
1024 }
1025
1026 if( bReadScript || bReadStyle || !aEndToken.isEmpty() )
1027 {
1028 nRet = GetNextRawToken();
1029 if( nRet != HtmlTokenId::NONE || !IsParserWorking() )
1030 return nRet;
1031 }
1032
1033 do {
1034 bool bNextCh = true;
1035 switch( nNextCh )
1036 {
1037 case '<':
1038 {
1039 sal_uInt64 nStreamPos = rInput.Tell();
1040 sal_uInt32 nLineNr = GetLineNr();
1041 sal_uInt32 nLinePos = GetLinePos();
1042
1043 bool bOffState = false;
1044 if( '/' == (nNextCh = GetNextChar()) )
1045 {
1046 bOffState = true;
1047 nNextCh = GetNextChar();
1048 }
1049 // Assume '<?' is a start of an XML declaration, ignore it.
1050 if (rtl::isAsciiAlpha(nNextCh) || nNextCh == '!' || nNextCh == '?')
1051 {
1052 OUStringBuffer sTmpBuffer;
1053 do {
1054 sTmpBuffer.appendUtf32( nNextCh );
1055 nNextCh = GetNextChar();
1056 if (std::u16string_view(sTmpBuffer) == u"![CDATA[")
1057 break;
1058 if (bFuzzing && sTmpBuffer.getLength() > 1024)
1059 {
1060 SAL_WARN("svtools", "abandoning import for performance reasons with long tokens");
1061 eState = SvParserState::Error;
1062 break;
1063 }
1064 } while( '>' != nNextCh && '/' != nNextCh && !rtl::isAsciiWhiteSpace( nNextCh ) &&
1065 !linguistic::IsControlChar(nNextCh) &&
1066 IsParserWorking() && !rInput.eof() );
1067
1068 if( !sTmpBuffer.isEmpty() )
1069 {
1070 aToken.append( sTmpBuffer );
1071 sTmpBuffer.setLength(0);
1072 }
1073
1074 // Skip blanks
1075 while( rtl::isAsciiWhiteSpace( nNextCh ) && IsParserWorking() )
1076 nNextCh = GetNextChar();
1077
1078 if( !IsParserWorking() )
1079 {
1080 if( SvParserState::Pending == eState )
1081 bReadNextChar = bReadNextCharSave;
1082 break;
1083 }
1084
1085 // Search token in table:
1086 sSaveToken = aToken;
1087 aToken = aToken.toString().toAsciiLowerCase();
1088
1089 if (!maNamespace.isEmpty() && o3tl::starts_with(aToken, maNamespace))
1090 aToken.remove( 0, maNamespace.getLength());
1091
1092 if( HtmlTokenId::NONE == (nRet = GetHTMLToken( aToken )) )
1093 // Unknown control
1095
1096 // If it's a token which can be switched off...
1097 if( bOffState )
1098 {
1099 if( nRet >= HtmlTokenId::ONOFF_START )
1100 {
1101 // and there is an off token, return off token instead
1102 nRet = static_cast<HtmlTokenId>(static_cast<int>(nRet) + 1);
1103 }
1104 else if( HtmlTokenId::LINEBREAK!=nRet || !maNamespace.isEmpty())
1105 {
1106 // and there is no off token, return unknown token.
1107 // (except for </BR>, that is treated like <BR>)
1108 // No exception for XHTML, though.
1110 }
1111 }
1112
1113 if( nRet == HtmlTokenId::COMMENT )
1114 {
1115 // fix: due to being case sensitive use sSaveToken as start of comment
1116 // and append a blank.
1117 aToken = sSaveToken;
1118 if( '>'!=nNextCh )
1119 aToken.append( " " );
1120 sal_uInt64 nCStreamPos = 0;
1121 sal_uInt32 nCLineNr = 0;
1122 sal_uInt32 nCLinePos = 0;
1123 sal_Int32 nCStrLen = 0;
1124
1125 bool bDone = false;
1126 // Read until closing -->. If not found restart at first >
1127 sTmpBuffer = aToken;
1128 while( !bDone && !rInput.eof() && IsParserWorking() )
1129 {
1130 if( '>'==nNextCh )
1131 {
1132 if( !nCStreamPos )
1133 {
1134 nCStreamPos = rInput.Tell();
1135 nCStrLen = sTmpBuffer.getLength();
1136 nCLineNr = GetLineNr();
1137 nCLinePos = GetLinePos();
1138 }
1139 bDone = sTmpBuffer.getLength() >= 2 && sTmpBuffer[sTmpBuffer.getLength() - 2] == '-' && sTmpBuffer[sTmpBuffer.getLength() - 1] == '-';
1140 if( !bDone )
1141 sTmpBuffer.appendUtf32(nNextCh);
1142 }
1143 else if (!linguistic::IsControlChar(nNextCh)
1144 || nNextCh == '\r' || nNextCh == '\n' || nNextCh == '\t')
1145 {
1146 sTmpBuffer.appendUtf32(nNextCh);
1147 }
1148 if( !bDone )
1149 nNextCh = GetNextChar();
1150 }
1151 aToken = sTmpBuffer;
1152 sTmpBuffer.setLength(0);
1153 if( !bDone && IsParserWorking() && nCStreamPos )
1154 {
1155 rInput.Seek( nCStreamPos );
1156 SetLineNr( nCLineNr );
1157 SetLinePos( nCLinePos );
1158 ClearTxtConvContext();
1159 aToken.truncate(nCStrLen);
1160 nNextCh = '>';
1161 }
1162 }
1163 else if (nRet == HtmlTokenId::CDATA)
1164 {
1165 // Read until the closing ]]>.
1166 bool bDone = false;
1167 while (!bDone && !rInput.eof() && IsParserWorking())
1168 {
1169 if (nNextCh == '>')
1170 {
1171 if (sTmpBuffer.getLength() >= 2)
1172 {
1173 bDone = sTmpBuffer[sTmpBuffer.getLength() - 2] == ']'
1174 && sTmpBuffer[sTmpBuffer.getLength() - 1] == ']';
1175 if (bDone)
1176 {
1177 // Ignore ]] at the end.
1178 sTmpBuffer.setLength(sTmpBuffer.getLength() - 2);
1179 }
1180 }
1181 if (!bDone)
1182 {
1183 sTmpBuffer.appendUtf32(nNextCh);
1184 }
1185 }
1186 else if (!linguistic::IsControlChar(nNextCh))
1187 {
1188 sTmpBuffer.appendUtf32(nNextCh);
1189 }
1190 if (!bDone)
1191 {
1192 nNextCh = GetNextChar();
1193 }
1194 }
1195 aToken = sTmpBuffer;
1196 sTmpBuffer.setLength(0);
1197 }
1198 else
1199 {
1200 // TokenString not needed anymore
1201 aToken.setLength( 0 );
1202 }
1203
1204 // Read until closing '>'
1205 if( '>' != nNextCh && IsParserWorking() )
1206 {
1207 ScanText( '>' );
1208
1209 // fdo#34666 fdo#36080 fdo#36390: closing "/>"?:
1210 // generate pending HtmlTokenId::<TOKEN>_OFF for HtmlTokenId::<TOKEN>_ON
1211 // Do not convert this to a single HtmlTokenId::<TOKEN>_OFF
1212 // which lead to fdo#56772.
1213 if ((nRet >= HtmlTokenId::ONOFF_START) && o3tl::ends_with(aToken, u"/"))
1214 {
1215 mnPendingOffToken = static_cast<HtmlTokenId>(static_cast<int>(nRet) + 1); // HtmlTokenId::<TOKEN>_ON -> HtmlTokenId::<TOKEN>_OFF
1216 aToken.setLength( aToken.getLength()-1 ); // remove trailing '/'
1217 }
1218 if( sal_Unicode(EOF) == nNextCh && rInput.eof() )
1219 {
1220 // Move back in front of < and restart there.
1221 // Return < as text.
1222 rInput.Seek( nStreamPos );
1223 SetLineNr( nLineNr );
1224 SetLinePos( nLinePos );
1225 ClearTxtConvContext();
1226
1227 aToken = "<";
1229 nNextCh = GetNextChar();
1230 bNextCh = false;
1231 break;
1232 }
1233 }
1234 if( SvParserState::Pending == eState )
1235 bReadNextChar = bReadNextCharSave;
1236 }
1237 else
1238 {
1239 if( bOffState )
1240 {
1241 // simply throw away everything
1242 ScanText( '>' );
1243 if( sal_Unicode(EOF) == nNextCh && rInput.eof() )
1244 {
1245 // Move back in front of < and restart there.
1246 // Return < as text.
1247 rInput.Seek( nStreamPos );
1248 SetLineNr( nLineNr );
1249 SetLinePos( nLinePos );
1250 ClearTxtConvContext();
1251
1252 aToken = "<";
1254 nNextCh = GetNextChar();
1255 bNextCh = false;
1256 break;
1257 }
1258 if( SvParserState::Pending == eState )
1259 bReadNextChar = bReadNextCharSave;
1260 aToken.setLength( 0 );
1261 }
1262 else if( '%' == nNextCh )
1263 {
1265
1266 sal_uInt64 nCStreamPos = rInput.Tell();
1267 sal_uInt32 nCLineNr = GetLineNr(), nCLinePos = GetLinePos();
1268
1269 bool bDone = false;
1270 // Read until closing %>. If not found restart at first >.
1271 sal_Unicode nLastTokenChar = !aToken.isEmpty() ? aToken[aToken.getLength() - 1] : 0;
1272 OUStringBuffer aTmpBuffer(aToken);
1273 while( !bDone && !rInput.eof() && IsParserWorking() )
1274 {
1275 bDone = '>'==nNextCh && nLastTokenChar == '%';
1276 if( !bDone )
1277 {
1278 aTmpBuffer.appendUtf32(nNextCh);
1279 nLastTokenChar = aTmpBuffer[aTmpBuffer.getLength() - 1];
1280 nNextCh = GetNextChar();
1281 }
1282 }
1283 if( !bDone && IsParserWorking() )
1284 {
1285 rInput.Seek( nCStreamPos );
1286 SetLineNr( nCLineNr );
1287 SetLinePos( nCLinePos );
1288 ClearTxtConvContext();
1289 aToken = "<%";
1291 break;
1292 }
1293 aToken = aTmpBuffer;
1294 aTmpBuffer.setLength(0);
1295 if( IsParserWorking() )
1296 {
1297 sSaveToken = aToken;
1298 aToken.setLength( 0 );
1299 }
1300 }
1301 else
1302 {
1303 aToken = "<";
1305 bNextCh = false;
1306 break;
1307 }
1308 }
1309
1310 if( IsParserWorking() )
1311 {
1312 bNextCh = '>' == nNextCh;
1313 switch( nRet )
1314 {
1316 bReadTextArea = true;
1317 break;
1319 bReadTextArea = false;
1320 break;
1322 if( !bReadTextArea )
1323 bReadScript = true;
1324 break;
1326 if( !bReadTextArea )
1327 {
1328 bReadScript = false;
1329 // JavaScript might modify the stream,
1330 // thus the last character has to be read again.
1331 bReadNextChar = true;
1332 bNextCh = false;
1333 }
1334 break;
1335
1337 bReadStyle = true;
1338 break;
1340 bReadStyle = false;
1341 break;
1342 default: break;
1343 }
1344 }
1345 }
1346 break;
1347
1348 case sal_Unicode(EOF):
1349 if( rInput.eof() )
1350 {
1351 eState = SvParserState::Accepted;
1352 nRet = HtmlTokenId(nNextCh);
1353 }
1354 else
1355 {
1356 // Read normal text.
1357 goto scan_text;
1358 }
1359 break;
1360
1361 case '\f':
1362 // form feeds are passed upwards separately
1363 nRet = HtmlTokenId::LINEFEEDCHAR; // !!! should be FORMFEEDCHAR
1364 break;
1365
1366 case '\n':
1367 case '\r':
1369 {
1370 sal_Unicode c = GetNextChar();
1371 if( ( '\n' != nNextCh || '\r' != c ) &&
1372 ( '\r' != nNextCh || '\n' != c ) )
1373 {
1374 bNextCh = false;
1375 nNextCh = c;
1376 }
1377 nRet = HtmlTokenId::NEWPARA;
1378 break;
1379 }
1380 [[fallthrough]];
1381 case '\t':
1382 if( bReadPRE )
1383 {
1384 nRet = HtmlTokenId::TABCHAR;
1385 break;
1386 }
1387 [[fallthrough]];
1388 case ' ':
1389 [[fallthrough]];
1390 default:
1391
1392scan_text:
1393 // "normal" text to come
1394 nRet = ScanText();
1395 bNextCh = 0 == aToken.getLength();
1396
1397 // the text should be processed
1398 if( !bNextCh && eState == SvParserState::Pending )
1399 {
1400 eState = SvParserState::Working;
1401 bReadNextChar = true;
1402 }
1403
1404 break;
1405 }
1406
1407 if( bNextCh && SvParserState::Working == eState )
1408 {
1409 nNextCh = GetNextChar();
1410 if( SvParserState::Pending == eState && nRet != HtmlTokenId::NONE && HtmlTokenId::TEXTTOKEN != nRet )
1411 {
1412 bReadNextChar = true;
1413 eState = SvParserState::Working;
1414 }
1415 }
1416
1417 } while( nRet == HtmlTokenId::NONE && SvParserState::Working == eState );
1418
1419 if( SvParserState::Pending == eState )
1420 nRet = HtmlTokenId::INVALID; // s.th. invalid
1421
1422 return nRet;
1423}
1424
1426{
1427 sal_Int32 nPos=0;
1428
1429 bool bEscape = false;
1430 while( nPos < aToken.getLength() )
1431 {
1432 bool bOldEscape = bEscape;
1433 bEscape = false;
1434 if( '\\'==aToken[nPos] && !bOldEscape )
1435 {
1436 aToken.remove( nPos, 1 );
1437 bEscape = true;
1438 }
1439 else
1440 {
1441 nPos++;
1442 }
1443 }
1444}
1445
1446const HTMLOptions& HTMLParser::GetOptions( HtmlOptionId const *pNoConvertToken )
1447{
1448 // If the options for the current token have already been returned,
1449 // return them once again.
1450 if (!maOptions.empty())
1451 return maOptions;
1452
1453 sal_Int32 nPos = 0;
1454 while( nPos < aToken.getLength() )
1455 {
1456 // A letter? Option beginning here.
1457 if( rtl::isAsciiAlpha( aToken[nPos] ) )
1458 {
1460 OUString aValue;
1461 sal_Int32 nStt = nPos;
1462 sal_Unicode cChar = 0;
1463
1464 // Actually only certain characters allowed.
1465 // Netscape only looks for "=" and white space (c.f.
1466 // Mozilla: PA_FetchRequestedNameValues in libparse/pa_mdl.c)
1467 while( nPos < aToken.getLength() )
1468 {
1469 cChar = aToken[nPos];
1470 if ( '=' == cChar ||!HTML_ISPRINTABLE(cChar) || rtl::isAsciiWhiteSpace(cChar) )
1471 break;
1472 nPos++;
1473 }
1474
1475 OUString sName( aToken.subView( nStt, nPos-nStt ) );
1476
1477 // PlugIns require original token name. Convert to lower case only for searching.
1478 nToken = GetHTMLOption( sName.toAsciiLowerCase() ); // Name is ready
1480 "GetOption: unknown HTML option '" << sName << "'" );
1481 bool bStripCRLF = (nToken < HtmlOptionId::SCRIPT_START ||
1483 (!pNoConvertToken || nToken != *pNoConvertToken);
1484
1485 while( nPos < aToken.getLength() )
1486 {
1487 cChar = aToken[nPos];
1488 if ( HTML_ISPRINTABLE(cChar) && !rtl::isAsciiWhiteSpace(cChar) )
1489 break;
1490 nPos++;
1491 }
1492
1493 // Option with value?
1494 if( nPos!=aToken.getLength() && '='==cChar )
1495 {
1496 nPos++;
1497
1498 while( nPos < aToken.getLength() )
1499 {
1500 cChar = aToken[nPos];
1501 if ( HTML_ISPRINTABLE(cChar) && ' ' != cChar && '\t' != cChar && '\r' != cChar && '\n' != cChar )
1502 break;
1503 nPos++;
1504 }
1505
1506 if( nPos != aToken.getLength() )
1507 {
1508 sal_Int32 nLen = 0;
1509 nStt = nPos;
1510 if( ('"'==cChar) || '\''==cChar )
1511 {
1512 sal_Unicode cEnd = cChar;
1513 nPos++; nStt++;
1514 bool bDone = false;
1515 bool bEscape = false;
1516 while( nPos < aToken.getLength() && !bDone )
1517 {
1518 bool bOldEscape = bEscape;
1519 bEscape = false;
1520 cChar = aToken[nPos];
1521 switch( cChar )
1522 {
1523 case '\r':
1524 case '\n':
1525 if( bStripCRLF )
1526 aToken.remove( nPos, 1 );
1527 else
1528 {
1529 nPos++;
1530 nLen++;
1531 }
1532 break;
1533 case '\\':
1534 if( bOldEscape )
1535 {
1536 nPos++;
1537 nLen++;
1538 }
1539 else
1540 {
1541 aToken.remove( nPos, 1 );
1542 bEscape = true;
1543 }
1544 break;
1545 case '"':
1546 case '\'':
1547 bDone = !bOldEscape && cChar==cEnd;
1548 if( !bDone )
1549 {
1550 nPos++;
1551 nLen++;
1552 }
1553 break;
1554 default:
1555 nPos++;
1556 nLen++;
1557 break;
1558 }
1559 }
1560 if( nPos!=aToken.getLength() )
1561 nPos++;
1562 }
1563 else
1564 {
1565 // More liberal than the standard: allow all printable characters
1566 bool bEscape = false;
1567 bool bDone = false;
1568 while( nPos < aToken.getLength() && !bDone )
1569 {
1570 bool bOldEscape = bEscape;
1571 bEscape = false;
1572 sal_Unicode c = aToken[nPos];
1573 switch( c )
1574 {
1575 case ' ':
1576 bDone = !bOldEscape;
1577 if( !bDone )
1578 {
1579 nPos++;
1580 nLen++;
1581 }
1582 break;
1583
1584 case '\t':
1585 case '\r':
1586 case '\n':
1587 bDone = true;
1588 break;
1589
1590 case '\\':
1591 if( bOldEscape )
1592 {
1593 nPos++;
1594 nLen++;
1595 }
1596 else
1597 {
1598 aToken.remove( nPos, 1 );
1599 bEscape = true;
1600 }
1601 break;
1602
1603 default:
1604 if( HTML_ISPRINTABLE( c ) )
1605 {
1606 nPos++;
1607 nLen++;
1608 }
1609 else
1610 bDone = true;
1611 break;
1612 }
1613 }
1614 }
1615
1616 if( nLen )
1617 aValue = aToken.subView( nStt, nLen );
1618 }
1619 }
1620
1621 // Token is known and can be saved
1622 maOptions.emplace_back(nToken, sName, aValue);
1623
1624 }
1625 else
1626 // Ignore white space and unexpected characters
1627 nPos++;
1628 }
1629
1630 return maOptions;
1631}
1632
1634{
1635 switch( nToken )
1636 {
1637 // in Netscape they only have impact in not empty paragraphs
1640 [[fallthrough]];
1643 nPre_LinePos = 0;
1644 if( bPre_IgnoreNewPara )
1646 break;
1647
1649 {
1650 sal_Int32 nSpaces = 8 - (nPre_LinePos % 8);
1651 DBG_ASSERT( aToken.isEmpty(), "Why is the token not empty?" );
1652 if (aToken.getLength() < nSpaces)
1653 {
1655 OUStringBuffer aBuf(aToken);
1656 aToken = padToLength(aBuf, nSpaces, ' ');
1657 }
1658 nPre_LinePos += nSpaces;
1660 }
1661 break;
1662 // Keep those
1664 nPre_LinePos += aToken.getLength();
1665 break;
1666
1672 case HtmlTokenId::INPUT:
1676
1677 case HtmlTokenId::IMAGE:
1680 case HtmlTokenId::PARAM:
1681 case HtmlTokenId::EMBED:
1682
1700
1705
1709
1730
1759
1794
1795 break;
1796
1797 // The remainder is treated as an unknown token.
1798 default:
1799 if( nToken != HtmlTokenId::NONE )
1800 {
1801 nToken =
1805 }
1806 break;
1807 }
1808
1809 bPre_IgnoreNewPara = false;
1810
1811 return nToken;
1812}
1813
1815{
1816 switch( nToken )
1817 {
1819 if( bPre_IgnoreNewPara )
1821 [[fallthrough]];
1825 break; // kept
1826
1827 default:
1828 if( nToken != HtmlTokenId::NONE )
1829 {
1831 {
1832 sSaveToken = "</" + sSaveToken;
1833 }
1834 else
1835 sSaveToken = "<" + sSaveToken;
1836 if( !aToken.isEmpty() )
1837 {
1838 UnescapeToken();
1839 sSaveToken += " ";
1840 aToken.insert(0, sSaveToken);
1841 }
1842 else
1843 aToken = sSaveToken;
1844 aToken.append( ">" );
1846 }
1847 break;
1848 }
1849
1850 bPre_IgnoreNewPara = false;
1851
1852 return nToken;
1853}
1854
1856{
1857 switch( nToken )
1858 {
1860 if( bPre_IgnoreNewPara )
1862 [[fallthrough]];
1866 break; // kept
1867
1868 default:
1869 if( nToken != HtmlTokenId::NONE )
1870 {
1871 nToken =
1875 }
1876 break;
1877 }
1878
1879 bPre_IgnoreNewPara = false;
1880
1881 return nToken;
1882}
1883
1885{
1886 bool bFound = false;
1887
1888 if( rURL.startsWith( OOO_STRING_SVTOOLS_HTML_internal_icon ) )
1889 {
1890 OUString aName( rURL.copy(14) );
1891 switch( aName[0] )
1892 {
1893 case 'b':
1895 break;
1896 case 'd':
1898 break;
1899 case 'e':
1901 break;
1902 case 'i':
1904 break;
1905 case 'n':
1907 break;
1908 }
1909 }
1910 if( bFound )
1911 {
1912 OUString sTmp ( rURL );
1914 rURL += sTmp;
1915 }
1916
1917 return bFound;
1918}
1919
1920namespace {
1921
1922enum class HtmlMeta {
1923 NONE = 0,
1924 Author,
1925 Description,
1926 Keywords,
1927 Refresh,
1928 Classification,
1929 Created,
1930 ChangedBy,
1931 Changed,
1932 Generator,
1933 SDFootnote,
1934 SDEndnote,
1936};
1937
1938}
1939
1940// <META NAME=xxx>
1942{
1943 { OOO_STRING_SVTOOLS_HTML_META_author, HtmlMeta::Author },
1944 { OOO_STRING_SVTOOLS_HTML_META_changed, HtmlMeta::Changed },
1945 { OOO_STRING_SVTOOLS_HTML_META_changedby, HtmlMeta::ChangedBy },
1946 { OOO_STRING_SVTOOLS_HTML_META_classification,HtmlMeta::Classification},
1947 { OOO_STRING_SVTOOLS_HTML_META_content_type, HtmlMeta::ContentType },
1948 { OOO_STRING_SVTOOLS_HTML_META_created, HtmlMeta::Created },
1949 { OOO_STRING_SVTOOLS_HTML_META_description, HtmlMeta::Description },
1950 { OOO_STRING_SVTOOLS_HTML_META_keywords, HtmlMeta::Keywords },
1951 { OOO_STRING_SVTOOLS_HTML_META_generator, HtmlMeta::Generator },
1952 { OOO_STRING_SVTOOLS_HTML_META_refresh, HtmlMeta::Refresh },
1953 { OOO_STRING_SVTOOLS_HTML_META_sdendnote, HtmlMeta::SDEndnote },
1954 { OOO_STRING_SVTOOLS_HTML_META_sdfootnote, HtmlMeta::SDFootnote },
1955 { nullptr, HtmlMeta(0) }
1956};
1957
1958
1959void HTMLParser::AddMetaUserDefined( OUString const & )
1960{
1961}
1962
1964 const uno::Reference<document::XDocumentProperties> & i_xDocProps,
1965 SvKeyValueIterator *i_pHTTPHeader,
1966 const HTMLOptions& aOptions,
1967 rtl_TextEncoding& o_rEnc )
1968{
1969 OUString aName, aContent;
1970 HtmlMeta nAction = HtmlMeta::NONE;
1971 bool bHTTPEquiv = false, bChanged = false;
1972
1973 for ( size_t i = aOptions.size(); i; )
1974 {
1975 const HTMLOption& aOption = aOptions[--i];
1976 switch ( aOption.GetToken() )
1977 {
1978 case HtmlOptionId::NAME:
1979 aName = aOption.GetString();
1980 if ( HtmlMeta::NONE==nAction )
1981 {
1982 aOption.GetEnum( nAction, aHTMLMetaNameTable );
1983 }
1984 break;
1986 aName = aOption.GetString();
1987 aOption.GetEnum( nAction, aHTMLMetaNameTable );
1988 bHTTPEquiv = true;
1989 break;
1991 aContent = aOption.GetString();
1992 break;
1994 {
1995 OString sValue(OUStringToOString(aOption.GetString(), RTL_TEXTENCODING_ASCII_US));
1996 o_rEnc = GetExtendedCompatibilityTextEncoding(rtl_getTextEncodingFromMimeCharset(sValue.getStr()));
1997 break;
1998 }
1999 default: break;
2000 }
2001 }
2002
2003 if ( bHTTPEquiv || HtmlMeta::Description != nAction )
2004 {
2005 // if it is not a Description, remove CRs and LFs from CONTENT
2006 aContent = aContent.replaceAll("\r", "").replaceAll("\n", "");
2007 }
2008 else
2009 {
2010 // convert line endings for Description
2011 aContent = convertLineEnd(aContent, GetSystemLineEnd());
2012 }
2013
2014 if ( bHTTPEquiv && i_pHTTPHeader )
2015 {
2016 // Netscape seems to just ignore a closing ", so we do too
2017 if ( aContent.endsWith("\"") )
2018 {
2019 aContent = aContent.copy( 0, aContent.getLength() - 1 );
2020 }
2021 SvKeyValue aKeyValue( aName, aContent );
2022 i_pHTTPHeader->Append( aKeyValue );
2023 }
2024
2025 switch ( nAction )
2026 {
2027 case HtmlMeta::Author:
2028 if (i_xDocProps.is()) {
2029 i_xDocProps->setAuthor( aContent );
2030 bChanged = true;
2031 }
2032 break;
2033 case HtmlMeta::Description:
2034 if (i_xDocProps.is()) {
2035 i_xDocProps->setDescription( aContent );
2036 bChanged = true;
2037 }
2038 break;
2039 case HtmlMeta::Keywords:
2040 if (i_xDocProps.is()) {
2041 i_xDocProps->setKeywords(
2042 ::comphelper::string::convertCommaSeparated(aContent));
2043 bChanged = true;
2044 }
2045 break;
2046 case HtmlMeta::Classification:
2047 if (i_xDocProps.is()) {
2048 i_xDocProps->setSubject( aContent );
2049 bChanged = true;
2050 }
2051 break;
2052
2053 case HtmlMeta::ChangedBy:
2054 if (i_xDocProps.is()) {
2055 i_xDocProps->setModifiedBy( aContent );
2056 bChanged = true;
2057 }
2058 break;
2059
2060 case HtmlMeta::Created:
2061 case HtmlMeta::Changed:
2062 if (i_xDocProps.is() && !aContent.isEmpty())
2063 {
2064 ::util::DateTime uDT;
2065 bool valid = false;
2066 if (comphelper::string::getTokenCount(aContent, ';') == 2)
2067 {
2068 sal_Int32 nIdx{ 0 };
2069 Date aDate(o3tl::toInt32(o3tl::getToken(aContent, 0, ';', nIdx)));
2070 auto nTime = o3tl::toInt64(o3tl::getToken(aContent, 0, ';', nIdx));
2071 if (nTime < 0)
2072 nTime = o3tl::saturating_toggle_sign(nTime);
2073 tools::Time aTime(nTime);
2074 DateTime aDateTime(aDate, aTime);
2075 uDT = aDateTime.GetUNODateTime();
2076 valid = true;
2077 }
2078 else if (utl::ISO8601parseDateTime(aContent, uDT))
2079 valid = true;
2080
2081 if (valid)
2082 {
2083 bChanged = true;
2084 if (HtmlMeta::Created == nAction)
2085 i_xDocProps->setCreationDate(uDT);
2086 else
2087 i_xDocProps->setModificationDate(uDT);
2088 }
2089 }
2090 break;
2091
2092 case HtmlMeta::Refresh:
2093 DBG_ASSERT( !bHTTPEquiv || i_pHTTPHeader, "Lost Reload-URL because of omitted MUST change." );
2094 break;
2095
2096 case HtmlMeta::ContentType:
2097 if ( !aContent.isEmpty() )
2098 {
2099 o_rEnc = GetEncodingByMIME( aContent );
2100 }
2101 break;
2102
2103 case HtmlMeta::NONE:
2104 if ( !bHTTPEquiv )
2105 {
2106 if (i_xDocProps.is())
2107 {
2108 uno::Reference<beans::XPropertyContainer> xUDProps
2109 = i_xDocProps->getUserDefinedProperties();
2110 try {
2111 xUDProps->addProperty(aName,
2112 beans::PropertyAttribute::REMOVABLE,
2113 uno::Any(aContent));
2115 bChanged = true;
2116 } catch (uno::Exception &) {
2117 // ignore
2118 }
2119 }
2120 }
2121 break;
2122 default:
2123 break;
2124 }
2125
2126 return bChanged;
2127}
2128
2130 const uno::Reference<document::XDocumentProperties> & i_xDocProps,
2131 SvKeyValueIterator *i_pHeader )
2132{
2133 HtmlOptionId nContentOption = HtmlOptionId::CONTENT;
2134 rtl_TextEncoding eEnc = RTL_TEXTENCODING_DONTKNOW;
2135
2136 bool bRet = ParseMetaOptionsImpl( i_xDocProps, i_pHeader,
2137 GetOptions(&nContentOption),
2138 eEnc );
2139
2140 // If the encoding is set by a META tag, it may only overwrite the
2141 // current encoding if both, the current and the new encoding, are 1-sal_uInt8
2142 // encodings. Everything else cannot lead to reasonable results.
2143 if (RTL_TEXTENCODING_DONTKNOW != eEnc &&
2144 rtl_isOctetTextEncoding( eEnc ) &&
2145 rtl_isOctetTextEncoding( GetSrcEncoding() ) )
2146 {
2148 SetSrcEncoding( eEnc );
2149 }
2150
2151 return bRet;
2152}
2153
2154rtl_TextEncoding HTMLParser::GetEncodingByMIME( const OUString& rMime )
2155{
2156 OUString sType;
2157 OUString sSubType;
2158 INetContentTypeParameterList aParameters;
2159 if (INetContentTypes::parse(rMime, sType, sSubType, &aParameters))
2160 {
2161 auto const iter = aParameters.find("charset");
2162 if (iter != aParameters.end())
2163 {
2164 const INetContentTypeParameter * pCharset = &iter->second;
2165 OString sValue(OUStringToOString(pCharset->m_sValue, RTL_TEXTENCODING_ASCII_US));
2166 return GetExtendedCompatibilityTextEncoding( rtl_getTextEncodingFromMimeCharset( sValue.getStr() ) );
2167 }
2168 }
2169 return RTL_TEXTENCODING_DONTKNOW;
2170}
2171
2173{
2174 rtl_TextEncoding eRet = RTL_TEXTENCODING_DONTKNOW;
2175 if( pHTTPHeader )
2176 {
2177 SvKeyValue aKV;
2178 for( bool bCont = pHTTPHeader->GetFirst( aKV ); bCont;
2179 bCont = pHTTPHeader->GetNext( aKV ) )
2180 {
2181 if( aKV.GetKey().equalsIgnoreAsciiCase( OOO_STRING_SVTOOLS_HTML_META_content_type ) )
2182 {
2183 if( !aKV.GetValue().isEmpty() )
2184 {
2186 }
2187 }
2188 }
2189 }
2190 return eRet;
2191}
2192
2194{
2195 bool bRet = false;
2196 rtl_TextEncoding eEnc = HTMLParser::GetEncodingByHttpHeader( pHTTPHeader );
2197 if(RTL_TEXTENCODING_DONTKNOW != eEnc)
2198 {
2199 SetSrcEncoding( eEnc );
2200 bRet = true;
2201 }
2202 return bRet;
2203}
2204
2205
2206/* vim:set shiftwidth=4 softtabstop=4 expandtab: */
OptionalString sType
void SetGreen(sal_uInt8 nGreen)
void SetRed(sal_uInt8 nRed)
void SetBlue(sal_uInt8 nBlue)
css::util::DateTime GetUNODateTime() const
Representation of an HTML option (=attribute in a start tag).
Definition: parhtml.hxx:88
HtmlOptionId GetToken() const
Definition: parhtml.hxx:98
HTMLTableFrame GetTableFrame() const
Definition: parhtml.cxx:205
void GetNumbers(std::vector< sal_uInt32 > &rNumbers) const
Definition: parhtml.cxx:131
const OUString & GetString() const
Definition: parhtml.hxx:102
HtmlOptionId nToken
Definition: parhtml.hxx:91
sal_Int32 GetSNumber() const
Definition: parhtml.cxx:122
EnumT GetEnum(const HTMLOptionEnum< EnumT > *pOptEnums, EnumT nDflt=static_cast< EnumT >(0)) const
Definition: parhtml.hxx:110
HTMLInputType GetInputType() const
Definition: parhtml.cxx:199
void GetColor(Color &) const
Definition: parhtml.cxx:161
OUString aValue
Definition: parhtml.hxx:89
HTMLOption(HtmlOptionId nTyp, OUString aToken, OUString aValue)
Definition: parhtml.cxx:99
sal_uInt32 GetNumber() const
Definition: parhtml.cxx:109
HTMLTableRules GetTableRules() const
Definition: parhtml.cxx:211
HtmlTokenId mnPendingOffToken
OFF token pending for a <XX.../> ON/OFF ON token.
Definition: parhtml.hxx:167
bool bReadPRE
Definition: parhtml.hxx:155
void SetNamespace(std::u16string_view rNamespace)
Definition: parhtml.cxx:242
HtmlTokenId FilterXMP(HtmlTokenId nToken)
Definition: parhtml.cxx:1814
virtual ~HTMLParser() override
Definition: parhtml.cxx:238
bool bReadXMP
Definition: parhtml.hxx:154
HtmlTokenId FilterListing(HtmlTokenId nToken)
Definition: parhtml.cxx:1855
void StartPRE()
Definition: parhtml.hxx:272
static rtl_TextEncoding GetEncodingByHttpHeader(SvKeyValueIterator *pHTTPHeader)
Definition: parhtml.cxx:2172
bool bReadComment
Definition: parhtml.hxx:163
bool bIsInHeader
Definition: parhtml.hxx:152
bool bReadNextChar
Definition: parhtml.hxx:162
virtual void AddMetaUserDefined(OUString const &i_rMetaName)
template method: called when ParseMetaOptions adds a user-defined meta
Definition: parhtml.cxx:1959
void FinishPRE()
Definition: parhtml.hxx:205
static rtl_TextEncoding GetEncodingByMIME(const OUString &rMime)
Definition: parhtml.cxx:2154
virtual SvParserState CallParser() override
Definition: parhtml.cxx:269
HtmlTokenId FilterPRE(HtmlTokenId nToken)
Definition: parhtml.cxx:1633
virtual bool ParseMetaOptions(const css::uno::Reference< css::document::XDocumentProperties > &, SvKeyValueIterator *)
overriding method must call this implementation!
Definition: parhtml.cxx:2129
void StartXMP()
Definition: parhtml.hxx:286
HtmlTokenId GetNextRawToken()
Definition: parhtml.cxx:775
bool SetEncodingByHTTPHeader(SvKeyValueIterator *pHTTPHeader)
Definition: parhtml.cxx:2193
void FinishXMP()
Definition: parhtml.hxx:213
bool bReadStyle
Definition: parhtml.hxx:158
virtual void Continue(HtmlTokenId nToken) override
Definition: parhtml.cxx:285
bool bReadTextArea
Definition: parhtml.hxx:156
bool bReadListing
Definition: parhtml.hxx:153
OUString maNamespace
XML namespace, in case of XHTML.
Definition: parhtml.hxx:172
void FinishListing()
Definition: parhtml.hxx:209
bool bReadScript
Definition: parhtml.hxx:157
void UnescapeToken()
Definition: parhtml.cxx:1425
void StartListing()
Definition: parhtml.hxx:279
HtmlTokenId ScanText(const sal_Unicode cBreak=0U)
Definition: parhtml.cxx:382
OUString sSaveToken
Definition: parhtml.hxx:175
sal_uInt32 nPre_LinePos
Definition: parhtml.hxx:165
bool bEndTokenFound
Definition: parhtml.hxx:159
HtmlTokenId FilterToken(HtmlTokenId nToken)
Definition: parhtml.cxx:305
OUString aEndToken
Definition: parhtml.hxx:169
virtual HtmlTokenId GetNextToken_() override
Definition: parhtml.cxx:995
bool ParseMetaOptionsImpl(const css::uno::Reference< css::document::XDocumentProperties > &, SvKeyValueIterator *, const HTMLOptions &, rtl_TextEncoding &rEnc)
parse meta options into XDocumentProperties and encoding
Definition: parhtml.cxx:1963
static bool InternalImgToPrivateURL(OUString &rURL)
Definition: parhtml.cxx:1884
const HTMLOptions & GetOptions(HtmlOptionId const *pNoConvertToken=nullptr)
Definition: parhtml.cxx:1446
HTMLOptions maOptions
Definition: parhtml.hxx:149
HTMLParser(SvStream &rIn, bool bReadNewDoc=true)
Definition: parhtml.cxx:217
bool bPre_IgnoreNewPara
Definition: parhtml.hxx:161
static bool parse(OUString const &rMediaType, OUString &rType, OUString &rSubType, INetContentTypeParameterList *pParameters=nullptr)
virtual void Append(const SvKeyValue &rKeyVal)
Definition: svparser.cxx:668
virtual bool GetFirst(SvKeyValue &rKeyVal)
Operation.
Definition: svparser.cxx:648
virtual bool GetNext(SvKeyValue &rKeyVal)
Definition: svparser.cxx:654
const OUString & GetKey() const
Operation.
Definition: svparser.hxx:184
const OUString & GetValue() const
Definition: svparser.hxx:185
sal_uInt64 Tell() const
bool eof() const
sal_uInt64 Seek(sal_uInt64 nPos)
sal_uInt64 SeekRel(sal_Int64 nPos)
#define DBG_ASSERT(sCon, aError)
float u
FastSaxParserImpl & m_rParser
OUString sName
HtmlTokenId GetHTMLToken(std::u16string_view rName)
Definition: htmlkywd.cxx:182
sal_Unicode GetHTMLCharName(std::u16string_view rName)
Definition: htmlkywd.cxx:468
HtmlOptionId GetHTMLOption(std::u16string_view rName)
Definition: htmlkywd.cxx:640
sal_uInt32 GetHTMLColor(const OUString &rName)
Definition: htmlkywd.cxx:803
#define OOO_STRING_SVTOOLS_HTML_IT_checkbox
Definition: htmlkywd.hxx:565
#define OOO_STRING_SVTOOLS_HTML_head
Definition: htmlkywd.hxx:90
#define OOO_STRING_SVTOOLS_HTML_META_changed
Definition: htmlkywd.hxx:640
#define OOO_STRING_SVTOOLS_HTML_IT_password
Definition: htmlkywd.hxx:564
#define OOO_STRING_SVTOOLS_HTML_body
Definition: htmlkywd.hxx:62
#define OOO_STRING_SVTOOLS_HTML_comment
Definition: htmlkywd.hxx:33
#define OOO_STRING_SVTOOLS_HTML_IT_button
Definition: htmlkywd.hxx:574
#define OOO_STRING_SVTOOLS_HTML_TR_none
Definition: htmlkywd.hxx:588
#define OOO_STRING_SVTOOLS_HTML_INT_ICON_notfound
Definition: htmlkywd.hxx:671
#define OOO_STRING_SVTOOLS_HTML_TR_groups
Definition: htmlkywd.hxx:589
#define OOO_STRING_SVTOOLS_HTML_TF_void
Definition: htmlkywd.hxx:577
#define OOO_STRING_SVTOOLS_HTML_META_sdfootnote
Definition: htmlkywd.hxx:646
#define OOO_STRING_SVTOOLS_HTML_IT_reset
Definition: htmlkywd.hxx:573
#define OOO_STRING_SVTOOLS_HTML_IT_submit
Definition: htmlkywd.hxx:571
#define OOO_STRING_SVTOOLS_HTML_IT_file
Definition: htmlkywd.hxx:569
#define OOO_STRING_SVTOOLS_HTML_style
Definition: htmlkywd.hxx:120
#define OOO_STRING_SVTOOLS_HTML_META_sdendnote
Definition: htmlkywd.hxx:645
#define OOO_STRING_SVTOOLS_HTML_TR_rows
Definition: htmlkywd.hxx:590
#define OOO_STRING_SVTOOLS_HTML_META_created
Definition: htmlkywd.hxx:642
#define OOO_STRING_SVTOOLS_HTML_IT_image
Definition: htmlkywd.hxx:572
#define OOO_STRING_SVTOOLS_HTML_TR_all
Definition: htmlkywd.hxx:592
#define OOO_STRING_SVTOOLS_HTML_IT_hidden
Definition: htmlkywd.hxx:570
#define OOO_STRING_SVTOOLS_HTML_TF_below
Definition: htmlkywd.hxx:579
#define OOO_STRING_SVTOOLS_HTML_META_refresh
Definition: htmlkywd.hxx:634
#define OOO_STRING_SVTOOLS_HTML_private_image
Definition: htmlkywd.hxx:665
#define OOO_STRING_SVTOOLS_HTML_TR_cols
Definition: htmlkywd.hxx:591
#define OOO_STRING_SVTOOLS_HTML_INT_ICON_embed
Definition: htmlkywd.hxx:669
#define OOO_STRING_SVTOOLS_HTML_TF_above
Definition: htmlkywd.hxx:578
#define OOO_STRING_SVTOOLS_HTML_TF_lhs
Definition: htmlkywd.hxx:581
#define OOO_STRING_SVTOOLS_HTML_INT_ICON_baddata
Definition: htmlkywd.hxx:667
#define OOO_STRING_SVTOOLS_HTML_TF_vsides
Definition: htmlkywd.hxx:583
#define OOO_STRING_SVTOOLS_HTML_INT_ICON_insecure
Definition: htmlkywd.hxx:670
#define OOO_STRING_SVTOOLS_HTML_TF_border
Definition: htmlkywd.hxx:585
#define OOO_STRING_SVTOOLS_HTML_IT_text
Definition: htmlkywd.hxx:563
#define OOO_STRING_SVTOOLS_HTML_internal_icon
Definition: htmlkywd.hxx:666
#define OOO_STRING_SVTOOLS_HTML_META_keywords
Definition: htmlkywd.hxx:639
#define OOO_STRING_SVTOOLS_HTML_IT_scribble
Definition: htmlkywd.hxx:568
#define OOO_STRING_SVTOOLS_HTML_META_description
Definition: htmlkywd.hxx:638
#define OOO_STRING_SVTOOLS_HTML_META_author
Definition: htmlkywd.hxx:636
#define OOO_STRING_SVTOOLS_HTML_script
Definition: htmlkywd.hxx:113
#define OOO_STRING_SVTOOLS_HTML_TF_hsides
Definition: htmlkywd.hxx:580
#define OOO_STRING_SVTOOLS_HTML_META_changedby
Definition: htmlkywd.hxx:641
#define OOO_STRING_SVTOOLS_HTML_INT_ICON_delayed
Definition: htmlkywd.hxx:668
#define OOO_STRING_SVTOOLS_HTML_META_content_type
Definition: htmlkywd.hxx:643
#define OOO_STRING_SVTOOLS_HTML_IT_radio
Definition: htmlkywd.hxx:566
#define OOO_STRING_SVTOOLS_HTML_IT_range
Definition: htmlkywd.hxx:567
#define OOO_STRING_SVTOOLS_HTML_META_classification
Definition: htmlkywd.hxx:637
#define OOO_STRING_SVTOOLS_HTML_META_generator
Definition: htmlkywd.hxx:635
#define OOO_STRING_SVTOOLS_HTML_TF_rhs
Definition: htmlkywd.hxx:582
#define OOO_STRING_SVTOOLS_HTML_TF_box
Definition: htmlkywd.hxx:584
HtmlOptionId
Definition: htmltokn.h:301
HtmlTokenId
Definition: htmltokn.h:46
constexpr bool isOffToken(HtmlTokenId nToken)
Definition: htmltokn.h:284
std::unordered_map< OString, INetContentTypeParameter > INetContentTypeParameterList
OUString aName
LineEnd GetSystemLineEnd()
TOOLS_DLLPUBLIC OString convertLineEnd(const OString &rIn, LineEnd eLineEnd)
sal_uInt16 nPos
#define SAL_WARN_IF(condition, area, stream)
#define SAL_WARN(area, stream)
aBuf
NONE
B & padToLength(B &rBuffer, sal_Int32 nLen, U cFill)
OStringBuffer & padToLength(OStringBuffer &rBuffer, sal_Int32 nLength, char cFill='\0')
OString stripStart(const OString &rIn, char c)
sal_Int32 getTokenCount(std::string_view rIn, char cTok)
int i
bool IsControlChar(sal_Unicode cChar)
std::enable_if< std::is_signed< T >::value, T >::type saturating_toggle_sign(T a)
constexpr bool ends_with(std::basic_string_view< charT, traits > sv, std::basic_string_view< charT, traits > x) noexcept
sal_Int32 toInt32(std::u16string_view str, sal_Int16 radix=10)
sal_Int64 toInt64(std::u16string_view str, sal_Int16 radix=10)
constexpr bool starts_with(std::basic_string_view< charT, traits > sv, std::basic_string_view< charT, traits > x) noexcept
std::basic_string_view< charT, traits > getToken(std::basic_string_view< charT, traits > sv, charT delimiter, std::size_t &position)
OString OUStringToOString(std::u16string_view str, ConnectionSettings const *settings)
ContentType
bool ISO8601parseDateTime(std::u16string_view rString, css::util::DateTime &rDateTime)
HTMLOptionEnum< HTMLInputType > const aInputTypeOptEnums[]
Definition: parhtml.cxx:55
HTMLOptionEnum< HTMLTableRules > const aTableRulesOptEnums[]
Definition: parhtml.cxx:88
HTMLOptionEnum< HTMLTableFrame > const aTableFrameOptEnums[]
Definition: parhtml.cxx:73
const sal_Int32 MAX_ENTITY_LEN(8)
HTMLOptionEnum< HtmlMeta > const aHTMLMetaNameTable[]
Definition: parhtml.cxx:1941
const sal_Int32 MAX_LEN(1024)
HTMLInputType
Definition: parhtml.hxx:53
HTMLTableRules
Definition: parhtml.hxx:50
HTMLTableFrame
Definition: parhtml.hxx:48
::std::vector< HTMLOption > HTMLOptions
Definition: parhtml.hxx:144
DefTokenId nToken
SvParserState
Definition: svparser.hxx:36
TOOLS_DLLPUBLIC rtl_TextEncoding GetExtendedCompatibilityTextEncoding(rtl_TextEncoding eEncoding)
unsigned char sal_uInt8
sal_uInt16 sal_Unicode
#define SAL_MAX_UINT32