LibreOffice Module svtools (master) 1
parrtf.cxx
Go to the documentation of this file.
1/* -*- Mode: C++; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4 -*- */
2/*
3 * This file is part of the LibreOffice project.
4 *
5 * This Source Code Form is subject to the terms of the Mozilla Public
6 * License, v. 2.0. If a copy of the MPL was not distributed with this
7 * file, You can obtain one at http://mozilla.org/MPL/2.0/.
8 *
9 * This file incorporates work covered by the following license notice:
10 *
11 * Licensed to the Apache Software Foundation (ASF) under one or more
12 * contributor license agreements. See the NOTICE file distributed
13 * with this work for additional information regarding copyright
14 * ownership. The ASF licenses this file to you under the Apache
15 * License, Version 2.0 (the "License"); you may not use this file
16 * except in compliance with the License. You may obtain a copy of
17 * the License at http://www.apache.org/licenses/LICENSE-2.0 .
18 */
19
20#include <sal/config.h>
21#include <sal/log.hxx>
22
24
25#include <rtl/character.hxx>
26#include <rtl/strbuf.hxx>
27#include <rtl/tencinfo.h>
28#include <rtl/ustrbuf.hxx>
29#include <tools/stream.hxx>
30#include <tools/debug.hxx>
31#include <svtools/rtftoken.h>
32#include <svtools/parrtf.hxx>
33
34const int MAX_STRING_LEN = 1024;
35
36#define RTF_ISDIGIT( c ) rtl::isAsciiDigit(c)
37#define RTF_ISALPHA( c ) rtl::isAsciiAlpha(c)
38
40 : SvParser<int>( rIn, nStackSize )
41 , nOpenBrackets(0)
42 , eCodeSet(RTL_TEXTENCODING_MS_1252)
43 , nUCharOverread(1)
44{
45 // default is ANSI-CodeSet
46 SetSrcEncoding( RTL_TEXTENCODING_MS_1252 );
47 bRTF_InTextRead = false;
48}
49
51{
52}
53
54
56{
57 int nRet = 0;
58 do {
59 bool bNextCh = true;
60 switch( nNextCh )
61 {
62 case '\\':
63 {
64 // control characters
65 nNextCh = GetNextChar();
66 switch( nNextCh )
67 {
68 case '{':
69 case '}':
70 case '\\':
71 case '+': // I found it in a RTF-file
72 case '~': // nonbreaking space
73 case '-': // optional hyphen
74 case '_': // nonbreaking hyphen
75 case '\'': // HexValue
76 nNextCh = '\\';
77 rInput.SeekRel( -1 );
78 ScanText();
79 nRet = RTF_TEXTTOKEN;
80 bNextCh = 0 == nNextCh;
81 break;
82
83 case '*': // ignoreflag
84 nRet = RTF_IGNOREFLAG;
85 break;
86 case ':': // subentry in an index entry
87 nRet = RTF_SUBENTRYINDEX;
88 break;
89 case '|': // formula-character
90 nRet = RTF_FORMULA;
91 break;
92
93 case 0x0a:
94 case 0x0d:
95 nRet = RTF_PAR;
96 break;
97
98 default:
99 if( RTF_ISALPHA( nNextCh ) )
100 {
101 aToken = "\\";
102 {
103 do {
104 aToken.appendUtf32(nNextCh);
105 nNextCh = GetNextChar();
106 } while( RTF_ISALPHA( nNextCh ) );
107 }
108
109 // minus before numeric parameters
110 bool bNegValue = false;
111 if( '-' == nNextCh )
112 {
113 bNegValue = true;
114 nNextCh = GetNextChar();
115 }
116
117 // possible numeric parameter
118 if( RTF_ISDIGIT( nNextCh ) )
119 {
120 OUStringBuffer aNumber;
121 do {
122 aNumber.append(static_cast<sal_Unicode>(nNextCh));
123 nNextCh = GetNextChar();
124 } while( RTF_ISDIGIT( nNextCh ) );
125 nTokenValue = OUString::unacquired(aNumber).toInt32();
126 if( bNegValue )
127 nTokenValue = -nTokenValue;
128 bTokenHasValue=true;
129 }
130 else if( bNegValue ) // restore minus
131 {
132 nNextCh = '-';
133 rInput.SeekRel( -1 );
134 }
135 if( ' ' == nNextCh ) // blank is part of token!
136 nNextCh = GetNextChar();
137
138 // search for the token in the table:
139 if( 0 == (nRet = GetRTFToken( aToken )) )
140 // Unknown Control
141 nRet = RTF_UNKNOWNCONTROL;
142
143 // bug 76812 - unicode token handled as normal text
144 bNextCh = false;
145 switch( nRet )
146 {
147 case RTF_UC:
148 if( 0 <= nTokenValue )
149 {
150 nUCharOverread = static_cast<sal_uInt8>(nTokenValue);
151 if (!aParserStates.empty())
152 {
153 //cmc: other ifdef breaks #i3584
154 aParserStates.top().nUCharOverread = nUCharOverread;
155 }
156 }
157 aToken.setLength( 0 ); // #i47831# erase token to prevent the token from being treated as text
158 // read next token
159 nRet = 0;
160 break;
161
162 case RTF_UPR:
163 if (!_inSkipGroup) {
164 // UPR - overread the group with the ansi
165 // information
166 int nNextToken;
167 do
168 {
169 nNextToken = GetNextToken_();
170 }
171 while (nNextToken != '{' && nNextToken != sal_Unicode(EOF) && IsParserWorking());
172
173 SkipGroup();
174 GetNextToken_(); // overread the last bracket
175 nRet = 0;
176 }
177 break;
178
179 case RTF_U:
180 if( !bRTF_InTextRead )
181 {
182 nRet = RTF_TEXTTOKEN;
183 aToken = OUStringChar( static_cast<sal_Unicode>(nTokenValue) );
184
185 // overread the next n "RTF" characters. This
186 // can be also \{, \}, \'88
187 for( sal_uInt8 m = 0; m < nUCharOverread; ++m )
188 {
189 sal_uInt32 cAnsi = nNextCh;
190 while( 0xD == cAnsi )
191 cAnsi = GetNextChar();
192 while( 0xA == cAnsi )
193 cAnsi = GetNextChar();
194
195 if( '\\' == cAnsi &&
196 '\'' == GetNextChar() )
197 // skip HexValue
198 GetHexValue();
199 nNextCh = GetNextChar();
200 }
201 ScanText();
202 bNextCh = 0 == nNextCh;
203 }
204 break;
205 }
206 }
207 else if( SvParserState::Pending != eState )
208 {
209 // Bug 34631 - "\ " read on - Blank as character
210 // eState = SvParserState::Error;
211 bNextCh = false;
212 }
213 break;
214 }
215 }
216 break;
217
218 case sal_Unicode(EOF):
220 nRet = nNextCh;
221 break;
222
223 case '{':
224 {
225 if( 0 <= nOpenBrackets )
226 {
227 RtfParserState_Impl aState( nUCharOverread, GetSrcEncoding() );
228 aParserStates.push( aState );
229 }
232 static_cast<size_t>(nOpenBrackets) == aParserStates.size(),
233 "ParserStateStack unequal to bracket count" );
234 nRet = nNextCh;
235 }
236 break;
237
238 case '}':
240 if( 0 <= nOpenBrackets )
241 {
242 aParserStates.pop();
243 if( !aParserStates.empty() )
244 {
245 const RtfParserState_Impl& rRPS =
246 aParserStates.top();
248 SetSrcEncoding( rRPS.eCodeSet );
249 }
250 else
251 {
252 nUCharOverread = 1;
253 SetSrcEncoding( GetCodeSet() );
254 }
255 }
257 static_cast<size_t>(nOpenBrackets) == aParserStates.size(),
258 "ParserStateStack unequal to bracket count" );
259 nRet = nNextCh;
260 break;
261
262 case 0x0d:
263 case 0x0a:
264 break;
265
266 default:
267 // now normal text follows
268 ScanText();
269 nRet = RTF_TEXTTOKEN;
270 bNextCh = 0 == nNextCh;
271 break;
272 }
273
274 if( bNextCh )
275 nNextCh = GetNextChar();
276
277 } while( !nRet && SvParserState::Working == eState );
278 return nRet;
279}
280
281
283{
284 // collect Hex values
285 int n;
286 sal_Unicode nHexVal = 0;
287
288 for( n = 0; n < 2; ++n )
289 {
290 nHexVal *= 16;
291 nNextCh = GetNextChar();
292 if( nNextCh >= '0' && nNextCh <= '9' )
293 nHexVal += (nNextCh - 48);
294 else if( nNextCh >= 'a' && nNextCh <= 'f' )
295 nHexVal += (nNextCh - 87);
296 else if( nNextCh >= 'A' && nNextCh <= 'F' )
297 nHexVal += (nNextCh - 55);
298 }
299 return nHexVal;
300}
301
303{
304 const sal_Unicode cBreak = 0;
305 OUStringBuffer aStrBuffer;
306 bool bContinue = true;
307 while( bContinue && IsParserWorking() && aStrBuffer.getLength() < MAX_STRING_LEN)
308 {
309 bool bNextCh = true;
310 switch( nNextCh )
311 {
312 case '\\':
313 {
314 nNextCh = GetNextChar();
315 switch (nNextCh)
316 {
317 case '\'':
318 {
319
320 OStringBuffer aByteString;
321 while (true)
322 {
323 char c = static_cast<char>(GetHexValue());
324 /*
325 * Note: \'00 is a valid internal character in a
326 * string in RTF. OStringBuffer supports
327 * appending nulls fine
328 */
329 aByteString.append(c);
330
331 bool bBreak = false;
332 bool bEOF = false;
333 char nSlash = '\\';
334 while (!bBreak)
335 {
336 auto next = GetNextChar();
337 if (sal_Unicode(EOF) == next)
338 {
339 bEOF = true;
340 break;
341 }
342 if (next>0xFF) // fix for #i43933# and #i35653#
343 {
344 if (!aByteString.isEmpty())
345 {
346 aStrBuffer.append( OStringToOUString(aByteString, GetSrcEncoding()) );
347 aByteString.setLength(0);
348 }
349 aStrBuffer.append(static_cast<sal_Unicode>(next));
350
351 continue;
352 }
353 nSlash = static_cast<char>(next);
354 while (nSlash == 0xD || nSlash == 0xA)
355 nSlash = static_cast<char>(GetNextChar());
356
357 switch (nSlash)
358 {
359 case '{':
360 case '}':
361 case '\\':
362 bBreak = true;
363 break;
364 default:
365 aByteString.append(nSlash);
366 break;
367 }
368 }
369
370 if (bEOF)
371 {
372 bContinue = false; // abort, string together
373 break;
374 }
375
376 nNextCh = GetNextChar();
377
378 if (nSlash != '\\' || nNextCh != '\'')
379 {
380 rInput.SeekRel(-1);
381 nNextCh = static_cast<unsigned char>(nSlash);
382 break;
383 }
384 }
385
386 bNextCh = false;
387
388 if (!aByteString.isEmpty())
389 {
390 aStrBuffer.append( OStringToOUString(aByteString, GetSrcEncoding()) );
391 aByteString.setLength(0);
392 }
393 }
394 break;
395 case '\\':
396 case '}':
397 case '{':
398 case '+': // I found in a RTF file
399 aStrBuffer.append(sal_Unicode(nNextCh));
400 break;
401 case '~': // nonbreaking space
402 aStrBuffer.append(u'\x00A0');
403 break;
404 case '-': // optional hyphen
405 aStrBuffer.append(u'\x00AD');
406 break;
407 case '_': // nonbreaking hyphen
408 aStrBuffer.append(u'\x2011');
409 break;
410
411 case 'u':
412 // read UNI-Code characters
413 {
414 nNextCh = GetNextChar();
415 rInput.SeekRel( -2 );
416
417 if( '-' == nNextCh || RTF_ISDIGIT( nNextCh ) )
418 {
419 bRTF_InTextRead = true;
420
421 OUString sSave( aToken ); // GetNextToken_() overwrites this
422 nNextCh = '\\';
423 int nToken = GetNextToken_();
424 DBG_ASSERT( RTF_U == nToken, "still not a UNI-Code character" );
425 // don't convert symbol chars
426 aStrBuffer.append(static_cast< sal_Unicode >(nTokenValue));
427
428 // overread the next n "RTF" characters. This
429 // can be also \{, \}, \'88
430 for( sal_uInt8 m = 0; m < nUCharOverread; ++m )
431 {
432 sal_Unicode cAnsi = nNextCh;
433 while( 0xD == cAnsi )
434 cAnsi = GetNextChar();
435 while( 0xA == cAnsi )
436 cAnsi = GetNextChar();
437
438 if( '\\' == cAnsi &&
439 '\'' == GetNextChar() )
440 // skip HexValue
441 GetHexValue();
442 nNextCh = GetNextChar();
443 }
444 bNextCh = false;
445 aToken = sSave;
446 bRTF_InTextRead = false;
447 }
448 else if ( 'c' == nNextCh )
449 {
450 // Prevent text breaking into multiple tokens.
451 rInput.SeekRel( 2 );
452 nNextCh = GetNextChar();
453 if (RTF_ISDIGIT( nNextCh ))
454 {
455 sal_uInt8 nNewOverread = 0 ;
456 do {
457 nNewOverread *= 10;
458 nNewOverread += nNextCh - '0';
459 nNextCh = GetNextChar();
460 } while ( RTF_ISDIGIT( nNextCh ) );
461 nUCharOverread = nNewOverread;
462 if (!aParserStates.empty())
463 aParserStates.top().nUCharOverread = nNewOverread;
464 }
465 bNextCh = 0x20 == nNextCh;
466 }
467 else
468 {
469 nNextCh = '\\';
470 bContinue = false; // abort, string together
471 }
472 }
473 break;
474
475 default:
476 rInput.SeekRel( -1 );
477 nNextCh = '\\';
478 bContinue = false; // abort, string together
479 break;
480 }
481 }
482 break;
483
484 case sal_Unicode(EOF):
485 eState = SvParserState::Error;
486 [[fallthrough]];
487 case '{':
488 case '}':
489 bContinue = false;
490 break;
491
492 case 0x0a:
493 case 0x0d:
494 break;
495
496 default:
497 if( nNextCh == cBreak || aStrBuffer.getLength() >= MAX_STRING_LEN)
498 bContinue = false;
499 else
500 {
501 do {
502 // all other characters end up in the text
503 aStrBuffer.appendUtf32(nNextCh);
504
505 if (sal_Unicode(EOF) == (nNextCh = GetNextChar()))
506 {
507 if (!aStrBuffer.isEmpty())
508 aToken.append( aStrBuffer );
509 return;
510 }
511 } while
512 (
513 (RTF_ISALPHA(nNextCh) || RTF_ISDIGIT(nNextCh)) &&
514 (aStrBuffer.getLength() < MAX_STRING_LEN)
515 );
516 bNextCh = false;
517 }
518 }
519
520 if( bContinue && bNextCh )
521 nNextCh = GetNextChar();
522 }
523
524 if (!aStrBuffer.isEmpty())
525 aToken.append( aStrBuffer );
526}
527
528
530
532{
533 short nBrackets=1;
534 if (_inSkipGroup>0)
535 return;
536 _inSkipGroup++;
537//#i16185# faking \bin keyword
538 do
539 {
540 switch (nNextCh)
541 {
542 case '{':
543 ++nBrackets;
544 break;
545 case '}':
546 if (!--nBrackets) {
547 _inSkipGroup--;
548 return;
549 }
550 break;
551 }
552 int nToken = GetNextToken_();
553 if (nToken == RTF_BIN)
554 {
555 rInput.SeekRel(-1);
556 SAL_WARN_IF(nTokenValue < 0, "svtools", "negative value argument for rtf \\bin keyword");
557 if (nTokenValue > 0)
558 rInput.SeekRel(nTokenValue);
559 nNextCh = GetNextChar();
560 }
561 while (nNextCh==0xa || nNextCh==0xd)
562 {
563 nNextCh = GetNextChar();
564 }
565 } while (sal_Unicode(EOF) != nNextCh && IsParserWorking());
566
567 if( SvParserState::Pending != eState && '}' != nNextCh )
568 eState = SvParserState::Error;
569 _inSkipGroup--;
570}
571
574
575
577{
578 char cFirstCh(0);
579 nNextChPos = rInput.Tell();
580 rInput.ReadChar( cFirstCh );
581 nNextCh = static_cast<unsigned char>(cFirstCh);
582 eState = SvParserState::Working;
583 nOpenBrackets = 0;
584 eCodeSet = RTL_TEXTENCODING_MS_1252;
585 SetSrcEncoding( eCodeSet );
586
587 // the first two tokens should be '{' and \\rtf !!
588 if( '{' == GetNextToken() && RTF_RTF == GetNextToken() )
589 {
590 AddFirstRef();
591 // call ReleaseRef at end of this scope, even in the face of exceptions
592 comphelper::ScopeGuard g([this] {
593 if( SvParserState::Pending != eState )
594 ReleaseRef(); // now parser is not needed anymore
595 });
596 Continue( 0 );
597 }
598 else
599 eState = SvParserState::Error;
600
601 return eState;
602}
603
604void SvRTFParser::Continue( int nToken )
605{
606// DBG_ASSERT( SVPAR_CS_DONTKNOW == GetCharSet(),
607// "Characterset was changed." );
608
609 if( !nToken )
610 nToken = GetNextToken();
611
612 bool bLooping = false;
613
614 while (IsParserWorking() && !bLooping)
615 {
616 auto nCurrentTokenIndex = m_nTokenIndex;
617 auto nCurrentToken = nToken;
618
619 SaveState( nToken );
620 switch( nToken )
621 {
622 case '}':
623 if( nOpenBrackets )
624 goto NEXTTOKEN;
626 break;
627
628 case '{':
629 // an unknown group ?
630 {
631 if( RTF_IGNOREFLAG != GetNextToken() )
632 nToken = SkipToken();
633 else if( RTF_UNKNOWNCONTROL != GetNextToken() )
634 nToken = SkipToken( -2 );
635 else
636 {
637 // filter immediately
639 nToken = GetNextToken();
640 if( '}' != nToken )
641 eState = SvParserState::Error;
642 break; // move to next token!!
643 }
644 }
645 goto NEXTTOKEN;
646
648 break; // skip unknown token
649 case RTF_NEXTTYPE:
650 case RTF_ANSITYPE:
651 eCodeSet = RTL_TEXTENCODING_MS_1252;
652 SetSrcEncoding( eCodeSet );
653 break;
654 case RTF_MACTYPE:
655 eCodeSet = RTL_TEXTENCODING_APPLE_ROMAN;
656 SetSrcEncoding( eCodeSet );
657 break;
658 case RTF_PCTYPE:
659 eCodeSet = RTL_TEXTENCODING_IBM_437;
660 SetSrcEncoding( eCodeSet );
661 break;
662 case RTF_PCATYPE:
663 eCodeSet = RTL_TEXTENCODING_IBM_850;
664 SetSrcEncoding( eCodeSet );
665 break;
666 case RTF_ANSICPG:
667 eCodeSet = rtl_getTextEncodingFromWindowsCodePage(nTokenValue);
668 SetSrcEncoding(eCodeSet);
669 break;
670 default:
671NEXTTOKEN:
672 NextToken( nToken );
673 break;
674 }
675 if( IsParserWorking() )
676 SaveState( 0 ); // processed till here,
677 // continue with new token!
678 nToken = GetNextToken();
679 bLooping = nCurrentTokenIndex == m_nTokenIndex && nToken == nCurrentToken;
680 }
681 if( SvParserState::Accepted == eState && 0 < nOpenBrackets )
682 eState = SvParserState::Error;
683}
684
685void SvRTFParser::SetEncoding( rtl_TextEncoding eEnc )
686{
687 if (eEnc == RTL_TEXTENCODING_DONTKNOW)
688 eEnc = GetCodeSet();
689
690 if (!aParserStates.empty())
691 aParserStates.top().eCodeSet = eEnc;
692 SetSrcEncoding(eEnc);
693}
694
695/* vim:set shiftwidth=4 softtabstop=4 expandtab: */
rtl_TextEncoding GetCodeSet() const
Definition: parrtf.hxx:59
void ReadBitmapData()
Definition: parrtf.cxx:573
sal_Unicode GetHexValue()
Definition: parrtf.cxx:282
virtual ~SvRTFParser() override
Definition: parrtf.cxx:50
void ReadUnknownData()
Definition: parrtf.cxx:572
sal_uInt8 nUCharOverread
Definition: parrtf.hxx:41
void ScanText()
Definition: parrtf.cxx:302
std::stack< RtfParserState_Impl > aParserStates
Definition: parrtf.hxx:38
virtual void Continue(int nToken) override
Definition: parrtf.cxx:604
void SkipGroup()
Definition: parrtf.cxx:531
rtl_TextEncoding eCodeSet
Definition: parrtf.hxx:40
int nOpenBrackets
Definition: parrtf.hxx:39
virtual SvParserState CallParser() override
Definition: parrtf.cxx:576
SvRTFParser(SvStream &rIn, sal_uInt8 nStackSize=3)
Definition: parrtf.cxx:39
static short _inSkipGroup
Definition: parrtf.hxx:44
virtual int GetNextToken_() override
Definition: parrtf.cxx:55
void SetEncoding(rtl_TextEncoding eEnc)
Definition: parrtf.cxx:685
void ReleaseRef()
void AddFirstRef()
sal_uInt64 Tell() const
SvStream & ReadChar(char &rChar)
sal_uInt64 SeekRel(sal_Int64 nPos)
#define DBG_ASSERT(sCon, aError)
float u
sal_Int64 n
#define SAL_WARN_IF(condition, area, stream)
m
#define RTF_ISALPHA(c)
Definition: parrtf.cxx:37
const int MAX_STRING_LEN
Definition: parrtf.cxx:34
#define RTF_ISDIGIT(c)
Definition: parrtf.cxx:36
DefTokenId nToken
const wchar_t *typedef int(__stdcall *DllNativeUnregProc)(int
int GetRTFToken(std::u16string_view rSearch)
Definition: rtfkeywd.cxx:1173
@ RTF_BIN
Definition: rtftoken.h:157
@ RTF_UNKNOWNCONTROL
Definition: rtftoken.h:58
@ RTF_U
Definition: rtftoken.h:241
@ RTF_PCTYPE
Definition: rtftoken.h:64
@ RTF_FORMULA
Definition: rtftoken.h:824
@ RTF_SUBENTRYINDEX
Definition: rtftoken.h:828
@ RTF_RTF
Definition: rtftoken.h:61
@ RTF_NEXTTYPE
Definition: rtftoken.h:66
@ RTF_UC
Definition: rtftoken.h:242
@ RTF_TEXTTOKEN
Definition: rtftoken.h:56
@ RTF_ANSITYPE
Definition: rtftoken.h:62
@ RTF_UPR
Definition: rtftoken.h:240
@ RTF_MACTYPE
Definition: rtftoken.h:63
@ RTF_ANSICPG
Definition: rtftoken.h:243
@ RTF_PCATYPE
Definition: rtftoken.h:65
@ RTF_IGNOREFLAG
Definition: rtftoken.h:829
@ RTF_PAR
Definition: rtftoken.h:811
sal_uInt8 nUCharOverread
Definition: parrtf.hxx:29
rtl_TextEncoding eCodeSet
Definition: parrtf.hxx:28
SvParserState
Definition: svparser.hxx:36
unsigned char sal_uInt8
sal_uInt16 sal_Unicode