LibreOffice Module sw (master) 1
parasc.cxx
Go to the documentation of this file.
1/* -*- Mode: C++; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4 -*- */
2/*
3 * This file is part of the LibreOffice project.
4 *
5 * This Source Code Form is subject to the terms of the Mozilla Public
6 * License, v. 2.0. If a copy of the MPL was not distributed with this
7 * file, You can obtain one at http://mozilla.org/MPL/2.0/.
8 *
9 * This file incorporates work covered by the following license notice:
10 *
11 * Licensed to the Apache Software Foundation (ASF) under one or more
12 * contributor license agreements. See the NOTICE file distributed
13 * with this work for additional information regarding copyright
14 * ownership. The ASF licenses this file to you under the Apache
15 * License, Version 2.0 (the "License"); you may not use this file
16 * except in compliance with the License. You may obtain a copy of
17 * the License at http://www.apache.org/licenses/LICENSE-2.0 .
18 */
19
20#include <memory>
21
22#include <tools/stream.hxx>
23#include <hintids.hxx>
24#include <sfx2/docfile.hxx>
25#include <sfx2/printer.hxx>
26#include <sfx2/sfxsids.hrc>
27#include <editeng/fontitem.hxx>
28#include <editeng/langitem.hxx>
31#include <shellio.hxx>
32#include <doc.hxx>
36#include <pam.hxx>
37#include <breakit.hxx>
38#include <swerror.h>
39#include <strings.hrc>
40#include <mdiexp.hxx>
41#include <poolfmt.hxx>
42#include <iodetect.hxx>
43
44#include <vcl/metric.hxx>
45#include <osl/diagnose.h>
46
47#define ASC_BUFFLEN 4096
48
49namespace {
50
51class SwASCIIParser
52{
54 std::optional<SwPaM> m_oPam;
55 SvStream& m_rInput;
56 std::unique_ptr<char[]> m_pArr;
57 const SwAsciiOptions& m_rOpt;
58 SwAsciiOptions m_usedAsciiOptions;
59 std::optional<SfxItemSet> m_oItemSet;
60 tools::Long m_nFileSize;
61 SvtScriptType m_nScript;
62 bool m_bNewDoc;
63
64 ErrCode ReadChars();
65 void InsertText( const OUString& rStr );
66
67 SwASCIIParser(const SwASCIIParser&) = delete;
68 SwASCIIParser& operator=(const SwASCIIParser&) = delete;
69
70public:
71 SwASCIIParser( SwDoc& rD, const SwPaM& rCursor, SvStream& rIn,
72 bool bReadNewDoc, const SwAsciiOptions& rOpts );
73
74 ErrCode CallParser();
75 const SwAsciiOptions& GetUsedAsciiOptions() const { return m_usedAsciiOptions; }
76};
77
78}
79
80// Call for the general reader interface
81ErrCode AsciiReader::Read( SwDoc& rDoc, const OUString&, SwPaM &rPam, const OUString & )
82{
83 if( !m_pStream )
84 {
85 OSL_ENSURE( false, "ASCII read without a stream" );
86 return ERR_SWG_READ_ERROR;
87 }
88
89 ErrCode nRet;
90 {
91 SwASCIIParser aParser( rDoc, rPam, *m_pStream,
93 nRet = aParser.CallParser();
94
95 OUString optionsString;
96 aParser.GetUsedAsciiOptions().WriteUserData(optionsString);
97
98 if(m_pMedium != nullptr && m_pMedium->GetItemSet() != nullptr)
99 m_pMedium->GetItemSet()->Put(SfxStringItem(SID_FILE_FILTEROPTIONS, optionsString));
100 }
101 // after Read reset the options
103 return nRet;
104}
105
106SwASCIIParser::SwASCIIParser(SwDoc& rD, const SwPaM& rCursor, SvStream& rIn, bool bReadNewDoc,
107 const SwAsciiOptions& rOpts)
108 : m_rDoc(rD)
109 , m_rInput(rIn)
110 , m_rOpt(rOpts)
111 , m_usedAsciiOptions(rOpts)
112 , m_nFileSize(0)
113 , m_nScript(SvtScriptType::NONE)
114 , m_bNewDoc(bReadNewDoc)
115{
116 m_oPam.emplace(*rCursor.GetPoint());
117 m_pArr.reset(new char[ASC_BUFFLEN + 2]);
118
119 m_oItemSet.emplace(
123
124 // set defaults from the options
125 if (m_rOpt.GetLanguage())
126 {
127 SvxLanguageItem aLang(m_rOpt.GetLanguage(), RES_CHRATR_LANGUAGE);
128 m_oItemSet->Put(aLang);
129 aLang.SetWhich(RES_CHRATR_CJK_LANGUAGE);
130 m_oItemSet->Put(aLang);
131 aLang.SetWhich(RES_CHRATR_CTL_LANGUAGE);
132 m_oItemSet->Put(aLang);
133 }
134 if (m_rOpt.GetFontName().isEmpty())
135 return;
136
137 vcl::Font aTextFont(m_rOpt.GetFontName(), Size(0, 10));
139 aTextFont = m_rDoc.getIDocumentDeviceAccess().getPrinter(false)->GetFontMetric(aTextFont);
140 SvxFontItem aFont( aTextFont.GetFamilyType(), aTextFont.GetFamilyName(),
141 OUString(), aTextFont.GetPitch(), aTextFont.GetCharSet(), RES_CHRATR_FONT );
142 m_oItemSet->Put(aFont);
143 aFont.SetWhich(RES_CHRATR_CJK_FONT);
144 m_oItemSet->Put(aFont);
145 aFont.SetWhich(RES_CHRATR_CTL_FONT);
146 m_oItemSet->Put(aFont);
147}
148
149// Calling the parser
150ErrCode SwASCIIParser::CallParser()
151{
152 m_rInput.ResetError();
153 m_nFileSize = m_rInput.TellEnd();
154 m_rInput.Seek(STREAM_SEEK_TO_BEGIN);
155 m_rInput.ResetError();
156
157 ::StartProgress(STR_STATSTR_W4WREAD, 0, m_nFileSize, m_rDoc.GetDocShell());
158
159 std::optional<SwPaM> pInsPam;
160 sal_Int32 nSttContent = 0;
161 if (!m_bNewDoc)
162 {
163 const SwNode& rTmp = m_oPam->GetPoint()->GetNode();
164 pInsPam.emplace( rTmp, rTmp, SwNodeOffset(0), SwNodeOffset(-1) );
165 nSttContent = m_oPam->GetPoint()->GetContentIndex();
166 }
167
168 SwTextFormatColl *pColl = nullptr;
169
170 if (m_bNewDoc)
171 {
173 false);
174 if (!pColl)
176 false);
177 if (pColl)
178 m_rDoc.SetTextFormatColl(*m_oPam, pColl);
179 }
180
181 ErrCode nError = ReadChars();
182
183 if (m_oItemSet)
184 {
185 // set only the attribute, for scanned scripts.
186 if (!(SvtScriptType::LATIN & m_nScript))
187 {
188 m_oItemSet->ClearItem(RES_CHRATR_FONT);
189 m_oItemSet->ClearItem(RES_CHRATR_LANGUAGE);
190 }
191 if (!(SvtScriptType::ASIAN & m_nScript))
192 {
193 m_oItemSet->ClearItem(RES_CHRATR_CJK_FONT);
194 m_oItemSet->ClearItem(RES_CHRATR_CJK_LANGUAGE);
195 }
196 if (!(SvtScriptType::COMPLEX & m_nScript))
197 {
198 m_oItemSet->ClearItem(RES_CHRATR_CTL_FONT);
199 m_oItemSet->ClearItem(RES_CHRATR_CTL_LANGUAGE);
200 }
201 if (m_oItemSet->Count())
202 {
203 if (m_bNewDoc)
204 {
205 if (pColl)
206 {
207 // Using the pool defaults for the font causes significant
208 // trouble for the HTML filter, because it is not able
209 // to export the pool defaults (or to be more precise:
210 // the HTML filter is not able to detect whether a pool
211 // default has changed or not. Even a comparison with the
212 // HTML template does not work, because the defaults are
213 // not copied when a new doc is created. The result of
214 // comparing pool defaults therefore would be that the
215 // defaults are exported always if the have changed for
216 // text documents in general. That's not sensible, as well
217 // as it is not sensible to export them always.
218 sal_uInt16 aWhichIds[4] =
219 {
222 };
223 sal_uInt16 *pWhichIds = aWhichIds;
224 while (*pWhichIds)
225 {
226 const SfxPoolItem *pItem;
227 if (SfxItemState::SET
228 == m_oItemSet->GetItemState(*pWhichIds, false, &pItem))
229 {
230 pColl->SetFormatAttr( *pItem );
231 m_oItemSet->ClearItem(*pWhichIds);
232 }
233 ++pWhichIds;
234 }
235 }
236 if (m_oItemSet->Count())
237 m_rDoc.SetDefault(*m_oItemSet);
238 }
239 else if( pInsPam )
240 {
241 // then set over the insert range the defined attributes
242 *pInsPam->GetMark() = *m_oPam->GetPoint();
243 pInsPam->GetPoint()->Assign(pInsPam->GetPoint()->GetNode(), SwNodeOffset(1),
244 nSttContent );
245
246 // !!!!!
247 OSL_ENSURE( false, "Have to change - hard attr. to para. style" );
248 m_rDoc.getIDocumentContentOperations().InsertItemSet(*pInsPam, *m_oItemSet);
249 }
250 }
251 m_oItemSet.reset();
252 }
253
254 pInsPam.reset();
255
257 return nError;
258}
259
260ErrCode SwASCIIParser::ReadChars()
261{
262 sal_Unicode *pStt = nullptr, *pEnd = nullptr, *pLastStt = nullptr;
263 tools::Long nReadCnt = 0, nLineLen = 0;
264 sal_Unicode cLastCR = 0;
265 bool bSwapUnicode = false;
266
267 const SwAsciiOptions* pUseMe = &m_rOpt;
268 SwAsciiOptions aEmpty;
269 if (m_nFileSize >= 2 && aEmpty.GetFontName() == m_rOpt.GetFontName()
270 && aEmpty.GetCharSet() == m_rOpt.GetCharSet()
271 && aEmpty.GetLanguage() == m_rOpt.GetLanguage()
272 && aEmpty.GetParaFlags() == m_rOpt.GetParaFlags())
273 {
274 sal_Size nLen, nOrig;
275 nOrig = nLen = m_rInput.ReadBytes(m_pArr.get(), ASC_BUFFLEN);
276 rtl_TextEncoding eCharSet;
277 LineEnd eLineEnd;
278 bool bHasBom;
279 const bool bRet
280 = SwIoSystem::IsDetectableText(m_pArr.get(), nLen, &eCharSet,
281 &bSwapUnicode, &eLineEnd, &bHasBom);
282 if (!bRet)
284
285 OSL_ENSURE(bRet, "Autodetect of text import without nag dialog must have failed");
286 if (bRet && eCharSet != RTL_TEXTENCODING_DONTKNOW)
287 {
288 aEmpty.SetCharSet(eCharSet);
289 aEmpty.SetParaFlags(eLineEnd);
290 aEmpty.SetIncludeBOM(bHasBom);
291 m_rInput.SeekRel(-(tools::Long(nLen)));
292 }
293 else
294 m_rInput.SeekRel(-(tools::Long(nOrig)));
295 pUseMe=&aEmpty;
296 }
297 m_usedAsciiOptions = *pUseMe;
298
299 rtl_TextToUnicodeConverter hConverter=nullptr;
300 rtl_TextToUnicodeContext hContext=nullptr;
301 rtl_TextEncoding currentCharSet = pUseMe->GetCharSet();
302 if (RTL_TEXTENCODING_UCS2 != currentCharSet)
303 {
304 if( currentCharSet == RTL_TEXTENCODING_DONTKNOW )
305 currentCharSet = RTL_TEXTENCODING_ASCII_US;
306 hConverter = rtl_createTextToUnicodeConverter( currentCharSet );
307 OSL_ENSURE( hConverter, "no string convert available" );
308 if (!hConverter)
309 return ErrCode(ErrCodeArea::Sw, ErrCodeClass::Read, 0);
310 bSwapUnicode = false;
311 hContext = rtl_createTextToUnicodeContext( hConverter );
312 }
313 else if (pUseMe != &aEmpty) //Already successfully figured out type
314 {
315 m_rInput.StartReadingUnicodeText(currentCharSet);
316 bSwapUnicode = m_rInput.IsEndianSwap();
317 }
318
319 std::unique_ptr<sal_Unicode[]> aWork;
320 sal_Size nArrOffset = 0;
321
322 do {
323 if( pStt >= pEnd )
324 {
325 if( pLastStt != pStt )
326 InsertText( OUString( pLastStt ));
327
328 // Read a new block
329 sal_Size lGCount;
330 if (ERRCODE_NONE != m_rInput.GetError()
331 || 0
332 == (lGCount = m_rInput.ReadBytes(m_pArr.get() + nArrOffset,
333 ASC_BUFFLEN - nArrOffset)))
334 break; // break from the while loop
335
336 /*
337 If there was some unconverted bytes on the last cycle then they
338 were put at the beginning of the array, so total bytes available
339 to convert this cycle includes them. If we found 0 following bytes
340 then we ignore the previous partial character.
341 */
342 lGCount += nArrOffset;
343
344 if( hConverter )
345 {
346 sal_uInt32 nInfo;
347 sal_Size nNewLen = lGCount, nCntBytes;
348 aWork.reset(new sal_Unicode[nNewLen + 1]); // add 1 for '\0'
349 sal_Unicode* pBuf = aWork.get();
350 pBuf[nNewLen] = 0; // ensure '\0'
351
352 nNewLen = rtl_convertTextToUnicode(hConverter, hContext, m_pArr.get(), lGCount,
353 pBuf, nNewLen,
354 (RTL_TEXTTOUNICODE_FLAGS_UNDEFINED_DEFAULT
355 | RTL_TEXTTOUNICODE_FLAGS_MBUNDEFINED_DEFAULT
356 | RTL_TEXTTOUNICODE_FLAGS_INVALID_DEFAULT
357 | RTL_TEXTTOUNICODE_FLAGS_GLOBAL_SIGNATURE),
358 &nInfo, &nCntBytes);
359 nArrOffset = lGCount - nCntBytes;
360 if( 0 != nArrOffset )
361 memmove(m_pArr.get(), m_pArr.get() + nCntBytes, nArrOffset);
362
363 pStt = pLastStt = aWork.get();
364 pEnd = pStt + nNewLen;
365 }
366 else
367 {
368 pStt = pLastStt = reinterpret_cast<sal_Unicode*>(m_pArr.get());
369 auto nChars = lGCount / 2;
370 pEnd = pStt + nChars;
371
372 if( bSwapUnicode )
373 {
374 char *pF = m_pArr.get(), *pN = m_pArr.get() + 1;
375 for (sal_Size n = 0; n < nChars; ++n, pF += 2, pN += 2)
376 {
377 char c = *pF;
378 *pF = *pN;
379 *pN = c;
380 }
381 }
382 }
383
384 *pEnd = 0;
385 nReadCnt += lGCount;
386
388
389 if( cLastCR )
390 {
391 if( 0x0a == *pStt && 0x0d == cLastCR )
392 pLastStt = ++pStt;
393 cLastCR = 0;
394 nLineLen = 0;
395 // We skip the last one at the end
396 if (!m_rInput.eof() || !(pEnd == pStt || (!*pEnd && pEnd == pStt + 1)))
397 m_rDoc.getIDocumentContentOperations().SplitNode(*m_oPam->GetPoint(), false);
398 }
399 }
400
401 bool bIns = true, bSplitNode = false;
402 switch( *pStt )
403 {
404
405 case 0x0a: if( LINEEND_LF == pUseMe->GetParaFlags() )
406 {
407 bIns = false;
408 *pStt = 0;
409 ++pStt;
410
411 // We skip the last one at the end
412 if (!m_rInput.eof() || pEnd != pStt)
413 bSplitNode = true;
414 }
415 break;
416
417 case 0x0d: if( LINEEND_LF != pUseMe->GetParaFlags() )
418 {
419 bIns = false;
420 *pStt = 0;
421 ++pStt;
422
423 bool bChkSplit = true;
424 if( LINEEND_CRLF == pUseMe->GetParaFlags() )
425 {
426 if( pStt == pEnd )
427 {
428 cLastCR = 0x0d;
429 bChkSplit = false;
430 }
431 else if( 0x0a == *pStt )
432 ++pStt;
433 }
434
435 // We skip the last one at the end
436 if (bChkSplit && (!m_rInput.eof() || pEnd != pStt))
437 bSplitNode = true;
438 }
439 break;
440
441 case 0x0c:
442 {
443 // Insert a hard page break
444 *pStt++ = 0;
445 if( nLineLen )
446 {
447 InsertText( OUString( pLastStt ));
448 }
449 m_rDoc.getIDocumentContentOperations().SplitNode(*m_oPam->GetPoint(),
450 false);
452 *m_oPam, SvxFormatBreakItem(SvxBreak::PageBefore, RES_BREAK));
453 pLastStt = pStt;
454 nLineLen = 0;
455 bIns = false;
456 }
457 break;
458
459 case 0x1a:
460 if (nReadCnt == m_nFileSize && pStt + 1 == pEnd)
461 *pStt = 0;
462 else
463 *pStt = '#'; // Replacement visualisation
464 break;
465
466 case '\t': break;
467
468 default:
469 if( ' ' > *pStt )
470 // Found control char, replace with '#'
471 *pStt = '#';
472 break;
473 }
474
475 if( bIns )
476 {
477 ++pStt;
478 ++nLineLen;
479 }
480 else if( bSplitNode )
481 {
482 // We found a CR/LF, thus save the text
483 InsertText( OUString( pLastStt ));
484 if (m_bNewDoc)
486 else
487 m_rDoc.getIDocumentContentOperations().SplitNode(*m_oPam->GetPoint(), false);
488 pLastStt = pStt;
489 nLineLen = 0;
490 }
491 } while(true);
492
493 if( hConverter )
494 {
495 rtl_destroyTextToUnicodeContext( hConverter, hContext );
496 rtl_destroyTextToUnicodeConverter( hConverter );
497 }
498 return ERRCODE_NONE;
499}
500
501void SwASCIIParser::InsertText( const OUString& rStr )
502{
504
505 if (m_oItemSet && g_pBreakIt
506 && m_nScript != (SvtScriptType::LATIN | SvtScriptType::ASIAN | SvtScriptType::COMPLEX))
507 m_nScript |= g_pBreakIt->GetAllScriptsOfText(rStr);
508}
509
510/* vim:set shiftwidth=4 softtabstop=4 expandtab: */
SwBreakIt * g_pBreakIt
Definition: breakit.cxx:34
virtual ErrCode Read(SwDoc &, const OUString &rBaseURL, SwPaM &, const OUString &) override
Definition: parasc.cxx:81
virtual bool AppendTextNode(SwPosition &rPos)=0
virtual bool SplitNode(const SwPosition &rPos, bool bChkTableStart)=0
Split a node at rPos (implemented only for TextNode).
virtual void InsertItemSet(const SwPaM &rRg, const SfxItemSet &, const SetAttrMode nFlags=SetAttrMode::DEFAULT, SwRootFrame const *pLayout=nullptr)=0
virtual bool InsertPoolItem(const SwPaM &rRg, const SfxPoolItem &, const SetAttrMode nFlags=SetAttrMode::DEFAULT, SwRootFrame const *pLayout=nullptr, SwTextAttr **ppNewTextAttr=nullptr)=0
Insert an attribute.
virtual bool InsertString(const SwPaM &rRg, const OUString &, const SwInsertFlags nInsertMode=SwInsertFlags::EMPTYEXPAND)=0
Insert string into existing text node at position rRg.Point().
virtual SfxPrinter * getPrinter(bool bCreate) const =0
Return the printer set at the document.
virtual SwTextFormatColl * GetTextCollFromPool(sal_uInt16 nId, bool bRegardLanguage=true)=0
Return "Auto-Collection with ID.
bool m_bInsertMode
Definition: shellio.hxx:213
SvStream * m_pStream
Definition: shellio.hxx:207
SwgReaderOption m_aOption
Definition: shellio.hxx:212
SfxMedium * m_pMedium
Definition: shellio.hxx:210
const SfxPoolItem * Put(const SfxPoolItem &rItem, sal_uInt16 nWhich)
SfxItemSet * GetItemSet() const
rtl_TextEncoding GetCharSet() const
Definition: shellio.hxx:68
LanguageType GetLanguage() const
Definition: shellio.hxx:71
LineEnd GetParaFlags() const
Definition: shellio.hxx:74
void SetIncludeBOM(bool bVal)
Definition: shellio.hxx:78
void SetParaFlags(LineEnd eVal)
Definition: shellio.hxx:75
const OUString & GetFontName() const
Definition: shellio.hxx:65
void SetCharSet(rtl_TextEncoding nVal)
Definition: shellio.hxx:69
SvtScriptType GetAllScriptsOfText(const OUString &rText) const
Definition: breakit.cxx:128
Definition: doc.hxx:195
IDocumentDeviceAccess const & getIDocumentDeviceAccess() const
Definition: doc.cxx:246
void SetDefault(const SfxPoolItem &)
Set attribute as new default attribute in current document.
Definition: docfmt.cxx:540
IDocumentContentOperations const & getIDocumentContentOperations() const
Definition: doc.cxx:323
IDocumentStylePoolAccess const & getIDocumentStylePoolAccess() const
Definition: doc.cxx:434
const SwAttrPool & GetAttrPool() const
Definition: doc.hxx:1329
bool SetTextFormatColl(const SwPaM &rRg, SwTextFormatColl *pFormat, const bool bReset=true, const bool bResetListAttrs=false, SwRootFrame const *pLayout=nullptr)
Add 4th optional parameter <bResetListAttrs>.
Definition: docfmt.cxx:1081
SwDocShell * GetDocShell()
Definition: doc.hxx:1362
static bool IsDetectableText(const char *pBuf, sal_uLong &rLen, rtl_TextEncoding *pCharSet, bool *pSwap, LineEnd *pLineEnd, bool *pBom)
Definition: iodetect.cxx:257
Base class of the Writer document model elements.
Definition: node.hxx:98
PaM is Point and Mark: a selection of the document model.
Definition: pam.hxx:187
const SwPosition * GetPoint() const
Definition: pam.hxx:261
Represents the style of a paragraph.
Definition: fmtcol.hxx:61
virtual bool SetFormatAttr(const SfxPoolItem &rAttr) override
Override to recognize changes on the <SwNumRuleItem> and register/unregister the paragragh style at t...
Definition: fmtcol.cxx:345
const SwAsciiOptions & GetASCIIOpts() const
Definition: shellio.hxx:123
void ResetASCIIOpts()
Definition: shellio.hxx:125
SwDoc & m_rDoc
Definition: docbm.cxx:1215
#define ERRCODE_IO_BROKENPACKAGE
#define ERRCODE_NONE
constexpr TypedWhichId< SvxFormatBreakItem > RES_BREAK(94)
constexpr TypedWhichId< SvxFontItem > RES_CHRATR_CJK_FONT(22)
constexpr TypedWhichId< SvxLanguageItem > RES_CHRATR_LANGUAGE(10)
constexpr TypedWhichId< SvxLanguageItem > RES_CHRATR_CTL_LANGUAGE(29)
constexpr TypedWhichId< SvxFontItem > RES_CHRATR_CTL_FONT(27)
constexpr TypedWhichId< SvxLanguageItem > RES_CHRATR_CJK_LANGUAGE(24)
constexpr TypedWhichId< SvxFontItem > RES_CHRATR_FONT(7)
sal_Int64 n
SvtScriptType
LineEnd
void StartProgress(TranslateId pMessResId, tools::Long nStartValue, tools::Long nEndValue, SwDocShell *pDocShell)
Definition: mainwn.cxx:52
void EndProgress(SwDocShell const *pDocShell)
Definition: mainwn.cxx:92
void SetProgressState(tools::Long nPosition, SwDocShell const *pDocShell)
Definition: mainwn.cxx:82
static constexpr auto Items
long Long
o3tl::strong_int< sal_Int32, struct Tag_SwNodeOffset > SwNodeOffset
Definition: nodeoffset.hxx:16
#define ASC_BUFFLEN
Definition: parasc.cxx:47
@ RES_POOLCOLL_STANDARD
Standard.
Definition: poolfmt.hxx:250
@ RES_POOLCOLL_HTML_PRE
Definition: poolfmt.hxx:433
#define ERR_SWG_READ_ERROR
Definition: swerror.h:25
sal_uInt16 sal_Unicode