LibreOffice Module sw (master)  1
parasc.cxx
Go to the documentation of this file.
1 /* -*- Mode: C++; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4 -*- */
2 /*
3  * This file is part of the LibreOffice project.
4  *
5  * This Source Code Form is subject to the terms of the Mozilla Public
6  * License, v. 2.0. If a copy of the MPL was not distributed with this
7  * file, You can obtain one at http://mozilla.org/MPL/2.0/.
8  *
9  * This file incorporates work covered by the following license notice:
10  *
11  * Licensed to the Apache Software Foundation (ASF) under one or more
12  * contributor license agreements. See the NOTICE file distributed
13  * with this work for additional information regarding copyright
14  * ownership. The ASF licenses this file to you under the Apache
15  * License, Version 2.0 (the "License"); you may not use this file
16  * except in compliance with the License. You may obtain a copy of
17  * the License at http://www.apache.org/licenses/LICENSE-2.0 .
18  */
19 
20 #include <memory>
21 
22 #include <tools/stream.hxx>
23 #include <hintids.hxx>
24 #include <rtl/tencinfo.h>
25 #include <sfx2/printer.hxx>
26 #include <editeng/fontitem.hxx>
27 #include <editeng/langitem.hxx>
30 #include <shellio.hxx>
31 #include <doc.hxx>
34 #include <swtypes.hxx>
35 #include <ndtxt.hxx>
36 #include <pam.hxx>
37 #include <frmatr.hxx>
38 #include <fltini.hxx>
39 #include <pagedesc.hxx>
40 #include <breakit.hxx>
41 #include <swerror.h>
42 #include <strings.hrc>
43 #include <mdiexp.hxx>
44 #include <poolfmt.hxx>
45 #include <iodetect.hxx>
46 
47 #include <vcl/metric.hxx>
48 #include <osl/diagnose.h>
49 
50 #define ASC_BUFFLEN 4096
51 
53 {
55  std::unique_ptr<SwPaM> pPam;
57  std::unique_ptr<sal_Char[]> pArr;
59  std::unique_ptr<SfxItemSet> pItemSet;
60  long nFileSize;
62  bool const bNewDoc;
63 
65  void InsertText( const OUString& rStr );
66 
67  SwASCIIParser(const SwASCIIParser&) = delete;
68  SwASCIIParser& operator=(const SwASCIIParser&) = delete;
69 
70 public:
71  SwASCIIParser( SwDoc* pD, const SwPaM& rCursor, SvStream& rIn,
72  bool bReadNewDoc, const SwAsciiOptions& rOpts );
73 
75 };
76 
77 // Call for the general reader interface
78 ErrCode AsciiReader::Read( SwDoc &rDoc, const OUString&, SwPaM &rPam, const OUString & )
79 {
80  if( !m_pStream )
81  {
82  OSL_ENSURE( false, "ASCII read without a stream" );
83  return ERR_SWG_READ_ERROR;
84  }
85 
86  std::unique_ptr<SwASCIIParser> pParser(new SwASCIIParser( &rDoc, rPam, *m_pStream,
88  ErrCode nRet = pParser->CallParser();
89 
90  pParser.reset();
91  // after Read reset the options
93  return nRet;
94 }
95 
97  bool bReadNewDoc, const SwAsciiOptions& rOpts)
98  : pDoc(pD), rInput(rIn), rOpt(rOpts), nFileSize(0), nScript(SvtScriptType::NONE)
99  , bNewDoc(bReadNewDoc)
100 {
101  pPam.reset( new SwPaM( *rCursor.GetPoint() ) );
102  pArr.reset( new sal_Char [ ASC_BUFFLEN + 2 ] );
103 
104  pItemSet = std::make_unique<SfxItemSet>( pDoc->GetAttrPool(),
108 
109  // set defaults from the options
110  if( rOpt.GetLanguage() )
111  {
113  pItemSet->Put( aLang );
114  aLang.SetWhich(RES_CHRATR_CJK_LANGUAGE);
115  pItemSet->Put( aLang );
116  aLang.SetWhich(RES_CHRATR_CTL_LANGUAGE);
117  pItemSet->Put( aLang );
118  }
119  if( !rOpt.GetFontName().isEmpty() )
120  {
121  vcl::Font aTextFont( rOpt.GetFontName(), Size( 0, 10 ) );
122  if( pDoc->getIDocumentDeviceAccess().getPrinter( false ) )
123  aTextFont = pDoc->getIDocumentDeviceAccess().getPrinter( false )->GetFontMetric( aTextFont );
124  SvxFontItem aFont( aTextFont.GetFamilyType(), aTextFont.GetFamilyName(),
125  OUString(), aTextFont.GetPitch(), aTextFont.GetCharSet(), RES_CHRATR_FONT );
126  pItemSet->Put( aFont );
127  aFont.SetWhich(RES_CHRATR_CJK_FONT);
128  pItemSet->Put( aFont );
129  aFont.SetWhich(RES_CHRATR_CTL_FONT);
130  pItemSet->Put( aFont );
131  }
132 }
133 
134 // Calling the parser
136 {
137  rInput.ResetError();
140  rInput.ResetError();
141 
142  ::StartProgress( STR_STATSTR_W4WREAD, 0, nFileSize, pDoc->GetDocShell() );
143 
144  std::unique_ptr<SwPaM> pInsPam;
145  sal_Int32 nSttContent = 0;
146  if (!bNewDoc)
147  {
148  const SwNodeIndex& rTmp = pPam->GetPoint()->nNode;
149  pInsPam.reset(new SwPaM( rTmp, rTmp, 0, -1 ));
150  nSttContent = pPam->GetPoint()->nContent.GetIndex();
151  }
152 
153  SwTextFormatColl *pColl = nullptr;
154 
155  if (bNewDoc)
156  {
158  if (!pColl)
160  if (pColl)
161  pDoc->SetTextFormatColl(*pPam, pColl);
162  }
163 
164  ErrCode nError = ReadChars();
165 
166  if( pItemSet )
167  {
168  // set only the attribute, for scanned scripts.
169  if( !( SvtScriptType::LATIN & nScript ))
170  {
171  pItemSet->ClearItem( RES_CHRATR_FONT );
172  pItemSet->ClearItem( RES_CHRATR_LANGUAGE );
173  }
174  if( !( SvtScriptType::ASIAN & nScript ))
175  {
176  pItemSet->ClearItem( RES_CHRATR_CJK_FONT );
177  pItemSet->ClearItem( RES_CHRATR_CJK_LANGUAGE );
178  }
179  if( !( SvtScriptType::COMPLEX & nScript ))
180  {
181  pItemSet->ClearItem( RES_CHRATR_CTL_FONT );
182  pItemSet->ClearItem( RES_CHRATR_CTL_LANGUAGE );
183  }
184  if( pItemSet->Count() )
185  {
186  if( bNewDoc )
187  {
188  if (pColl)
189  {
190  // Using the pool defaults for the font causes significant
191  // trouble for the HTML filter, because it is not able
192  // to export the pool defaults (or to be more precise:
193  // the HTML filter is not able to detect whether a pool
194  // default has changed or not. Even a comparison with the
195  // HTML template does not work, because the defaults are
196  // not copied when a new doc is created. The result of
197  // comparing pool defaults therefore would be that the
198  // defaults are exported always if the have changed for
199  // text documents in general. That's not sensible, as well
200  // as it is not sensible to export them always.
201  sal_uInt16 aWhichIds[4] =
202  {
205  };
206  sal_uInt16 *pWhichIds = aWhichIds;
207  while (*pWhichIds)
208  {
209  const SfxPoolItem *pItem;
210  if (SfxItemState::SET == pItemSet->GetItemState(*pWhichIds,
211  false, &pItem))
212  {
213  pColl->SetFormatAttr( *pItem );
214  pItemSet->ClearItem( *pWhichIds );
215  }
216  ++pWhichIds;
217  }
218  }
219  if (pItemSet->Count())
221  }
222  else if( pInsPam )
223  {
224  // then set over the insert range the defined attributes
225  *pInsPam->GetMark() = *pPam->GetPoint();
226  ++pInsPam->GetPoint()->nNode;
227  pInsPam->GetPoint()->nContent.Assign(
228  pInsPam->GetContentNode(), nSttContent );
229 
230  // !!!!!
231  OSL_ENSURE( false, "Have to change - hard attr. to para. style" );
233  }
234  }
235  pItemSet.reset();
236  }
237 
238  pInsPam.reset();
239 
241  return nError;
242 }
243 
245 {
246  sal_Unicode *pStt = nullptr, *pEnd = nullptr, *pLastStt = nullptr;
247  long nReadCnt = 0, nLineLen = 0;
248  sal_Unicode cLastCR = 0;
249  bool bSwapUnicode = false;
250 
251  const SwAsciiOptions *pUseMe=&rOpt;
252  SwAsciiOptions aEmpty;
253  if (nFileSize >= 2 &&
254  aEmpty.GetFontName() == rOpt.GetFontName() &&
255  aEmpty.GetCharSet() == rOpt.GetCharSet() &&
256  aEmpty.GetLanguage() == rOpt.GetLanguage() &&
257  aEmpty.GetParaFlags() == rOpt.GetParaFlags())
258  {
259  sal_uLong nLen, nOrig;
260  nOrig = nLen = rInput.ReadBytes(pArr.get(), ASC_BUFFLEN);
261  rtl_TextEncoding eCharSet;
262  LineEnd eLineEnd;
263  bool bRet = SwIoSystem::IsDetectableText(pArr.get(), nLen, &eCharSet, &bSwapUnicode, &eLineEnd);
264  OSL_ENSURE(bRet, "Autodetect of text import without nag dialog must have failed");
265  if (bRet && eCharSet != RTL_TEXTENCODING_DONTKNOW)
266  {
267  aEmpty.SetCharSet(eCharSet);
268  aEmpty.SetParaFlags(eLineEnd);
269  rInput.SeekRel(-(long(nLen)));
270  }
271  else
272  rInput.SeekRel(-(long(nOrig)));
273  pUseMe=&aEmpty;
274  }
275 
276  rtl_TextToUnicodeConverter hConverter=nullptr;
277  rtl_TextToUnicodeContext hContext=nullptr;
278  rtl_TextEncoding currentCharSet = pUseMe->GetCharSet();
279  if (RTL_TEXTENCODING_UCS2 != currentCharSet)
280  {
281  if( currentCharSet == RTL_TEXTENCODING_DONTKNOW )
282  currentCharSet = RTL_TEXTENCODING_ASCII_US;
283  hConverter = rtl_createTextToUnicodeConverter( currentCharSet );
284  OSL_ENSURE( hConverter, "no string convert available" );
285  if (!hConverter)
286  return ErrCode(ErrCodeArea::Sw, ErrCodeClass::Read, 0);
287  bSwapUnicode = false;
288  hContext = rtl_createTextToUnicodeContext( hConverter );
289  }
290  else if (pUseMe != &aEmpty) //Already successfully figured out type
291  {
292  rInput.StartReadingUnicodeText( currentCharSet );
293  bSwapUnicode = rInput.IsEndianSwap();
294  }
295 
296  std::unique_ptr<sal_Unicode[]> aWork;
297  sal_uLong nArrOffset = 0;
298 
299  do {
300  if( pStt >= pEnd )
301  {
302  if( pLastStt != pStt )
303  InsertText( OUString( pLastStt ));
304 
305  // Read a new block
306  sal_uLong lGCount;
307  if( ERRCODE_NONE != rInput.GetError() || 0 == (lGCount =
308  rInput.ReadBytes( pArr.get() + nArrOffset,
309  ASC_BUFFLEN - nArrOffset )))
310  break; // break from the while loop
311 
312  /*
313  If there was some unconverted bytes on the last cycle then they
314  were put at the beginning of the array, so total bytes available
315  to convert this cycle includes them. If we found 0 following bytes
316  then we ignore the previous partial character.
317  */
318  lGCount+=nArrOffset;
319 
320  if( hConverter )
321  {
322  sal_uInt32 nInfo;
323  sal_Size nNewLen = lGCount, nCntBytes;
324  aWork.reset(new sal_Unicode[nNewLen + 1]); // add 1 for '\0'
325  sal_Unicode* pBuf = aWork.get();
326  pBuf[nNewLen] = 0; // ensure '\0'
327 
328  nNewLen = rtl_convertTextToUnicode( hConverter, hContext,
329  pArr.get(), lGCount, pBuf, nNewLen,
330  (
331  RTL_TEXTTOUNICODE_FLAGS_UNDEFINED_DEFAULT |
332  RTL_TEXTTOUNICODE_FLAGS_MBUNDEFINED_DEFAULT |
333  RTL_TEXTTOUNICODE_FLAGS_INVALID_DEFAULT |
334  RTL_TEXTTOUNICODE_FLAGS_GLOBAL_SIGNATURE
335  ),
336  &nInfo,
337  &nCntBytes );
338  if( 0 != ( nArrOffset = lGCount - nCntBytes ) )
339  memmove( pArr.get(), pArr.get() + nCntBytes, nArrOffset );
340 
341  pStt = pLastStt = aWork.get();
342  pEnd = pStt + nNewLen;
343  }
344  else
345  {
346  pStt = pLastStt = reinterpret_cast<sal_Unicode*>(pArr.get());
347  pEnd = reinterpret_cast<sal_Unicode*>(pArr.get() + lGCount);
348 
349  if( bSwapUnicode )
350  {
351  sal_Char* pF = pArr.get(), *pN = pArr.get() + 1;
352  for( sal_uLong n = 0; n < lGCount; n += 2, pF += 2, pN += 2 )
353  {
354  sal_Char c = *pF;
355  *pF = *pN;
356  *pN = c;
357  }
358  }
359  }
360 
361  *pEnd = 0;
362  nReadCnt += lGCount;
363 
364  ::SetProgressState( nReadCnt, pDoc->GetDocShell() );
365 
366  if( cLastCR )
367  {
368  if( 0x0a == *pStt && 0x0d == cLastCR )
369  pLastStt = ++pStt;
370  cLastCR = 0;
371  nLineLen = 0;
372  // We skip the last one at the end
373  if( !rInput.eof() || !(pEnd == pStt ||
374  ( !*pEnd && pEnd == pStt+1 ) ) )
375  pDoc->getIDocumentContentOperations().SplitNode( *pPam->GetPoint(), false );
376  }
377  }
378 
379  bool bIns = true, bSplitNode = false;
380  switch( *pStt )
381  {
382 
383  case 0x0a: if( LINEEND_LF == pUseMe->GetParaFlags() )
384  {
385  bIns = false;
386  *pStt = 0;
387  ++pStt;
388 
389  // We skip the last one at the end
390  if( !rInput.eof() || pEnd != pStt )
391  bSplitNode = true;
392  }
393  break;
394 
395  case 0x0d: if( LINEEND_LF != pUseMe->GetParaFlags() )
396  {
397  bIns = false;
398  *pStt = 0;
399  ++pStt;
400 
401  bool bChkSplit = false;
402  if( LINEEND_CRLF == pUseMe->GetParaFlags() )
403  {
404  if( pStt == pEnd )
405  cLastCR = 0x0d;
406  else if( 0x0a == *pStt )
407  {
408  ++pStt;
409  bChkSplit = true;
410  }
411  }
412  else
413  bChkSplit = true;
414 
415  // We skip the last one at the end
416  if( bChkSplit && ( !rInput.eof() || pEnd != pStt ))
417  bSplitNode = true;
418  }
419  break;
420 
421  case 0x0c:
422  {
423  // Insert a hard page break
424  *pStt++ = 0;
425  if( nLineLen )
426  {
427  InsertText( OUString( pLastStt ));
428  }
429  pDoc->getIDocumentContentOperations().SplitNode( *pPam->GetPoint(), false );
431  *pPam, SvxFormatBreakItem( SvxBreak::PageBefore, RES_BREAK ) );
432  pLastStt = pStt;
433  nLineLen = 0;
434  bIns = false;
435  }
436  break;
437 
438  case 0x1a:
439  if( nReadCnt == nFileSize && pStt+1 == pEnd )
440  *pStt = 0;
441  else
442  *pStt = '#'; // Replacement visualisation
443  break;
444 
445  case '\t': break;
446 
447  default:
448  if( ' ' > *pStt )
449  // Found control char, replace with '#'
450  *pStt = '#';
451  break;
452  }
453 
454  if( bIns )
455  {
456  if( ( nLineLen >= MAX_ASCII_PARA - 100 ) &&
457  ( ( *pStt == ' ' ) || ( nLineLen >= MAX_ASCII_PARA - 1 ) ) )
458  {
459  sal_Unicode c = *pStt;
460  *pStt = 0;
461  InsertText( OUString( pLastStt ));
462  pDoc->getIDocumentContentOperations().SplitNode( *pPam->GetPoint(), false );
463  pLastStt = pStt;
464  nLineLen = 0;
465  *pStt = c;
466  }
467  ++pStt;
468  ++nLineLen;
469  }
470  else if( bSplitNode )
471  {
472  // We found a CR/LF, thus save the text
473  InsertText( OUString( pLastStt ));
474  if(bNewDoc)
476  else
477  pDoc->getIDocumentContentOperations().SplitNode( *pPam->GetPoint(), false );
478  pLastStt = pStt;
479  nLineLen = 0;
480  }
481  } while(true);
482 
483  if( hConverter )
484  {
485  rtl_destroyTextToUnicodeContext( hConverter, hContext );
486  rtl_destroyTextToUnicodeConverter( hConverter );
487  }
488  return ERRCODE_NONE;
489 }
490 
491 void SwASCIIParser::InsertText( const OUString& rStr )
492 {
494 
495  if( pItemSet && g_pBreakIt && nScript != ( SvtScriptType::LATIN |
496  SvtScriptType::ASIAN |
497  SvtScriptType::COMPLEX ) )
499 }
500 
501 /* vim:set shiftwidth=4 softtabstop=4 expandtab: */
rtl_TextEncoding GetCharSet() const
Definition: shellio.hxx:76
Represents the style of a paragraph.
Definition: fmtcol.hxx:55
void ResetASCIIOpts()
Definition: shellio.hxx:135
#define RES_CHRATR_CJK_LANGUAGE
Definition: hintids.hxx:92
#define RES_CHRATR_LANGUAGE
Definition: hintids.hxx:78
SwDocShell * GetDocShell()
Definition: doc.hxx:1340
IDocumentDeviceAccess const & getIDocumentDeviceAccess() const
Definition: doc.cxx:270
const OUString & GetFontName() const
Definition: shellio.hxx:73
long nFileSize
Definition: parasc.cxx:60
virtual void InsertItemSet(const SwPaM &rRg, const SfxItemSet &, const SetAttrMode nFlags=SetAttrMode::DEFAULT, SwRootFrame const *pLayout=nullptr)=0
virtual sal_uInt64 TellEnd()
virtual bool InsertPoolItem(const SwPaM &rRg, const SfxPoolItem &, const SetAttrMode nFlags=SetAttrMode::DEFAULT, SwRootFrame const *pLayout=nullptr, bool bExpandCharToPara=false)=0
Insert an attribute.
sal_uIntPtr sal_uLong
SwDoc * pDoc
Definition: parasc.cxx:54
Definition: doc.hxx:185
LanguageType GetLanguage() const
Definition: shellio.hxx:79
SvtScriptType GetAllScriptsOfText(const OUString &rText) const
Definition: breakit.cxx:128
sal_uInt64 Seek(sal_uInt64 nPos)
#define RES_CHRATR_FONT
Definition: hintids.hxx:75
void EndProgress(SwDocShell const *pDocShell)
Definition: mainwn.cxx:88
#define RES_CHRATR_CJK_FONT
Definition: hintids.hxx:90
void SetParaFlags(LineEnd eVal)
Definition: shellio.hxx:83
virtual bool SetFormatAttr(const SfxPoolItem &rAttr) override
Override to recognize changes on the and register/unregister the paragragh style at t...
Definition: fmtcol.cxx:330
IDocumentContentOperations const & getIDocumentContentOperations() const
Definition: doc.cxx:347
sal_uInt64 SeekRel(sal_Int64 nPos)
const SwAsciiOptions & rOpt
Definition: parasc.cxx:58
void InsertText(const OUString &rStr)
Definition: parasc.cxx:491
sal_uInt16 sal_Unicode
ErrCode GetError() const
void StartProgress(const char *pMessResId, long nStartValue, long nEndValue, SwDocShell *pDocShell)
Definition: mainwn.cxx:48
bool eof() const
ErrCode CallParser()
Definition: parasc.cxx:135
void StartReadingUnicodeText(rtl_TextEncoding eReadBomCharSet)
bool SetTextFormatColl(const SwPaM &rRg, SwTextFormatColl *pFormat, const bool bReset=true, const bool bResetListAttrs=false, SwRootFrame const *pLayout=nullptr)
Add 4th optional parameter .
Definition: docfmt.cxx:1096
static bool IsDetectableText(const sal_Char *pBuf, sal_uLong &rLen, rtl_TextEncoding *pCharSet, bool *pSwap, LineEnd *pLineEnd)
Definition: iodetect.cxx:241
IDocumentStylePoolAccess const & getIDocumentStylePoolAccess() const
Definition: doc.cxx:458
char sal_Char
SwBreakIt * g_pBreakIt
Definition: breakit.cxx:34
const SwAsciiOptions & GetASCIIOpts() const
Definition: shellio.hxx:133
std::unique_ptr< SwPaM > pPam
Definition: parasc.cxx:55
SwASCIIParser & operator=(const SwASCIIParser &)=delete
void SetProgressState(long nPosition, SwDocShell const *pDocShell)
Definition: mainwn.cxx:78
std::unique_ptr< sal_Char[]> pArr
Definition: parasc.cxx:57
void SetCharSet(rtl_TextEncoding nVal)
Definition: shellio.hxx:77
PaM is Point and Mark: a selection of the document model.
Definition: pam.hxx:136
virtual bool InsertString(const SwPaM &rRg, const OUString &, const SwInsertFlags nInsertMode=SwInsertFlags::EMPTYEXPAND)=0
Insert string into existing text node at position rRg.Point().
SvtScriptType
LINEEND_LF
SwASCIIParser(const SwASCIIParser &)=delete
#define RES_CHRATR_CTL_FONT
Definition: hintids.hxx:95
const SwPosition * GetPoint() const
Definition: pam.hxx:207
#define STREAM_SEEK_TO_BEGIN
virtual ErrCode Read(SwDoc &, const OUString &rBaseURL, SwPaM &, const OUString &) override
Definition: parasc.cxx:78
SvStream & rInput
Definition: parasc.cxx:56
Marks a node in the document model.
Definition: ndindex.hxx:31
bool IsEndianSwap() const
virtual SfxPrinter * getPrinter(bool bCreate) const =0
Return the printer set at the document.
#define ASC_BUFFLEN
Definition: parasc.cxx:50
LineEnd
std::size_t ReadBytes(void *pData, std::size_t nSize)
SvtScriptType nScript
Definition: parasc.cxx:61
void SetDefault(const SfxPoolItem &)
Set attribute as new default attribute in current document.
Definition: docfmt.cxx:554
#define MAX_ASCII_PARA
Definition: shellio.hxx:61
SwgReaderOption m_aOption
Definition: shellio.hxx:215
#define ERRCODE_NONE
#define RES_CHRATR_CTL_LANGUAGE
Definition: hintids.hxx:97
bool const bNewDoc
Definition: parasc.cxx:62
virtual SwTextFormatColl * GetTextCollFromPool(sal_uInt16 nId, bool bRegardLanguage=true)=0
Return "Auto-Collection with ID.
ErrCode ReadChars()
Definition: parasc.cxx:244
SvStream * m_pStream
Definition: shellio.hxx:210
virtual bool AppendTextNode(SwPosition &rPos)=0
LINEEND_CRLF
virtual bool SplitNode(const SwPosition &rPos, bool bChkTableStart)=0
Split a node at rPos (implemented only for TextNode).
#define ERR_SWG_READ_ERROR
Definition: swerror.h:25
std::unique_ptr< SfxItemSet > pItemSet
Definition: parasc.cxx:59
LineEnd GetParaFlags() const
Definition: shellio.hxx:82
virtual void ResetError()
#define RES_BREAK
Definition: hintids.hxx:199
const SwAttrPool & GetAttrPool() const
Definition: doc.hxx:1307
bool m_bInsertMode
Definition: shellio.hxx:216