LibreOffice Module sw (master)  1
parasc.cxx
Go to the documentation of this file.
1 /* -*- Mode: C++; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4 -*- */
2 /*
3  * This file is part of the LibreOffice project.
4  *
5  * This Source Code Form is subject to the terms of the Mozilla Public
6  * License, v. 2.0. If a copy of the MPL was not distributed with this
7  * file, You can obtain one at http://mozilla.org/MPL/2.0/.
8  *
9  * This file incorporates work covered by the following license notice:
10  *
11  * Licensed to the Apache Software Foundation (ASF) under one or more
12  * contributor license agreements. See the NOTICE file distributed
13  * with this work for additional information regarding copyright
14  * ownership. The ASF licenses this file to you under the Apache
15  * License, Version 2.0 (the "License"); you may not use this file
16  * except in compliance with the License. You may obtain a copy of
17  * the License at http://www.apache.org/licenses/LICENSE-2.0 .
18  */
19 
20 #include <memory>
21 
22 #include <tools/stream.hxx>
23 #include <hintids.hxx>
24 #include <rtl/tencinfo.h>
25 #include <sfx2/printer.hxx>
26 #include <editeng/fontitem.hxx>
27 #include <editeng/langitem.hxx>
30 #include <shellio.hxx>
31 #include <doc.hxx>
34 #include <swtypes.hxx>
35 #include <ndtxt.hxx>
36 #include <pam.hxx>
37 #include <frmatr.hxx>
38 #include <fltini.hxx>
39 #include <pagedesc.hxx>
40 #include <breakit.hxx>
41 #include <swerror.h>
42 #include <strings.hrc>
43 #include <mdiexp.hxx>
44 #include <poolfmt.hxx>
45 #include <iodetect.hxx>
46 
47 #include <vcl/metric.hxx>
48 #include <osl/diagnose.h>
49 
50 #define ASC_BUFFLEN 4096
51 
52 namespace {
53 
54 class SwASCIIParser
55 {
56  SwDoc* pDoc;
57  std::unique_ptr<SwPaM> pPam;
58  SvStream& rInput;
59  std::unique_ptr<sal_Char[]> pArr;
60  const SwAsciiOptions& rOpt;
61  std::unique_ptr<SfxItemSet> pItemSet;
62  long nFileSize;
63  SvtScriptType nScript;
64  bool const bNewDoc;
65 
66  ErrCode ReadChars();
67  void InsertText( const OUString& rStr );
68 
69  SwASCIIParser(const SwASCIIParser&) = delete;
70  SwASCIIParser& operator=(const SwASCIIParser&) = delete;
71 
72 public:
73  SwASCIIParser( SwDoc* pD, const SwPaM& rCursor, SvStream& rIn,
74  bool bReadNewDoc, const SwAsciiOptions& rOpts );
75 
76  ErrCode CallParser();
77 };
78 
79 }
80 
81 // Call for the general reader interface
82 ErrCode AsciiReader::Read( SwDoc &rDoc, const OUString&, SwPaM &rPam, const OUString & )
83 {
84  if( !m_pStream )
85  {
86  OSL_ENSURE( false, "ASCII read without a stream" );
87  return ERR_SWG_READ_ERROR;
88  }
89 
90  std::unique_ptr<SwASCIIParser> pParser(new SwASCIIParser( &rDoc, rPam, *m_pStream,
92  ErrCode nRet = pParser->CallParser();
93 
94  pParser.reset();
95  // after Read reset the options
97  return nRet;
98 }
99 
100 SwASCIIParser::SwASCIIParser(SwDoc* pD, const SwPaM& rCursor, SvStream& rIn,
101  bool bReadNewDoc, const SwAsciiOptions& rOpts)
102  : pDoc(pD), rInput(rIn), rOpt(rOpts), nFileSize(0), nScript(SvtScriptType::NONE)
103  , bNewDoc(bReadNewDoc)
104 {
105  pPam.reset( new SwPaM( *rCursor.GetPoint() ) );
106  pArr.reset( new sal_Char [ ASC_BUFFLEN + 2 ] );
107 
108  pItemSet = std::make_unique<SfxItemSet>( pDoc->GetAttrPool(),
112 
113  // set defaults from the options
114  if( rOpt.GetLanguage() )
115  {
116  SvxLanguageItem aLang( rOpt.GetLanguage(), RES_CHRATR_LANGUAGE );
117  pItemSet->Put( aLang );
118  aLang.SetWhich(RES_CHRATR_CJK_LANGUAGE);
119  pItemSet->Put( aLang );
120  aLang.SetWhich(RES_CHRATR_CTL_LANGUAGE);
121  pItemSet->Put( aLang );
122  }
123  if( !rOpt.GetFontName().isEmpty() )
124  {
125  vcl::Font aTextFont( rOpt.GetFontName(), Size( 0, 10 ) );
126  if( pDoc->getIDocumentDeviceAccess().getPrinter( false ) )
127  aTextFont = pDoc->getIDocumentDeviceAccess().getPrinter( false )->GetFontMetric( aTextFont );
128  SvxFontItem aFont( aTextFont.GetFamilyType(), aTextFont.GetFamilyName(),
129  OUString(), aTextFont.GetPitch(), aTextFont.GetCharSet(), RES_CHRATR_FONT );
130  pItemSet->Put( aFont );
131  aFont.SetWhich(RES_CHRATR_CJK_FONT);
132  pItemSet->Put( aFont );
133  aFont.SetWhich(RES_CHRATR_CTL_FONT);
134  pItemSet->Put( aFont );
135  }
136 }
137 
138 // Calling the parser
139 ErrCode SwASCIIParser::CallParser()
140 {
141  rInput.ResetError();
142  nFileSize = rInput.TellEnd();
143  rInput.Seek(STREAM_SEEK_TO_BEGIN);
144  rInput.ResetError();
145 
146  ::StartProgress( STR_STATSTR_W4WREAD, 0, nFileSize, pDoc->GetDocShell() );
147 
148  std::unique_ptr<SwPaM> pInsPam;
149  sal_Int32 nSttContent = 0;
150  if (!bNewDoc)
151  {
152  const SwNodeIndex& rTmp = pPam->GetPoint()->nNode;
153  pInsPam.reset(new SwPaM( rTmp, rTmp, 0, -1 ));
154  nSttContent = pPam->GetPoint()->nContent.GetIndex();
155  }
156 
157  SwTextFormatColl *pColl = nullptr;
158 
159  if (bNewDoc)
160  {
161  pColl = pDoc->getIDocumentStylePoolAccess().GetTextCollFromPool(RES_POOLCOLL_HTML_PRE, false);
162  if (!pColl)
163  pColl = pDoc->getIDocumentStylePoolAccess().GetTextCollFromPool(RES_POOLCOLL_STANDARD,false);
164  if (pColl)
165  pDoc->SetTextFormatColl(*pPam, pColl);
166  }
167 
168  ErrCode nError = ReadChars();
169 
170  if( pItemSet )
171  {
172  // set only the attribute, for scanned scripts.
173  if( !( SvtScriptType::LATIN & nScript ))
174  {
175  pItemSet->ClearItem( RES_CHRATR_FONT );
176  pItemSet->ClearItem( RES_CHRATR_LANGUAGE );
177  }
178  if( !( SvtScriptType::ASIAN & nScript ))
179  {
180  pItemSet->ClearItem( RES_CHRATR_CJK_FONT );
181  pItemSet->ClearItem( RES_CHRATR_CJK_LANGUAGE );
182  }
183  if( !( SvtScriptType::COMPLEX & nScript ))
184  {
185  pItemSet->ClearItem( RES_CHRATR_CTL_FONT );
186  pItemSet->ClearItem( RES_CHRATR_CTL_LANGUAGE );
187  }
188  if( pItemSet->Count() )
189  {
190  if( bNewDoc )
191  {
192  if (pColl)
193  {
194  // Using the pool defaults for the font causes significant
195  // trouble for the HTML filter, because it is not able
196  // to export the pool defaults (or to be more precise:
197  // the HTML filter is not able to detect whether a pool
198  // default has changed or not. Even a comparison with the
199  // HTML template does not work, because the defaults are
200  // not copied when a new doc is created. The result of
201  // comparing pool defaults therefore would be that the
202  // defaults are exported always if the have changed for
203  // text documents in general. That's not sensible, as well
204  // as it is not sensible to export them always.
205  sal_uInt16 aWhichIds[4] =
206  {
209  };
210  sal_uInt16 *pWhichIds = aWhichIds;
211  while (*pWhichIds)
212  {
213  const SfxPoolItem *pItem;
214  if (SfxItemState::SET == pItemSet->GetItemState(*pWhichIds,
215  false, &pItem))
216  {
217  pColl->SetFormatAttr( *pItem );
218  pItemSet->ClearItem( *pWhichIds );
219  }
220  ++pWhichIds;
221  }
222  }
223  if (pItemSet->Count())
224  pDoc->SetDefault(*pItemSet);
225  }
226  else if( pInsPam )
227  {
228  // then set over the insert range the defined attributes
229  *pInsPam->GetMark() = *pPam->GetPoint();
230  ++pInsPam->GetPoint()->nNode;
231  pInsPam->GetPoint()->nContent.Assign(
232  pInsPam->GetContentNode(), nSttContent );
233 
234  // !!!!!
235  OSL_ENSURE( false, "Have to change - hard attr. to para. style" );
236  pDoc->getIDocumentContentOperations().InsertItemSet( *pInsPam, *pItemSet );
237  }
238  }
239  pItemSet.reset();
240  }
241 
242  pInsPam.reset();
243 
244  ::EndProgress( pDoc->GetDocShell() );
245  return nError;
246 }
247 
248 ErrCode SwASCIIParser::ReadChars()
249 {
250  sal_Unicode *pStt = nullptr, *pEnd = nullptr, *pLastStt = nullptr;
251  long nReadCnt = 0, nLineLen = 0;
252  sal_Unicode cLastCR = 0;
253  bool bSwapUnicode = false;
254 
255  const SwAsciiOptions *pUseMe=&rOpt;
256  SwAsciiOptions aEmpty;
257  if (nFileSize >= 2 &&
258  aEmpty.GetFontName() == rOpt.GetFontName() &&
259  aEmpty.GetCharSet() == rOpt.GetCharSet() &&
260  aEmpty.GetLanguage() == rOpt.GetLanguage() &&
261  aEmpty.GetParaFlags() == rOpt.GetParaFlags())
262  {
263  sal_uLong nLen, nOrig;
264  nOrig = nLen = rInput.ReadBytes(pArr.get(), ASC_BUFFLEN);
265  rtl_TextEncoding eCharSet;
266  LineEnd eLineEnd;
267  bool bRet = SwIoSystem::IsDetectableText(pArr.get(), nLen, &eCharSet, &bSwapUnicode, &eLineEnd);
268  OSL_ENSURE(bRet, "Autodetect of text import without nag dialog must have failed");
269  if (bRet && eCharSet != RTL_TEXTENCODING_DONTKNOW)
270  {
271  aEmpty.SetCharSet(eCharSet);
272  aEmpty.SetParaFlags(eLineEnd);
273  rInput.SeekRel(-(long(nLen)));
274  }
275  else
276  rInput.SeekRel(-(long(nOrig)));
277  pUseMe=&aEmpty;
278  }
279 
280  rtl_TextToUnicodeConverter hConverter=nullptr;
281  rtl_TextToUnicodeContext hContext=nullptr;
282  rtl_TextEncoding currentCharSet = pUseMe->GetCharSet();
283  if (RTL_TEXTENCODING_UCS2 != currentCharSet)
284  {
285  if( currentCharSet == RTL_TEXTENCODING_DONTKNOW )
286  currentCharSet = RTL_TEXTENCODING_ASCII_US;
287  hConverter = rtl_createTextToUnicodeConverter( currentCharSet );
288  OSL_ENSURE( hConverter, "no string convert available" );
289  if (!hConverter)
290  return ErrCode(ErrCodeArea::Sw, ErrCodeClass::Read, 0);
291  bSwapUnicode = false;
292  hContext = rtl_createTextToUnicodeContext( hConverter );
293  }
294  else if (pUseMe != &aEmpty) //Already successfully figured out type
295  {
296  rInput.StartReadingUnicodeText( currentCharSet );
297  bSwapUnicode = rInput.IsEndianSwap();
298  }
299 
300  std::unique_ptr<sal_Unicode[]> aWork;
301  sal_uLong nArrOffset = 0;
302 
303  do {
304  if( pStt >= pEnd )
305  {
306  if( pLastStt != pStt )
307  InsertText( OUString( pLastStt ));
308 
309  // Read a new block
310  sal_uLong lGCount;
311  if( ERRCODE_NONE != rInput.GetError() || 0 == (lGCount =
312  rInput.ReadBytes( pArr.get() + nArrOffset,
313  ASC_BUFFLEN - nArrOffset )))
314  break; // break from the while loop
315 
316  /*
317  If there was some unconverted bytes on the last cycle then they
318  were put at the beginning of the array, so total bytes available
319  to convert this cycle includes them. If we found 0 following bytes
320  then we ignore the previous partial character.
321  */
322  lGCount+=nArrOffset;
323 
324  if( hConverter )
325  {
326  sal_uInt32 nInfo;
327  sal_Size nNewLen = lGCount, nCntBytes;
328  aWork.reset(new sal_Unicode[nNewLen + 1]); // add 1 for '\0'
329  sal_Unicode* pBuf = aWork.get();
330  pBuf[nNewLen] = 0; // ensure '\0'
331 
332  nNewLen = rtl_convertTextToUnicode( hConverter, hContext,
333  pArr.get(), lGCount, pBuf, nNewLen,
334  (
335  RTL_TEXTTOUNICODE_FLAGS_UNDEFINED_DEFAULT |
336  RTL_TEXTTOUNICODE_FLAGS_MBUNDEFINED_DEFAULT |
337  RTL_TEXTTOUNICODE_FLAGS_INVALID_DEFAULT |
338  RTL_TEXTTOUNICODE_FLAGS_GLOBAL_SIGNATURE
339  ),
340  &nInfo,
341  &nCntBytes );
342  if( 0 != ( nArrOffset = lGCount - nCntBytes ) )
343  memmove( pArr.get(), pArr.get() + nCntBytes, nArrOffset );
344 
345  pStt = pLastStt = aWork.get();
346  pEnd = pStt + nNewLen;
347  }
348  else
349  {
350  pStt = pLastStt = reinterpret_cast<sal_Unicode*>(pArr.get());
351  pEnd = reinterpret_cast<sal_Unicode*>(pArr.get() + lGCount);
352 
353  if( bSwapUnicode )
354  {
355  sal_Char* pF = pArr.get(), *pN = pArr.get() + 1;
356  for( sal_uLong n = 0; n < lGCount; n += 2, pF += 2, pN += 2 )
357  {
358  sal_Char c = *pF;
359  *pF = *pN;
360  *pN = c;
361  }
362  }
363  }
364 
365  *pEnd = 0;
366  nReadCnt += lGCount;
367 
368  ::SetProgressState( nReadCnt, pDoc->GetDocShell() );
369 
370  if( cLastCR )
371  {
372  if( 0x0a == *pStt && 0x0d == cLastCR )
373  pLastStt = ++pStt;
374  cLastCR = 0;
375  nLineLen = 0;
376  // We skip the last one at the end
377  if( !rInput.eof() || !(pEnd == pStt ||
378  ( !*pEnd && pEnd == pStt+1 ) ) )
379  pDoc->getIDocumentContentOperations().SplitNode( *pPam->GetPoint(), false );
380  }
381  }
382 
383  bool bIns = true, bSplitNode = false;
384  switch( *pStt )
385  {
386 
387  case 0x0a: if( LINEEND_LF == pUseMe->GetParaFlags() )
388  {
389  bIns = false;
390  *pStt = 0;
391  ++pStt;
392 
393  // We skip the last one at the end
394  if( !rInput.eof() || pEnd != pStt )
395  bSplitNode = true;
396  }
397  break;
398 
399  case 0x0d: if( LINEEND_LF != pUseMe->GetParaFlags() )
400  {
401  bIns = false;
402  *pStt = 0;
403  ++pStt;
404 
405  bool bChkSplit = false;
406  if( LINEEND_CRLF == pUseMe->GetParaFlags() )
407  {
408  if( pStt == pEnd )
409  cLastCR = 0x0d;
410  else if( 0x0a == *pStt )
411  {
412  ++pStt;
413  bChkSplit = true;
414  }
415  }
416  else
417  bChkSplit = true;
418 
419  // We skip the last one at the end
420  if( bChkSplit && ( !rInput.eof() || pEnd != pStt ))
421  bSplitNode = true;
422  }
423  break;
424 
425  case 0x0c:
426  {
427  // Insert a hard page break
428  *pStt++ = 0;
429  if( nLineLen )
430  {
431  InsertText( OUString( pLastStt ));
432  }
433  pDoc->getIDocumentContentOperations().SplitNode( *pPam->GetPoint(), false );
434  pDoc->getIDocumentContentOperations().InsertPoolItem(
435  *pPam, SvxFormatBreakItem( SvxBreak::PageBefore, RES_BREAK ) );
436  pLastStt = pStt;
437  nLineLen = 0;
438  bIns = false;
439  }
440  break;
441 
442  case 0x1a:
443  if( nReadCnt == nFileSize && pStt+1 == pEnd )
444  *pStt = 0;
445  else
446  *pStt = '#'; // Replacement visualisation
447  break;
448 
449  case '\t': break;
450 
451  default:
452  if( ' ' > *pStt )
453  // Found control char, replace with '#'
454  *pStt = '#';
455  break;
456  }
457 
458  if( bIns )
459  {
460  if( ( nLineLen >= MAX_ASCII_PARA - 100 ) &&
461  ( ( *pStt == ' ' ) || ( nLineLen >= MAX_ASCII_PARA - 1 ) ) )
462  {
463  sal_Unicode c = *pStt;
464  *pStt = 0;
465  InsertText( OUString( pLastStt ));
466  pDoc->getIDocumentContentOperations().SplitNode( *pPam->GetPoint(), false );
467  pLastStt = pStt;
468  nLineLen = 0;
469  *pStt = c;
470  }
471  ++pStt;
472  ++nLineLen;
473  }
474  else if( bSplitNode )
475  {
476  // We found a CR/LF, thus save the text
477  InsertText( OUString( pLastStt ));
478  if(bNewDoc)
479  pDoc->getIDocumentContentOperations().AppendTextNode( *pPam->GetPoint() );
480  else
481  pDoc->getIDocumentContentOperations().SplitNode( *pPam->GetPoint(), false );
482  pLastStt = pStt;
483  nLineLen = 0;
484  }
485  } while(true);
486 
487  if( hConverter )
488  {
489  rtl_destroyTextToUnicodeContext( hConverter, hContext );
490  rtl_destroyTextToUnicodeConverter( hConverter );
491  }
492  return ERRCODE_NONE;
493 }
494 
495 void SwASCIIParser::InsertText( const OUString& rStr )
496 {
497  pDoc->getIDocumentContentOperations().InsertString( *pPam, rStr );
498 
499  if( pItemSet && g_pBreakIt && nScript != ( SvtScriptType::LATIN |
500  SvtScriptType::ASIAN |
501  SvtScriptType::COMPLEX ) )
502  nScript |= g_pBreakIt->GetAllScriptsOfText( rStr );
503 }
504 
505 /* vim:set shiftwidth=4 softtabstop=4 expandtab: */
rtl_TextEncoding GetCharSet() const
Definition: shellio.hxx:77
Represents the style of a paragraph.
Definition: fmtcol.hxx:55
void ResetASCIIOpts()
Definition: shellio.hxx:136
#define RES_CHRATR_CJK_LANGUAGE
Definition: hintids.hxx:93
#define RES_CHRATR_LANGUAGE
Definition: hintids.hxx:79
const OUString & GetFontName() const
Definition: shellio.hxx:74
sal_uIntPtr sal_uLong
sal_Int64 n
Definition: doc.hxx:185
LanguageType GetLanguage() const
Definition: shellio.hxx:80
SvtScriptType GetAllScriptsOfText(const OUString &rText) const
Definition: breakit.cxx:127
#define RES_CHRATR_FONT
Definition: hintids.hxx:76
void EndProgress(SwDocShell const *pDocShell)
Definition: mainwn.cxx:92
#define RES_CHRATR_CJK_FONT
Definition: hintids.hxx:91
void SetParaFlags(LineEnd eVal)
Definition: shellio.hxx:84
virtual bool SetFormatAttr(const SfxPoolItem &rAttr) override
Override to recognize changes on the and register/unregister the paragragh style at t...
Definition: fmtcol.cxx:329
sal_uInt16 sal_Unicode
void StartProgress(const char *pMessResId, long nStartValue, long nEndValue, SwDocShell *pDocShell)
Definition: mainwn.cxx:52
static bool IsDetectableText(const sal_Char *pBuf, sal_uLong &rLen, rtl_TextEncoding *pCharSet, bool *pSwap, LineEnd *pLineEnd)
Definition: iodetect.cxx:241
char sal_Char
SwBreakIt * g_pBreakIt
Definition: breakit.cxx:33
sal_uLong GetIndex() const
Definition: ndindex.hxx:152
const SwAsciiOptions & GetASCIIOpts() const
Definition: shellio.hxx:134
void SetProgressState(long nPosition, SwDocShell const *pDocShell)
Definition: mainwn.cxx:82
void SetCharSet(rtl_TextEncoding nVal)
Definition: shellio.hxx:78
PaM is Point and Mark: a selection of the document model.
Definition: pam.hxx:136
SvtScriptType
#define RES_CHRATR_CTL_FONT
Definition: hintids.hxx:96
const SwPosition * GetPoint() const
Definition: pam.hxx:207
virtual ErrCode Read(SwDoc &, const OUString &rBaseURL, SwPaM &, const OUString &) override
Definition: parasc.cxx:82
Marks a node in the document model.
Definition: ndindex.hxx:31
#define ASC_BUFFLEN
Definition: parasc.cxx:50
LineEnd
#define MAX_ASCII_PARA
Definition: shellio.hxx:62
SwgReaderOption m_aOption
Definition: shellio.hxx:217
#define ERRCODE_NONE
#define RES_CHRATR_CTL_LANGUAGE
Definition: hintids.hxx:98
SvStream * m_pStream
Definition: shellio.hxx:212
#define ERR_SWG_READ_ERROR
Definition: swerror.h:25
LineEnd GetParaFlags() const
Definition: shellio.hxx:83
#define RES_BREAK
Definition: hintids.hxx:201
bool m_bInsertMode
Definition: shellio.hxx:218