LibreOffice Module sw (master)  1
parasc.cxx
Go to the documentation of this file.
1 /* -*- Mode: C++; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4 -*- */
2 /*
3  * This file is part of the LibreOffice project.
4  *
5  * This Source Code Form is subject to the terms of the Mozilla Public
6  * License, v. 2.0. If a copy of the MPL was not distributed with this
7  * file, You can obtain one at http://mozilla.org/MPL/2.0/.
8  *
9  * This file incorporates work covered by the following license notice:
10  *
11  * Licensed to the Apache Software Foundation (ASF) under one or more
12  * contributor license agreements. See the NOTICE file distributed
13  * with this work for additional information regarding copyright
14  * ownership. The ASF licenses this file to you under the Apache
15  * License, Version 2.0 (the "License"); you may not use this file
16  * except in compliance with the License. You may obtain a copy of
17  * the License at http://www.apache.org/licenses/LICENSE-2.0 .
18  */
19 
20 #include <memory>
21 
22 #include <tools/stream.hxx>
23 #include <hintids.hxx>
24 #include <sfx2/printer.hxx>
25 #include <editeng/fontitem.hxx>
26 #include <editeng/langitem.hxx>
28 #include <svl/languageoptions.hxx>
29 #include <shellio.hxx>
30 #include <doc.hxx>
34 #include <pam.hxx>
35 #include <breakit.hxx>
36 #include <swerror.h>
37 #include <strings.hrc>
38 #include <mdiexp.hxx>
39 #include <poolfmt.hxx>
40 #include <iodetect.hxx>
41 
42 #include <vcl/metric.hxx>
43 #include <osl/diagnose.h>
44 
45 #define ASC_BUFFLEN 4096
46 
47 namespace {
48 
49 class SwASCIIParser
50 {
51  SwDoc* pDoc;
52  std::unique_ptr<SwPaM> pPam;
53  SvStream& rInput;
54  std::unique_ptr<char[]> pArr;
55  const SwAsciiOptions& rOpt;
56  std::unique_ptr<SfxItemSet> pItemSet;
57  long nFileSize;
58  SvtScriptType nScript;
59  bool bNewDoc;
60 
61  ErrCode ReadChars();
62  void InsertText( const OUString& rStr );
63 
64  SwASCIIParser(const SwASCIIParser&) = delete;
65  SwASCIIParser& operator=(const SwASCIIParser&) = delete;
66 
67 public:
68  SwASCIIParser( SwDoc* pD, const SwPaM& rCursor, SvStream& rIn,
69  bool bReadNewDoc, const SwAsciiOptions& rOpts );
70 
71  ErrCode CallParser();
72 };
73 
74 }
75 
76 // Call for the general reader interface
77 ErrCode AsciiReader::Read( SwDoc &rDoc, const OUString&, SwPaM &rPam, const OUString & )
78 {
79  if( !m_pStream )
80  {
81  OSL_ENSURE( false, "ASCII read without a stream" );
82  return ERR_SWG_READ_ERROR;
83  }
84 
85  std::unique_ptr<SwASCIIParser> pParser(new SwASCIIParser( &rDoc, rPam, *m_pStream,
87  ErrCode nRet = pParser->CallParser();
88 
89  pParser.reset();
90  // after Read reset the options
92  return nRet;
93 }
94 
95 SwASCIIParser::SwASCIIParser(SwDoc* pD, const SwPaM& rCursor, SvStream& rIn,
96  bool bReadNewDoc, const SwAsciiOptions& rOpts)
97  : pDoc(pD), rInput(rIn), rOpt(rOpts), nFileSize(0), nScript(SvtScriptType::NONE)
98  , bNewDoc(bReadNewDoc)
99 {
100  pPam.reset( new SwPaM( *rCursor.GetPoint() ) );
101  pArr.reset( new char [ ASC_BUFFLEN + 2 ] );
102 
103  pItemSet = std::make_unique<SfxItemSet>( pDoc->GetAttrPool(),
107 
108  // set defaults from the options
109  if( rOpt.GetLanguage() )
110  {
111  SvxLanguageItem aLang( rOpt.GetLanguage(), RES_CHRATR_LANGUAGE );
112  pItemSet->Put( aLang );
113  aLang.SetWhich(RES_CHRATR_CJK_LANGUAGE);
114  pItemSet->Put( aLang );
115  aLang.SetWhich(RES_CHRATR_CTL_LANGUAGE);
116  pItemSet->Put( aLang );
117  }
118  if( !rOpt.GetFontName().isEmpty() )
119  {
120  vcl::Font aTextFont( rOpt.GetFontName(), Size( 0, 10 ) );
121  if( pDoc->getIDocumentDeviceAccess().getPrinter( false ) )
122  aTextFont = pDoc->getIDocumentDeviceAccess().getPrinter( false )->GetFontMetric( aTextFont );
123  SvxFontItem aFont( aTextFont.GetFamilyType(), aTextFont.GetFamilyName(),
124  OUString(), aTextFont.GetPitch(), aTextFont.GetCharSet(), RES_CHRATR_FONT );
125  pItemSet->Put( aFont );
126  aFont.SetWhich(RES_CHRATR_CJK_FONT);
127  pItemSet->Put( aFont );
128  aFont.SetWhich(RES_CHRATR_CTL_FONT);
129  pItemSet->Put( aFont );
130  }
131 }
132 
133 // Calling the parser
134 ErrCode SwASCIIParser::CallParser()
135 {
136  rInput.ResetError();
137  nFileSize = rInput.TellEnd();
138  rInput.Seek(STREAM_SEEK_TO_BEGIN);
139  rInput.ResetError();
140 
141  ::StartProgress( STR_STATSTR_W4WREAD, 0, nFileSize, pDoc->GetDocShell() );
142 
143  std::unique_ptr<SwPaM> pInsPam;
144  sal_Int32 nSttContent = 0;
145  if (!bNewDoc)
146  {
147  const SwNodeIndex& rTmp = pPam->GetPoint()->nNode;
148  pInsPam.reset(new SwPaM( rTmp, rTmp, 0, -1 ));
149  nSttContent = pPam->GetPoint()->nContent.GetIndex();
150  }
151 
152  SwTextFormatColl *pColl = nullptr;
153 
154  if (bNewDoc)
155  {
156  pColl = pDoc->getIDocumentStylePoolAccess().GetTextCollFromPool(RES_POOLCOLL_HTML_PRE, false);
157  if (!pColl)
158  pColl = pDoc->getIDocumentStylePoolAccess().GetTextCollFromPool(RES_POOLCOLL_STANDARD,false);
159  if (pColl)
160  pDoc->SetTextFormatColl(*pPam, pColl);
161  }
162 
163  ErrCode nError = ReadChars();
164 
165  if( pItemSet )
166  {
167  // set only the attribute, for scanned scripts.
168  if( !( SvtScriptType::LATIN & nScript ))
169  {
170  pItemSet->ClearItem( RES_CHRATR_FONT );
171  pItemSet->ClearItem( RES_CHRATR_LANGUAGE );
172  }
173  if( !( SvtScriptType::ASIAN & nScript ))
174  {
175  pItemSet->ClearItem( RES_CHRATR_CJK_FONT );
176  pItemSet->ClearItem( RES_CHRATR_CJK_LANGUAGE );
177  }
178  if( !( SvtScriptType::COMPLEX & nScript ))
179  {
180  pItemSet->ClearItem( RES_CHRATR_CTL_FONT );
181  pItemSet->ClearItem( RES_CHRATR_CTL_LANGUAGE );
182  }
183  if( pItemSet->Count() )
184  {
185  if( bNewDoc )
186  {
187  if (pColl)
188  {
189  // Using the pool defaults for the font causes significant
190  // trouble for the HTML filter, because it is not able
191  // to export the pool defaults (or to be more precise:
192  // the HTML filter is not able to detect whether a pool
193  // default has changed or not. Even a comparison with the
194  // HTML template does not work, because the defaults are
195  // not copied when a new doc is created. The result of
196  // comparing pool defaults therefore would be that the
197  // defaults are exported always if the have changed for
198  // text documents in general. That's not sensible, as well
199  // as it is not sensible to export them always.
200  sal_uInt16 aWhichIds[4] =
201  {
204  };
205  sal_uInt16 *pWhichIds = aWhichIds;
206  while (*pWhichIds)
207  {
208  const SfxPoolItem *pItem;
209  if (SfxItemState::SET == pItemSet->GetItemState(*pWhichIds,
210  false, &pItem))
211  {
212  pColl->SetFormatAttr( *pItem );
213  pItemSet->ClearItem( *pWhichIds );
214  }
215  ++pWhichIds;
216  }
217  }
218  if (pItemSet->Count())
219  pDoc->SetDefault(*pItemSet);
220  }
221  else if( pInsPam )
222  {
223  // then set over the insert range the defined attributes
224  *pInsPam->GetMark() = *pPam->GetPoint();
225  ++pInsPam->GetPoint()->nNode;
226  pInsPam->GetPoint()->nContent.Assign(
227  pInsPam->GetContentNode(), nSttContent );
228 
229  // !!!!!
230  OSL_ENSURE( false, "Have to change - hard attr. to para. style" );
231  pDoc->getIDocumentContentOperations().InsertItemSet( *pInsPam, *pItemSet );
232  }
233  }
234  pItemSet.reset();
235  }
236 
237  pInsPam.reset();
238 
239  ::EndProgress( pDoc->GetDocShell() );
240  return nError;
241 }
242 
243 ErrCode SwASCIIParser::ReadChars()
244 {
245  sal_Unicode *pStt = nullptr, *pEnd = nullptr, *pLastStt = nullptr;
246  long nReadCnt = 0, nLineLen = 0;
247  sal_Unicode cLastCR = 0;
248  bool bSwapUnicode = false;
249 
250  const SwAsciiOptions *pUseMe=&rOpt;
251  SwAsciiOptions aEmpty;
252  if (nFileSize >= 2 &&
253  aEmpty.GetFontName() == rOpt.GetFontName() &&
254  aEmpty.GetCharSet() == rOpt.GetCharSet() &&
255  aEmpty.GetLanguage() == rOpt.GetLanguage() &&
256  aEmpty.GetParaFlags() == rOpt.GetParaFlags())
257  {
258  sal_uLong nLen, nOrig;
259  nOrig = nLen = rInput.ReadBytes(pArr.get(), ASC_BUFFLEN);
260  rtl_TextEncoding eCharSet;
261  LineEnd eLineEnd;
262  const bool bRet
263  = SwIoSystem::IsDetectableText(pArr.get(), nLen, &eCharSet, &bSwapUnicode, &eLineEnd);
264  if (!bRet)
266 
267  OSL_ENSURE(bRet, "Autodetect of text import without nag dialog must have failed");
268  if (bRet && eCharSet != RTL_TEXTENCODING_DONTKNOW)
269  {
270  aEmpty.SetCharSet(eCharSet);
271  aEmpty.SetParaFlags(eLineEnd);
272  rInput.SeekRel(-(long(nLen)));
273  }
274  else
275  rInput.SeekRel(-(long(nOrig)));
276  pUseMe=&aEmpty;
277  }
278 
279  rtl_TextToUnicodeConverter hConverter=nullptr;
280  rtl_TextToUnicodeContext hContext=nullptr;
281  rtl_TextEncoding currentCharSet = pUseMe->GetCharSet();
282  if (RTL_TEXTENCODING_UCS2 != currentCharSet)
283  {
284  if( currentCharSet == RTL_TEXTENCODING_DONTKNOW )
285  currentCharSet = RTL_TEXTENCODING_ASCII_US;
286  hConverter = rtl_createTextToUnicodeConverter( currentCharSet );
287  OSL_ENSURE( hConverter, "no string convert available" );
288  if (!hConverter)
289  return ErrCode(ErrCodeArea::Sw, ErrCodeClass::Read, 0);
290  bSwapUnicode = false;
291  hContext = rtl_createTextToUnicodeContext( hConverter );
292  }
293  else if (pUseMe != &aEmpty) //Already successfully figured out type
294  {
295  rInput.StartReadingUnicodeText( currentCharSet );
296  bSwapUnicode = rInput.IsEndianSwap();
297  }
298 
299  std::unique_ptr<sal_Unicode[]> aWork;
300  sal_uLong nArrOffset = 0;
301 
302  do {
303  if( pStt >= pEnd )
304  {
305  if( pLastStt != pStt )
306  InsertText( OUString( pLastStt ));
307 
308  // Read a new block
309  sal_uLong lGCount;
310  if( ERRCODE_NONE != rInput.GetError() || 0 == (lGCount =
311  rInput.ReadBytes( pArr.get() + nArrOffset,
312  ASC_BUFFLEN - nArrOffset )))
313  break; // break from the while loop
314 
315  /*
316  If there was some unconverted bytes on the last cycle then they
317  were put at the beginning of the array, so total bytes available
318  to convert this cycle includes them. If we found 0 following bytes
319  then we ignore the previous partial character.
320  */
321  lGCount+=nArrOffset;
322 
323  if( hConverter )
324  {
325  sal_uInt32 nInfo;
326  sal_Size nNewLen = lGCount, nCntBytes;
327  aWork.reset(new sal_Unicode[nNewLen + 1]); // add 1 for '\0'
328  sal_Unicode* pBuf = aWork.get();
329  pBuf[nNewLen] = 0; // ensure '\0'
330 
331  nNewLen = rtl_convertTextToUnicode( hConverter, hContext,
332  pArr.get(), lGCount, pBuf, nNewLen,
333  (
334  RTL_TEXTTOUNICODE_FLAGS_UNDEFINED_DEFAULT |
335  RTL_TEXTTOUNICODE_FLAGS_MBUNDEFINED_DEFAULT |
336  RTL_TEXTTOUNICODE_FLAGS_INVALID_DEFAULT |
337  RTL_TEXTTOUNICODE_FLAGS_GLOBAL_SIGNATURE
338  ),
339  &nInfo,
340  &nCntBytes );
341  if( 0 != ( nArrOffset = lGCount - nCntBytes ) )
342  memmove( pArr.get(), pArr.get() + nCntBytes, nArrOffset );
343 
344  pStt = pLastStt = aWork.get();
345  pEnd = pStt + nNewLen;
346  }
347  else
348  {
349  pStt = pLastStt = reinterpret_cast<sal_Unicode*>(pArr.get());
350  pEnd = reinterpret_cast<sal_Unicode*>(pArr.get() + lGCount);
351 
352  if( bSwapUnicode )
353  {
354  char* pF = pArr.get(), *pN = pArr.get() + 1;
355  for( sal_uLong n = 0; n < lGCount; n += 2, pF += 2, pN += 2 )
356  {
357  char c = *pF;
358  *pF = *pN;
359  *pN = c;
360  }
361  }
362  }
363 
364  *pEnd = 0;
365  nReadCnt += lGCount;
366 
367  ::SetProgressState( nReadCnt, pDoc->GetDocShell() );
368 
369  if( cLastCR )
370  {
371  if( 0x0a == *pStt && 0x0d == cLastCR )
372  pLastStt = ++pStt;
373  cLastCR = 0;
374  nLineLen = 0;
375  // We skip the last one at the end
376  if( !rInput.eof() || !(pEnd == pStt ||
377  ( !*pEnd && pEnd == pStt+1 ) ) )
378  pDoc->getIDocumentContentOperations().SplitNode( *pPam->GetPoint(), false );
379  }
380  }
381 
382  bool bIns = true, bSplitNode = false;
383  switch( *pStt )
384  {
385 
386  case 0x0a: if( LINEEND_LF == pUseMe->GetParaFlags() )
387  {
388  bIns = false;
389  *pStt = 0;
390  ++pStt;
391 
392  // We skip the last one at the end
393  if( !rInput.eof() || pEnd != pStt )
394  bSplitNode = true;
395  }
396  break;
397 
398  case 0x0d: if( LINEEND_LF != pUseMe->GetParaFlags() )
399  {
400  bIns = false;
401  *pStt = 0;
402  ++pStt;
403 
404  bool bChkSplit = false;
405  if( LINEEND_CRLF == pUseMe->GetParaFlags() )
406  {
407  if( pStt == pEnd )
408  cLastCR = 0x0d;
409  else if( 0x0a == *pStt )
410  {
411  ++pStt;
412  bChkSplit = true;
413  }
414  }
415  else
416  bChkSplit = true;
417 
418  // We skip the last one at the end
419  if( bChkSplit && ( !rInput.eof() || pEnd != pStt ))
420  bSplitNode = true;
421  }
422  break;
423 
424  case 0x0c:
425  {
426  // Insert a hard page break
427  *pStt++ = 0;
428  if( nLineLen )
429  {
430  InsertText( OUString( pLastStt ));
431  }
432  pDoc->getIDocumentContentOperations().SplitNode( *pPam->GetPoint(), false );
433  pDoc->getIDocumentContentOperations().InsertPoolItem(
434  *pPam, SvxFormatBreakItem( SvxBreak::PageBefore, RES_BREAK ) );
435  pLastStt = pStt;
436  nLineLen = 0;
437  bIns = false;
438  }
439  break;
440 
441  case 0x1a:
442  if( nReadCnt == nFileSize && pStt+1 == pEnd )
443  *pStt = 0;
444  else
445  *pStt = '#'; // Replacement visualisation
446  break;
447 
448  case '\t': break;
449 
450  default:
451  if( ' ' > *pStt )
452  // Found control char, replace with '#'
453  *pStt = '#';
454  break;
455  }
456 
457  if( bIns )
458  {
459  if( ( nLineLen >= MAX_ASCII_PARA - 100 ) &&
460  ( ( *pStt == ' ' ) || ( nLineLen >= MAX_ASCII_PARA - 1 ) ) )
461  {
462  sal_Unicode c = *pStt;
463  *pStt = 0;
464  InsertText( OUString( pLastStt ));
465  pDoc->getIDocumentContentOperations().SplitNode( *pPam->GetPoint(), false );
466  pLastStt = pStt;
467  nLineLen = 0;
468  *pStt = c;
469  }
470  ++pStt;
471  ++nLineLen;
472  }
473  else if( bSplitNode )
474  {
475  // We found a CR/LF, thus save the text
476  InsertText( OUString( pLastStt ));
477  if(bNewDoc)
478  pDoc->getIDocumentContentOperations().AppendTextNode( *pPam->GetPoint() );
479  else
480  pDoc->getIDocumentContentOperations().SplitNode( *pPam->GetPoint(), false );
481  pLastStt = pStt;
482  nLineLen = 0;
483  }
484  } while(true);
485 
486  if( hConverter )
487  {
488  rtl_destroyTextToUnicodeContext( hConverter, hContext );
489  rtl_destroyTextToUnicodeConverter( hConverter );
490  }
491  return ERRCODE_NONE;
492 }
493 
494 void SwASCIIParser::InsertText( const OUString& rStr )
495 {
496  pDoc->getIDocumentContentOperations().InsertString( *pPam, rStr );
497 
498  if( pItemSet && g_pBreakIt && nScript != ( SvtScriptType::LATIN |
499  SvtScriptType::ASIAN |
500  SvtScriptType::COMPLEX ) )
501  nScript |= g_pBreakIt->GetAllScriptsOfText( rStr );
502 }
503 
504 /* vim:set shiftwidth=4 softtabstop=4 expandtab: */
rtl_TextEncoding GetCharSet() const
Definition: shellio.hxx:77
Represents the style of a paragraph.
Definition: fmtcol.hxx:55
void ResetASCIIOpts()
Definition: shellio.hxx:137
#define RES_CHRATR_CJK_LANGUAGE
Definition: hintids.hxx:186
#define RES_CHRATR_LANGUAGE
Definition: hintids.hxx:172
const OUString & GetFontName() const
Definition: shellio.hxx:74
sal_uIntPtr sal_uLong
sal_Int64 n
Definition: doc.hxx:186
LanguageType GetLanguage() const
Definition: shellio.hxx:80
SvtScriptType GetAllScriptsOfText(const OUString &rText) const
Definition: breakit.cxx:127
#define RES_CHRATR_FONT
Definition: hintids.hxx:169
void EndProgress(SwDocShell const *pDocShell)
Definition: mainwn.cxx:92
#define RES_CHRATR_CJK_FONT
Definition: hintids.hxx:184
void SetParaFlags(LineEnd eVal)
Definition: shellio.hxx:84
virtual bool SetFormatAttr(const SfxPoolItem &rAttr) override
Override to recognize changes on the and register/unregister the paragragh style at t...
Definition: fmtcol.cxx:329
sal_uInt16 sal_Unicode
void StartProgress(const char *pMessResId, long nStartValue, long nEndValue, SwDocShell *pDocShell)
Definition: mainwn.cxx:52
SwBreakIt * g_pBreakIt
Definition: breakit.cxx:33
sal_uLong GetIndex() const
Definition: ndindex.hxx:152
const SwAsciiOptions & GetASCIIOpts() const
Definition: shellio.hxx:135
void SetProgressState(long nPosition, SwDocShell const *pDocShell)
Definition: mainwn.cxx:82
void SetCharSet(rtl_TextEncoding nVal)
Definition: shellio.hxx:78
PaM is Point and Mark: a selection of the document model.
Definition: pam.hxx:136
SvtScriptType
#define RES_CHRATR_CTL_FONT
Definition: hintids.hxx:189
const SwPosition * GetPoint() const
Definition: pam.hxx:207
virtual ErrCode Read(SwDoc &, const OUString &rBaseURL, SwPaM &, const OUString &) override
Definition: parasc.cxx:77
static bool IsDetectableText(const char *pBuf, sal_uLong &rLen, rtl_TextEncoding *pCharSet, bool *pSwap, LineEnd *pLineEnd)
Definition: iodetect.cxx:240
Marks a node in the document model.
Definition: ndindex.hxx:31
#define ASC_BUFFLEN
Definition: parasc.cxx:45
LineEnd
#define ERRCODE_IO_BROKENPACKAGE
#define MAX_ASCII_PARA
Definition: shellio.hxx:62
SwgReaderOption m_aOption
Definition: shellio.hxx:224
#define ERRCODE_NONE
#define RES_CHRATR_CTL_LANGUAGE
Definition: hintids.hxx:191
SvStream * m_pStream
Definition: shellio.hxx:219
#define ERR_SWG_READ_ERROR
Definition: swerror.h:25
LineEnd GetParaFlags() const
Definition: shellio.hxx:83
#define RES_BREAK
Definition: hintids.hxx:294
bool m_bInsertMode
Definition: shellio.hxx:225