LibreOffice Module sw (master)  1
parasc.cxx
Go to the documentation of this file.
1 /* -*- Mode: C++; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4 -*- */
2 /*
3  * This file is part of the LibreOffice project.
4  *
5  * This Source Code Form is subject to the terms of the Mozilla Public
6  * License, v. 2.0. If a copy of the MPL was not distributed with this
7  * file, You can obtain one at http://mozilla.org/MPL/2.0/.
8  *
9  * This file incorporates work covered by the following license notice:
10  *
11  * Licensed to the Apache Software Foundation (ASF) under one or more
12  * contributor license agreements. See the NOTICE file distributed
13  * with this work for additional information regarding copyright
14  * ownership. The ASF licenses this file to you under the Apache
15  * License, Version 2.0 (the "License"); you may not use this file
16  * except in compliance with the License. You may obtain a copy of
17  * the License at http://www.apache.org/licenses/LICENSE-2.0 .
18  */
19 
20 #include <memory>
21 
22 #include <tools/stream.hxx>
23 #include <hintids.hxx>
24 #include <sfx2/printer.hxx>
25 #include <editeng/fontitem.hxx>
26 #include <editeng/langitem.hxx>
28 #include <svl/languageoptions.hxx>
29 #include <shellio.hxx>
30 #include <doc.hxx>
34 #include <pam.hxx>
35 #include <breakit.hxx>
36 #include <swerror.h>
37 #include <strings.hrc>
38 #include <mdiexp.hxx>
39 #include <poolfmt.hxx>
40 #include <iodetect.hxx>
41 
42 #include <vcl/metric.hxx>
43 #include <osl/diagnose.h>
44 
45 #define ASC_BUFFLEN 4096
46 
47 namespace {
48 
49 class SwASCIIParser
50 {
51  SwDoc& rDoc;
52  std::unique_ptr<SwPaM> pPam;
53  SvStream& rInput;
54  std::unique_ptr<char[]> pArr;
55  const SwAsciiOptions& rOpt;
56  std::unique_ptr<SfxItemSet> pItemSet;
57  tools::Long nFileSize;
58  SvtScriptType nScript;
59  bool bNewDoc;
60 
61  ErrCode ReadChars();
62  void InsertText( const OUString& rStr );
63 
64  SwASCIIParser(const SwASCIIParser&) = delete;
65  SwASCIIParser& operator=(const SwASCIIParser&) = delete;
66 
67 public:
68  SwASCIIParser( SwDoc& rD, const SwPaM& rCursor, SvStream& rIn,
69  bool bReadNewDoc, const SwAsciiOptions& rOpts );
70 
71  ErrCode CallParser();
72 };
73 
74 }
75 
76 // Call for the general reader interface
77 ErrCode AsciiReader::Read( SwDoc& rDoc, const OUString&, SwPaM &rPam, const OUString & )
78 {
79  if( !m_pStream )
80  {
81  OSL_ENSURE( false, "ASCII read without a stream" );
82  return ERR_SWG_READ_ERROR;
83  }
84 
85  std::unique_ptr<SwASCIIParser> xParser(new SwASCIIParser( rDoc, rPam, *m_pStream,
87  ErrCode nRet = xParser->CallParser();
88 
89  xParser.reset();
90  // after Read reset the options
92  return nRet;
93 }
94 
95 SwASCIIParser::SwASCIIParser(SwDoc& rD, const SwPaM& rCursor, SvStream& rIn,
96  bool bReadNewDoc, const SwAsciiOptions& rOpts)
97  : rDoc(rD), rInput(rIn), rOpt(rOpts), nFileSize(0), nScript(SvtScriptType::NONE)
98  , bNewDoc(bReadNewDoc)
99 {
100  pPam.reset( new SwPaM( *rCursor.GetPoint() ) );
101  pArr.reset( new char [ ASC_BUFFLEN + 2 ] );
102 
103  pItemSet = std::make_unique<SfxItemSet>( rDoc.GetAttrPool(),
107 
108  // set defaults from the options
109  if( rOpt.GetLanguage() )
110  {
111  SvxLanguageItem aLang( rOpt.GetLanguage(), RES_CHRATR_LANGUAGE );
112  pItemSet->Put( aLang );
113  aLang.SetWhich(RES_CHRATR_CJK_LANGUAGE);
114  pItemSet->Put( aLang );
115  aLang.SetWhich(RES_CHRATR_CTL_LANGUAGE);
116  pItemSet->Put( aLang );
117  }
118  if( rOpt.GetFontName().isEmpty() )
119  return;
120 
121  vcl::Font aTextFont( rOpt.GetFontName(), Size( 0, 10 ) );
122  if( rDoc.getIDocumentDeviceAccess().getPrinter( false ) )
123  aTextFont = rDoc.getIDocumentDeviceAccess().getPrinter( false )->GetFontMetric( aTextFont );
124  SvxFontItem aFont( aTextFont.GetFamilyType(), aTextFont.GetFamilyName(),
125  OUString(), aTextFont.GetPitch(), aTextFont.GetCharSet(), RES_CHRATR_FONT );
126  pItemSet->Put( aFont );
127  aFont.SetWhich(RES_CHRATR_CJK_FONT);
128  pItemSet->Put( aFont );
129  aFont.SetWhich(RES_CHRATR_CTL_FONT);
130  pItemSet->Put( aFont );
131 }
132 
133 // Calling the parser
134 ErrCode SwASCIIParser::CallParser()
135 {
136  rInput.ResetError();
137  nFileSize = rInput.TellEnd();
138  rInput.Seek(STREAM_SEEK_TO_BEGIN);
139  rInput.ResetError();
140 
141  ::StartProgress( STR_STATSTR_W4WREAD, 0, nFileSize, rDoc.GetDocShell() );
142 
143  std::unique_ptr<SwPaM> pInsPam;
144  sal_Int32 nSttContent = 0;
145  if (!bNewDoc)
146  {
147  const SwNodeIndex& rTmp = pPam->GetPoint()->nNode;
148  pInsPam.reset(new SwPaM( rTmp, rTmp, 0, -1 ));
149  nSttContent = pPam->GetPoint()->nContent.GetIndex();
150  }
151 
152  SwTextFormatColl *pColl = nullptr;
153 
154  if (bNewDoc)
155  {
156  pColl = rDoc.getIDocumentStylePoolAccess().GetTextCollFromPool(RES_POOLCOLL_HTML_PRE, false);
157  if (!pColl)
158  pColl = rDoc.getIDocumentStylePoolAccess().GetTextCollFromPool(RES_POOLCOLL_STANDARD,false);
159  if (pColl)
160  rDoc.SetTextFormatColl(*pPam, pColl);
161  }
162 
163  ErrCode nError = ReadChars();
164 
165  if( pItemSet )
166  {
167  // set only the attribute, for scanned scripts.
168  if( !( SvtScriptType::LATIN & nScript ))
169  {
170  pItemSet->ClearItem( RES_CHRATR_FONT );
171  pItemSet->ClearItem( RES_CHRATR_LANGUAGE );
172  }
173  if( !( SvtScriptType::ASIAN & nScript ))
174  {
175  pItemSet->ClearItem( RES_CHRATR_CJK_FONT );
176  pItemSet->ClearItem( RES_CHRATR_CJK_LANGUAGE );
177  }
178  if( !( SvtScriptType::COMPLEX & nScript ))
179  {
180  pItemSet->ClearItem( RES_CHRATR_CTL_FONT );
181  pItemSet->ClearItem( RES_CHRATR_CTL_LANGUAGE );
182  }
183  if( pItemSet->Count() )
184  {
185  if( bNewDoc )
186  {
187  if (pColl)
188  {
189  // Using the pool defaults for the font causes significant
190  // trouble for the HTML filter, because it is not able
191  // to export the pool defaults (or to be more precise:
192  // the HTML filter is not able to detect whether a pool
193  // default has changed or not. Even a comparison with the
194  // HTML template does not work, because the defaults are
195  // not copied when a new doc is created. The result of
196  // comparing pool defaults therefore would be that the
197  // defaults are exported always if the have changed for
198  // text documents in general. That's not sensible, as well
199  // as it is not sensible to export them always.
200  sal_uInt16 aWhichIds[4] =
201  {
204  };
205  sal_uInt16 *pWhichIds = aWhichIds;
206  while (*pWhichIds)
207  {
208  const SfxPoolItem *pItem;
209  if (SfxItemState::SET == pItemSet->GetItemState(*pWhichIds,
210  false, &pItem))
211  {
212  pColl->SetFormatAttr( *pItem );
213  pItemSet->ClearItem( *pWhichIds );
214  }
215  ++pWhichIds;
216  }
217  }
218  if (pItemSet->Count())
219  rDoc.SetDefault(*pItemSet);
220  }
221  else if( pInsPam )
222  {
223  // then set over the insert range the defined attributes
224  *pInsPam->GetMark() = *pPam->GetPoint();
225  ++pInsPam->GetPoint()->nNode;
226  pInsPam->GetPoint()->nContent.Assign(
227  pInsPam->GetContentNode(), nSttContent );
228 
229  // !!!!!
230  OSL_ENSURE( false, "Have to change - hard attr. to para. style" );
231  rDoc.getIDocumentContentOperations().InsertItemSet( *pInsPam, *pItemSet );
232  }
233  }
234  pItemSet.reset();
235  }
236 
237  pInsPam.reset();
238 
239  ::EndProgress( rDoc.GetDocShell() );
240  return nError;
241 }
242 
243 ErrCode SwASCIIParser::ReadChars()
244 {
245  sal_Unicode *pStt = nullptr, *pEnd = nullptr, *pLastStt = nullptr;
246  tools::Long nReadCnt = 0, nLineLen = 0;
247  sal_Unicode cLastCR = 0;
248  bool bSwapUnicode = false;
249 
250  const SwAsciiOptions *pUseMe=&rOpt;
251  SwAsciiOptions aEmpty;
252  if (nFileSize >= 2 &&
253  aEmpty.GetFontName() == rOpt.GetFontName() &&
254  aEmpty.GetCharSet() == rOpt.GetCharSet() &&
255  aEmpty.GetLanguage() == rOpt.GetLanguage() &&
256  aEmpty.GetParaFlags() == rOpt.GetParaFlags())
257  {
258  sal_Size nLen, nOrig;
259  nOrig = nLen = rInput.ReadBytes(pArr.get(), ASC_BUFFLEN);
260  rtl_TextEncoding eCharSet;
261  LineEnd eLineEnd;
262  const bool bRet
263  = SwIoSystem::IsDetectableText(pArr.get(), nLen, &eCharSet, &bSwapUnicode, &eLineEnd);
264  if (!bRet)
266 
267  OSL_ENSURE(bRet, "Autodetect of text import without nag dialog must have failed");
268  if (bRet && eCharSet != RTL_TEXTENCODING_DONTKNOW)
269  {
270  aEmpty.SetCharSet(eCharSet);
271  aEmpty.SetParaFlags(eLineEnd);
272  rInput.SeekRel(-(tools::Long(nLen)));
273  }
274  else
275  rInput.SeekRel(-(tools::Long(nOrig)));
276  pUseMe=&aEmpty;
277  }
278 
279  rtl_TextToUnicodeConverter hConverter=nullptr;
280  rtl_TextToUnicodeContext hContext=nullptr;
281  rtl_TextEncoding currentCharSet = pUseMe->GetCharSet();
282  if (RTL_TEXTENCODING_UCS2 != currentCharSet)
283  {
284  if( currentCharSet == RTL_TEXTENCODING_DONTKNOW )
285  currentCharSet = RTL_TEXTENCODING_ASCII_US;
286  hConverter = rtl_createTextToUnicodeConverter( currentCharSet );
287  OSL_ENSURE( hConverter, "no string convert available" );
288  if (!hConverter)
289  return ErrCode(ErrCodeArea::Sw, ErrCodeClass::Read, 0);
290  bSwapUnicode = false;
291  hContext = rtl_createTextToUnicodeContext( hConverter );
292  }
293  else if (pUseMe != &aEmpty) //Already successfully figured out type
294  {
295  rInput.StartReadingUnicodeText( currentCharSet );
296  bSwapUnicode = rInput.IsEndianSwap();
297  }
298 
299  std::unique_ptr<sal_Unicode[]> aWork;
300  sal_Size nArrOffset = 0;
301 
302  do {
303  if( pStt >= pEnd )
304  {
305  if( pLastStt != pStt )
306  InsertText( OUString( pLastStt ));
307 
308  // Read a new block
309  sal_Size lGCount;
310  if( ERRCODE_NONE != rInput.GetError() || 0 == (lGCount =
311  rInput.ReadBytes( pArr.get() + nArrOffset,
312  ASC_BUFFLEN - nArrOffset )))
313  break; // break from the while loop
314 
315  /*
316  If there was some unconverted bytes on the last cycle then they
317  were put at the beginning of the array, so total bytes available
318  to convert this cycle includes them. If we found 0 following bytes
319  then we ignore the previous partial character.
320  */
321  lGCount += nArrOffset;
322 
323  if( hConverter )
324  {
325  sal_uInt32 nInfo;
326  sal_Size nNewLen = lGCount, nCntBytes;
327  aWork.reset(new sal_Unicode[nNewLen + 1]); // add 1 for '\0'
328  sal_Unicode* pBuf = aWork.get();
329  pBuf[nNewLen] = 0; // ensure '\0'
330 
331  nNewLen = rtl_convertTextToUnicode( hConverter, hContext,
332  pArr.get(), lGCount, pBuf, nNewLen,
333  (
334  RTL_TEXTTOUNICODE_FLAGS_UNDEFINED_DEFAULT |
335  RTL_TEXTTOUNICODE_FLAGS_MBUNDEFINED_DEFAULT |
336  RTL_TEXTTOUNICODE_FLAGS_INVALID_DEFAULT |
337  RTL_TEXTTOUNICODE_FLAGS_GLOBAL_SIGNATURE
338  ),
339  &nInfo,
340  &nCntBytes );
341  nArrOffset = lGCount - nCntBytes;
342  if( 0 != nArrOffset )
343  memmove( pArr.get(), pArr.get() + nCntBytes, nArrOffset );
344 
345  pStt = pLastStt = aWork.get();
346  pEnd = pStt + nNewLen;
347  }
348  else
349  {
350  pStt = pLastStt = reinterpret_cast<sal_Unicode*>(pArr.get());
351  auto nChars = lGCount / 2;
352  pEnd = pStt + nChars;
353 
354  if( bSwapUnicode )
355  {
356  char* pF = pArr.get(), *pN = pArr.get() + 1;
357  for (sal_Size n = 0; n < nChars; ++n, pF += 2, pN += 2)
358  {
359  char c = *pF;
360  *pF = *pN;
361  *pN = c;
362  }
363  }
364  }
365 
366  *pEnd = 0;
367  nReadCnt += lGCount;
368 
369  ::SetProgressState( nReadCnt, rDoc.GetDocShell() );
370 
371  if( cLastCR )
372  {
373  if( 0x0a == *pStt && 0x0d == cLastCR )
374  pLastStt = ++pStt;
375  cLastCR = 0;
376  nLineLen = 0;
377  // We skip the last one at the end
378  if( !rInput.eof() || !(pEnd == pStt ||
379  ( !*pEnd && pEnd == pStt+1 ) ) )
380  rDoc.getIDocumentContentOperations().SplitNode( *pPam->GetPoint(), false );
381  }
382  }
383 
384  bool bIns = true, bSplitNode = false;
385  switch( *pStt )
386  {
387 
388  case 0x0a: if( LINEEND_LF == pUseMe->GetParaFlags() )
389  {
390  bIns = false;
391  *pStt = 0;
392  ++pStt;
393 
394  // We skip the last one at the end
395  if( !rInput.eof() || pEnd != pStt )
396  bSplitNode = true;
397  }
398  break;
399 
400  case 0x0d: if( LINEEND_LF != pUseMe->GetParaFlags() )
401  {
402  bIns = false;
403  *pStt = 0;
404  ++pStt;
405 
406  bool bChkSplit = true;
407  if( LINEEND_CRLF == pUseMe->GetParaFlags() )
408  {
409  if( pStt == pEnd )
410  {
411  cLastCR = 0x0d;
412  bChkSplit = false;
413  }
414  else if( 0x0a == *pStt )
415  ++pStt;
416  }
417 
418  // We skip the last one at the end
419  if( bChkSplit && ( !rInput.eof() || pEnd != pStt ))
420  bSplitNode = true;
421  }
422  break;
423 
424  case 0x0c:
425  {
426  // Insert a hard page break
427  *pStt++ = 0;
428  if( nLineLen )
429  {
430  InsertText( OUString( pLastStt ));
431  }
432  rDoc.getIDocumentContentOperations().SplitNode( *pPam->GetPoint(), false );
433  rDoc.getIDocumentContentOperations().InsertPoolItem(
434  *pPam, SvxFormatBreakItem( SvxBreak::PageBefore, RES_BREAK ) );
435  pLastStt = pStt;
436  nLineLen = 0;
437  bIns = false;
438  }
439  break;
440 
441  case 0x1a:
442  if( nReadCnt == nFileSize && pStt+1 == pEnd )
443  *pStt = 0;
444  else
445  *pStt = '#'; // Replacement visualisation
446  break;
447 
448  case '\t': break;
449 
450  default:
451  if( ' ' > *pStt )
452  // Found control char, replace with '#'
453  *pStt = '#';
454  break;
455  }
456 
457  if( bIns )
458  {
459  if( ( nLineLen >= MAX_ASCII_PARA - 100 ) &&
460  ( ( *pStt == ' ' ) || ( nLineLen >= MAX_ASCII_PARA - 1 ) ) )
461  {
462  sal_Unicode c = *pStt;
463  *pStt = 0;
464  InsertText( OUString( pLastStt ));
465  rDoc.getIDocumentContentOperations().SplitNode( *pPam->GetPoint(), false );
466  pLastStt = pStt;
467  nLineLen = 0;
468  *pStt = c;
469  }
470  ++pStt;
471  ++nLineLen;
472  }
473  else if( bSplitNode )
474  {
475  // We found a CR/LF, thus save the text
476  InsertText( OUString( pLastStt ));
477  if(bNewDoc)
478  rDoc.getIDocumentContentOperations().AppendTextNode( *pPam->GetPoint() );
479  else
480  rDoc.getIDocumentContentOperations().SplitNode( *pPam->GetPoint(), false );
481  pLastStt = pStt;
482  nLineLen = 0;
483  }
484  } while(true);
485 
486  if( hConverter )
487  {
488  rtl_destroyTextToUnicodeContext( hConverter, hContext );
489  rtl_destroyTextToUnicodeConverter( hConverter );
490  }
491  return ERRCODE_NONE;
492 }
493 
494 void SwASCIIParser::InsertText( const OUString& rStr )
495 {
496  rDoc.getIDocumentContentOperations().InsertString( *pPam, rStr );
497 
498  if( pItemSet && g_pBreakIt && nScript != ( SvtScriptType::LATIN |
499  SvtScriptType::ASIAN |
500  SvtScriptType::COMPLEX ) )
501  nScript |= g_pBreakIt->GetAllScriptsOfText( rStr );
502 }
503 
504 /* vim:set shiftwidth=4 softtabstop=4 expandtab: */
rtl_TextEncoding GetCharSet() const
Definition: shellio.hxx:71
Represents the style of a paragraph.
Definition: fmtcol.hxx:55
void ResetASCIIOpts()
Definition: shellio.hxx:131
constexpr TypedWhichId< SvxFontItem > RES_CHRATR_CTL_FONT(27)
const OUString & GetFontName() const
Definition: shellio.hxx:68
constexpr TypedWhichId< SvxLanguageItem > RES_CHRATR_LANGUAGE(10)
long Long
sal_Int64 n
Definition: doc.hxx:186
LanguageType GetLanguage() const
Definition: shellio.hxx:74
SvtScriptType GetAllScriptsOfText(const OUString &rText) const
Definition: breakit.cxx:127
constexpr TypedWhichId< SvxFormatBreakItem > RES_BREAK(94)
void EndProgress(SwDocShell const *pDocShell)
Definition: mainwn.cxx:92
void SetParaFlags(LineEnd eVal)
Definition: shellio.hxx:78
virtual bool SetFormatAttr(const SfxPoolItem &rAttr) override
Override to recognize changes on the and register/unregister the paragragh style at t...
Definition: fmtcol.cxx:332
constexpr TypedWhichId< SvxFontItem > RES_CHRATR_FONT(7)
sal_uInt16 sal_Unicode
void StartProgress(const char *pMessResId, tools::Long nStartValue, tools::Long nEndValue, SwDocShell *pDocShell)
Definition: mainwn.cxx:52
constexpr TypedWhichId< SvxLanguageItem > RES_CHRATR_CTL_LANGUAGE(29)
SwBreakIt * g_pBreakIt
Definition: breakit.cxx:33
sal_uLong GetIndex() const
Definition: ndindex.hxx:152
const SwAsciiOptions & GetASCIIOpts() const
Definition: shellio.hxx:129
void SetCharSet(rtl_TextEncoding nVal)
Definition: shellio.hxx:72
PaM is Point and Mark: a selection of the document model.
Definition: pam.hxx:136
SvtScriptType
const SwPosition * GetPoint() const
Definition: pam.hxx:207
virtual ErrCode Read(SwDoc &, const OUString &rBaseURL, SwPaM &, const OUString &) override
Definition: parasc.cxx:77
static bool IsDetectableText(const char *pBuf, sal_uLong &rLen, rtl_TextEncoding *pCharSet, bool *pSwap, LineEnd *pLineEnd)
Definition: iodetect.cxx:242
Marks a node in the document model.
Definition: ndindex.hxx:31
#define ASC_BUFFLEN
Definition: parasc.cxx:45
LineEnd
#define ERRCODE_IO_BROKENPACKAGE
#define MAX_ASCII_PARA
Definition: shellio.hxx:56
SwgReaderOption m_aOption
Definition: shellio.hxx:218
#define ERRCODE_NONE
void SetProgressState(tools::Long nPosition, SwDocShell const *pDocShell)
Definition: mainwn.cxx:82
constexpr TypedWhichId< SvxFontItem > RES_CHRATR_CJK_FONT(22)
SvStream * m_pStream
Definition: shellio.hxx:213
#define ERR_SWG_READ_ERROR
Definition: swerror.h:25
LineEnd GetParaFlags() const
Definition: shellio.hxx:77
constexpr TypedWhichId< SvxLanguageItem > RES_CHRATR_CJK_LANGUAGE(24)
bool m_bInsertMode
Definition: shellio.hxx:219