LibreOffice Module sw (master) 1
iodetect.cxx
Go to the documentation of this file.
1/* -*- Mode: C++; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4 -*- */
2/*
3 * This file is part of the LibreOffice project.
4 *
5 * This Source Code Form is subject to the terms of the Mozilla Public
6 * License, v. 2.0. If a copy of the MPL was not distributed with this
7 * file, You can obtain one at http://mozilla.org/MPL/2.0/.
8 *
9 * This file incorporates work covered by the following license notice:
10 *
11 * Licensed to the Apache Software Foundation (ASF) under one or more
12 * contributor license agreements. See the NOTICE file distributed
13 * with this work for additional information regarding copyright
14 * ownership. The ASF licenses this file to you under the Apache
15 * License, Version 2.0 (the "License"); you may not use this file
16 * except in compliance with the License. You may obtain a copy of
17 * the License at http://www.apache.org/licenses/LICENSE-2.0 .
18 */
19
20#include <iodetect.hxx>
21#include <memory>
22#include <osl/endian.h>
23#include <sot/storage.hxx>
24#include <tools/urlobj.hxx>
26#include <sfx2/docfilt.hxx>
27#include <sfx2/fcontnr.hxx>
28#include <sfx2/docfile.hxx>
29#include <com/sun/star/ucb/ContentCreationException.hpp>
30#include <com/sun/star/embed/XStorage.hpp>
31#include <unicode/ucsdet.h>
32
33using namespace ::com::sun::star;
34
36{
37 return SvtModuleOptions().IsWriter();
38}
39
41{
53};
54
56{
57 // for StorageFilters also set the SubStorageName
58 const OUString& rUserData = rFltr.GetUserData();
59 if (rUserData == FILTER_XML ||
60 rUserData == FILTER_XMLV ||
61 rUserData == FILTER_XMLVW)
62 return "content.xml";
63 if (rUserData == sWW6 || rUserData == FILTER_WW8)
64 return "WordDocument";
65 return OUString();
66}
67
68std::shared_ptr<const SfxFilter> SwIoSystem::GetFilterOfFormat(std::u16string_view rFormatNm,
69 const SfxFilterContainer* pCnt)
70{
73 const SfxFilterContainer* pFltCnt = pCnt ? pCnt : ( IsDocShellRegistered() ? &aCntSw : &aCntSwWeb );
74
75 do {
76 if( pFltCnt )
77 {
78 SfxFilterMatcher aMatcher( pFltCnt->GetName() );
79 SfxFilterMatcherIter aIter( aMatcher );
80 std::shared_ptr<const SfxFilter> pFilter = aIter.First();
81 while ( pFilter )
82 {
83 if( pFilter->GetUserData() == rFormatNm )
84 return pFilter;
85 pFilter = aIter.Next();
86 }
87 }
88 if( pCnt || pFltCnt == &aCntSwWeb )
89 break;
90 pFltCnt = &aCntSwWeb;
91 } while( true );
92 return nullptr;
93}
94
95bool SwIoSystem::IsValidStgFilter( const css::uno::Reference < css::embed::XStorage >& rStg, const SfxFilter& rFilter)
96{
97 bool bRet = false;
98 try
99 {
100 SotClipboardFormatId nStgFormatId = SotStorage::GetFormatID( rStg );
101 bRet = rStg->isStreamElement( "content.xml" );
102 if ( bRet )
103 bRet = ( nStgFormatId != SotClipboardFormatId::NONE && ( rFilter.GetFormat() == nStgFormatId ) );
104 }
105 catch (const css::uno::Exception& )
106 {
107 }
108
109 return bRet;
110}
111
113{
114 SotClipboardFormatId nStgFormatId = rStg.GetFormat();
115 /*#i8409# We cannot trust the clipboard id anymore :-(*/
116 if (rFilter.GetUserData() == FILTER_WW8 || rFilter.GetUserData() == sWW6)
117 nStgFormatId = SotClipboardFormatId::NONE;
118
119 bool bRet = ERRCODE_NONE == rStg.GetError() &&
120 ( nStgFormatId == SotClipboardFormatId::NONE || rFilter.GetFormat() == nStgFormatId ) &&
121 ( rStg.IsContained( SwIoSystem::GetSubStorageName( rFilter )) );
122 if( bRet )
123 {
124 /* Bug 53445 - there are Excel Docs w/o ClipBoardId! */
125 /* Bug 62703 - and also WinWord Docs w/o ClipBoardId! */
126 if (rFilter.GetUserData() == FILTER_WW8 || rFilter.GetUserData() == sWW6)
127 {
128 bRet = (rStg.IsContained("0Table")
129 || rStg.IsContained("1Table"))
130 == (rFilter.GetUserData() == FILTER_WW8);
131 if (bRet && !rFilter.IsAllowedAsTemplate())
132 {
134 rStg.OpenSotStream("WordDocument",
135 StreamMode::STD_READ );
136 xRef->Seek(10);
137 sal_uInt8 nByte;
138 xRef->ReadUChar( nByte );
139 bRet = !(nByte & 1);
140 }
141 }
142 }
143 return bRet;
144}
145
146// Check the type of the stream (file) by searching for corresponding set of bytes.
147// If no known type is found, return ASCII for now!
148// Returns the internal FilterName.
149std::shared_ptr<const SfxFilter> SwIoSystem::GetFileFilter(const OUString& rFileName)
150{
152 SfxFilterContainer aCntSwWeb( sSWRITERWEB );
153 const SfxFilterContainer* pFCntnr = IsDocShellRegistered() ? &aCntSw : &aCntSwWeb;
154
155 SfxFilterMatcher aMatcher( pFCntnr->GetName() );
156 SfxFilterMatcherIter aIter( aMatcher );
157 std::shared_ptr<const SfxFilter> pFilter = aIter.First();
158 if ( !pFilter )
159 return nullptr;
160
161 if (SotStorage::IsStorageFile(rFileName))
162 {
163 // package storage or OLEStorage based format
165 INetURLObject aObj;
166 aObj.SetSmartProtocol( INetProtocol::File );
167 aObj.SetSmartURL( rFileName );
168 SfxMedium aMedium(aObj.GetMainURL(INetURLObject::DecodeMechanism::NONE), StreamMode::STD_READ);
169
170 // templates should not get precedence over "normal" filters (#i35508, #i33168)
171 std::shared_ptr<const SfxFilter> pTemplateFilter;
172 if (aMedium.IsStorage())
173 {
174 uno::Reference<embed::XStorage> const xStor = aMedium.GetStorage();
175 if ( xStor.is() )
176 {
177 while ( pFilter )
178 {
179 if (pFilter->GetUserData().startsWith("C") && IsValidStgFilter(xStor, *pFilter ))
180 {
181 if (pFilter->IsOwnTemplateFormat())
182 {
183 // found template filter; maybe there's a "normal" one also
184 pTemplateFilter = pFilter;
185 }
186 else
187 return pFilter;
188 }
189
190 pFilter = aIter.Next();
191 }
192
193 // there's only a template filter that could be found
194 if ( pTemplateFilter )
195 pFilter = pTemplateFilter;
196 }
197 }
198 else
199 {
200 try
201 {
202 SvStream *const pStream = aMedium.GetInStream();
203 if ( pStream && SotStorage::IsStorageFile(pStream) )
204 xStg = new SotStorage( pStream, false );
205 }
206 catch (const css::ucb::ContentCreationException &)
207 {
208 }
209
210 if( xStg.is() && ( xStg->GetError() == ERRCODE_NONE ) )
211 {
212 while ( pFilter )
213 {
214 if (pFilter->GetUserData().startsWith("C") && IsValidStgFilter(*xStg, *pFilter))
215 {
216 if (pFilter->IsOwnTemplateFormat())
217 {
218 // found template filter; maybe there's a "normal" one also
219 pTemplateFilter = pFilter;
220 }
221 else
222 return pFilter;
223 }
224
225 pFilter = aIter.Next();
226 }
227
228 // there's only a template filter that could be found
229 if ( pTemplateFilter )
230 pFilter = pTemplateFilter;
231
232 }
233 }
234
235 return pFilter;
236 }
237
239}
240
241rtl_TextEncoding SwIoSystem::GetTextEncoding(SvStream& rStrm)
242{
243 sal_Size nLen, nOrig;
244 char aBuf[4096];
245 nOrig = nLen = rStrm.ReadBytes(aBuf, sizeof(aBuf));
246
247 rtl_TextEncoding eCharSet;
248 const bool bRet = SwIoSystem::IsDetectableText(aBuf, nLen, &eCharSet, nullptr, nullptr, nullptr);
249 if (bRet && eCharSet != RTL_TEXTENCODING_DONTKNOW)
250 rStrm.SeekRel(-(tools::Long(nLen)));
251 else
252 rStrm.SeekRel(-(tools::Long(nOrig)));
253
254 return eCharSet;
255}
256
257bool SwIoSystem::IsDetectableText(const char* pBuf, sal_uLong &rLen,
258 rtl_TextEncoding *pCharSet, bool *pSwap, LineEnd *pLineEnd, bool *pBom)
259{
260 bool bSwap = false;
261 rtl_TextEncoding eCharSet = RTL_TEXTENCODING_DONTKNOW;
262 bool bLE = true;
263 bool bBom = false;
264 /*See if it's a known unicode type*/
265 if (rLen >= 2)
266 {
267 sal_uLong nHead=0;
268 if (rLen > 2 && sal_uInt8(pBuf[0]) == 0xEF && sal_uInt8(pBuf[1]) == 0xBB &&
269 sal_uInt8(pBuf[2]) == 0xBF)
270 {
271 eCharSet = RTL_TEXTENCODING_UTF8;
272 nHead = 3;
273 bBom = true;
274 }
275 else if (sal_uInt8(pBuf[0]) == 0xFE && sal_uInt8(pBuf[1]) == 0xFF)
276 {
277 eCharSet = RTL_TEXTENCODING_UCS2;
278 bLE = false;
279 nHead = 2;
280 bBom = true;
281 }
282 else if (sal_uInt8(pBuf[1]) == 0xFE && sal_uInt8(pBuf[0]) == 0xFF)
283 {
284 eCharSet = RTL_TEXTENCODING_UCS2;
285 nHead = 2;
286 bBom = true;
287 }
288 pBuf+=nHead;
289 rLen-=nHead;
290 }
291 /*See unicode type again without BOM*/
292 if (rLen >= 1 && eCharSet == RTL_TEXTENCODING_DONTKNOW)
293 {
294 UErrorCode uerr = U_ZERO_ERROR;
295 UCharsetDetector* ucd = ucsdet_open(&uerr);
296 ucsdet_setText(ucd, pBuf, rLen, &uerr);
297 if (const UCharsetMatch* match = ucsdet_detect(ucd, &uerr))
298 {
299 const char* pEncodingName = ucsdet_getName(match, &uerr);
300
301 if (U_SUCCESS(uerr) && !strcmp("UTF-8", pEncodingName))
302 {
303 eCharSet = RTL_TEXTENCODING_UTF8; // UTF-8
304 }
305 else if (U_SUCCESS(uerr) && !strcmp("UTF-16BE", pEncodingName))
306 {
307 eCharSet = RTL_TEXTENCODING_UCS2; // UTF-16BE
308 bLE = false;
309 }
310 else if (U_SUCCESS(uerr) && !strcmp("UTF-16LE", pEncodingName))
311 {
312 eCharSet = RTL_TEXTENCODING_UCS2; // UTF-16LE
313 }
314 else if (U_SUCCESS(uerr) && !strcmp("GB18030", pEncodingName))
315 {
316 eCharSet = RTL_TEXTENCODING_GB_18030;
317 }
318 }
319
320 ucsdet_close(ucd);
321 }
322
323 bool bCR = false, bLF = false, bIsBareUnicode = false;
324
325 if (eCharSet != RTL_TEXTENCODING_DONTKNOW)
326 {
327 std::unique_ptr<sal_Unicode[]> aWork(new sal_Unicode[rLen+1]);
328 sal_Unicode *pNewBuf = aWork.get();
329 std::size_t nNewLen;
330 if (eCharSet != RTL_TEXTENCODING_UCS2)
331 {
332 nNewLen = rLen;
333 rtl_TextToUnicodeConverter hConverter =
334 rtl_createTextToUnicodeConverter(eCharSet);
335 rtl_TextToUnicodeContext hContext =
336 rtl_createTextToUnicodeContext(hConverter);
337
338 sal_Size nCntBytes;
339 sal_uInt32 nInfo;
340 nNewLen = rtl_convertTextToUnicode( hConverter, hContext, pBuf,
341 rLen, pNewBuf, nNewLen,
342 (RTL_TEXTTOUNICODE_FLAGS_UNDEFINED_DEFAULT |
343 RTL_TEXTTOUNICODE_FLAGS_MBUNDEFINED_DEFAULT |
344 RTL_TEXTTOUNICODE_FLAGS_INVALID_DEFAULT), &nInfo, &nCntBytes);
345
346 rtl_destroyTextToUnicodeContext(hConverter, hContext);
347 rtl_destroyTextToUnicodeConverter(hConverter);
348 }
349 else
350 {
351 nNewLen = rLen/2;
352 memcpy(pNewBuf, pBuf, rLen);
353#ifdef OSL_LITENDIAN
354 bool const bNativeLE = true;
355#else
356 bool const bNativeLE = false;
357#endif
358 if (bLE != bNativeLE)
359 {
360 bSwap = true;
361 char* pF = reinterpret_cast<char*>(pNewBuf);
362 char* pN = pF+1;
363 for(sal_uLong n = 0; n < nNewLen; ++n, pF+=2, pN+=2 )
364 {
365 char c = *pF;
366 *pF = *pN;
367 *pN = c;
368 }
369 }
370 }
371
372 for (sal_uLong nCnt = 0; nCnt < nNewLen; ++nCnt, ++pNewBuf)
373 {
374 switch (*pNewBuf)
375 {
376 case 0xA:
377 bLF = true;
378 break;
379 case 0xD:
380 bCR = true;
381 break;
382 default:
383 break;
384 }
385 }
386 }
387 else
388 {
389 for( sal_uLong nCnt = 0; nCnt < rLen; ++nCnt, ++pBuf )
390 {
391 switch (*pBuf)
392 {
393 case 0x0:
394 if( nCnt + 1 < rLen && !*(pBuf+1) )
395 return false;
396 bIsBareUnicode = true;
397 break;
398 case 0xA:
399 bLF = true;
400 break;
401 case 0xD:
402 bCR = true;
403 break;
404 case 0xC:
405 case 0x1A:
406 case 0x9:
407 break;
408 default:
409 break;
410 }
411 }
412 }
413
414 LineEnd eSysLE = GetSystemLineEnd();
415 LineEnd eLineEnd;
416 if (!bCR && !bLF)
417 eLineEnd = eSysLE;
418 else
419 eLineEnd = bCR ? ( bLF ? LINEEND_CRLF : LINEEND_CR ) : LINEEND_LF;
420
421 if (pCharSet)
422 *pCharSet = eCharSet;
423 if (pSwap)
424 *pSwap = bSwap;
425 if (pLineEnd)
426 *pLineEnd = eLineEnd;
427 if (pBom)
428 *pBom = bBom;
429
430 return !bIsBareUnicode;
431}
432
433/* vim:set shiftwidth=4 softtabstop=4 expandtab: */
void SetSmartProtocol(INetProtocol eTheSmartScheme)
OUString GetMainURL(DecodeMechanism eMechanism, rtl_TextEncoding eCharset=RTL_TEXTENCODING_UTF8) const
bool SetSmartURL(std::u16string_view rTheAbsURIRef, EncodeMechanism eMechanism=EncodeMechanism::WasEncoded, rtl_TextEncoding eCharset=RTL_TEXTENCODING_UTF8, FSysStyle eStyle=FSysStyle::Detect)
OUString const & GetName() const
std::shared_ptr< const SfxFilter > First()
std::shared_ptr< const SfxFilter > Next()
const OUString & GetUserData() const
bool IsAllowedAsTemplate() const
SotClipboardFormatId GetFormat() const
css::uno::Reference< css::embed::XStorage > GetStorage(bool bCreateTempFile=true)
bool IsStorage()
SvStream * GetInStream()
bool IsContained(const OUString &rEleName) const
static SotClipboardFormatId GetFormatID(css::uno::Reference< css::embed::XStorage > const &xStorage)
tools::SvRef< SotStorageStream > OpenSotStream(const OUString &rEleName, StreamMode=StreamMode::STD_READWRITE)
static bool IsStorageFile(OUString const &rFileName)
SotClipboardFormatId GetFormat()
ErrCode GetError() const
std::size_t ReadBytes(void *pData, std::size_t nSize)
sal_uInt64 SeekRel(sal_Int64 nPos)
bool IsWriter() const
static std::shared_ptr< const SfxFilter > GetFileFilter(const OUString &rFileName)
Detect for the given file which filter should be used.
Definition: iodetect.cxx:149
static bool IsDetectableText(const char *pBuf, sal_uLong &rLen, rtl_TextEncoding *pCharSet, bool *pSwap, LineEnd *pLineEnd, bool *pBom)
Definition: iodetect.cxx:257
static OUString GetSubStorageName(const SfxFilter &rFltr)
Definition: iodetect.cxx:55
static SW_DLLPUBLIC std::shared_ptr< const SfxFilter > GetFilterOfFormat(std::u16string_view rFormat, const SfxFilterContainer *pCnt=nullptr)
find for an internal format name the corresponding filter entry
Definition: iodetect.cxx:68
static bool IsValidStgFilter(SotStorage &, const SfxFilter &)
Definition: iodetect.cxx:112
static SW_DLLPUBLIC rtl_TextEncoding GetTextEncoding(SvStream &)
Definition: iodetect.cxx:241
bool is() const
#define ERRCODE_NONE
SotClipboardFormatId
SwIoDetect aFilterDetect[]
Definition: iodetect.cxx:40
static bool IsDocShellRegistered()
Definition: iodetect.cxx:35
constexpr OUStringLiteral FILTER_WW8
WinWord 97 filter.
Definition: iodetect.hxx:35
constexpr OUStringLiteral FILTER_TEXT_DLG
text filter with encoding dialog
Definition: iodetect.hxx:36
constexpr OUStringLiteral sRtfWH
Definition: iodetect.hxx:32
constexpr OUStringLiteral FILTER_RTF
RTF filter.
Definition: iodetect.hxx:31
constexpr OUStringLiteral FILTER_XML
XML filter.
Definition: iodetect.hxx:37
constexpr OUStringLiteral FILTER_BAS
StarBasic (identical to ANSI)
Definition: iodetect.hxx:34
#define FILTER_XMLVW
XML filter.
Definition: iodetect.hxx:39
constexpr OUStringLiteral sWW5
Definition: iodetect.hxx:42
constexpr OUStringLiteral FILTER_DOCX
Definition: iodetect.hxx:40
constexpr OUStringLiteral sSWRITERWEB
Definition: iodetect.hxx:46
#define FILTER_XMLV
XML filter.
Definition: iodetect.hxx:38
constexpr OUStringLiteral sWW6
Definition: iodetect.hxx:43
constexpr OUStringLiteral sSWRITER
Definition: iodetect.hxx:45
constexpr OUStringLiteral sHTML
Definition: iodetect.hxx:41
constexpr OUStringLiteral FILTER_TEXT
text filter with default codeset
Definition: iodetect.hxx:33
sal_Int64 n
LineEnd
LINEEND_LF
LINEEND_CRLF
LINEEND_CR
LineEnd GetSystemLineEnd()
aBuf
bool match(const sal_Unicode *pWild, const sal_Unicode *pStr, const sal_Unicode cEscape)
void SvStream & rStrm
long Long
sal_uIntPtr sal_uLong
unsigned char sal_uInt8
sal_uInt16 sal_Unicode