LibreOffice Module filter (master) 1
textfilterdetect/filterdetect.cxx
Go to the documentation of this file.
1/* -*- Mode: C++; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4 -*- */
2/*
3 * This file is part of the LibreOffice project.
4 *
5 * This Source Code Form is subject to the terms of the Mozilla Public
6 * License, v. 2.0. If a copy of the MPL was not distributed with this
7 * file, You can obtain one at http://mozilla.org/MPL/2.0/.
8 */
9
10#include "filterdetect.hxx"
11
12#include <svtools/htmltokn.h>
13#include <tools/urlobj.hxx>
14#include <tools/zcodec.hxx>
15#include <ucbhelper/content.hxx>
19
20#include <com/sun/star/io/XInputStream.hpp>
22#include <memory>
23
24constexpr OUStringLiteral WRITER_TEXT_FILTER = u"Text";
25constexpr OUStringLiteral CALC_TEXT_FILTER = u"Text - txt - csv (StarCalc)";
26
27constexpr OUStringLiteral WEB_HTML_FILTER = u"HTML";
28constexpr OUStringLiteral WRITER_HTML_FILTER = u"HTML (StarWriter)";
29constexpr OUStringLiteral CALC_HTML_FILTER = u"calc_HTML_WebQuery";
30
31constexpr OUStringLiteral WRITER_DOCSERVICE = u"com.sun.star.text.TextDocument";
32constexpr OUStringLiteral CALC_DOCSERVICE = u"com.sun.star.sheet.SpreadsheetDocument";
33
34using namespace ::com::sun::star;
36
37namespace {
38
39bool IsHTMLStream( const uno::Reference<io::XInputStream>& xInStream )
40{
41 std::unique_ptr<SvStream> pInStream( utl::UcbStreamHelper::CreateStream( xInStream ) );
42 if ( !pInStream || pInStream->GetError() )
43 // No stream
44 return false;
45
46 // Read the stream header
47 pInStream->StartReadingUnicodeText( RTL_TEXTENCODING_DONTKNOW );
48 const sal_uInt64 nUniPos = pInStream->Tell();
49 const sal_uInt16 nSize = 4096;
50
51 OString sHeader;
52 if ( nUniPos == 3 || nUniPos == 0 ) // UTF-8 or non-Unicode
53 sHeader = read_uInt8s_ToOString( *pInStream, nSize );
54 else // UTF-16 (nUniPos = 2)
55 sHeader = OUStringToOString( read_uInt16s_ToOUString( *pInStream, nSize ), RTL_TEXTENCODING_ASCII_US );
56
57 // Now check whether the stream begins with a known HTML tag.
58 enum DetectPhase { BeforeTag, TagOpened, InTagName };
59 DetectPhase dp = BeforeTag;
61 enum DeclarationPhase
62 {
63 BeforeDeclaration,
64 DeclarationOpened
65 };
66 DeclarationPhase eDeclaration = BeforeDeclaration;
67
68 const char* pHeader = sHeader.getStr();
69 const int nLength = sHeader.getLength();
70 int i = 0, nStartOfTagIndex = 0;
71
72 for ( i = 0; i < nLength; ++i, ++pHeader )
73 {
74 char c = *pHeader;
75 if ((c == ' ' || c == '\n' || c == '\t' || c == '\r' || c == '\f')
76 && eDeclaration == BeforeDeclaration)
77 {
78 if ( dp == TagOpened )
79 return false; // Invalid: Should start with a tag name
80 else if ( dp == InTagName )
81 break; // End of tag name reached
82 }
83 else if ( c == '<' )
84 {
85 if ( dp == BeforeTag )
86 dp = TagOpened;
87 else
88 return false; // Invalid: Nested '<'
89 }
90 else if ( c == '>' )
91 {
92 if ( dp == InTagName )
93 break; // End of tag name reached
94 else if (eDeclaration == DeclarationOpened)
95 {
96 dp = BeforeTag;
97 eDeclaration = BeforeDeclaration;
98 }
99 else
100 return false; // Invalid: Empty tag or before '<'
101 }
102 else if ( c == '!' )
103 {
104 if ( dp == TagOpened )
105 return true; // "<!" - DOCTYPE or comments block
106 else
107 return false; // Invalid: '!' before '<' or inside tag name
108 }
109 else
110 {
111 if ( dp == BeforeTag )
112 return false; // Invalid: Should start with a tag
113 else if ( dp == TagOpened )
114 {
115 if (c == '?' && eDeclaration == BeforeDeclaration)
116 eDeclaration = DeclarationOpened;
117 else if (eDeclaration == BeforeDeclaration)
118 {
119 nStartOfTagIndex = i;
120 dp = InTagName;
121 }
122 }
123 }
124 }
125
126 // The string following '<' has to be a known HTML token.
127 OString aToken = sHeader.copy( nStartOfTagIndex, i - nStartOfTagIndex );
128 return GetHTMLToken( OStringToOUString( aToken.toAsciiLowerCase(), RTL_TEXTENCODING_ASCII_US ) ) != HtmlTokenId::NONE;
129}
130}
131
133
135
136OUString SAL_CALL PlainTextFilterDetect::detect(uno::Sequence<beans::PropertyValue>& lDescriptor)
137{
138 MediaDescriptor aMediaDesc(lDescriptor);
139
140 OUString aType = aMediaDesc.getUnpackedValueOrDefault(MediaDescriptor::PROP_TYPENAME, OUString() );
141 OUString aDocService = aMediaDesc.getUnpackedValueOrDefault(MediaDescriptor::PROP_DOCUMENTSERVICE, OUString() );
142
143 if ((aType == "generic_HTML") || (aType == "calc_HTML"))
144 {
145 uno::Reference<io::XInputStream> xInStream(aMediaDesc[MediaDescriptor::PROP_INPUTSTREAM], uno::UNO_QUERY);
146 if (!xInStream.is() || !IsHTMLStream(xInStream))
147 return OUString();
148
149 if ((aDocService == CALC_DOCSERVICE) || (aType == "calc_HTML"))
150 aMediaDesc[MediaDescriptor::PROP_FILTERNAME] <<= OUString(CALC_HTML_FILTER);
151 else if (aDocService == WRITER_DOCSERVICE)
152 aMediaDesc[MediaDescriptor::PROP_FILTERNAME] <<= OUString(WRITER_HTML_FILTER);
153 else
154 aMediaDesc[MediaDescriptor::PROP_FILTERNAME] <<= OUString(WEB_HTML_FILTER);
155 }
156
157 else if (aType == "generic_Text")
158 {
159 uno::Reference<io::XStream> xStream(aMediaDesc[MediaDescriptor::PROP_STREAM], uno::UNO_QUERY);
160 uno::Reference<io::XInputStream> xInStream(aMediaDesc[MediaDescriptor::PROP_INPUTSTREAM], uno::UNO_QUERY);
161 if (xStream.is() || xInStream.is())
162 {
163 ZCodec aCodecGZ;
164 std::unique_ptr<SvStream> pInStream;
165 if (xStream.is())
167 else
168 pInStream = utl::UcbStreamHelper::CreateStream(xInStream);
169 std::unique_ptr<SvMemoryStream> pDecompressedStream(new SvMemoryStream());
170 if (aCodecGZ.AttemptDecompression(*pInStream, *pDecompressedStream))
171 {
172 uno::Reference<io::XStream> xStreamDecompressed(new utl::OStreamWrapper(std::move(pDecompressedStream)));
173 aMediaDesc[MediaDescriptor::PROP_STREAM] <<= xStreamDecompressed;
174 aMediaDesc[MediaDescriptor::PROP_INPUTSTREAM] <<= xStreamDecompressed->getInputStream();
175 OUString aURL = aMediaDesc.getUnpackedValueOrDefault(MediaDescriptor::PROP_URL, OUString() );
176 sal_Int32 nIdx = aURL.lastIndexOf(".gz");
177 if (nIdx != -1)
178 aMediaDesc[MediaDescriptor::PROP_URL] <<= aURL.copy(0, nIdx);
179 }
180 }
181 // Get the file name extension.
182 INetURLObject aParser(aMediaDesc.getUnpackedValueOrDefault(MediaDescriptor::PROP_URL, OUString() ) );
184 aExt = aExt.toAsciiLowerCase();
185 OUString aName = aParser.getName().toAsciiLowerCase();
186
187 // Decide which filter to use based on the document service first,
188 // then on extension if that's not available.
189
190 if (aDocService == CALC_DOCSERVICE)
191 aMediaDesc[MediaDescriptor::PROP_FILTERNAME] <<= OUString(CALC_TEXT_FILTER);
192 else if (aDocService == WRITER_DOCSERVICE)
193 aMediaDesc[MediaDescriptor::PROP_FILTERNAME] <<= OUString(WRITER_TEXT_FILTER);
194 else if (aExt == "csv" || aExt == "tsv" || aExt == "tab" || aExt == "xls" || aName.endsWith(".csv.gz"))
195 aMediaDesc[MediaDescriptor::PROP_FILTERNAME] <<= OUString(CALC_TEXT_FILTER);
196 else
197 aMediaDesc[MediaDescriptor::PROP_FILTERNAME] <<= OUString(WRITER_TEXT_FILTER);
198 }
199
200 else
201 // Nothing to detect.
202 return OUString();
203
204 aMediaDesc >> lDescriptor;
205 return aType;
206}
207
208// XInitialization
209
210void SAL_CALL PlainTextFilterDetect::initialize(const uno::Sequence<uno::Any>& /*aArguments*/)
211{
212}
213
215{
216 return "com.sun.star.comp.filters.PlainTextFilterDetect";
217}
218
220{
221 return { "com.sun.star.document.ExtendedTypeDetection", "com.sun.star.comp.filters.PlainTextFilterDetect" };
222}
223
224// XServiceInfo
226{
228}
229
230sal_Bool SAL_CALL PlainTextFilterDetect::supportsService(const OUString& rServiceName)
231{
232 return cppu::supportsService(this, rServiceName);
233}
234
235uno::Sequence<OUString> SAL_CALL PlainTextFilterDetect::getSupportedServiceNames()
236{
238}
239
240extern "C" SAL_DLLPUBLIC_EXPORT css::uno::XInterface*
242 css::uno::Sequence<css::uno::Any> const &)
243{
244 return cppu::acquire(new PlainTextFilterDetect);
245}
246
247/* vim:set shiftwidth=4 softtabstop=4 expandtab: */
Reference< XInputStream > xStream
OUString getName(sal_Int32 nIndex=LAST_SEGMENT, bool bIgnoreFinalSlash=true, DecodeMechanism eMechanism=DecodeMechanism::ToIUri, rtl_TextEncoding eCharset=RTL_TEXTENCODING_UTF8) const
OUString getExtension(sal_Int32 nIndex=LAST_SEGMENT, bool bIgnoreFinalSlash=true, DecodeMechanism eMechanism=DecodeMechanism::ToIUri, rtl_TextEncoding eCharset=RTL_TEXTENCODING_UTF8) const
virtual css::uno::Sequence< OUString > SAL_CALL getSupportedServiceNames() override
virtual sal_Bool SAL_CALL supportsService(const OUString &ServiceName) override
virtual OUString SAL_CALL getImplementationName() override
virtual OUString SAL_CALL detect(css::uno::Sequence< css::beans::PropertyValue > &lDescriptor) override
virtual void SAL_CALL initialize(const css::uno::Sequence< css::uno::Any > &aArguments) override
bool AttemptDecompression(SvStream &rIStm, SvStream &rOStm)
static std::unique_ptr< SvStream > CreateStream(const OUString &rFileName, StreamMode eOpenMode, css::uno::Reference< css::awt::XWindow > xParentWin=nullptr)
URL aURL
float u
SVT_DLLPUBLIC HtmlTokenId GetHTMLToken(std::u16string_view rName)
OUString aName
Shape IDs per cluster in DGG atom.
bool CPPUHELPER_DLLPUBLIC supportsService(css::lang::XServiceInfo *implementation, rtl::OUString const &name)
int i
Definition: gentoken.py:48
OString OUStringToOString(std::u16string_view str, ConnectionSettings const *settings)
TOOLS_DLLPUBLIC OString read_uInt8s_ToOString(SvStream &rStrm, std::size_t nUnits)
TOOLS_DLLPUBLIC OUString read_uInt16s_ToOUString(SvStream &rStrm, std::size_t nUnits)
constexpr OUStringLiteral CALC_TEXT_FILTER
uno::Sequence< OUString > PlainTextFilterDetect_getSupportedServiceNames()
constexpr OUStringLiteral WEB_HTML_FILTER
SAL_DLLPUBLIC_EXPORT css::uno::XInterface * com_sun_star_comp_filters_PlainTextFilterDetect_get_implementation(css::uno::XComponentContext *, css::uno::Sequence< css::uno::Any > const &)
constexpr OUStringLiteral WRITER_DOCSERVICE
OUString PlainTextFilterDetect_getImplementationName()
constexpr OUStringLiteral WRITER_TEXT_FILTER
constexpr OUStringLiteral CALC_DOCSERVICE
constexpr OUStringLiteral WRITER_HTML_FILTER
constexpr OUStringLiteral CALC_HTML_FILTER
unsigned char sal_Bool
sal_Int32 nLength