LibreOffice Module ucb (master) 1
regexp.cxx
Go to the documentation of this file.
1/* -*- Mode: C++; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4 -*- */
2/*
3 * This file is part of the LibreOffice project.
4 *
5 * This Source Code Form is subject to the terms of the Mozilla Public
6 * License, v. 2.0. If a copy of the MPL was not distributed with this
7 * file, You can obtain one at http://mozilla.org/MPL/2.0/.
8 *
9 * This file incorporates work covered by the following license notice:
10 *
11 * Licensed to the Apache Software Foundation (ASF) under one or more
12 * contributor license agreements. See the NOTICE file distributed
13 * with this work for additional information regarding copyright
14 * ownership. The ASF licenses this file to you under the Apache
15 * License, Version 2.0 (the "License"); you may not use this file
16 * except in compliance with the License. You may obtain a copy of
17 * the License at http://www.apache.org/licenses/LICENSE-2.0 .
18 */
19
20#include <regexp.hxx>
21
22#include <cstddef>
23
24#include <osl/diagnose.h>
25#include <com/sun/star/lang/IllegalArgumentException.hpp>
26#include <rtl/character.hxx>
27#include <rtl/ustrbuf.hxx>
28#include <rtl/ustring.hxx>
29#include <utility>
30
31using namespace com::sun::star;
32using namespace ucb_impl;
33
34
35// Regexp
36
37
38inline Regexp::Regexp(Kind eTheKind, OUString aThePrefix,
39 bool bTheEmptyDomain, OUString aTheInfix,
40 bool bTheTranslation,
41 OUString aTheReversePrefix):
42 m_eKind(eTheKind),
43 m_aPrefix(std::move(aThePrefix)),
44 m_aInfix(std::move(aTheInfix)),
45 m_aReversePrefix(std::move(aTheReversePrefix)),
46 m_bEmptyDomain(bTheEmptyDomain),
47 m_bTranslation(bTheTranslation)
48{
49 OSL_ASSERT(m_eKind == KIND_DOMAIN
50 || (!m_bEmptyDomain && m_aInfix.isEmpty()));
51 OSL_ASSERT(m_bTranslation || m_aReversePrefix.isEmpty());
52}
53
54
55namespace {
56
57bool matchStringIgnoreCase(sal_Unicode const ** pBegin,
58 sal_Unicode const * pEnd,
59 OUString const & rString)
60{
61 sal_Unicode const * p = *pBegin;
62
63 sal_Unicode const * q = rString.getStr();
64 sal_Unicode const * qEnd = q + rString.getLength();
65
66 if (pEnd - p < qEnd - q)
67 return false;
68
69 while (q != qEnd)
70 {
71 if (rtl::compareIgnoreAsciiCase(*p++, *q++) != 0)
72 return false;
73 }
74
75 *pBegin = p;
76 return true;
77}
78
79}
80
81bool Regexp::matches(OUString const & rString) const
82{
83 sal_Unicode const * pBegin = rString.getStr();
84 sal_Unicode const * pEnd = pBegin + rString.getLength();
85
86 bool bMatches = false;
87
88 sal_Unicode const * p = pBegin;
89 if (matchStringIgnoreCase(&p, pEnd, m_aPrefix))
90 {
91 switch (m_eKind)
92 {
93 case KIND_PREFIX:
94 bMatches = true;
95 break;
96
97 case KIND_AUTHORITY:
98 bMatches = p == pEnd || *p == '/' || *p == '?' || *p == '#';
99 break;
100
101 case KIND_DOMAIN:
102 if (!m_bEmptyDomain)
103 {
104 if (p == pEnd || *p == '/' || *p == '?' || *p == '#')
105 break;
106 ++p;
107 }
108 for (;;)
109 {
110 sal_Unicode const * q = p;
111 if (matchStringIgnoreCase(&q, pEnd, m_aInfix)
112 && (q == pEnd || *q == '/' || *q == '?' || *q == '#'))
113 {
114 bMatches = true;
115 break;
116 }
117
118 if (p == pEnd)
119 break;
120
121 sal_Unicode c = *p++;
122 if (c == '/' || c == '?' || c == '#')
123 break;
124 }
125 break;
126 }
127 }
128
129 return bMatches;
130}
131
132
133namespace {
134
135bool isScheme(OUString const & rString, bool bColon)
136{
137 // Return true if rString matches <scheme> (plus a trailing ":" if bColon
138 // is true) from RFC 2396:
139 sal_Unicode const * p = rString.getStr();
140 sal_Unicode const * pEnd = p + rString.getLength();
141 if (p != pEnd && rtl::isAsciiAlpha(*p))
142 for (++p;;)
143 {
144 if (p == pEnd)
145 return !bColon;
146 sal_Unicode c = *p++;
147 if (!(rtl::isAsciiAlphanumeric(c)
148 || c == '+' || c == '-' || c == '.'))
149 return bColon && c == ':' && p == pEnd;
150 }
151 return false;
152}
153
154void appendStringLiteral(OUStringBuffer * pBuffer,
155 OUString const & rString)
156{
157 OSL_ASSERT(pBuffer);
158
159 pBuffer->append('"');
160 sal_Unicode const * p = rString.getStr();
161 sal_Unicode const * pEnd = p + rString.getLength();
162 while (p != pEnd)
163 {
164 sal_Unicode c = *p++;
165 if (c == '"' || c == '\\')
166 pBuffer->append('\\');
167 pBuffer->append(c);
168 }
169 pBuffer->append('"');
170}
171
172}
173
174OUString Regexp::getRegexp() const
175{
176 if (m_bTranslation)
177 {
178 OUStringBuffer aBuffer;
179 if (!m_aPrefix.isEmpty())
180 appendStringLiteral(&aBuffer, m_aPrefix);
181 switch (m_eKind)
182 {
183 case KIND_PREFIX:
184 aBuffer.append("(.*)");
185 break;
186
187 case KIND_AUTHORITY:
188 aBuffer.append("(([/?#].*)?)");
189 break;
190
191 case KIND_DOMAIN:
192 aBuffer.append("([^/?#]");
193 aBuffer.append(sal_Unicode(m_bEmptyDomain ? '*' : '+'));
194 if (!m_aInfix.isEmpty())
195 appendStringLiteral(&aBuffer, m_aInfix);
196 aBuffer.append("([/?#].*)?)");
197 break;
198 }
199 aBuffer.append("->");
200 if (!m_aReversePrefix.isEmpty())
201 appendStringLiteral(&aBuffer, m_aReversePrefix);
202 aBuffer.append("\\1");
203 return aBuffer.makeStringAndClear();
204 }
205 else if (m_eKind == KIND_PREFIX && isScheme(m_aPrefix, true))
206 return m_aPrefix.copy(0, m_aPrefix.getLength() - 1);
207 else
208 {
209 OUStringBuffer aBuffer;
210 if (!m_aPrefix.isEmpty())
211 appendStringLiteral(&aBuffer, m_aPrefix);
212 switch (m_eKind)
213 {
214 case KIND_PREFIX:
215 aBuffer.append(".*");
216 break;
217
218 case KIND_AUTHORITY:
219 aBuffer.append("([/?#].*)?");
220 break;
221
222 case KIND_DOMAIN:
223 aBuffer.append("[^/?#]");
224 aBuffer.append( m_bEmptyDomain ? '*' : '+' );
225 if (!m_aInfix.isEmpty())
226 appendStringLiteral(&aBuffer, m_aInfix);
227 aBuffer.append("([/?#].*)?");
228 break;
229 }
230 return aBuffer.makeStringAndClear();
231 }
232}
233
234
235namespace {
236
237bool matchString(sal_Unicode const ** pBegin, sal_Unicode const * pEnd,
238 char const * pString, size_t nStringLength)
239{
240 sal_Unicode const * p = *pBegin;
241
242 unsigned char const * q = reinterpret_cast< unsigned char const * >(pString);
243 unsigned char const * qEnd = q + nStringLength;
244
245 if (pEnd - p < qEnd - q)
246 return false;
247
248 while (q != qEnd)
249 {
250 sal_Unicode c1 = *p++;
251 sal_Unicode c2 = *q++;
252 if (c1 != c2)
253 return false;
254 }
255
256 *pBegin = p;
257 return true;
258}
259
260bool scanStringLiteral(sal_Unicode const ** pBegin, sal_Unicode const * pEnd,
261 OUString * pString)
262{
263 sal_Unicode const * p = *pBegin;
264
265 if (p == pEnd || *p++ != '"')
266 return false;
267
268 OUStringBuffer aBuffer;
269 for (;;)
270 {
271 if (p == pEnd)
272 return false;
273 sal_Unicode c = *p++;
274 if (c == '"')
275 break;
276 if (c == '\\')
277 {
278 if (p == pEnd)
279 return false;
280 c = *p++;
281 if (c != '"' && c != '\\')
282 return false;
283 }
284 aBuffer.append(c);
285 }
286
287 *pBegin = p;
288 *pString = aBuffer.makeStringAndClear();
289 return true;
290}
291
292}
293
294Regexp Regexp::parse(OUString const & rRegexp)
295{
296 // Detect an input of '<scheme>' as an abbreviation of '"<scheme>:".*'
297 // where <scheme> is as defined in RFC 2396:
298 if (isScheme(rRegexp, false))
300 rRegexp + ":",
301 false,
302 OUString(),
303 false,
304 OUString());
305
306 sal_Unicode const * p = rRegexp.getStr();
307 sal_Unicode const * pEnd = p + rRegexp.getLength();
308
309 OUString aPrefix;
310 scanStringLiteral(&p, pEnd, &aPrefix);
311
312 if (p == pEnd)
313 throw lang::IllegalArgumentException();
314
315 // This and the matchString() calls below are some of the few places where
316 // RTL_CONSTASCII_STRINGPARAM() should NOT be removed.
317 // (c.f. https://gerrit.libreoffice.org/3117)
318 if (matchString(&p, pEnd, RTL_CONSTASCII_STRINGPARAM(".*")))
319 {
320 if (p != pEnd)
321 throw lang::IllegalArgumentException();
322
323 return Regexp(Regexp::KIND_PREFIX, aPrefix, false, OUString(),
324 false, OUString());
325 }
326 else if (matchString(&p, pEnd, RTL_CONSTASCII_STRINGPARAM("(.*)->")))
327 {
328 OUString aReversePrefix;
329 scanStringLiteral(&p, pEnd, &aReversePrefix);
330
331 if (!matchString(&p, pEnd, RTL_CONSTASCII_STRINGPARAM("\\1"))
332 || p != pEnd)
333 throw lang::IllegalArgumentException();
334
335 return Regexp(Regexp::KIND_PREFIX, aPrefix, false, OUString(),
336 true, aReversePrefix);
337 }
338 else if (matchString(&p, pEnd, RTL_CONSTASCII_STRINGPARAM("([/?#].*)?")))
339 {
340 if (p != pEnd)
341 throw lang::IllegalArgumentException();
342
343 return Regexp(Regexp::KIND_AUTHORITY, aPrefix, false, OUString(),
344 false, OUString());
345 }
346 else if (matchString(&p, pEnd,
347 RTL_CONSTASCII_STRINGPARAM("(([/?#].*)?)->")))
348 {
349 OUString aReversePrefix;
350 if (!(scanStringLiteral(&p, pEnd, &aReversePrefix)
351 && matchString(&p, pEnd, RTL_CONSTASCII_STRINGPARAM("\\1"))
352 && p == pEnd))
353 throw lang::IllegalArgumentException();
354
355 return Regexp(Regexp::KIND_AUTHORITY, aPrefix, false, OUString(),
356 true, aReversePrefix);
357 }
358 else
359 {
360 bool bOpen = false;
361 if (p != pEnd && *p == '(')
362 {
363 ++p;
364 bOpen = true;
365 }
366
367 if (!matchString(&p, pEnd, RTL_CONSTASCII_STRINGPARAM("[^/?#]")))
368 throw lang::IllegalArgumentException();
369
370 if (p == pEnd || (*p != '*' && *p != '+'))
371 throw lang::IllegalArgumentException();
372 bool bEmptyDomain = *p++ == '*';
373
374 OUString aInfix;
375 scanStringLiteral(&p, pEnd, &aInfix);
376
377 if (!matchString(&p, pEnd, RTL_CONSTASCII_STRINGPARAM("([/?#].*)?")))
378 throw lang::IllegalArgumentException();
379
380 OUString aReversePrefix;
381 if (bOpen
382 && !(matchString(&p, pEnd, RTL_CONSTASCII_STRINGPARAM(")->"))
383 && scanStringLiteral(&p, pEnd, &aReversePrefix)
384 && matchString(&p, pEnd, RTL_CONSTASCII_STRINGPARAM("\\1"))))
385 throw lang::IllegalArgumentException();
386
387 if (p != pEnd)
388 throw lang::IllegalArgumentException();
389
390 return Regexp(Regexp::KIND_DOMAIN, aPrefix, bEmptyDomain, aInfix,
391 bOpen, aReversePrefix);
392 }
393}
394
395/* vim:set shiftwidth=4 softtabstop=4 expandtab: */
bool m_bEmptyDomain
Definition: regexp.hxx:55
OUString m_aReversePrefix
Definition: regexp.hxx:54
OUString m_aInfix
Definition: regexp.hxx:53
OUString getRegexp() const
Definition: regexp.cxx:174
OUString m_aPrefix
Definition: regexp.hxx:52
Regexp(Kind eTheKind, OUString aThePrefix, bool bTheEmptyDomain, OUString aTheInfix, bool bTheTranslation, OUString aTheReversePrefix)
Definition: regexp.cxx:38
static Regexp parse(OUString const &rRegexp)
Definition: regexp.cxx:294
bool matches(OUString const &rString) const
Definition: regexp.cxx:81
bool m_bTranslation
Definition: regexp.hxx:56
void * p
sal_uInt16 sal_Unicode
std::unique_ptr< char[]> aBuffer