LibreOffice Module ucb (master) 1
regexp.cxx
Go to the documentation of this file.
1/* -*- Mode: C++; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4 -*- */
2/*
3 * This file is part of the LibreOffice project.
4 *
5 * This Source Code Form is subject to the terms of the Mozilla Public
6 * License, v. 2.0. If a copy of the MPL was not distributed with this
7 * file, You can obtain one at http://mozilla.org/MPL/2.0/.
8 *
9 * This file incorporates work covered by the following license notice:
10 *
11 * Licensed to the Apache Software Foundation (ASF) under one or more
12 * contributor license agreements. See the NOTICE file distributed
13 * with this work for additional information regarding copyright
14 * ownership. The ASF licenses this file to you under the Apache
15 * License, Version 2.0 (the "License"); you may not use this file
16 * except in compliance with the License. You may obtain a copy of
17 * the License at http://www.apache.org/licenses/LICENSE-2.0 .
18 */
19
20#include <regexp.hxx>
21
22#include <cstddef>
23
24#include <osl/diagnose.h>
25#include <com/sun/star/lang/IllegalArgumentException.hpp>
26#include <rtl/character.hxx>
27#include <rtl/ustrbuf.hxx>
28#include <rtl/ustring.hxx>
29#include <utility>
30
31using namespace com::sun::star;
32using namespace ucb_impl;
33
34
35// Regexp
36
37
38inline Regexp::Regexp(Kind eTheKind, OUString aThePrefix,
39 bool bTheEmptyDomain, OUString aTheInfix,
40 bool bTheTranslation,
41 OUString aTheReversePrefix):
42 m_eKind(eTheKind),
43 m_aPrefix(std::move(aThePrefix)),
44 m_aInfix(std::move(aTheInfix)),
45 m_aReversePrefix(std::move(aTheReversePrefix)),
46 m_bEmptyDomain(bTheEmptyDomain),
47 m_bTranslation(bTheTranslation)
48{
49 OSL_ASSERT(m_eKind == KIND_DOMAIN
50 || (!m_bEmptyDomain && m_aInfix.isEmpty()));
51 OSL_ASSERT(m_bTranslation || m_aReversePrefix.isEmpty());
52}
53
54
55namespace {
56
57bool matchStringIgnoreCase(sal_Unicode const ** pBegin,
58 sal_Unicode const * pEnd,
59 OUString const & rString)
60{
61 sal_Unicode const * p = *pBegin;
62
63 sal_Unicode const * q = rString.getStr();
64 sal_Unicode const * qEnd = q + rString.getLength();
65
66 if (pEnd - p < qEnd - q)
67 return false;
68
69 while (q != qEnd)
70 {
71 if (rtl::compareIgnoreAsciiCase(*p++, *q++) != 0)
72 return false;
73 }
74
75 *pBegin = p;
76 return true;
77}
78
79}
80
81bool Regexp::matches(OUString const & rString) const
82{
83 sal_Unicode const * pBegin = rString.getStr();
84 sal_Unicode const * pEnd = pBegin + rString.getLength();
85
86 bool bMatches = false;
87
88 sal_Unicode const * p = pBegin;
89 if (matchStringIgnoreCase(&p, pEnd, m_aPrefix))
90 {
91 switch (m_eKind)
92 {
93 case KIND_PREFIX:
94 bMatches = true;
95 break;
96
97 case KIND_AUTHORITY:
98 bMatches = p == pEnd || *p == '/' || *p == '?' || *p == '#';
99 break;
100
101 case KIND_DOMAIN:
102 if (!m_bEmptyDomain)
103 {
104 if (p == pEnd || *p == '/' || *p == '?' || *p == '#')
105 break;
106 ++p;
107 }
108 for (;;)
109 {
110 sal_Unicode const * q = p;
111 if (matchStringIgnoreCase(&q, pEnd, m_aInfix)
112 && (q == pEnd || *q == '/' || *q == '?' || *q == '#'))
113 {
114 bMatches = true;
115 break;
116 }
117
118 if (p == pEnd)
119 break;
120
121 sal_Unicode c = *p++;
122 if (c == '/' || c == '?' || c == '#')
123 break;
124 }
125 break;
126 }
127 }
128
129 return bMatches;
130}
131
132
133namespace {
134
135bool isScheme(OUString const & rString, bool bColon)
136{
137 // Return true if rString matches <scheme> (plus a trailing ":" if bColon
138 // is true) from RFC 2396:
139 sal_Unicode const * p = rString.getStr();
140 sal_Unicode const * pEnd = p + rString.getLength();
141 if (p != pEnd && rtl::isAsciiAlpha(*p))
142 for (++p;;)
143 {
144 if (p == pEnd)
145 return !bColon;
146 sal_Unicode c = *p++;
147 if (!(rtl::isAsciiAlphanumeric(c)
148 || c == '+' || c == '-' || c == '.'))
149 return bColon && c == ':' && p == pEnd;
150 }
151 return false;
152}
153
154void appendStringLiteral(OUStringBuffer * pBuffer,
155 OUString const & rString)
156{
157 OSL_ASSERT(pBuffer);
158
159 pBuffer->append('"');
160 sal_Unicode const * p = rString.getStr();
161 sal_Unicode const * pEnd = p + rString.getLength();
162 while (p != pEnd)
163 {
164 sal_Unicode c = *p++;
165 if (c == '"' || c == '\\')
166 pBuffer->append('\\');
167 pBuffer->append(c);
168 }
169 pBuffer->append('"');
170}
171
172}
173
174OUString Regexp::getRegexp() const
175{
176 if (m_bTranslation)
177 {
178 OUStringBuffer aBuffer;
179 if (!m_aPrefix.isEmpty())
180 appendStringLiteral(&aBuffer, m_aPrefix);
181 switch (m_eKind)
182 {
183 case KIND_PREFIX:
184 aBuffer.append("(.*)");
185 break;
186
187 case KIND_AUTHORITY:
188 aBuffer.append("(([/?#].*)?)");
189 break;
190
191 case KIND_DOMAIN:
192 aBuffer.append("([^/?#]" + OUStringChar(sal_Unicode(m_bEmptyDomain ? '*' : '+')));
193 if (!m_aInfix.isEmpty())
194 appendStringLiteral(&aBuffer, m_aInfix);
195 aBuffer.append("([/?#].*)?)");
196 break;
197 }
198 aBuffer.append("->");
199 if (!m_aReversePrefix.isEmpty())
200 appendStringLiteral(&aBuffer, m_aReversePrefix);
201 aBuffer.append("\\1");
202 return aBuffer.makeStringAndClear();
203 }
204 else if (m_eKind == KIND_PREFIX && isScheme(m_aPrefix, true))
205 return m_aPrefix.copy(0, m_aPrefix.getLength() - 1);
206 else
207 {
208 OUStringBuffer aBuffer;
209 if (!m_aPrefix.isEmpty())
210 appendStringLiteral(&aBuffer, m_aPrefix);
211 switch (m_eKind)
212 {
213 case KIND_PREFIX:
214 aBuffer.append(".*");
215 break;
216
217 case KIND_AUTHORITY:
218 aBuffer.append("([/?#].*)?");
219 break;
220
221 case KIND_DOMAIN:
222 aBuffer.append("[^/?#]" + OUStringChar( m_bEmptyDomain ? '*' : '+' ));
223 if (!m_aInfix.isEmpty())
224 appendStringLiteral(&aBuffer, m_aInfix);
225 aBuffer.append("([/?#].*)?");
226 break;
227 }
228 return aBuffer.makeStringAndClear();
229 }
230}
231
232
233namespace {
234
235bool matchString(sal_Unicode const ** pBegin, sal_Unicode const * pEnd,
236 char const * pString, size_t nStringLength)
237{
238 sal_Unicode const * p = *pBegin;
239
240 unsigned char const * q = reinterpret_cast< unsigned char const * >(pString);
241 unsigned char const * qEnd = q + nStringLength;
242
243 if (pEnd - p < qEnd - q)
244 return false;
245
246 while (q != qEnd)
247 {
248 sal_Unicode c1 = *p++;
249 sal_Unicode c2 = *q++;
250 if (c1 != c2)
251 return false;
252 }
253
254 *pBegin = p;
255 return true;
256}
257
258bool scanStringLiteral(sal_Unicode const ** pBegin, sal_Unicode const * pEnd,
259 OUString * pString)
260{
261 sal_Unicode const * p = *pBegin;
262
263 if (p == pEnd || *p++ != '"')
264 return false;
265
266 OUStringBuffer aBuffer;
267 for (;;)
268 {
269 if (p == pEnd)
270 return false;
271 sal_Unicode c = *p++;
272 if (c == '"')
273 break;
274 if (c == '\\')
275 {
276 if (p == pEnd)
277 return false;
278 c = *p++;
279 if (c != '"' && c != '\\')
280 return false;
281 }
282 aBuffer.append(c);
283 }
284
285 *pBegin = p;
286 *pString = aBuffer.makeStringAndClear();
287 return true;
288}
289
290}
291
292Regexp Regexp::parse(OUString const & rRegexp)
293{
294 // Detect an input of '<scheme>' as an abbreviation of '"<scheme>:".*'
295 // where <scheme> is as defined in RFC 2396:
296 if (isScheme(rRegexp, false))
298 rRegexp + ":",
299 false,
300 OUString(),
301 false,
302 OUString());
303
304 sal_Unicode const * p = rRegexp.getStr();
305 sal_Unicode const * pEnd = p + rRegexp.getLength();
306
307 OUString aPrefix;
308 scanStringLiteral(&p, pEnd, &aPrefix);
309
310 if (p == pEnd)
311 throw lang::IllegalArgumentException();
312
313 // This and the matchString() calls below are some of the few places where
314 // RTL_CONSTASCII_STRINGPARAM() should NOT be removed.
315 // (c.f. https://gerrit.libreoffice.org/3117)
316 if (matchString(&p, pEnd, RTL_CONSTASCII_STRINGPARAM(".*")))
317 {
318 if (p != pEnd)
319 throw lang::IllegalArgumentException();
320
321 return Regexp(Regexp::KIND_PREFIX, aPrefix, false, OUString(),
322 false, OUString());
323 }
324 else if (matchString(&p, pEnd, RTL_CONSTASCII_STRINGPARAM("(.*)->")))
325 {
326 OUString aReversePrefix;
327 scanStringLiteral(&p, pEnd, &aReversePrefix);
328
329 if (!matchString(&p, pEnd, RTL_CONSTASCII_STRINGPARAM("\\1"))
330 || p != pEnd)
331 throw lang::IllegalArgumentException();
332
333 return Regexp(Regexp::KIND_PREFIX, aPrefix, false, OUString(),
334 true, aReversePrefix);
335 }
336 else if (matchString(&p, pEnd, RTL_CONSTASCII_STRINGPARAM("([/?#].*)?")))
337 {
338 if (p != pEnd)
339 throw lang::IllegalArgumentException();
340
341 return Regexp(Regexp::KIND_AUTHORITY, aPrefix, false, OUString(),
342 false, OUString());
343 }
344 else if (matchString(&p, pEnd,
345 RTL_CONSTASCII_STRINGPARAM("(([/?#].*)?)->")))
346 {
347 OUString aReversePrefix;
348 if (!(scanStringLiteral(&p, pEnd, &aReversePrefix)
349 && matchString(&p, pEnd, RTL_CONSTASCII_STRINGPARAM("\\1"))
350 && p == pEnd))
351 throw lang::IllegalArgumentException();
352
353 return Regexp(Regexp::KIND_AUTHORITY, aPrefix, false, OUString(),
354 true, aReversePrefix);
355 }
356 else
357 {
358 bool bOpen = false;
359 if (p != pEnd && *p == '(')
360 {
361 ++p;
362 bOpen = true;
363 }
364
365 if (!matchString(&p, pEnd, RTL_CONSTASCII_STRINGPARAM("[^/?#]")))
366 throw lang::IllegalArgumentException();
367
368 if (p == pEnd || (*p != '*' && *p != '+'))
369 throw lang::IllegalArgumentException();
370 bool bEmptyDomain = *p++ == '*';
371
372 OUString aInfix;
373 scanStringLiteral(&p, pEnd, &aInfix);
374
375 if (!matchString(&p, pEnd, RTL_CONSTASCII_STRINGPARAM("([/?#].*)?")))
376 throw lang::IllegalArgumentException();
377
378 OUString aReversePrefix;
379 if (bOpen
380 && !(matchString(&p, pEnd, RTL_CONSTASCII_STRINGPARAM(")->"))
381 && scanStringLiteral(&p, pEnd, &aReversePrefix)
382 && matchString(&p, pEnd, RTL_CONSTASCII_STRINGPARAM("\\1"))))
383 throw lang::IllegalArgumentException();
384
385 if (p != pEnd)
386 throw lang::IllegalArgumentException();
387
388 return Regexp(Regexp::KIND_DOMAIN, aPrefix, bEmptyDomain, aInfix,
389 bOpen, aReversePrefix);
390 }
391}
392
393/* vim:set shiftwidth=4 softtabstop=4 expandtab: */
bool m_bEmptyDomain
Definition: regexp.hxx:55
OUString m_aReversePrefix
Definition: regexp.hxx:54
OUString m_aInfix
Definition: regexp.hxx:53
OUString getRegexp() const
Definition: regexp.cxx:174
OUString m_aPrefix
Definition: regexp.hxx:52
Regexp(Kind eTheKind, OUString aThePrefix, bool bTheEmptyDomain, OUString aTheInfix, bool bTheTranslation, OUString aTheReversePrefix)
Definition: regexp.cxx:38
static Regexp parse(OUString const &rRegexp)
Definition: regexp.cxx:292
bool matches(OUString const &rString) const
Definition: regexp.cxx:81
bool m_bTranslation
Definition: regexp.hxx:56
void * p
sal_uInt16 sal_Unicode
std::unique_ptr< char[]> aBuffer