LibreOffice Module ucb (master)  1
regexp.cxx
Go to the documentation of this file.
1 /* -*- Mode: C++; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4 -*- */
2 /*
3  * This file is part of the LibreOffice project.
4  *
5  * This Source Code Form is subject to the terms of the Mozilla Public
6  * License, v. 2.0. If a copy of the MPL was not distributed with this
7  * file, You can obtain one at http://mozilla.org/MPL/2.0/.
8  *
9  * This file incorporates work covered by the following license notice:
10  *
11  * Licensed to the Apache Software Foundation (ASF) under one or more
12  * contributor license agreements. See the NOTICE file distributed
13  * with this work for additional information regarding copyright
14  * ownership. The ASF licenses this file to you under the Apache
15  * License, Version 2.0 (the "License"); you may not use this file
16  * except in compliance with the License. You may obtain a copy of
17  * the License at http://www.apache.org/licenses/LICENSE-2.0 .
18  */
19 
20 #include <regexp.hxx>
21 
22 #include <cstddef>
23 
24 #include <osl/diagnose.h>
25 #include <com/sun/star/lang/IllegalArgumentException.hpp>
26 #include <rtl/character.hxx>
27 #include <rtl/ustrbuf.hxx>
28 #include <rtl/ustring.hxx>
29 
30 using namespace com::sun::star;
31 using namespace ucb_impl;
32 
33 
34 // Regexp
35 
36 
37 inline Regexp::Regexp(Kind eTheKind, OUString const & rThePrefix,
38  bool bTheEmptyDomain, OUString const & rTheInfix,
39  bool bTheTranslation,
40  OUString const & rTheReversePrefix):
41  m_eKind(eTheKind),
42  m_aPrefix(rThePrefix),
43  m_aInfix(rTheInfix),
44  m_aReversePrefix(rTheReversePrefix),
45  m_bEmptyDomain(bTheEmptyDomain),
46  m_bTranslation(bTheTranslation)
47 {
48  OSL_ASSERT(m_eKind == KIND_DOMAIN
49  || (!m_bEmptyDomain && m_aInfix.isEmpty()));
50  OSL_ASSERT(m_bTranslation || m_aReversePrefix.isEmpty());
51 }
52 
53 
54 namespace {
55 
56 bool matchStringIgnoreCase(sal_Unicode const ** pBegin,
57  sal_Unicode const * pEnd,
58  OUString const & rString)
59 {
60  sal_Unicode const * p = *pBegin;
61 
62  sal_Unicode const * q = rString.getStr();
63  sal_Unicode const * qEnd = q + rString.getLength();
64 
65  if (pEnd - p < qEnd - q)
66  return false;
67 
68  while (q != qEnd)
69  {
70  if (rtl::compareIgnoreAsciiCase(*p++, *q++) != 0)
71  return false;
72  }
73 
74  *pBegin = p;
75  return true;
76 }
77 
78 }
79 
80 bool Regexp::matches(OUString const & rString) const
81 {
82  sal_Unicode const * pBegin = rString.getStr();
83  sal_Unicode const * pEnd = pBegin + rString.getLength();
84 
85  bool bMatches = false;
86 
87  sal_Unicode const * p = pBegin;
88  if (matchStringIgnoreCase(&p, pEnd, m_aPrefix))
89  {
90  switch (m_eKind)
91  {
92  case KIND_PREFIX:
93  bMatches = true;
94  break;
95 
96  case KIND_AUTHORITY:
97  bMatches = p == pEnd || *p == '/' || *p == '?' || *p == '#';
98  break;
99 
100  case KIND_DOMAIN:
101  if (!m_bEmptyDomain)
102  {
103  if (p == pEnd || *p == '/' || *p == '?' || *p == '#')
104  break;
105  ++p;
106  }
107  for (;;)
108  {
109  sal_Unicode const * q = p;
110  if (matchStringIgnoreCase(&q, pEnd, m_aInfix)
111  && (q == pEnd || *q == '/' || *q == '?' || *q == '#'))
112  {
113  bMatches = true;
114  break;
115  }
116 
117  if (p == pEnd)
118  break;
119 
120  sal_Unicode c = *p++;
121  if (c == '/' || c == '?' || c == '#')
122  break;
123  }
124  break;
125  }
126  }
127 
128  return bMatches;
129 }
130 
131 
132 namespace {
133 
134 bool isScheme(OUString const & rString, bool bColon)
135 {
136  // Return true if rString matches <scheme> (plus a trailing ":" if bColon
137  // is true) from RFC 2396:
138  sal_Unicode const * p = rString.getStr();
139  sal_Unicode const * pEnd = p + rString.getLength();
140  if (p != pEnd && rtl::isAsciiAlpha(*p))
141  for (++p;;)
142  {
143  if (p == pEnd)
144  return !bColon;
145  sal_Unicode c = *p++;
146  if (!(rtl::isAsciiAlphanumeric(c)
147  || c == '+' || c == '-' || c == '.'))
148  return bColon && c == ':' && p == pEnd;
149  }
150  return false;
151 }
152 
153 void appendStringLiteral(OUStringBuffer * pBuffer,
154  OUString const & rString)
155 {
156  OSL_ASSERT(pBuffer);
157 
158  pBuffer->append('"');
159  sal_Unicode const * p = rString.getStr();
160  sal_Unicode const * pEnd = p + rString.getLength();
161  while (p != pEnd)
162  {
163  sal_Unicode c = *p++;
164  if (c == '"' || c == '\\')
165  pBuffer->append('\\');
166  pBuffer->append(c);
167  }
168  pBuffer->append('"');
169 }
170 
171 }
172 
173 OUString Regexp::getRegexp() const
174 {
175  if (m_bTranslation)
176  {
177  OUStringBuffer aBuffer;
178  if (!m_aPrefix.isEmpty())
179  appendStringLiteral(&aBuffer, m_aPrefix);
180  switch (m_eKind)
181  {
182  case KIND_PREFIX:
183  aBuffer.append("(.*)");
184  break;
185 
186  case KIND_AUTHORITY:
187  aBuffer.append("(([/?#].*)?)");
188  break;
189 
190  case KIND_DOMAIN:
191  aBuffer.append("([^/?#]");
192  aBuffer.append(sal_Unicode(m_bEmptyDomain ? '*' : '+'));
193  if (!m_aInfix.isEmpty())
194  appendStringLiteral(&aBuffer, m_aInfix);
195  aBuffer.append("([/?#].*)?)");
196  break;
197  }
198  aBuffer.append("->");
199  if (!m_aReversePrefix.isEmpty())
200  appendStringLiteral(&aBuffer, m_aReversePrefix);
201  aBuffer.append("\\1");
202  return aBuffer.makeStringAndClear();
203  }
204  else if (m_eKind == KIND_PREFIX && isScheme(m_aPrefix, true))
205  return m_aPrefix.copy(0, m_aPrefix.getLength() - 1);
206  else
207  {
208  OUStringBuffer aBuffer;
209  if (!m_aPrefix.isEmpty())
210  appendStringLiteral(&aBuffer, m_aPrefix);
211  switch (m_eKind)
212  {
213  case KIND_PREFIX:
214  aBuffer.append(".*");
215  break;
216 
217  case KIND_AUTHORITY:
218  aBuffer.append("([/?#].*)?");
219  break;
220 
221  case KIND_DOMAIN:
222  aBuffer.append("[^/?#]");
223  aBuffer.append( m_bEmptyDomain ? '*' : '+' );
224  if (!m_aInfix.isEmpty())
225  appendStringLiteral(&aBuffer, m_aInfix);
226  aBuffer.append("([/?#].*)?");
227  break;
228  }
229  return aBuffer.makeStringAndClear();
230  }
231 }
232 
233 
234 namespace {
235 
236 bool matchString(sal_Unicode const ** pBegin, sal_Unicode const * pEnd,
237  sal_Char const * pString, size_t nStringLength)
238 {
239  sal_Unicode const * p = *pBegin;
240 
241  unsigned char const * q = reinterpret_cast< unsigned char const * >(pString);
242  unsigned char const * qEnd = q + nStringLength;
243 
244  if (pEnd - p < qEnd - q)
245  return false;
246 
247  while (q != qEnd)
248  {
249  sal_Unicode c1 = *p++;
250  sal_Unicode c2 = *q++;
251  if (c1 != c2)
252  return false;
253  }
254 
255  *pBegin = p;
256  return true;
257 }
258 
259 bool scanStringLiteral(sal_Unicode const ** pBegin, sal_Unicode const * pEnd,
260  OUString * pString)
261 {
262  sal_Unicode const * p = *pBegin;
263 
264  if (p == pEnd || *p++ != '"')
265  return false;
266 
267  OUStringBuffer aBuffer;
268  for (;;)
269  {
270  if (p == pEnd)
271  return false;
272  sal_Unicode c = *p++;
273  if (c == '"')
274  break;
275  if (c == '\\')
276  {
277  if (p == pEnd)
278  return false;
279  c = *p++;
280  if (c != '"' && c != '\\')
281  return false;
282  }
283  aBuffer.append(c);
284  }
285 
286  *pBegin = p;
287  *pString = aBuffer.makeStringAndClear();
288  return true;
289 }
290 
291 }
292 
293 Regexp Regexp::parse(OUString const & rRegexp)
294 {
295  // Detect an input of '<scheme>' as an abbreviation of '"<scheme>:".*'
296  // where <scheme> is as defined in RFC 2396:
297  if (isScheme(rRegexp, false))
299  rRegexp + ":",
300  false,
301  OUString(),
302  false,
303  OUString());
304 
305  sal_Unicode const * p = rRegexp.getStr();
306  sal_Unicode const * pEnd = p + rRegexp.getLength();
307 
308  OUString aPrefix;
309  scanStringLiteral(&p, pEnd, &aPrefix);
310 
311  if (p == pEnd)
312  throw lang::IllegalArgumentException();
313 
314  // This and the matchString() calls below are some of the few places where
315  // RTL_CONSTASCII_STRINGPARAM() should NOT be removed.
316  // (c.f. https://gerrit.libreoffice.org/3117)
317  if (matchString(&p, pEnd, RTL_CONSTASCII_STRINGPARAM(".*")))
318  {
319  if (p != pEnd)
320  throw lang::IllegalArgumentException();
321 
322  return Regexp(Regexp::KIND_PREFIX, aPrefix, false, OUString(),
323  false, OUString());
324  }
325  else if (matchString(&p, pEnd, RTL_CONSTASCII_STRINGPARAM("(.*)->")))
326  {
327  OUString aReversePrefix;
328  scanStringLiteral(&p, pEnd, &aReversePrefix);
329 
330  if (!matchString(&p, pEnd, RTL_CONSTASCII_STRINGPARAM("\\1"))
331  || p != pEnd)
332  throw lang::IllegalArgumentException();
333 
334  return Regexp(Regexp::KIND_PREFIX, aPrefix, false, OUString(),
335  true, aReversePrefix);
336  }
337  else if (matchString(&p, pEnd, RTL_CONSTASCII_STRINGPARAM("([/?#].*)?")))
338  {
339  if (p != pEnd)
340  throw lang::IllegalArgumentException();
341 
342  return Regexp(Regexp::KIND_AUTHORITY, aPrefix, false, OUString(),
343  false, OUString());
344  }
345  else if (matchString(&p, pEnd,
346  RTL_CONSTASCII_STRINGPARAM("(([/?#].*)?)->")))
347  {
348  OUString aReversePrefix;
349  if (!(scanStringLiteral(&p, pEnd, &aReversePrefix)
350  && matchString(&p, pEnd, RTL_CONSTASCII_STRINGPARAM("\\1"))
351  && p == pEnd))
352  throw lang::IllegalArgumentException();
353 
354  return Regexp(Regexp::KIND_AUTHORITY, aPrefix, false, OUString(),
355  true, aReversePrefix);
356  }
357  else
358  {
359  bool bOpen = false;
360  if (p != pEnd && *p == '(')
361  {
362  ++p;
363  bOpen = true;
364  }
365 
366  if (!matchString(&p, pEnd, RTL_CONSTASCII_STRINGPARAM("[^/?#]")))
367  throw lang::IllegalArgumentException();
368 
369  if (p == pEnd || (*p != '*' && *p != '+'))
370  throw lang::IllegalArgumentException();
371  bool bEmptyDomain = *p++ == '*';
372 
373  OUString aInfix;
374  scanStringLiteral(&p, pEnd, &aInfix);
375 
376  if (!matchString(&p, pEnd, RTL_CONSTASCII_STRINGPARAM("([/?#].*)?")))
377  throw lang::IllegalArgumentException();
378 
379  OUString aReversePrefix;
380  if (bOpen
381  && !(matchString(&p, pEnd, RTL_CONSTASCII_STRINGPARAM(")->"))
382  && scanStringLiteral(&p, pEnd, &aReversePrefix)
383  && matchString(&p, pEnd, RTL_CONSTASCII_STRINGPARAM("\\1"))))
384  throw lang::IllegalArgumentException();
385 
386  if (p != pEnd)
387  throw lang::IllegalArgumentException();
388 
389  return Regexp(Regexp::KIND_DOMAIN, aPrefix, bEmptyDomain, aInfix,
390  bOpen, aReversePrefix);
391  }
392 }
393 
394 /* vim:set shiftwidth=4 softtabstop=4 expandtab: */
bool m_bEmptyDomain
Definition: regexp.hxx:56
OUString m_aPrefix
Definition: regexp.hxx:53
OUString m_aReversePrefix
Definition: regexp.hxx:55
sal_uInt16 sal_Unicode
char sal_Char
OUString m_aInfix
Definition: regexp.hxx:54
static Regexp parse(OUString const &rRegexp)
Definition: regexp.cxx:293
bool m_bTranslation
Definition: regexp.hxx:57
bool matches(OUString const &rString) const
Definition: regexp.cxx:80
Regexp(Kind eTheKind, OUString const &rThePrefix, bool bTheEmptyDomain, OUString const &rTheInfix, bool bTheTranslation, OUString const &rTheReversePrefix)
Definition: regexp.cxx:37
OUString getRegexp() const
Definition: regexp.cxx:173