LibreOffice Module comphelper (master)  1
string.cxx
Go to the documentation of this file.
1 /* -*- Mode: C++; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4 -*- */
2 /*
3  * This file is part of the LibreOffice project.
4  *
5  * This Source Code Form is subject to the terms of the Mozilla Public
6  * License, v. 2.0. If a copy of the MPL was not distributed with this
7  * file, You can obtain one at http://mozilla.org/MPL/2.0/.
8  *
9  * This file incorporates work covered by the following license notice:
10  *
11  * Licensed to the Apache Software Foundation (ASF) under one or more
12  * contributor license agreements. See the NOTICE file distributed
13  * with this work for additional information regarding copyright
14  * ownership. The ASF licenses this file to you under the Apache
15  * License, Version 2.0 (the "License"); you may not use this file
16  * except in compliance with the License. You may obtain a copy of
17  * the License at http://www.apache.org/licenses/LICENSE-2.0 .
18  */
19 
20 #include <sal/config.h>
21 
22 #include <cstddef>
23 #include <string_view>
24 #include <vector>
25 #include <algorithm>
26 
27 #include <rtl/character.hxx>
28 #include <rtl/ustring.hxx>
29 #include <rtl/ustrbuf.hxx>
30 #include <rtl/string.hxx>
31 #include <rtl/strbuf.hxx>
32 #include <sal/types.h>
33 
34 #include <comphelper/string.hxx>
35 #include <comphelper/stl_types.hxx>
36 #include <comphelper/sequence.hxx>
37 
38 #include <com/sun/star/i18n/BreakIterator.hpp>
39 #include <com/sun/star/i18n/CharType.hpp>
40 #include <com/sun/star/i18n/Collator.hpp>
41 
42 
43 namespace comphelper { namespace string {
44 
45 namespace
46 {
47  template <typename T, typename C> T tmpl_stripStart(const T &rIn,
48  const C cRemove)
49  {
50  if (rIn.isEmpty())
51  return rIn;
52 
53  sal_Int32 i = 0;
54 
55  while (i < rIn.getLength())
56  {
57  if (rIn[i] != cRemove)
58  break;
59  ++i;
60  }
61 
62  return rIn.copy(i);
63  }
64 }
65 
66 OString stripStart(const OString &rIn, sal_Char c)
67 {
68  return tmpl_stripStart<OString, sal_Char>(rIn, c);
69 }
70 
71 OUString stripStart(const OUString &rIn, sal_Unicode c)
72 {
73  return tmpl_stripStart<OUString, sal_Unicode>(rIn, c);
74 }
75 
76 namespace
77 {
78  template <typename T, typename C> T tmpl_stripEnd(const T &rIn,
79  const C cRemove)
80  {
81  if (rIn.isEmpty())
82  return rIn;
83 
84  sal_Int32 i = rIn.getLength();
85 
86  while (i > 0)
87  {
88  if (rIn[i-1] != cRemove)
89  break;
90  --i;
91  }
92 
93  return rIn.copy(0, i);
94  }
95 }
96 
97 OString stripEnd(const OString &rIn, sal_Char c)
98 {
99  return tmpl_stripEnd<OString, sal_Char>(rIn, c);
100 }
101 
102 OUString stripEnd(const OUString &rIn, sal_Unicode c)
103 {
104  return tmpl_stripEnd<OUString, sal_Unicode>(rIn, c);
105 }
106 
107 OString strip(const OString &rIn, sal_Char c)
108 {
109  return stripEnd(stripStart(rIn, c), c);
110 }
111 
112 OUString strip(const OUString &rIn, sal_Unicode c)
113 {
114  return stripEnd(stripStart(rIn, c), c);
115 }
116 
117 namespace
118 {
119  template <typename T, typename C> sal_Int32 tmpl_getTokenCount(const T &rIn,
120  C cTok)
121  {
122  // Empty String: TokenCount by Definition is 0
123  if (rIn.isEmpty())
124  return 0;
125 
126  sal_Int32 nTokCount = 1;
127  for (sal_Int32 i = 0; i < rIn.getLength(); ++i)
128  {
129  if (rIn[i] == cTok)
130  ++nTokCount;
131  }
132  return nTokCount;
133  }
134 }
135 
136 sal_Int32 getTokenCount(const OString &rIn, sal_Char cTok)
137 {
138  return tmpl_getTokenCount<OString, sal_Char>(rIn, cTok);
139 }
140 
141 sal_Int32 getTokenCount(const OUString &rIn, sal_Unicode cTok)
142 {
143  return tmpl_getTokenCount<OUString, sal_Unicode>(rIn, cTok);
144 }
145 
146 static sal_uInt32 decimalStringToNumber(
147  OUString const & str, sal_Int32 nStart, sal_Int32 nLength )
148 {
149  sal_uInt32 result = 0;
150  for( sal_Int32 i = nStart; i < nStart + nLength; )
151  {
152  sal_uInt32 c = str.iterateCodePoints(&i);
153  sal_uInt32 value = 0;
154  if( c <= 0x0039) // ASCII decimal digits, most common
155  value = c - 0x0030;
156  else if( c >= 0x1D7F6 ) // mathematical monospace digits
157  value = c - 0x1D7F6;
158  else if( c >= 0x1D7EC ) // mathematical sans-serif bold digits
159  value = c - 0x1D7EC;
160  else if( c >= 0x1D7E2 ) // mathematical sans-serif digits
161  value = c - 0x1D7E2;
162  else if( c >= 0x1D7D8 ) // mathematical double-struck digits
163  value = c - 0x1D7D8;
164  else if( c >= 0x1D7CE ) // mathematical bold digits
165  value = c - 0x1D7CE;
166  else if( c >= 0x11066 ) // brahmi digits
167  value = c - 0x11066;
168  else if( c >= 0x104A0 ) // osmanya digits
169  value = c - 0x104A0;
170  else if( c >= 0xFF10 ) // fullwidth digits
171  value = c - 0xFF10;
172  else if( c >= 0xABF0 ) // meetei mayek digits
173  value = c - 0xABF0;
174  else if( c >= 0xAA50 ) // cham digits
175  value = c - 0xAA50;
176  else if( c >= 0xA9D0 ) // javanese digits
177  value = c - 0xA9D0;
178  else if( c >= 0xA900 ) // kayah li digits
179  value = c - 0xA900;
180  else if( c >= 0xA8D0 ) // saurashtra digits
181  value = c - 0xA8D0;
182  else if( c >= 0xA620 ) // vai digits
183  value = c - 0xA620;
184  else if( c >= 0x1C50 ) // ol chiki digits
185  value = c - 0x1C50;
186  else if( c >= 0x1C40 ) // lepcha digits
187  value = c - 0x1C40;
188  else if( c >= 0x1BB0 ) // sundanese digits
189  value = c - 0x1BB0;
190  else if( c >= 0x1B50 ) // balinese digits
191  value = c - 0x1B50;
192  else if( c >= 0x1A90 ) // tai tham tham digits
193  value = c - 0x1A90;
194  else if( c >= 0x1A80 ) // tai tham hora digits
195  value = c - 0x1A80;
196  else if( c >= 0x19D0 ) // new tai lue digits
197  value = c - 0x19D0;
198  else if( c >= 0x1946 ) // limbu digits
199  value = c - 0x1946;
200  else if( c >= 0x1810 ) // mongolian digits
201  value = c - 0x1810;
202  else if( c >= 0x17E0 ) // khmer digits
203  value = c - 0x17E0;
204  else if( c >= 0x1090 ) // myanmar shan digits
205  value = c - 0x1090;
206  else if( c >= 0x1040 ) // myanmar digits
207  value = c - 0x1040;
208  else if( c >= 0x0F20 ) // tibetan digits
209  value = c - 0x0F20;
210  else if( c >= 0x0ED0 ) // lao digits
211  value = c - 0x0ED0;
212  else if( c >= 0x0E50 ) // thai digits
213  value = c - 0x0E50;
214  else if( c >= 0x0D66 ) // malayalam digits
215  value = c - 0x0D66;
216  else if( c >= 0x0CE6 ) // kannada digits
217  value = c - 0x0CE6;
218  else if( c >= 0x0C66 ) // telugu digits
219  value = c - 0x0C66;
220  else if( c >= 0x0BE6 ) // tamil digits
221  value = c - 0x0BE6;
222  else if( c >= 0x0B66 ) // odia digits
223  value = c - 0x0B66;
224  else if( c >= 0x0AE6 ) // gujarati digits
225  value = c - 0x0AE6;
226  else if( c >= 0x0A66 ) // gurmukhi digits
227  value = c - 0x0A66;
228  else if( c >= 0x09E6 ) // bengali digits
229  value = c - 0x09E6;
230  else if( c >= 0x0966 ) // devanagari digit
231  value = c - 0x0966;
232  else if( c >= 0x07C0 ) // nko digits
233  value = c - 0x07C0;
234  else if( c >= 0x06F0 ) // extended arabic-indic digits
235  value = c - 0x06F0;
236  else if( c >= 0x0660 ) // arabic-indic digits
237  value = c - 0x0660;
238  result = result * 10 + value;
239  }
240  return result;
241 }
242 
244  OUString const & str )
245 {
246  return decimalStringToNumber(str, 0, str.getLength());
247 }
248 
249 using namespace ::com::sun::star;
250 
251 // convert between sequence of string and comma separated string
252 
254  uno::Sequence< OUString > const& i_rSeq)
255 {
256  OUStringBuffer buf;
258  i_rSeq.begin(), i_rSeq.end(), ::comphelper::OUStringBufferAppender(buf), OUString( ", " ));
259  return buf.makeStringAndClear();
260 }
261 
262 std::vector<OUString>
263  split(const OUString& rStr, sal_Unicode cSeparator)
264 {
265  std::vector< OUString > vec;
266  sal_Int32 idx = 0;
267  do
268  {
269  OUString kw =
270  rStr.getToken(0, cSeparator, idx);
271  kw = kw.trim();
272  if (!kw.isEmpty())
273  {
274  vec.push_back(kw);
275  }
276 
277  } while (idx >= 0);
278 
279  return vec;
280 }
281 
282 uno::Sequence< OUString >
283  convertCommaSeparated( OUString const& i_rString )
284 {
285  std::vector< OUString > vec = split(i_rString, ',');
287 }
288 
289 OString join(const OString& rSeparator, const std::vector<OString>& rSequence)
290 {
291  OStringBuffer aBuffer;
292  for (size_t i = 0; i < rSequence.size(); ++i)
293  {
294  if (i != 0)
295  aBuffer.append(rSeparator);
296  aBuffer.append(rSequence[i]);
297  }
298  return aBuffer.makeStringAndClear();
299 }
300 
301 sal_Int32 compareNatural( const OUString & rLHS, const OUString & rRHS,
302  const uno::Reference< i18n::XCollator > &rCollator,
303  const uno::Reference< i18n::XBreakIterator > &rBI,
304  const lang::Locale &rLocale )
305 {
306  sal_Int32 nRet = 0;
307 
308  sal_Int32 nLHSLastNonDigitPos = 0;
309  sal_Int32 nRHSLastNonDigitPos = 0;
310  sal_Int32 nLHSFirstDigitPos = 0;
311  sal_Int32 nRHSFirstDigitPos = 0;
312 
313  while (nLHSFirstDigitPos < rLHS.getLength() || nRHSFirstDigitPos < rRHS.getLength())
314  {
315  sal_Int32 nLHSChunkLen;
316  sal_Int32 nRHSChunkLen;
317 
318  //Compare non digit block as normal strings
319  nLHSFirstDigitPos = rBI->nextCharBlock(rLHS, nLHSLastNonDigitPos,
320  rLocale, i18n::CharType::DECIMAL_DIGIT_NUMBER);
321  nRHSFirstDigitPos = rBI->nextCharBlock(rRHS, nRHSLastNonDigitPos,
322  rLocale, i18n::CharType::DECIMAL_DIGIT_NUMBER);
323  if (nLHSFirstDigitPos == -1)
324  nLHSFirstDigitPos = rLHS.getLength();
325  if (nRHSFirstDigitPos == -1)
326  nRHSFirstDigitPos = rRHS.getLength();
327  nLHSChunkLen = nLHSFirstDigitPos - nLHSLastNonDigitPos;
328  nRHSChunkLen = nRHSFirstDigitPos - nRHSLastNonDigitPos;
329 
330  nRet = rCollator->compareSubstring(rLHS, nLHSLastNonDigitPos,
331  nLHSChunkLen, rRHS, nRHSLastNonDigitPos, nRHSChunkLen);
332  if (nRet != 0)
333  break;
334 
335  //Compare digit block as one number vs another
336  nLHSLastNonDigitPos = rBI->endOfCharBlock(rLHS, nLHSFirstDigitPos,
337  rLocale, i18n::CharType::DECIMAL_DIGIT_NUMBER);
338  nRHSLastNonDigitPos = rBI->endOfCharBlock(rRHS, nRHSFirstDigitPos,
339  rLocale, i18n::CharType::DECIMAL_DIGIT_NUMBER);
340  if (nLHSLastNonDigitPos == -1)
341  nLHSLastNonDigitPos = rLHS.getLength();
342  if (nRHSLastNonDigitPos == -1)
343  nRHSLastNonDigitPos = rRHS.getLength();
344  nLHSChunkLen = nLHSLastNonDigitPos - nLHSFirstDigitPos;
345  nRHSChunkLen = nRHSLastNonDigitPos - nRHSFirstDigitPos;
346 
347  //To-Do: Possibly scale down those unicode codepoints that relate to
348  //numbers outside of the normal 0-9 range, e.g. see GetLocalizedChar in
349  //vcl
350 
351  sal_uInt32 nLHS = comphelper::string::decimalStringToNumber(rLHS, nLHSFirstDigitPos, nLHSChunkLen);
352  sal_uInt32 nRHS = comphelper::string::decimalStringToNumber(rRHS, nRHSFirstDigitPos, nRHSChunkLen);
353 
354  if (nLHS != nRHS)
355  {
356  nRet = (nLHS < nRHS) ? -1 : 1;
357  break;
358  }
359  }
360 
361  return nRet;
362 }
363 
365  const uno::Reference< uno::XComponentContext > &rContext,
366  const lang::Locale &rLocale) : m_aLocale(rLocale)
367 {
368  m_xCollator = i18n::Collator::create( rContext );
369  m_xCollator->loadDefaultCollator(m_aLocale, 0);
370  m_xBI = i18n::BreakIterator::create( rContext );
371 }
372 
373 bool isdigitAsciiString(const OString &rString)
374 {
375  return std::all_of(
376  rString.getStr(), rString.getStr() + rString.getLength(),
377  [](unsigned char c){ return rtl::isAsciiDigit(c); });
378 }
379 
380 bool isdigitAsciiString(const OUString &rString)
381 {
382  return std::all_of(
383  rString.getStr(), rString.getStr() + rString.getLength(),
384  [](sal_Unicode c){ return rtl::isAsciiDigit(c); });
385 }
386 
387 namespace
388 {
389  template <typename T, typename O> T tmpl_reverseString(const T &rIn)
390  {
391  if (rIn.isEmpty())
392  return rIn;
393 
394  sal_Int32 i = rIn.getLength();
395  O sBuf(i);
396  while (i)
397  sBuf.append(rIn[--i]);
398  return sBuf.makeStringAndClear();
399  }
400 }
401 
402 OUString reverseString(const OUString &rStr)
403 {
404  return tmpl_reverseString<OUString, OUStringBuffer>(rStr);
405 }
406 
407 OString reverseString(const OString &rStr)
408 {
409  return tmpl_reverseString<OString, OStringBuffer>(rStr);
410 }
411 
412 sal_Int32 indexOfAny(OUString const& rIn,
413  sal_Unicode const*const pChars, sal_Int32 const nPos)
414 {
415  for (sal_Int32 i = nPos; i < rIn.getLength(); ++i)
416  {
417  sal_Unicode const c = rIn[i];
418  for (sal_Unicode const* pChar = pChars; *pChar; ++pChar)
419  {
420  if (c == *pChar)
421  {
422  return i;
423  }
424  }
425  }
426  return -1;
427 }
428 
429 OUString removeAny(OUString const& rIn,
430  sal_Unicode const*const pChars)
431 {
432  OUStringBuffer buf;
433  bool isFound(false);
434  for (sal_Int32 i = 0; i < rIn.getLength(); ++i)
435  {
436  sal_Unicode const c = rIn[i];
437  bool removeC(false);
438  for (sal_Unicode const* pChar = pChars; *pChar; ++pChar)
439  {
440  if (c == *pChar)
441  {
442  removeC = true;
443  break;
444  }
445  }
446  if (removeC)
447  {
448  if (!isFound)
449  {
450  if (i > 0)
451  {
452  buf.append(std::u16string_view(rIn).substr(0, i));
453  }
454  isFound = true;
455  }
456  }
457  else if (isFound)
458  {
459  buf.append(c);
460  }
461  }
462  return isFound ? buf.makeStringAndClear() : rIn;
463 }
464 
465 OUString setToken(const OUString& rIn, sal_Int32 nToken, sal_Unicode cTok,
466  const OUString& rNewToken)
467 {
468  sal_Int32 nLen = rIn.getLength();
469  sal_Int32 nTok = 0;
470  sal_Int32 nFirstChar = 0;
471  sal_Int32 i = 0;
472 
473  // Determine token position and length
474  while ( i < nLen )
475  {
476  // Increase token count if match
477  if (rIn[i] == cTok)
478  {
479  ++nTok;
480 
481  if (nTok == nToken)
482  nFirstChar = i+1;
483  else if (nTok > nToken)
484  break;
485  }
486 
487  ++i;
488  }
489 
490  if (nTok >= nToken)
491  return rIn.replaceAt(nFirstChar, i-nFirstChar, rNewToken);
492  return rIn;
493 }
494 
495 } }
496 
497 /* vim:set shiftwidth=4 softtabstop=4 expandtab: */
const sal_Char * pChar
OString stripEnd(const OString &rIn, sal_Char c)
Strips occurrences of a character from the end of the source string.
Definition: string.cxx:97
css::lang::Locale const m_aLocale
Definition: string.hxx:344
IJScriptValueObject VARIANT value
OutputIter intersperse(ForwardIter start, ForwardIter end, OutputIter out, T const &separator)
algorithm similar to std::copy, but inserts a separator between elements.
Definition: stl_types.hxx:146
OString join(const OString &rSeparator, const std::vector< OString > &rSequence)
Return a string which is the concatenation of the strings in the sequence.
Definition: string.cxx:289
std::vector< OUString > split(const OUString &rStr, sal_Unicode cSeparator)
Definition: string.cxx:263
sal_uInt16 sal_Unicode
char sal_Char
OString strip(const OString &rIn, sal_Char c)
Strips occurrences of a character from the start and end of the source string.
Definition: string.cxx:107
bool isdigitAsciiString(const OString &rString)
Determine if an OString contains solely ASCII numeric digits.
Definition: string.cxx:373
css::uno::Reference< css::i18n::XBreakIterator > m_xBI
Definition: string.hxx:346
MetadataImporterPluginType * result
int i
sal_Int32 compareNatural(const OUString &rLHS, const OUString &rRHS, const uno::Reference< i18n::XCollator > &rCollator, const uno::Reference< i18n::XBreakIterator > &rBI, const lang::Locale &rLocale)
Definition: string.cxx:301
NaturalStringSorter(const css::uno::Reference< css::uno::XComponentContext > &rContext, const css::lang::Locale &rLocale)
Definition: string.cxx:364
OString stripStart(const OString &rIn, sal_Char c)
Strips occurrences of a character from the start of the source string.
Definition: string.cxx:66
sal_Int32 indexOfAny(OUString const &rIn, sal_Unicode const *const pChars, sal_Int32 const nPos)
Find any of a list of code units in the string.
Definition: string.cxx:412
output iterator that appends OUStrings into an OUStringBuffer.
Definition: stl_types.hxx:119
OUString removeAny(OUString const &rIn, sal_Unicode const *const pChars)
Remove any of a list of code units in the string.
Definition: string.cxx:429
static sal_uInt32 decimalStringToNumber(OUString const &str, sal_Int32 nStart, sal_Int32 nLength)
Definition: string.cxx:146
css::uno::Sequence< DstElementType > containerToSequence(const SrcType &i_Container)
Copy from a container into a Sequence.
Definition: sequence.hxx:182
sal_Int32 getTokenCount(const OString &rIn, sal_Char cTok)
Returns number of tokens in an OUString.
Definition: string.cxx:136
OUString setToken(const OUString &rIn, sal_Int32 nToken, sal_Unicode cTok, const OUString &rNewToken)
Replace a token in a string.
Definition: string.cxx:465
OUString reverseString(const OUString &rStr)
Reverse an OUString.
Definition: string.cxx:402
OUString convertCommaSeparated(uno::Sequence< OUString > const &i_rSeq)
Definition: string.cxx:253
css::uno::Reference< css::i18n::XCollator > m_xCollator
Definition: string.hxx:345