LibreOffice Module comphelper (master)  1
string.cxx
Go to the documentation of this file.
1 /* -*- Mode: C++; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4 -*- */
2 /*
3  * This file is part of the LibreOffice project.
4  *
5  * This Source Code Form is subject to the terms of the Mozilla Public
6  * License, v. 2.0. If a copy of the MPL was not distributed with this
7  * file, You can obtain one at http://mozilla.org/MPL/2.0/.
8  *
9  * This file incorporates work covered by the following license notice:
10  *
11  * Licensed to the Apache Software Foundation (ASF) under one or more
12  * contributor license agreements. See the NOTICE file distributed
13  * with this work for additional information regarding copyright
14  * ownership. The ASF licenses this file to you under the Apache
15  * License, Version 2.0 (the "License"); you may not use this file
16  * except in compliance with the License. You may obtain a copy of
17  * the License at http://www.apache.org/licenses/LICENSE-2.0 .
18  */
19 
20 #include <sal/config.h>
21 
22 #include <cstddef>
23 #include <string_view>
24 #include <vector>
25 #include <algorithm>
26 
27 #include <rtl/character.hxx>
28 #include <rtl/ustring.hxx>
29 #include <rtl/ustrbuf.hxx>
30 #include <rtl/string.hxx>
31 #include <rtl/strbuf.hxx>
32 #include <sal/types.h>
33 
34 #include <comphelper/string.hxx>
35 #include <comphelper/stl_types.hxx>
36 #include <comphelper/sequence.hxx>
37 
38 #include <com/sun/star/i18n/BreakIterator.hpp>
39 #include <com/sun/star/i18n/CharType.hpp>
40 #include <com/sun/star/i18n/Collator.hpp>
41 
42 
43 namespace comphelper::string {
44 
45 namespace
46 {
47  template <typename T, typename C> T tmpl_stripStart(const T &rIn,
48  const C cRemove)
49  {
50  if (rIn.empty())
51  return rIn;
52 
53  typename T::size_type i = 0;
54 
55  while (i < rIn.size())
56  {
57  if (rIn[i] != cRemove)
58  break;
59  ++i;
60  }
61 
62  return rIn.substr(i);
63  }
64 }
65 
66 OString stripStart(std::string_view rIn, char c)
67 {
68  return OString(tmpl_stripStart<std::string_view, char>(rIn, c));
69 }
70 
71 OUString stripStart(std::u16string_view rIn, sal_Unicode c)
72 {
73  return OUString(tmpl_stripStart<std::u16string_view, sal_Unicode>(rIn, c));
74 }
75 
76 namespace
77 {
78  template <typename T, typename C> T tmpl_stripEnd(const T &rIn,
79  const C cRemove)
80  {
81  if (rIn.empty())
82  return rIn;
83 
84  typename T::size_type i = rIn.size();
85 
86  while (i > 0)
87  {
88  if (rIn[i-1] != cRemove)
89  break;
90  --i;
91  }
92 
93  return rIn.substr(0, i);
94  }
95 }
96 
97 OString stripEnd(std::string_view rIn, char c)
98 {
99  return OString(tmpl_stripEnd<std::string_view, char>(rIn, c));
100 }
101 
102 OUString stripEnd(std::u16string_view rIn, sal_Unicode c)
103 {
104  return OUString(tmpl_stripEnd<std::u16string_view, sal_Unicode>(rIn, c));
105 }
106 
107 OString strip(std::string_view rIn, char c)
108 {
109  auto x = tmpl_stripStart<std::string_view, char>(rIn, c);
110  return stripEnd(x, c);
111 }
112 
113 OUString strip(std::u16string_view rIn, sal_Unicode c)
114 {
115  auto x = tmpl_stripStart<std::u16string_view, sal_Unicode>(rIn, c);
116  return stripEnd(x, c);
117 }
118 
119 namespace
120 {
121  template <typename T, typename C> sal_Int32 tmpl_getTokenCount( T rIn,
122  C cTok)
123  {
124  // Empty String: TokenCount by Definition is 0
125  if (rIn.empty())
126  return 0;
127 
128  sal_Int32 nTokCount = 1;
129  for (typename T::size_type i = 0; i < rIn.size(); ++i)
130  {
131  if (rIn[i] == cTok)
132  ++nTokCount;
133  }
134  return nTokCount;
135  }
136 }
137 
138 sal_Int32 getTokenCount(std::string_view rIn, char cTok)
139 {
140  return tmpl_getTokenCount<std::string_view, char>(rIn, cTok);
141 }
142 
143 sal_Int32 getTokenCount(std::u16string_view rIn, sal_Unicode cTok)
144 {
145  return tmpl_getTokenCount<std::u16string_view, sal_Unicode>(rIn, cTok);
146 }
147 
148 static sal_uInt32 decimalStringToNumber(
149  OUString const & str, sal_Int32 nStart, sal_Int32 nLength )
150 {
151  sal_uInt32 result = 0;
152  for( sal_Int32 i = nStart; i < nStart + nLength; )
153  {
154  sal_uInt32 c = str.iterateCodePoints(&i);
155  sal_uInt32 value = 0;
156  if( c <= 0x0039) // ASCII decimal digits, most common
157  value = c - 0x0030;
158  else if( c >= 0x1D7F6 ) // mathematical monospace digits
159  value = c - 0x1D7F6;
160  else if( c >= 0x1D7EC ) // mathematical sans-serif bold digits
161  value = c - 0x1D7EC;
162  else if( c >= 0x1D7E2 ) // mathematical sans-serif digits
163  value = c - 0x1D7E2;
164  else if( c >= 0x1D7D8 ) // mathematical double-struck digits
165  value = c - 0x1D7D8;
166  else if( c >= 0x1D7CE ) // mathematical bold digits
167  value = c - 0x1D7CE;
168  else if( c >= 0x11066 ) // brahmi digits
169  value = c - 0x11066;
170  else if( c >= 0x104A0 ) // osmanya digits
171  value = c - 0x104A0;
172  else if( c >= 0xFF10 ) // fullwidth digits
173  value = c - 0xFF10;
174  else if( c >= 0xABF0 ) // meetei mayek digits
175  value = c - 0xABF0;
176  else if( c >= 0xAA50 ) // cham digits
177  value = c - 0xAA50;
178  else if( c >= 0xA9D0 ) // javanese digits
179  value = c - 0xA9D0;
180  else if( c >= 0xA900 ) // kayah li digits
181  value = c - 0xA900;
182  else if( c >= 0xA8D0 ) // saurashtra digits
183  value = c - 0xA8D0;
184  else if( c >= 0xA620 ) // vai digits
185  value = c - 0xA620;
186  else if( c >= 0x1C50 ) // ol chiki digits
187  value = c - 0x1C50;
188  else if( c >= 0x1C40 ) // lepcha digits
189  value = c - 0x1C40;
190  else if( c >= 0x1BB0 ) // sundanese digits
191  value = c - 0x1BB0;
192  else if( c >= 0x1B50 ) // balinese digits
193  value = c - 0x1B50;
194  else if( c >= 0x1A90 ) // tai tham tham digits
195  value = c - 0x1A90;
196  else if( c >= 0x1A80 ) // tai tham hora digits
197  value = c - 0x1A80;
198  else if( c >= 0x19D0 ) // new tai lue digits
199  value = c - 0x19D0;
200  else if( c >= 0x1946 ) // limbu digits
201  value = c - 0x1946;
202  else if( c >= 0x1810 ) // mongolian digits
203  value = c - 0x1810;
204  else if( c >= 0x17E0 ) // khmer digits
205  value = c - 0x17E0;
206  else if( c >= 0x1090 ) // myanmar shan digits
207  value = c - 0x1090;
208  else if( c >= 0x1040 ) // myanmar digits
209  value = c - 0x1040;
210  else if( c >= 0x0F20 ) // tibetan digits
211  value = c - 0x0F20;
212  else if( c >= 0x0ED0 ) // lao digits
213  value = c - 0x0ED0;
214  else if( c >= 0x0E50 ) // thai digits
215  value = c - 0x0E50;
216  else if( c >= 0x0D66 ) // malayalam digits
217  value = c - 0x0D66;
218  else if( c >= 0x0CE6 ) // kannada digits
219  value = c - 0x0CE6;
220  else if( c >= 0x0C66 ) // telugu digits
221  value = c - 0x0C66;
222  else if( c >= 0x0BE6 ) // tamil digits
223  value = c - 0x0BE6;
224  else if( c >= 0x0B66 ) // odia digits
225  value = c - 0x0B66;
226  else if( c >= 0x0AE6 ) // gujarati digits
227  value = c - 0x0AE6;
228  else if( c >= 0x0A66 ) // gurmukhi digits
229  value = c - 0x0A66;
230  else if( c >= 0x09E6 ) // bengali digits
231  value = c - 0x09E6;
232  else if( c >= 0x0966 ) // devanagari digit
233  value = c - 0x0966;
234  else if( c >= 0x07C0 ) // nko digits
235  value = c - 0x07C0;
236  else if( c >= 0x06F0 ) // extended arabic-indic digits
237  value = c - 0x06F0;
238  else if( c >= 0x0660 ) // arabic-indic digits
239  value = c - 0x0660;
240  result = result * 10 + value;
241  }
242  return result;
243 }
244 
246  OUString const & str )
247 {
248  return decimalStringToNumber(str, 0, str.getLength());
249 }
250 
251 using namespace ::com::sun::star;
252 
253 // convert between sequence of string and comma separated string
254 
256  uno::Sequence< OUString > const& i_rSeq)
257 {
258  OUStringBuffer buf;
260  i_rSeq.begin(), i_rSeq.end(), ::comphelper::OUStringBufferAppender(buf), OUString( ", " ));
261  return buf.makeStringAndClear();
262 }
263 
264 std::vector<OUString>
265  split(const OUString& rStr, sal_Unicode cSeparator)
266 {
267  std::vector< OUString > vec;
268  sal_Int32 idx = 0;
269  do
270  {
271  OUString kw =
272  rStr.getToken(0, cSeparator, idx);
273  kw = kw.trim();
274  if (!kw.isEmpty())
275  {
276  vec.push_back(kw);
277  }
278 
279  } while (idx >= 0);
280 
281  return vec;
282 }
283 
284 uno::Sequence< OUString >
285  convertCommaSeparated( OUString const& i_rString )
286 {
287  std::vector< OUString > vec = split(i_rString, ',');
289 }
290 
291 OString join(std::string_view rSeparator, const std::vector<OString>& rSequence)
292 {
293  OStringBuffer aBuffer;
294  for (size_t i = 0; i < rSequence.size(); ++i)
295  {
296  if (i != 0)
297  aBuffer.append(rSeparator);
298  aBuffer.append(rSequence[i]);
299  }
300  return aBuffer.makeStringAndClear();
301 }
302 
303 sal_Int32 compareNatural( const OUString & rLHS, const OUString & rRHS,
304  const uno::Reference< i18n::XCollator > &rCollator,
305  const uno::Reference< i18n::XBreakIterator > &rBI,
306  const lang::Locale &rLocale )
307 {
308  sal_Int32 nRet = 0;
309 
310  sal_Int32 nLHSLastNonDigitPos = 0;
311  sal_Int32 nRHSLastNonDigitPos = 0;
312  sal_Int32 nLHSFirstDigitPos = 0;
313  sal_Int32 nRHSFirstDigitPos = 0;
314 
315  while (nLHSFirstDigitPos < rLHS.getLength() || nRHSFirstDigitPos < rRHS.getLength())
316  {
317  sal_Int32 nLHSChunkLen;
318  sal_Int32 nRHSChunkLen;
319 
320  //Compare non digit block as normal strings
321  nLHSFirstDigitPos = rBI->nextCharBlock(rLHS, nLHSLastNonDigitPos,
322  rLocale, i18n::CharType::DECIMAL_DIGIT_NUMBER);
323  nRHSFirstDigitPos = rBI->nextCharBlock(rRHS, nRHSLastNonDigitPos,
324  rLocale, i18n::CharType::DECIMAL_DIGIT_NUMBER);
325  if (nLHSFirstDigitPos == -1)
326  nLHSFirstDigitPos = rLHS.getLength();
327  if (nRHSFirstDigitPos == -1)
328  nRHSFirstDigitPos = rRHS.getLength();
329  nLHSChunkLen = nLHSFirstDigitPos - nLHSLastNonDigitPos;
330  nRHSChunkLen = nRHSFirstDigitPos - nRHSLastNonDigitPos;
331 
332  nRet = rCollator->compareSubstring(rLHS, nLHSLastNonDigitPos,
333  nLHSChunkLen, rRHS, nRHSLastNonDigitPos, nRHSChunkLen);
334  if (nRet != 0)
335  break;
336 
337  //Compare digit block as one number vs another
338  nLHSLastNonDigitPos = rBI->endOfCharBlock(rLHS, nLHSFirstDigitPos,
339  rLocale, i18n::CharType::DECIMAL_DIGIT_NUMBER);
340  nRHSLastNonDigitPos = rBI->endOfCharBlock(rRHS, nRHSFirstDigitPos,
341  rLocale, i18n::CharType::DECIMAL_DIGIT_NUMBER);
342  if (nLHSLastNonDigitPos == -1)
343  nLHSLastNonDigitPos = rLHS.getLength();
344  if (nRHSLastNonDigitPos == -1)
345  nRHSLastNonDigitPos = rRHS.getLength();
346  nLHSChunkLen = nLHSLastNonDigitPos - nLHSFirstDigitPos;
347  nRHSChunkLen = nRHSLastNonDigitPos - nRHSFirstDigitPos;
348 
349  //To-Do: Possibly scale down those unicode codepoints that relate to
350  //numbers outside of the normal 0-9 range, e.g. see GetLocalizedChar in
351  //vcl
352 
353  sal_uInt32 nLHS = comphelper::string::decimalStringToNumber(rLHS, nLHSFirstDigitPos, nLHSChunkLen);
354  sal_uInt32 nRHS = comphelper::string::decimalStringToNumber(rRHS, nRHSFirstDigitPos, nRHSChunkLen);
355 
356  if (nLHS != nRHS)
357  {
358  nRet = (nLHS < nRHS) ? -1 : 1;
359  break;
360  }
361  }
362 
363  return nRet;
364 }
365 
367  const uno::Reference< uno::XComponentContext > &rContext,
368  const lang::Locale &rLocale) : m_aLocale(rLocale)
369 {
370  m_xCollator = i18n::Collator::create( rContext );
371  m_xCollator->loadDefaultCollator(m_aLocale, 0);
372  m_xBI = i18n::BreakIterator::create( rContext );
373 }
374 
375 bool isdigitAsciiString(std::string_view rString)
376 {
377  return std::all_of(
378  rString.data(), rString.data() + rString.size(),
379  [](unsigned char c){ return rtl::isAsciiDigit(c); });
380 }
381 
382 bool isdigitAsciiString(std::u16string_view rString)
383 {
384  return std::all_of(
385  rString.data(), rString.data() + rString.size(),
386  [](sal_Unicode c){ return rtl::isAsciiDigit(c); });
387 }
388 
389 namespace
390 {
391  template <typename T, typename I, typename O> T tmpl_reverseString(I rIn)
392  {
393  if (rIn.empty())
394  return T();
395 
396  typename I::size_type i = rIn.size();
397  O sBuf(static_cast<sal_Int32>(i));
398  while (i)
399  sBuf.append(rIn[--i]);
400  return sBuf.makeStringAndClear();
401  }
402 }
403 
404 OUString reverseString(std::u16string_view rStr)
405 {
406  return tmpl_reverseString<OUString, std::u16string_view, OUStringBuffer>(rStr);
407 }
408 
409 OString reverseString(std::string_view rStr)
410 {
411  return tmpl_reverseString<OString, std::string_view, OStringBuffer>(rStr);
412 }
413 
414 sal_Int32 indexOfAny(std::u16string_view rIn,
415  sal_Unicode const*const pChars, sal_Int32 const nPos)
416 {
417  for (std::u16string_view::size_type i = nPos; i < rIn.size(); ++i)
418  {
419  sal_Unicode const c = rIn[i];
420  for (sal_Unicode const* pChar = pChars; *pChar; ++pChar)
421  {
422  if (c == *pChar)
423  {
424  return i;
425  }
426  }
427  }
428  return -1;
429 }
430 
431 OUString removeAny(std::u16string_view rIn,
432  sal_Unicode const*const pChars)
433 {
434  OUStringBuffer buf;
435  bool isFound(false);
436  for (std::u16string_view::size_type i = 0; i < rIn.size(); ++i)
437  {
438  sal_Unicode const c = rIn[i];
439  bool removeC(false);
440  for (sal_Unicode const* pChar = pChars; *pChar; ++pChar)
441  {
442  if (c == *pChar)
443  {
444  removeC = true;
445  break;
446  }
447  }
448  if (removeC)
449  {
450  if (!isFound)
451  {
452  if (i > 0)
453  {
454  buf.append(rIn.substr(0, i));
455  }
456  isFound = true;
457  }
458  }
459  else if (isFound)
460  {
461  buf.append(c);
462  }
463  }
464  return isFound ? buf.makeStringAndClear() : OUString(rIn);
465 }
466 
467 OUString setToken(const OUString& rIn, sal_Int32 nToken, sal_Unicode cTok,
468  const OUString& rNewToken)
469 {
470  sal_Int32 nLen = rIn.getLength();
471  sal_Int32 nTok = 0;
472  sal_Int32 nFirstChar = 0;
473  sal_Int32 i = 0;
474 
475  // Determine token position and length
476  while ( i < nLen )
477  {
478  // Increase token count if match
479  if (rIn[i] == cTok)
480  {
481  ++nTok;
482 
483  if (nTok == nToken)
484  nFirstChar = i+1;
485  else if (nTok > nToken)
486  break;
487  }
488 
489  ++i;
490  }
491 
492  if (nTok >= nToken)
493  return rIn.replaceAt(nFirstChar, i-nFirstChar, rNewToken);
494  return rIn;
495 }
496 
497 }
498 
499 /* vim:set shiftwidth=4 softtabstop=4 expandtab: */
OString stripEnd(std::string_view rIn, char c)
Strips occurrences of a character from the end of the source string.
Definition: string.cxx:97
css::lang::Locale const m_aLocale
Definition: string.hxx:334
std::locale m_aLocale
OString strip(std::string_view rIn, char c)
Strips occurrences of a character from the start and end of the source string.
Definition: string.cxx:107
bool isdigitAsciiString(std::string_view rString)
Determine if an OString contains solely ASCII numeric digits.
Definition: string.cxx:375
OutputIter intersperse(ForwardIter start, ForwardIter end, OutputIter out, T const &separator)
algorithm similar to std::copy, but inserts a separator between elements.
Definition: stl_types.hxx:151
float x
std::vector< OUString > split(const OUString &rStr, sal_Unicode cSeparator)
Definition: string.cxx:265
sal_uInt16 sal_Unicode
OString join(std::string_view rSeparator, const std::vector< OString > &rSequence)
Return a string which is the concatenation of the strings in the sequence.
Definition: string.cxx:291
OUString removeAny(std::u16string_view rIn, sal_Unicode const *const pChars)
Remove any of a list of code units in the string.
Definition: string.cxx:431
sal_Int32 getTokenCount(std::string_view rIn, char cTok)
Returns number of tokens in an OUString.
Definition: string.cxx:138
int i
css::uno::Reference< css::i18n::XBreakIterator > m_xBI
Definition: string.hxx:336
sal_Int32 compareNatural(const OUString &rLHS, const OUString &rRHS, const uno::Reference< i18n::XCollator > &rCollator, const uno::Reference< i18n::XBreakIterator > &rBI, const lang::Locale &rLocale)
Definition: string.cxx:303
NaturalStringSorter(const css::uno::Reference< css::uno::XComponentContext > &rContext, const css::lang::Locale &rLocale)
Definition: string.cxx:366
output iterator that appends OUStrings into an OUStringBuffer.
Definition: stl_types.hxx:124
sal_Int32 indexOfAny(std::u16string_view rIn, sal_Unicode const *const pChars, sal_Int32 const nPos)
Find any of a list of code units in the string.
Definition: string.cxx:414
static sal_uInt32 decimalStringToNumber(OUString const &str, sal_Int32 nStart, sal_Int32 nLength)
Definition: string.cxx:148
const sal_uInt16 idx[]
std::unique_ptr< char[]> aBuffer
OString stripStart(std::string_view rIn, char c)
Strips occurrences of a character from the start of the source string.
Definition: string.cxx:66
css::uno::Sequence< DstElementType > containerToSequence(const SrcType &i_Container)
Copy from a container into a Sequence.
Definition: sequence.hxx:182
const char * pChar
Any value
Any result
OUString reverseString(std::u16string_view rStr)
Reverse an OUString.
Definition: string.cxx:404
OUString setToken(const OUString &rIn, sal_Int32 nToken, sal_Unicode cTok, const OUString &rNewToken)
Replace a token in a string.
Definition: string.cxx:467
OUString convertCommaSeparated(uno::Sequence< OUString > const &i_rSeq)
Definition: string.cxx:255
css::uno::Reference< css::i18n::XCollator > m_xCollator
Definition: string.hxx:335