LibreOffice Module comphelper (master)  1
string.cxx
Go to the documentation of this file.
1 /* -*- Mode: C++; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4 -*- */
2 /*
3  * This file is part of the LibreOffice project.
4  *
5  * This Source Code Form is subject to the terms of the Mozilla Public
6  * License, v. 2.0. If a copy of the MPL was not distributed with this
7  * file, You can obtain one at http://mozilla.org/MPL/2.0/.
8  *
9  * This file incorporates work covered by the following license notice:
10  *
11  * Licensed to the Apache Software Foundation (ASF) under one or more
12  * contributor license agreements. See the NOTICE file distributed
13  * with this work for additional information regarding copyright
14  * ownership. The ASF licenses this file to you under the Apache
15  * License, Version 2.0 (the "License"); you may not use this file
16  * except in compliance with the License. You may obtain a copy of
17  * the License at http://www.apache.org/licenses/LICENSE-2.0 .
18  */
19 
20 #include <sal/config.h>
21 
22 #include <cstddef>
23 #include <string_view>
24 #include <vector>
25 #include <algorithm>
26 
27 #include <rtl/character.hxx>
28 #include <rtl/ustring.hxx>
29 #include <rtl/ustrbuf.hxx>
30 #include <rtl/string.hxx>
31 #include <rtl/strbuf.hxx>
32 #include <sal/types.h>
33 
34 #include <comphelper/string.hxx>
35 #include <comphelper/stl_types.hxx>
36 #include <comphelper/sequence.hxx>
37 
38 #include <com/sun/star/i18n/BreakIterator.hpp>
39 #include <com/sun/star/i18n/CharType.hpp>
40 #include <com/sun/star/i18n/Collator.hpp>
41 
42 
43 namespace comphelper::string {
44 
45 namespace
46 {
47  template <typename T, typename C> T tmpl_stripStart(const T &rIn,
48  const C cRemove)
49  {
50  if (rIn.empty())
51  return rIn;
52 
53  typename T::size_type i = 0;
54 
55  while (i < rIn.size())
56  {
57  if (rIn[i] != cRemove)
58  break;
59  ++i;
60  }
61 
62  return rIn.substr(i);
63  }
64  template <typename T, typename C> T tmpl_stripStartString(const T &rIn,
65  const C cRemove)
66  {
67  if (rIn.isEmpty())
68  return rIn;
69 
70  sal_Int32 i = 0;
71 
72  while (i < rIn.getLength())
73  {
74  if (rIn[i] != cRemove)
75  break;
76  ++i;
77  }
78 
79  return rIn.copy(i);
80  }
81 }
82 
83 OString stripStart(const OString& rIn, char c)
84 {
85  return tmpl_stripStartString<OString, char>(rIn, c);
86 }
87 
88 std::string_view stripStart(std::string_view rIn, char c)
89 {
90  return tmpl_stripStart<std::string_view, char>(rIn, c);
91 }
92 
93 OUString stripStart(const OUString& rIn, sal_Unicode c)
94 {
95  return tmpl_stripStartString<OUString, sal_Unicode>(rIn, c);
96 }
97 
98 std::u16string_view stripStart(std::u16string_view rIn, sal_Unicode c)
99 {
100  return tmpl_stripStart<std::u16string_view, sal_Unicode>(rIn, c);
101 }
102 
103 namespace
104 {
105  template <typename T, typename C> T tmpl_stripEnd(const T &rIn,
106  const C cRemove)
107  {
108  if (rIn.empty())
109  return rIn;
110 
111  typename T::size_type i = rIn.size();
112 
113  while (i > 0)
114  {
115  if (rIn[i-1] != cRemove)
116  break;
117  --i;
118  }
119 
120  return rIn.substr(0, i);
121  }
122  template <typename T, typename C> T tmpl_stripEndString(const T &rIn,
123  const C cRemove)
124  {
125  if (rIn.isEmpty())
126  return rIn;
127 
128  sal_Int32 i = rIn.getLength();
129 
130  while (i > 0)
131  {
132  if (rIn[i-1] != cRemove)
133  break;
134  --i;
135  }
136 
137  return rIn.copy(0, i);
138  }
139 }
140 
141 OString stripEnd(const OString& rIn, char c)
142 {
143  return tmpl_stripEndString<OString, char>(rIn, c);
144 }
145 
146 std::string_view stripEnd(std::string_view rIn, char c)
147 {
148  return tmpl_stripEnd<std::string_view, char>(rIn, c);
149 }
150 
151 OUString stripEnd(const OUString& rIn, sal_Unicode c)
152 {
153  return tmpl_stripEndString<OUString, sal_Unicode>(rIn, c);
154 }
155 
156 std::u16string_view stripEnd(std::u16string_view rIn, sal_Unicode c)
157 {
158  return tmpl_stripEnd<std::u16string_view, sal_Unicode>(rIn, c);
159 }
160 
161 OString strip(const OString& rIn, char c)
162 {
163  auto x = tmpl_stripStartString<OString, char>(rIn, c);
164  return stripEnd(x, c);
165 }
166 
167 std::string_view strip(std::string_view rIn, char c)
168 {
169  auto x = tmpl_stripStart<std::string_view, char>(rIn, c);
170  return stripEnd(x, c);
171 }
172 
173 OUString strip(const OUString& rIn, sal_Unicode c)
174 {
175  auto x = tmpl_stripStartString<OUString, sal_Unicode>(rIn, c);
176  return stripEnd(x, c);
177 }
178 
179 std::u16string_view strip(std::u16string_view rIn, sal_Unicode c)
180 {
181  auto x = tmpl_stripStart<std::u16string_view, sal_Unicode>(rIn, c);
182  return stripEnd(x, c);
183 }
184 
185 namespace
186 {
187  template <typename T, typename C> sal_Int32 tmpl_getTokenCount( T rIn,
188  C cTok)
189  {
190  // Empty String: TokenCount by Definition is 0
191  if (rIn.empty())
192  return 0;
193 
194  sal_Int32 nTokCount = 1;
195  for (typename T::size_type i = 0; i < rIn.size(); ++i)
196  {
197  if (rIn[i] == cTok)
198  ++nTokCount;
199  }
200  return nTokCount;
201  }
202 }
203 
204 sal_Int32 getTokenCount(std::string_view rIn, char cTok)
205 {
206  return tmpl_getTokenCount<std::string_view, char>(rIn, cTok);
207 }
208 
209 sal_Int32 getTokenCount(std::u16string_view rIn, sal_Unicode cTok)
210 {
211  return tmpl_getTokenCount<std::u16string_view, sal_Unicode>(rIn, cTok);
212 }
213 
214 static sal_uInt32 decimalStringToNumber(
215  OUString const & str, sal_Int32 nStart, sal_Int32 nLength )
216 {
217  sal_uInt32 result = 0;
218  for( sal_Int32 i = nStart; i < nStart + nLength; )
219  {
220  sal_uInt32 c = str.iterateCodePoints(&i);
221  sal_uInt32 value = 0;
222  if( c <= 0x0039) // ASCII decimal digits, most common
223  value = c - 0x0030;
224  else if( c >= 0x1D7F6 ) // mathematical monospace digits
225  value = c - 0x1D7F6;
226  else if( c >= 0x1D7EC ) // mathematical sans-serif bold digits
227  value = c - 0x1D7EC;
228  else if( c >= 0x1D7E2 ) // mathematical sans-serif digits
229  value = c - 0x1D7E2;
230  else if( c >= 0x1D7D8 ) // mathematical double-struck digits
231  value = c - 0x1D7D8;
232  else if( c >= 0x1D7CE ) // mathematical bold digits
233  value = c - 0x1D7CE;
234  else if( c >= 0x11066 ) // brahmi digits
235  value = c - 0x11066;
236  else if( c >= 0x104A0 ) // osmanya digits
237  value = c - 0x104A0;
238  else if( c >= 0xFF10 ) // fullwidth digits
239  value = c - 0xFF10;
240  else if( c >= 0xABF0 ) // meetei mayek digits
241  value = c - 0xABF0;
242  else if( c >= 0xAA50 ) // cham digits
243  value = c - 0xAA50;
244  else if( c >= 0xA9D0 ) // javanese digits
245  value = c - 0xA9D0;
246  else if( c >= 0xA900 ) // kayah li digits
247  value = c - 0xA900;
248  else if( c >= 0xA8D0 ) // saurashtra digits
249  value = c - 0xA8D0;
250  else if( c >= 0xA620 ) // vai digits
251  value = c - 0xA620;
252  else if( c >= 0x1C50 ) // ol chiki digits
253  value = c - 0x1C50;
254  else if( c >= 0x1C40 ) // lepcha digits
255  value = c - 0x1C40;
256  else if( c >= 0x1BB0 ) // sundanese digits
257  value = c - 0x1BB0;
258  else if( c >= 0x1B50 ) // balinese digits
259  value = c - 0x1B50;
260  else if( c >= 0x1A90 ) // tai tham tham digits
261  value = c - 0x1A90;
262  else if( c >= 0x1A80 ) // tai tham hora digits
263  value = c - 0x1A80;
264  else if( c >= 0x19D0 ) // new tai lue digits
265  value = c - 0x19D0;
266  else if( c >= 0x1946 ) // limbu digits
267  value = c - 0x1946;
268  else if( c >= 0x1810 ) // mongolian digits
269  value = c - 0x1810;
270  else if( c >= 0x17E0 ) // khmer digits
271  value = c - 0x17E0;
272  else if( c >= 0x1090 ) // myanmar shan digits
273  value = c - 0x1090;
274  else if( c >= 0x1040 ) // myanmar digits
275  value = c - 0x1040;
276  else if( c >= 0x0F20 ) // tibetan digits
277  value = c - 0x0F20;
278  else if( c >= 0x0ED0 ) // lao digits
279  value = c - 0x0ED0;
280  else if( c >= 0x0E50 ) // thai digits
281  value = c - 0x0E50;
282  else if( c >= 0x0D66 ) // malayalam digits
283  value = c - 0x0D66;
284  else if( c >= 0x0CE6 ) // kannada digits
285  value = c - 0x0CE6;
286  else if( c >= 0x0C66 ) // telugu digits
287  value = c - 0x0C66;
288  else if( c >= 0x0BE6 ) // tamil digits
289  value = c - 0x0BE6;
290  else if( c >= 0x0B66 ) // odia digits
291  value = c - 0x0B66;
292  else if( c >= 0x0AE6 ) // gujarati digits
293  value = c - 0x0AE6;
294  else if( c >= 0x0A66 ) // gurmukhi digits
295  value = c - 0x0A66;
296  else if( c >= 0x09E6 ) // bengali digits
297  value = c - 0x09E6;
298  else if( c >= 0x0966 ) // devanagari digit
299  value = c - 0x0966;
300  else if( c >= 0x07C0 ) // nko digits
301  value = c - 0x07C0;
302  else if( c >= 0x06F0 ) // extended arabic-indic digits
303  value = c - 0x06F0;
304  else if( c >= 0x0660 ) // arabic-indic digits
305  value = c - 0x0660;
306  result = result * 10 + value;
307  }
308  return result;
309 }
310 
312  OUString const & str )
313 {
314  return decimalStringToNumber(str, 0, str.getLength());
315 }
316 
317 using namespace ::com::sun::star;
318 
319 // convert between sequence of string and comma separated string
320 
322  uno::Sequence< OUString > const& i_rSeq)
323 {
324  OUStringBuffer buf;
326  i_rSeq.begin(), i_rSeq.end(), ::comphelper::OUStringBufferAppender(buf), OUString( ", " ));
327  return buf.makeStringAndClear();
328 }
329 
330 /* copy of getToken from sal/, modified to take a string_view */
331 static sal_Int32 getToken( OUString& ppThis,
332  std::u16string_view pStr,
333  sal_Int32 nToken,
334  sal_Unicode cTok,
335  sal_Int32 nIndex )
336 {
337  assert(nIndex <= static_cast<sal_Int32>(pStr.size()));
338 
339  // Set ppThis to an empty string and return -1 if either nToken or nIndex is
340  // negative:
341  if (nIndex >= 0 && nToken >= 0)
342  {
343  const auto* pOrgCharStr = pStr.data();
344  const auto* pCharStr = pOrgCharStr + nIndex;
345  sal_Int32 nLen = pStr.size() - nIndex;
346  sal_Int32 nTokCount = 0;
347  const auto* pCharStrStart = pCharStr;
348  while (nLen > 0)
349  {
350  if (*pCharStr == cTok)
351  {
352  nTokCount++;
353 
354  if (nTokCount > nToken)
355  break;
356  if (nTokCount == nToken)
357  pCharStrStart = pCharStr + 1;
358  }
359 
360  pCharStr++;
361  nLen--;
362  }
363  if (nTokCount >= nToken)
364  {
365  ppThis = OUString(pCharStrStart, pCharStr - pCharStrStart);
366  if (nLen > 0)
367  return pCharStr - pOrgCharStr + 1;
368  else
369  return -1;
370  }
371  }
372 
373  ppThis.clear();
374  return -1;
375 }
376 
377 std::vector<OUString>
378  split(std::u16string_view rStr, sal_Unicode cSeparator)
379 {
380  std::vector< OUString > vec;
381  sal_Int32 idx = 0;
382  do
383  {
384  OUString kw;
385  idx = getToken(kw, rStr, 0, cSeparator, idx);
386  kw = kw.trim();
387  if (!kw.isEmpty())
388  {
389  vec.push_back(kw);
390  }
391 
392  } while (idx >= 0);
393 
394  return vec;
395 }
396 
397 uno::Sequence< OUString >
398  convertCommaSeparated( std::u16string_view i_rString )
399 {
400  std::vector< OUString > vec = split(i_rString, ',');
402 }
403 
404 OString join(std::string_view rSeparator, const std::vector<OString>& rSequence)
405 {
406  OStringBuffer aBuffer;
407  for (size_t i = 0; i < rSequence.size(); ++i)
408  {
409  if (i != 0)
410  aBuffer.append(rSeparator);
411  aBuffer.append(rSequence[i]);
412  }
413  return aBuffer.makeStringAndClear();
414 }
415 
416 sal_Int32 compareNatural( const OUString & rLHS, const OUString & rRHS,
417  const uno::Reference< i18n::XCollator > &rCollator,
418  const uno::Reference< i18n::XBreakIterator > &rBI,
419  const lang::Locale &rLocale )
420 {
421  sal_Int32 nRet = 0;
422 
423  sal_Int32 nLHSLastNonDigitPos = 0;
424  sal_Int32 nRHSLastNonDigitPos = 0;
425  sal_Int32 nLHSFirstDigitPos = 0;
426  sal_Int32 nRHSFirstDigitPos = 0;
427 
428  while (nLHSFirstDigitPos < rLHS.getLength() || nRHSFirstDigitPos < rRHS.getLength())
429  {
430  sal_Int32 nLHSChunkLen;
431  sal_Int32 nRHSChunkLen;
432 
433  //Compare non digit block as normal strings
434  nLHSFirstDigitPos = rBI->nextCharBlock(rLHS, nLHSLastNonDigitPos,
435  rLocale, i18n::CharType::DECIMAL_DIGIT_NUMBER);
436  nRHSFirstDigitPos = rBI->nextCharBlock(rRHS, nRHSLastNonDigitPos,
437  rLocale, i18n::CharType::DECIMAL_DIGIT_NUMBER);
438  if (nLHSFirstDigitPos == -1)
439  nLHSFirstDigitPos = rLHS.getLength();
440  if (nRHSFirstDigitPos == -1)
441  nRHSFirstDigitPos = rRHS.getLength();
442  nLHSChunkLen = nLHSFirstDigitPos - nLHSLastNonDigitPos;
443  nRHSChunkLen = nRHSFirstDigitPos - nRHSLastNonDigitPos;
444 
445  nRet = rCollator->compareSubstring(rLHS, nLHSLastNonDigitPos,
446  nLHSChunkLen, rRHS, nRHSLastNonDigitPos, nRHSChunkLen);
447  if (nRet != 0)
448  break;
449 
450  //Compare digit block as one number vs another
451  nLHSLastNonDigitPos = rBI->endOfCharBlock(rLHS, nLHSFirstDigitPos,
452  rLocale, i18n::CharType::DECIMAL_DIGIT_NUMBER);
453  nRHSLastNonDigitPos = rBI->endOfCharBlock(rRHS, nRHSFirstDigitPos,
454  rLocale, i18n::CharType::DECIMAL_DIGIT_NUMBER);
455  if (nLHSLastNonDigitPos == -1)
456  nLHSLastNonDigitPos = rLHS.getLength();
457  if (nRHSLastNonDigitPos == -1)
458  nRHSLastNonDigitPos = rRHS.getLength();
459  nLHSChunkLen = nLHSLastNonDigitPos - nLHSFirstDigitPos;
460  nRHSChunkLen = nRHSLastNonDigitPos - nRHSFirstDigitPos;
461 
462  //To-Do: Possibly scale down those unicode codepoints that relate to
463  //numbers outside of the normal 0-9 range, e.g. see GetLocalizedChar in
464  //vcl
465 
466  sal_uInt32 nLHS = comphelper::string::decimalStringToNumber(rLHS, nLHSFirstDigitPos, nLHSChunkLen);
467  sal_uInt32 nRHS = comphelper::string::decimalStringToNumber(rRHS, nRHSFirstDigitPos, nRHSChunkLen);
468 
469  if (nLHS != nRHS)
470  {
471  nRet = (nLHS < nRHS) ? -1 : 1;
472  break;
473  }
474  }
475 
476  return nRet;
477 }
478 
480  const uno::Reference< uno::XComponentContext > &rContext,
481  const lang::Locale &rLocale) : m_aLocale(rLocale)
482 {
483  m_xCollator = i18n::Collator::create( rContext );
484  m_xCollator->loadDefaultCollator(m_aLocale, 0);
485  m_xBI = i18n::BreakIterator::create( rContext );
486 }
487 
488 bool isdigitAsciiString(std::string_view rString)
489 {
490  return std::all_of(
491  rString.data(), rString.data() + rString.size(),
492  [](unsigned char c){ return rtl::isAsciiDigit(c); });
493 }
494 
495 bool isdigitAsciiString(std::u16string_view rString)
496 {
497  return std::all_of(
498  rString.data(), rString.data() + rString.size(),
499  [](sal_Unicode c){ return rtl::isAsciiDigit(c); });
500 }
501 
502 namespace
503 {
504  template <typename T, typename I, typename O> T tmpl_reverseString(I rIn)
505  {
506  if (rIn.empty())
507  return T();
508 
509  typename I::size_type i = rIn.size();
510  O sBuf(static_cast<sal_Int32>(i));
511  while (i)
512  sBuf.append(rIn[--i]);
513  return sBuf.makeStringAndClear();
514  }
515 }
516 
517 OUString reverseString(std::u16string_view rStr)
518 {
519  return tmpl_reverseString<OUString, std::u16string_view, OUStringBuffer>(rStr);
520 }
521 
522 OString reverseString(std::string_view rStr)
523 {
524  return tmpl_reverseString<OString, std::string_view, OStringBuffer>(rStr);
525 }
526 
527 sal_Int32 indexOfAny(std::u16string_view rIn,
528  sal_Unicode const*const pChars, sal_Int32 const nPos)
529 {
530  for (std::u16string_view::size_type i = nPos; i < rIn.size(); ++i)
531  {
532  sal_Unicode const c = rIn[i];
533  for (sal_Unicode const* pChar = pChars; *pChar; ++pChar)
534  {
535  if (c == *pChar)
536  {
537  return i;
538  }
539  }
540  }
541  return -1;
542 }
543 
544 OUString removeAny(std::u16string_view rIn,
545  sal_Unicode const*const pChars)
546 {
547  OUStringBuffer buf;
548  bool isFound(false);
549  for (std::u16string_view::size_type i = 0; i < rIn.size(); ++i)
550  {
551  sal_Unicode const c = rIn[i];
552  bool removeC(false);
553  for (sal_Unicode const* pChar = pChars; *pChar; ++pChar)
554  {
555  if (c == *pChar)
556  {
557  removeC = true;
558  break;
559  }
560  }
561  if (removeC)
562  {
563  if (!isFound)
564  {
565  if (i > 0)
566  {
567  buf.append(rIn.substr(0, i));
568  }
569  isFound = true;
570  }
571  }
572  else if (isFound)
573  {
574  buf.append(c);
575  }
576  }
577  return isFound ? buf.makeStringAndClear() : OUString(rIn);
578 }
579 
580 OUString setToken(const OUString& rIn, sal_Int32 nToken, sal_Unicode cTok,
581  std::u16string_view rNewToken)
582 {
583  sal_Int32 nLen = rIn.getLength();
584  sal_Int32 nTok = 0;
585  sal_Int32 nFirstChar = 0;
586  sal_Int32 i = 0;
587 
588  // Determine token position and length
589  while ( i < nLen )
590  {
591  // Increase token count if match
592  if (rIn[i] == cTok)
593  {
594  ++nTok;
595 
596  if (nTok == nToken)
597  nFirstChar = i+1;
598  else if (nTok > nToken)
599  break;
600  }
601 
602  ++i;
603  }
604 
605  if (nTok >= nToken)
606  return rIn.replaceAt(nFirstChar, i-nFirstChar, rNewToken);
607  return rIn;
608 }
609 
610 }
611 
612 /* vim:set shiftwidth=4 softtabstop=4 expandtab: */
OString stripEnd(const OString &rIn, char c)
Strips occurrences of a character from the end of the source string.
Definition: string.cxx:141
css::lang::Locale const m_aLocale
Definition: string.hxx:340
std::locale m_aLocale
std::vector< OUString > split(std::u16string_view rStr, sal_Unicode cSeparator)
Definition: string.cxx:378
bool isdigitAsciiString(std::string_view rString)
Determine if an OString contains solely ASCII numeric digits.
Definition: string.cxx:488
OutputIter intersperse(ForwardIter start, ForwardIter end, OutputIter out, T const &separator)
algorithm similar to std::copy, but inserts a separator between elements.
Definition: stl_types.hxx:152
float x
sal_uInt16 sal_Unicode
OString join(std::string_view rSeparator, const std::vector< OString > &rSequence)
Return a string which is the concatenation of the strings in the sequence.
Definition: string.cxx:404
OUString removeAny(std::u16string_view rIn, sal_Unicode const *const pChars)
Remove any of a list of code units in the string.
Definition: string.cxx:544
sal_Int32 getTokenCount(std::string_view rIn, char cTok)
Returns number of tokens in an OUString.
Definition: string.cxx:204
int i
css::uno::Reference< css::i18n::XBreakIterator > m_xBI
Definition: string.hxx:342
sal_Int32 compareNatural(const OUString &rLHS, const OUString &rRHS, const uno::Reference< i18n::XCollator > &rCollator, const uno::Reference< i18n::XBreakIterator > &rBI, const lang::Locale &rLocale)
Definition: string.cxx:416
NaturalStringSorter(const css::uno::Reference< css::uno::XComponentContext > &rContext, const css::lang::Locale &rLocale)
Definition: string.cxx:479
output iterator that appends OUStrings into an OUStringBuffer.
Definition: stl_types.hxx:125
sal_Int32 indexOfAny(std::u16string_view rIn, sal_Unicode const *const pChars, sal_Int32 const nPos)
Find any of a list of code units in the string.
Definition: string.cxx:527
static sal_uInt32 decimalStringToNumber(OUString const &str, sal_Int32 nStart, sal_Int32 nLength)
Definition: string.cxx:214
const sal_uInt16 idx[]
std::unique_ptr< char[]> aBuffer
css::uno::Sequence< DstElementType > containerToSequence(const SrcType &i_Container)
Copy from a container into a Sequence.
Definition: sequence.hxx:190
OString strip(const OString &rIn, char c)
Strips occurrences of a character from the start and end of the source string.
Definition: string.cxx:161
OUString setToken(const OUString &rIn, sal_Int32 nToken, sal_Unicode cTok, std::u16string_view rNewToken)
Replace a token in a string.
Definition: string.cxx:580
OString stripStart(const OString &rIn, char c)
Strips occurrences of a character from the start of the source string.
Definition: string.cxx:83
const char * pChar
Any value
Any result
static sal_Int32 getToken(OUString &ppThis, std::u16string_view pStr, sal_Int32 nToken, sal_Unicode cTok, sal_Int32 nIndex)
Definition: string.cxx:331
OUString reverseString(std::u16string_view rStr)
Reverse an OUString.
Definition: string.cxx:517
OUString convertCommaSeparated(uno::Sequence< OUString > const &i_rSeq)
Definition: string.cxx:321
css::uno::Reference< css::i18n::XCollator > m_xCollator
Definition: string.hxx:341