LibreOffice Module comphelper (master)  1
parallelsort.hxx
Go to the documentation of this file.
1 /* -*- Mode: C++; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4 -*- */
2 /*
3  * This file is part of the LibreOffice project.
4  *
5  * This Source Code Form is subject to the terms of the Mozilla Public
6  * License, v. 2.0. If a copy of the MPL was not distributed with this
7  * file, You can obtain one at http://mozilla.org/MPL/2.0/.
8  */
9 
10 #ifndef INCLUDED_COMPHELPER_PARALLELSORT_HXX
11 #define INCLUDED_COMPHELPER_PARALLELSORT_HXX
12 
14 #include <tools/cpuid.hxx>
15 
16 #include <memory>
17 #include <iterator>
18 #include <thread>
19 #include <algorithm>
20 #include <cmath>
21 #include <random>
22 #include <functional>
23 #include <iostream>
24 #include <chrono>
25 
26 namespace comphelper
27 {
28 const size_t nThreadCountGlobal = std::thread::hardware_concurrency();
31 
32 static thread_local std::mt19937 aGenerator{ std::random_device{}() };
33 
34 #define PARALLELSORT_ENABLEPZ 0
35 
36 namespace
37 {
38 class ProfileZone
39 {
40 public:
41 #if PARALLELSORT_ENABLEPZ
42  ProfileZone(const char* pTag)
43  : maTag(pTag)
44  , maStart(std::chrono::steady_clock::now())
45  , mbFinished(false)
46  {
47  }
48 
49  ~ProfileZone()
50  {
51  if (!mbFinished)
52  showTimeElapsed();
53  }
54 
55  void stop()
56  {
57  showTimeElapsed();
58  mbFinished = true;
59  }
60 #else
61  ProfileZone(const char* /*pTag*/)
62  : mbDummy(true)
63  {
64  }
65 
66  void stop()
67  {
68  // Avoid loplugin:staticmethods, loplugin:staticaccess errors
69  (void)mbDummy;
70  }
71 #endif
72 
73 private:
74 #if PARALLELSORT_ENABLEPZ
75 
76  void showTimeElapsed()
77  {
78  auto end = std::chrono::steady_clock::now();
79  size_t elapsed
80  = std::chrono::duration_cast<std::chrono::milliseconds>(end - maStart).count();
81  std::cout << maTag << " : " << elapsed << " ms" << std::endl << std::flush;
82  }
83 
84  std::string maTag;
85  std::chrono::steady_clock::time_point maStart;
86  bool mbFinished;
87 #else
88  bool mbDummy;
89 
90 #endif
91 };
92 
93 class ParallelRunner
94 {
95  class Executor final : public comphelper::ThreadTask
96  {
97  public:
98  Executor(const std::shared_ptr<comphelper::ThreadTaskTag>& rTag,
99  std::function<void()> aFunc)
100  : comphelper::ThreadTask(rTag)
101  , maFunc(std::move(aFunc))
102  {
103  }
104 
105  virtual void doWork() override { maFunc(); }
106 
107  private:
108  const std::function<void()> maFunc;
109  };
110 
111 public:
112  ParallelRunner() { maTag = comphelper::ThreadPool::createThreadTaskTag(); }
113 
114  void enqueue(std::function<void()> aFunc)
115  {
116  rTPool.pushTask(std::make_unique<Executor>(maTag, aFunc));
117  }
118 
119  void wait() { rTPool.waitUntilDone(maTag, false); }
120 
121 private:
122  std::shared_ptr<comphelper::ThreadTaskTag> maTag;
123 };
124 
125 constexpr size_t nMaxTreeArraySize = 64;
126 
127 size_t lcl_round_down_pow2(size_t nNum)
128 {
129  size_t nPow2;
130  for (nPow2 = 1; nPow2 <= nNum; nPow2 <<= 1)
131  ;
132  return std::min((nPow2 >> 1), nMaxTreeArraySize);
133 }
134 
135 template <class RandItr> struct Sampler
136 {
137  using ValueType = typename std::iterator_traits<RandItr>::value_type;
138 
139  static void sample(RandItr aBegin, RandItr aEnd, ValueType* pSamples, size_t nSamples,
140  size_t /*nParallelism*/)
141  {
142  ProfileZone aZone("\tsample()");
143  assert(aBegin <= aEnd);
144  size_t nLen = static_cast<std::size_t>(aEnd - aBegin);
145  assert(std::mt19937::max() >= nLen);
146 
147  for (size_t nIdx = 0; nIdx < nSamples; ++nIdx)
148  {
149  size_t nSel = aGenerator() % nLen--;
150  std::swap(*(aBegin + nSel), *(aBegin + nLen));
151  pSamples[nIdx] = *(aBegin + nLen);
152  }
153  }
154 };
155 
156 template <class RandItr, class Compare> class Binner
157 {
158  using ValueType = typename std::iterator_traits<RandItr>::value_type;
159 
160  const size_t mnTreeArraySize;
161  const size_t mnDividers;
162  constexpr static size_t mnMaxStaticSize = 1024 * 50;
164  ValueType maDividers[nMaxTreeArraySize];
165  std::unique_ptr<uint8_t[]> pLabels;
166  size_t maSepBinEnds[nMaxTreeArraySize * nMaxTreeArraySize];
168 
169 public:
170  size_t maBinEnds[nMaxTreeArraySize];
171 
172  Binner(const ValueType* pSamples, size_t nSamples, size_t nBins, bool bThreaded)
173  : mnTreeArraySize(lcl_round_down_pow2(nBins))
174  , mnDividers(mnTreeArraySize - 1)
175  , mbThreaded(bThreaded)
176  {
177  assert((nSamples % mnTreeArraySize) == 0);
178  assert(mnTreeArraySize <= nMaxTreeArraySize);
179  std::fill(maBinEnds, maBinEnds + mnTreeArraySize, 0);
180  std::fill(maSepBinEnds, maSepBinEnds + mnTreeArraySize * mnTreeArraySize, 0);
181  fillTreeArray(1, pSamples, pSamples + nSamples);
182  }
183 
184  void fillTreeArray(size_t nPos, const ValueType* pLow, const ValueType* pHigh)
185  {
186  assert(pLow <= pHigh);
187  const ValueType* pMid = pLow + (pHigh - pLow) / 2;
188  maDividers[nPos] = *pMid;
189 
190  if (2 * nPos < mnDividers) // So that 2*nPos < mnTreeArraySize
191  {
192  fillTreeArray(2 * nPos, pLow, pMid);
193  fillTreeArray(2 * nPos + 1, pMid + 1, pHigh);
194  }
195  }
196 
197  constexpr inline size_t findBin(const ValueType& rVal, Compare& aComp)
198  {
199  size_t nIdx = 1;
200  while (nIdx <= mnDividers)
201  nIdx = ((nIdx << 1) + aComp(maDividers[nIdx], rVal));
202  return (nIdx - mnTreeArraySize);
203  }
204 
205  void label(const RandItr aBegin, const RandItr aEnd, Compare& aComp)
206  {
207  ProfileZone aZoneSetup("\tlabel():setup");
208  size_t nLen = static_cast<std::size_t>(aEnd - aBegin);
209  if (nLen > mnMaxStaticSize)
210  pLabels = std::make_unique<uint8_t[]>(nLen);
211  uint8_t* pLabelsRaw = (nLen > mnMaxStaticSize) ? pLabels.get() : maLabels;
212  aZoneSetup.stop();
213  ProfileZone aZoneFindBins("\tFindBins()");
214  if (mbThreaded)
215  {
216  ParallelRunner aPRunner;
217  const size_t nBins = mnTreeArraySize;
218  for (size_t nTIdx = 0; nTIdx < nBins; ++nTIdx)
219  {
220  aPRunner.enqueue([this, nTIdx, nBins, nLen, aBegin, pLabelsRaw, &aComp] {
221  ProfileZone aZoneIn("\t\tFindBinsThreaded()");
222  size_t nBinEndsStartIdx = nTIdx * mnTreeArraySize;
223  size_t* pBinEnds = maSepBinEnds + nBinEndsStartIdx;
224  size_t aBinEndsF[nMaxTreeArraySize] = { 0 };
225  for (size_t nIdx = nTIdx; nIdx < nLen; nIdx += nBins)
226  {
227  size_t nBinIdx = findBin(*(aBegin + nIdx), aComp);
228  pLabelsRaw[nIdx] = static_cast<uint8_t>(nBinIdx);
229  ++aBinEndsF[nBinIdx];
230  }
231 
232  for (size_t nIdx = 0; nIdx < mnTreeArraySize; ++nIdx)
233  pBinEnds[nIdx] = aBinEndsF[nIdx];
234  });
235  }
236 
237  aPRunner.wait();
238 
239  // Populate maBinEnds from maSepBinEnds
240  for (size_t nTIdx = 0; nTIdx < mnTreeArraySize; ++nTIdx)
241  {
242  for (size_t nSepIdx = 0; nSepIdx < mnTreeArraySize; ++nSepIdx)
243  maBinEnds[nTIdx] += maSepBinEnds[nSepIdx * mnTreeArraySize + nTIdx];
244  }
245  }
246  else
247  {
248  uint8_t* pLabel = pLabelsRaw;
249  for (RandItr aItr = aBegin; aItr != aEnd; ++aItr)
250  {
251  size_t nBinIdx = findBin(*aItr, aComp);
252  *pLabel++ = nBinIdx;
253  ++maBinEnds[nBinIdx];
254  }
255  }
256 
257  aZoneFindBins.stop();
258 
259  size_t nSum = 0;
260  // Store each bin's starting position in maBinEnds array for now.
261  for (size_t nIdx = 0; nIdx < mnTreeArraySize; ++nIdx)
262  {
263  size_t nSize = maBinEnds[nIdx];
264  maBinEnds[nIdx] = nSum;
265  nSum += nSize;
266  }
267 
268  // Now maBinEnds has end positions of each bin.
269  }
270 
271  void bin(const RandItr aBegin, const RandItr aEnd, ValueType* pOut)
272  {
273  ProfileZone aZone("\tbin()");
274  const size_t nLen = static_cast<std::size_t>(aEnd - aBegin);
275  uint8_t* pLabelsRaw = (nLen > mnMaxStaticSize) ? pLabels.get() : maLabels;
276  size_t nIdx;
277  for (nIdx = 0; nIdx < nLen; ++nIdx)
278  {
279  pOut[maBinEnds[pLabelsRaw[nIdx]]++] = *(aBegin + nIdx);
280  }
281  }
282 };
283 
284 template <class RandItr, class Compare = std::less<>>
285 void s3sort(const RandItr aBegin, const RandItr aEnd, Compare aComp = Compare(),
286  bool bThreaded = true)
287 {
288  static size_t nThreadCount = nThreadCountGlobal;
289 
290  constexpr size_t nBaseCaseSize = 1024;
291  const std::size_t nLen = static_cast<std::size_t>(aEnd - aBegin);
292  if (nLen < nBaseCaseSize)
293  {
294  std::sort(aBegin, aEnd, aComp);
295  return;
296  }
297 
298  using ValueType = typename std::iterator_traits<RandItr>::value_type;
299  auto pOut = std::make_unique<ValueType[]>(nLen);
300 
301  const size_t nBins = lcl_round_down_pow2(nThreadCount);
302  const size_t nOverSamplingFactor = std::max(1.0, std::sqrt(static_cast<double>(nLen) / 64));
303  const size_t nSamples = nOverSamplingFactor * nBins;
304  auto aSamples = std::make_unique<ValueType[]>(nSamples);
305  ProfileZone aZoneSampleAnsSort("SampleAndSort");
306  // Select samples and sort them
307  Sampler<RandItr>::sample(aBegin, aEnd, aSamples.get(), nSamples, nBins);
308  std::sort(aSamples.get(), aSamples.get() + nSamples, aComp);
309  aZoneSampleAnsSort.stop();
310 
311  if (!aComp(aSamples[0], aSamples[nSamples - 1]))
312  {
313  // All samples are equal, fallback to standard sort.
314  std::sort(aBegin, aEnd, aComp);
315  return;
316  }
317 
318  ProfileZone aZoneBinner("Binner");
319  // Create and populate bins using pOut from input iterators.
320  Binner<RandItr, Compare> aBinner(aSamples.get(), nSamples, nBins, bThreaded);
321  aBinner.label(aBegin, aEnd, aComp);
322  aBinner.bin(aBegin, aEnd, pOut.get());
323  aZoneBinner.stop();
324 
325  ProfileZone aZoneSortBins("SortBins");
326  ValueType* pOutRaw = pOut.get();
327  if (bThreaded)
328  {
329  ParallelRunner aPRunner;
330  // Sort the bins separately.
331  for (size_t nBinIdx = 0, nBinStart = 0; nBinIdx < nBins; ++nBinIdx)
332  {
333  size_t nBinEnd = aBinner.maBinEnds[nBinIdx];
334  aPRunner.enqueue([pOutRaw, nBinStart, nBinEnd, &aComp] {
335  std::sort(pOutRaw + nBinStart, pOutRaw + nBinEnd, aComp);
336  });
337 
338  nBinStart = nBinEnd;
339  }
340 
341  aPRunner.wait();
342  }
343  else
344  {
345  for (size_t nBinIdx = 0, nBinStart = 0; nBinIdx < nBins; ++nBinIdx)
346  {
347  auto nBinEnd = aBinner.maBinEnds[nBinIdx];
348  std::sort(pOutRaw + nBinStart, pOutRaw + nBinEnd, aComp);
349  nBinStart = nBinEnd;
350  }
351  }
352 
353  aZoneSortBins.stop();
354 
355  // Move the sorted array to the array specified by input iterators.
356  std::move(pOutRaw, pOutRaw + nLen, aBegin);
357 }
358 
359 } // anonymous namespace
360 
361 template <class RandItr, class Compare = std::less<>>
362 void parallelSort(const RandItr aBegin, const RandItr aEnd, Compare aComp = Compare())
363 {
364  assert(aBegin <= aEnd);
365  s3sort(aBegin, aEnd, aComp);
366 }
367 
368 } // namespace comphelper
369 
370 #endif // INCLUDED_COMPHELPER_PARALLELSORT_HXX
371 
372 /* vim:set shiftwidth=4 softtabstop=4 expandtab: */
const size_t count(pCandidateA->getBorderLines().size())
const size_t mnTreeArraySize
A very basic thread-safe thread pool implementation.
Definition: threadpool.hxx:43
const size_t mnDividers
bool mbThreaded
static constexpr size_t mnMaxStaticSize
size_t maBinEnds[nMaxTreeArraySize]
ValueType maDividers[nMaxTreeArraySize]
const std::function< void()> maFunc
std::shared_ptr< comphelper::ThreadTaskTag > maTag
void waitUntilDone(const std::shared_ptr< ThreadTaskTag > &, bool bJoin=true)
Wait until all queued tasks associated with the tag are completed.
Definition: threadpool.cxx:259
const BorderLinePrimitive2D *pCandidateB assert(pCandidateA)
bool mbDummy
oslFileHandle & pOut
const size_t nThreadCountGlobal
uint8_t maLabels[mnMaxStaticSize]
size_t maSepBinEnds[nMaxTreeArraySize *nMaxTreeArraySize]
static ThreadPool & getSharedOptimalPool()
returns a pointer to a shared pool with optimal thread count for the CPU
Definition: threadpool.cxx:125
void pushTask(std::unique_ptr< ThreadTask > pTask)
push a new task onto the work queue
Definition: threadpool.cxx:209
bool hasHyperThreading()
def stop
enumrange< T >::Iterator end(enumrange< T >)
std::unique_ptr< uint8_t[]> pLabels
static comphelper::ThreadPool & rTPool(comphelper::ThreadPool::getSharedOptimalPool())
ValueType
Point maStart
static std::shared_ptr< ThreadTaskTag > createThreadTaskTag()
Definition: threadpool.cxx:296
def label(st)
void parallelSort(const RandItr aBegin, const RandItr aEnd, Compare aComp=Compare())
const bool bHyperThreadingActive
typedef void(CALLTYPE *GetFuncDataPtr)(sal_uInt16 &nNo