10 #ifndef INCLUDED_COMPHELPER_PARALLELSORT_HXX
11 #define INCLUDED_COMPHELPER_PARALLELSORT_HXX
32 static thread_local std::mt19937 aGenerator{ std::random_device{}() };
34 #define PARALLELSORT_ENABLEPZ 0
41 #if PARALLELSORT_ENABLEPZ
42 ProfileZone(
const char* pTag)
44 ,
maStart(std::chrono::steady_clock::now())
61 ProfileZone(
const char* )
74 #if PARALLELSORT_ENABLEPZ
76 void showTimeElapsed()
78 auto end = std::chrono::steady_clock::now();
80 = std::chrono::duration_cast<std::chrono::milliseconds>(
end -
maStart).
count();
81 std::cout <<
maTag <<
" : " << elapsed <<
" ms" << std::endl << std::flush;
85 std::chrono::steady_clock::time_point
maStart;
98 Executor(
const std::shared_ptr<comphelper::ThreadTaskTag>& rTag,
99 std::function<
void()> aFunc)
101 ,
maFunc(std::move(aFunc))
105 virtual void doWork()
override {
maFunc(); }
114 void enqueue(std::function<
void()> aFunc)
122 std::shared_ptr<comphelper::ThreadTaskTag>
maTag;
125 constexpr
size_t nMaxTreeArraySize = 64;
127 size_t lcl_round_down_pow2(
size_t nNum)
130 for (nPow2 = 1; nPow2 <= nNum; nPow2 <<= 1)
132 return std::min((nPow2 >> 1), nMaxTreeArraySize);
135 template <
class RandItr>
struct Sampler
137 using ValueType =
typename std::iterator_traits<RandItr>::value_type;
139 static void sample(RandItr aBegin, RandItr aEnd,
ValueType* pSamples,
size_t nSamples,
142 ProfileZone aZone(
"\tsample()");
144 size_t nLen =
static_cast<std::size_t
>(aEnd - aBegin);
145 assert(std::mt19937::max() >= nLen);
147 for (
size_t nIdx = 0; nIdx < nSamples; ++nIdx)
149 size_t nSel = aGenerator() % nLen--;
150 std::swap(*(aBegin + nSel), *(aBegin + nLen));
151 pSamples[nIdx] = *(aBegin + nLen);
156 template <
class RandItr,
class Compare>
class Binner
158 using ValueType =
typename std::iterator_traits<RandItr>::value_type;
172 Binner(
const ValueType* pSamples,
size_t nSamples,
size_t nBins,
bool bThreaded)
173 : mnTreeArraySize(lcl_round_down_pow2(nBins))
174 , mnDividers(mnTreeArraySize - 1)
175 , mbThreaded(bThreaded)
177 assert((nSamples % mnTreeArraySize) == 0);
178 assert(mnTreeArraySize <= nMaxTreeArraySize);
179 std::fill(maBinEnds, maBinEnds + mnTreeArraySize, 0);
180 std::fill(maSepBinEnds, maSepBinEnds + mnTreeArraySize * mnTreeArraySize, 0);
181 fillTreeArray(1, pSamples, pSamples + nSamples);
187 const ValueType* pMid = pLow + (pHigh - pLow) / 2;
188 maDividers[nPos] = *pMid;
190 if (2 * nPos < mnDividers)
192 fillTreeArray(2 * nPos, pLow, pMid);
193 fillTreeArray(2 * nPos + 1, pMid + 1, pHigh);
197 constexpr
inline size_t findBin(
const ValueType& rVal, Compare& aComp)
200 while (nIdx <= mnDividers)
201 nIdx = ((nIdx << 1) + aComp(maDividers[nIdx], rVal));
202 return (nIdx - mnTreeArraySize);
205 void label(
const RandItr aBegin,
const RandItr aEnd, Compare& aComp)
207 ProfileZone aZoneSetup(
"\tlabel():setup");
208 size_t nLen =
static_cast<std::size_t
>(aEnd - aBegin);
209 if (nLen > mnMaxStaticSize)
210 pLabels = std::make_unique<uint8_t[]>(nLen);
213 ProfileZone aZoneFindBins(
"\tFindBins()");
216 ParallelRunner aPRunner;
218 for (
size_t nTIdx = 0; nTIdx < nBins; ++nTIdx)
220 aPRunner.enqueue([
this, nTIdx, nBins, nLen, aBegin, pLabelsRaw, &aComp] {
221 ProfileZone aZoneIn(
"\t\tFindBinsThreaded()");
223 size_t* pBinEnds = maSepBinEnds + nBinEndsStartIdx;
224 size_t aBinEndsF[nMaxTreeArraySize] = { 0 };
225 for (
size_t nIdx = nTIdx; nIdx < nLen; nIdx += nBins)
227 size_t nBinIdx = findBin(*(aBegin + nIdx), aComp);
228 pLabelsRaw[nIdx] =
static_cast<uint8_t
>(nBinIdx);
229 ++aBinEndsF[nBinIdx];
233 pBinEnds[nIdx] = aBinEndsF[nIdx];
243 maBinEnds[nTIdx] += maSepBinEnds[nSepIdx * mnTreeArraySize + nTIdx];
248 uint8_t* pLabel = pLabelsRaw;
249 for (RandItr aItr = aBegin; aItr != aEnd; ++aItr)
251 size_t nBinIdx = findBin(*aItr, aComp);
253 ++maBinEnds[nBinIdx];
257 aZoneFindBins.stop();
263 size_t nSize = maBinEnds[nIdx];
264 maBinEnds[nIdx] = nSum;
271 void bin(
const RandItr aBegin,
const RandItr aEnd,
ValueType* pOut)
273 ProfileZone aZone(
"\tbin()");
274 const size_t nLen =
static_cast<std::size_t
>(aEnd - aBegin);
277 for (nIdx = 0; nIdx < nLen; ++nIdx)
279 pOut[maBinEnds[pLabelsRaw[nIdx]]++] = *(aBegin + nIdx);
284 template <
class RandItr,
class Compare = std::less<>>
285 void s3sort(
const RandItr aBegin,
const RandItr aEnd, Compare aComp = Compare(),
286 bool bThreaded =
true)
290 constexpr
size_t nBaseCaseSize = 1024;
291 const std::size_t nLen =
static_cast<std::size_t
>(aEnd - aBegin);
292 if (nLen < nBaseCaseSize)
294 std::sort(aBegin, aEnd, aComp);
298 using ValueType =
typename std::iterator_traits<RandItr>::value_type;
299 auto pOut = std::make_unique<ValueType[]>(nLen);
301 const size_t nBins = lcl_round_down_pow2(nThreadCount);
302 const size_t nOverSamplingFactor = std::max(1.0, std::sqrt(static_cast<double>(nLen) / 64));
303 const size_t nSamples = nOverSamplingFactor * nBins;
304 auto aSamples = std::make_unique<ValueType[]>(nSamples);
305 ProfileZone aZoneSampleAnsSort(
"SampleAndSort");
307 Sampler<RandItr>::sample(aBegin, aEnd, aSamples.get(), nSamples, nBins);
308 std::sort(aSamples.get(), aSamples.get() + nSamples, aComp);
309 aZoneSampleAnsSort.stop();
311 if (!aComp(aSamples[0], aSamples[nSamples - 1]))
314 std::sort(aBegin, aEnd, aComp);
318 ProfileZone aZoneBinner(
"Binner");
320 Binner<RandItr, Compare> aBinner(aSamples.get(), nSamples, nBins, bThreaded);
321 aBinner.label(aBegin, aEnd, aComp);
322 aBinner.bin(aBegin, aEnd,
pOut.get());
325 ProfileZone aZoneSortBins(
"SortBins");
329 ParallelRunner aPRunner;
331 for (
size_t nBinIdx = 0, nBinStart = 0; nBinIdx < nBins; ++nBinIdx)
333 size_t nBinEnd = aBinner.maBinEnds[nBinIdx];
334 aPRunner.enqueue([pOutRaw, nBinStart, nBinEnd, &aComp] {
335 std::sort(pOutRaw + nBinStart, pOutRaw + nBinEnd, aComp);
345 for (
size_t nBinIdx = 0, nBinStart = 0; nBinIdx < nBins; ++nBinIdx)
347 auto nBinEnd = aBinner.maBinEnds[nBinIdx];
348 std::sort(pOutRaw + nBinStart, pOutRaw + nBinEnd, aComp);
353 aZoneSortBins.stop();
356 std::move(pOutRaw, pOutRaw + nLen, aBegin);
361 template <
class RandItr,
class Compare = std::less<>>
362 void parallelSort(
const RandItr aBegin,
const RandItr aEnd, Compare aComp = Compare())
365 s3sort(aBegin, aEnd, aComp);
370 #endif // INCLUDED_COMPHELPER_PARALLELSORT_HXX
const size_t count(pCandidateA->getBorderLines().size())
const size_t mnTreeArraySize
A very basic thread-safe thread pool implementation.
static constexpr size_t mnMaxStaticSize
size_t maBinEnds[nMaxTreeArraySize]
ValueType maDividers[nMaxTreeArraySize]
const std::function< void()> maFunc
std::shared_ptr< comphelper::ThreadTaskTag > maTag
void waitUntilDone(const std::shared_ptr< ThreadTaskTag > &, bool bJoin=true)
Wait until all queued tasks associated with the tag are completed.
const BorderLinePrimitive2D *pCandidateB assert(pCandidateA)
const size_t nThreadCountGlobal
uint8_t maLabels[mnMaxStaticSize]
size_t maSepBinEnds[nMaxTreeArraySize *nMaxTreeArraySize]
static ThreadPool & getSharedOptimalPool()
returns a pointer to a shared pool with optimal thread count for the CPU
void pushTask(std::unique_ptr< ThreadTask > pTask)
push a new task onto the work queue
enumrange< T >::Iterator end(enumrange< T >)
std::unique_ptr< uint8_t[]> pLabels
static comphelper::ThreadPool & rTPool(comphelper::ThreadPool::getSharedOptimalPool())
static std::shared_ptr< ThreadTaskTag > createThreadTaskTag()
void parallelSort(const RandItr aBegin, const RandItr aEnd, Compare aComp=Compare())
const bool bHyperThreadingActive