UDocumentation UE5.7 10.02.2026 (Source)
API documentation for Unreal Engine 5.7
ParallelSort.h
Go to the documentation of this file.
1// Copyright Epic Games, Inc. All Rights Reserved.
2
3#pragma once
4
6#include "HAL/PlatformMath.h"
8#include "Tasks/Task.h"
9#include "Templates/Less.h"
11
12namespace Algo
13{
14
26template <typename RangeType, typename PredicateType>
27void ParallelSort(RangeType&& Range, PredicateType Predicate);
28
30template <typename RangeType, typename PredicateType>
31void ParallelSortForceSingleThreaded(RangeType&& Range, PredicateType Predicate);
32
33} // namespace Algo
34
35
37// Implementation forward declares
39
41{
42template <typename T>
52template <typename T, typename IndexType, typename PredicateType>
53void ParallelMergeSortPointerAndNum(T* First, IndexType Num, PredicateType Predicate);
54template <typename T, typename IndexType, typename PredicateType>
55void MergeSortPointerAndNum(T* First, IndexType Num, PredicateType Predicate);
56template <typename T, typename IndexType, typename PredicateType>
58template <typename T, typename IndexType, typename PredicateType>
59void Merge(T* First, IndexType Num, IndexType FloorMidIndex, PredicateType Predicate, TMergeBuffer<T>& MergeBuffer);
60template <typename T, typename IndexType, typename PredicateType>
61void SmallSort(T* First, IndexType Num, PredicateType Predicate);
62
63} // namespace Algo::MergeSort
64
65
67// Template implementations
69
70
71namespace Algo
72{
73
74template <typename RangeType, typename PredicateType>
75void ParallelSort(RangeType&& Range, PredicateType Predicate)
76{
78}
79
80template <typename RangeType>
81void ParallelSort(RangeType&& Range)
82{
84}
85
86template <typename RangeType, typename PredicateType>
87void ParallelSortForceSingleThreaded(RangeType&& Range, PredicateType Predicate)
88{
90}
91
92template <typename RangeType, typename PredicateType>
97
98} // namespace Algo
99
100namespace Algo::MergeSort
101{
102
103template <typename T, typename IndexType, typename PredicateType>
105{
106 using namespace UE::Tasks;
107
108 // In Round 0 we divide the Array up initially into TaskCount segments, and create an initial set of tasks that
109 // recursively mergesorts all segments in parallel.
110
111 // Rounds 1 through NumRounds-1 merge each pair of recursively sorted segments from the previous round into a
112 // combined segment, ending with NumRound -1 merging up into a single segment for the entire range.
113
114 // We require TaskCount is a Power of 2
115 // Set MaxAsymptoticTaskCount somewhat arbitrarily at 16 subtasks. There are diminishing returns to further division;
116 // total time is ~ Sum(i=1,log(threadCount))(N/i) + N/(log(threadCount+1)) log (N/(log(threadCount+1)
117 // e.g. for 8 threads and N = 2^20, it is N + N/2 + N/4 + N/8 + N/16 log(2^20/2^4) == 2*N - 2*N/16 + (N/16)*16 ~ 2.875*N
118 // for 16 threads and N = 2^20, it is N + N/2 + N/4 + N/8 + N/16 + N/32 log(2^20/2^5) == 2*N - 2*N/32 + (N/32)*15 ~ 2.40*N
119 // Approaches time == 2*N in the limit of Number of threads == N
120 //
121 // Depending on N, MaxTaskCount might be less than MaxAsymptoticTaskCount because we want to avoid the overhead of
122 // launching threads to sort a small number of elements, so we set a MinBatchSize.
123 // TODO: Find some objective way to set MinBatchSize.
124 constexpr int32 MinBatchSize = 16;
125 constexpr int32 MaxAsymptoticTaskCount = 16;
126
127 // MinBatchSize = DivAndRoundUp(Num, MaxTaskCount) == (Num + MaxTaskCount - 1) / MaxTaskCount;
128 // ==>
129 // MaxTaskCount == DivAndRoundDown(Num - 1, MinBatchSize - 1)
130 int32 MaxTaskCount = (Num - 1) / (MinBatchSize - 1);
131 if (MaxTaskCount < 2)
132 {
134 }
136 int32 TaskCount = 1 << FPlatformMath::FloorLog2(MaxTaskCount);
137 IndexType BatchSize = (Num + TaskCount - 1) / TaskCount;
138
139 // Round 0 is the initial recursive sort of segments, Round 1 is the first merge round, ...
140 // Round TaskCount is the merge into the final full Range.
141 const int32 NumRounds = FPlatformMath::FloorLog2(TaskCount) + 1;
142
143 // We define a recursive merge function for round N that launches a task for the Upper in round NumRounds - 1,
144 // then works itself on the Lower for round N - 1, then merges. We end up with TaskCount threads (including the
145 // main thread) doing the single-threaded recursive call to mergesort in round 0, then half that many threads
146 // doing the merge in round 1, ..., and only the main thread doing the merge in round NumRounds - 1.
147
148 // Each thread working on a round needs scratch space for the merge, we give it a mergebuffer. It uses the same
149 // mergebuffer for each of the recursive calls that it works on itself; the tasks it kicks off create their
150 // own mergebuffers and do the same.
151 // Merging two segments, each of size <= BatchSize into a merged 2*BatchSize length segment requires scratch
152 // space equal to BatchSize, (plus 1 more because we push before we pop).
153 auto GetMergeBufferForRound = [BatchSize, Num](int32 Round, int32 ConstructorSite)
154 {
155 // BatchSize << Round is the size of the segment produced for the round, we only need half that (plus one).
156 const IndexType BufferSize = ((BatchSize << Round) + 1) / 2 + 1;
158 MergeBuffer.Buffer.Reserve(BufferSize);
159 MergeBuffer.AllocatedSize = BufferSize;
160 MergeBuffer.ConstructorSite = ConstructorSite;
161 MergeBuffer.InputNum = Num;
162 MergeBuffer.Round = Round;
163 MergeBuffer.BatchSize = BatchSize;
164 return MergeBuffer;
165 };
166
167 // Does just the merge job with no recursion.
168 auto MergeRoundAndIndex = [TaskCount, Num, BatchSize, &Predicate, First](int32 Round, int32 IndexInRound, TMergeBuffer<T>& MergeBuffer)
169 {
170 const int32 NumMergesForRound = TaskCount >> Round;
171 const IndexType CurrentLevelBatchSize = BatchSize << Round;
172 const IndexType PreviousRoundBatchSize = BatchSize << (Round - 1);
177 };
178
179 // The base case of recursion; call single-threaded mergesort, which does its own recursion but without any
180 // threading.
181 auto RecursiveMergeRound0 = [TaskCount, Num, BatchSize, &Predicate, First](int32 IndexInRound, TMergeBuffer<T>& MergeBuffer)
182 {
183 const IndexType StartIndex = IndexInRound * BatchSize;
184 const IndexType MergeNum = IndexInRound < TaskCount - 1 ? BatchSize : Num - StartIndex;
185 MergeSortRecursive(First + StartIndex, MergeNum, Predicate, MergeBuffer);
186 };
187
188 // Our recursive function that forks itself to handle the upper recursive segment, works itself recursively on the
189 // lower recursive segment, and then merges the two segments. The arguments are the round number, and the index
190 // within the round of the segment it is creating.
195 {
196 if (Round > 0)
197 {
198 FTask Upper = Launch(TEXT("ParallelMergeSort"),
200 {
203 });
204
206 Upper.Wait();
208 }
209 else
210 {
212 }
213 };
215
216 // Kick off RecursiveMergeRoundAndIndexPtr for the last round, at its only segment index: index 0.
219}
220
221template <typename T, typename IndexType, typename PredicateType>
222void MergeSortPointerAndNum(T* First, IndexType Num, PredicateType Predicate)
223{
225 IndexType BufferSize = (Num + 1) / 2 + 1;
226 MergeBuffer.Buffer.Reserve(BufferSize);
227 MergeBuffer.AllocatedSize = BufferSize;
228 MergeBuffer.ConstructorSite = 300;
229 MergeBuffer.InputNum = Num;
230 MergeBuffer.Round = -1;
231 MergeBuffer.BatchSize = -1;
233}
234
235template <typename T, typename IndexType, typename PredicateType>
237{
238 if (Num <= 3)
239 {
240 SmallSort(First, Num, Predicate);
241 return;
242 }
243
244 const IndexType FloorMidIndex = (Num + 1) / 2;
247
248 Merge(First, Num, FloorMidIndex, Predicate, MergeBuffer);
249}
250
251template <typename T, typename IndexType, typename PredicateType>
252void Merge(T* First, IndexType Num, IndexType FloorMidIndex, PredicateType Predicate, TMergeBuffer<T>& InMergeBuffer)
253{
254 IndexType WriteHead = 0;
255 IndexType UpperReadHead = FloorMidIndex;
256
257 // Skip past however many leading elements of Lower come before the first element of Upper
258 while (WriteHead < FloorMidIndex)
259 {
260 if (!Predicate(First[UpperReadHead], First[WriteHead]))
261 {
262 ++WriteHead;
263 }
264 else
265 {
266 break;
267 }
268 }
269
270 if (WriteHead == FloorMidIndex)
271 {
272 // All elements of Lower came before Upper, the range is already sorted
273 return;
274 }
275
278 // Our caller is supposed to reserve the MergeBuffer for us. The maximum size of MergeBuffer that we use is
279 // the length of the Upper half, plus 1. Assert that we have that size so that we don't hit the performance
280 // cost of growing the buffer here.
281 checkf(MergeBuffer.Max() >= Num - FloorMidIndex + 1,
282 TEXT("ParallelSort bug: MergeBuffer not preallocated large enough. %d < %d. ")
283 TEXT("AllocatedSize=%d, ConstructorSite=%d, InputNum=%d, Round=%d, BatchSize=%d."),
284 MergeBuffer.Max(), Num - FloorMidIndex + 1,
285 InMergeBuffer.AllocatedSize, InMergeBuffer.ConstructorSite, InMergeBuffer.InputNum, InMergeBuffer.Round, InMergeBuffer.BatchSize);
286
287 // Move the UpperReadHead into WriteHead, we already compared it above and know it is lower
288 MergeBuffer.Add(MoveTemp(First[WriteHead]));
289 First[WriteHead] = MoveTemp(First[UpperReadHead]);
291 ++WriteHead;
292
293 while (UpperReadHead < Num && WriteHead < FloorMidIndex)
294 {
295 check(WriteHead < UpperReadHead);
296 MergeBuffer.Add(MoveTemp(First[WriteHead]));
297 if (!Predicate(First[UpperReadHead], MergeBuffer[0]))
298 {
299 First[WriteHead] = MergeBuffer.PopFrontValue();
300 }
301 else
302 {
303 First[WriteHead] = MoveTemp(First[UpperReadHead]);
305 }
306 ++WriteHead;
307 }
308 while (UpperReadHead < Num && !MergeBuffer.IsEmpty())
309 {
310 check(WriteHead < UpperReadHead);
311 if (!Predicate(First[UpperReadHead], MergeBuffer[0]))
312 {
313 First[WriteHead] = MergeBuffer.PopFrontValue();
314 }
315 else
316 {
317 First[WriteHead] = MoveTemp(First[UpperReadHead]);
319 }
320 ++WriteHead;
321 }
322 while (WriteHead < FloorMidIndex)
323 {
324 MergeBuffer.Add(MoveTemp(First[WriteHead]));
325 First[WriteHead] = MergeBuffer.PopFrontValue();
326 ++WriteHead;
327 }
328 while (!MergeBuffer.IsEmpty())
329 {
330 check(WriteHead < UpperReadHead);
331 First[WriteHead] = MergeBuffer.PopFrontValue();
332 ++WriteHead;
333 }
334
335 // We have finished all lower, and so the remaining elements to write (if any) all come from the
336 // remainder of UpperReadHead. And we don't need to move them because they're already in their destination
337 // position. So the list of remaining writes should be identical to the list of remaining uppers. If we're
338 // done with Uppers as well, then WriteHead == UpperReadHead == Num.
339 check(WriteHead == UpperReadHead);
340}
341
342template <typename T, typename IndexType, typename PredicateType>
343void SmallSort(T* First, IndexType Num, PredicateType Predicate)
344{
345 switch (Num)
346 {
347 case 0:
348 return;
349 case 1:
350 return;
351 case 2:
352 if (Predicate(First[1], First[0]))
353 {
354 Swap(First[1], First[0]);
355 }
356 return;
357 case 3:
358 if (!Predicate(First[1], First[0]))
359 {
360 // 0 < 1
361 if (Predicate(First[2], First[1]))
362 {
363 // 0 < 1 and 2 < 1
364 if (Predicate(First[2], First[0]))
365 {
366 // 2 < 0 < 1
367 T Temp = MoveTemp(First[0]);
368 First[0] = MoveTemp(First[2]);
369 First[2] = MoveTemp(First[1]);
370 First[1] = MoveTemp(Temp);
371 }
372 else
373 {
374 // 0 < 2 < 1
375 Swap(First[2], First[1]);
376 }
377 }
378 // else Already sorted 0 < 1 < 2
379 }
380 else
381 {
382 // 1 < 0
383 if (Predicate(First[2], First[0]))
384 {
385 // 1 < 0 and 2 < 0
386 if (Predicate(First[2], First[1]))
387 {
388 // 2 < 1 < 0
389 Swap(First[2], First[0]);
390 }
391 else
392 {
393 // 1 < 2 < 0
394 T Temp = MoveTemp(First[0]);
395 First[0] = MoveTemp(First[1]);
396 First[1] = MoveTemp(First[2]);
397 First[2] = MoveTemp(Temp);
398 }
399 }
400 else
401 {
402 // 1 < 0 < 2
403 Swap(First[1], First[0]);
404 }
405 }
406 return;
407 default:
408 checkf(false, TEXT("ParallelSort bug: SmallSort was passed Num > 3"));
409 return;
410 }
411}
412
413} // namespace Algo
OODEFFUNC typedef void(OODLE_CALLBACK t_fp_OodleCore_Plugin_Free)(void *ptr)
#define check(expr)
Definition AssertionMacros.h:314
#define checkf(expr, format,...)
Definition AssertionMacros.h:315
#define TEXT(x)
Definition Platform.h:1272
FPlatformTypes::int32 int32
A 32-bit signed integer.
Definition Platform.h:1125
UE_FORCEINLINE_HINT TSharedRef< CastToType, Mode > StaticCastSharedRef(TSharedRef< CastFromType, Mode > const &InSharedRef)
Definition SharedPointer.h:127
@ Num
Definition MetalRHIPrivate.h:234
auto GetNum(const TStringConversion< Converter, DefaultConversionSize > &Conversion) -> decltype(Conversion.Length())
Definition StringConv.h:808
auto GetData(const TStringConversion< Converter, DefaultConversionSize > &Conversion) -> decltype(Conversion.Get())
Definition StringConv.h:802
UE_INTRINSIC_CAST UE_REWRITE constexpr std::remove_reference_t< T > && MoveTemp(T &&Obj) noexcept
Definition UnrealTemplate.h:520
Definition Launch.Build.cs:10
Definition AndroidPlatformMisc.h:14
Definition RingBuffer.h:135
void Reset()
Definition RingBuffer.h:266
Definition ParallelSort.h:41
void ParallelMergeSortPointerAndNum(T *First, IndexType Num, PredicateType Predicate)
Definition ParallelSort.h:104
void SmallSort(T *First, IndexType Num, PredicateType Predicate)
Definition ParallelSort.h:343
void MergeSortPointerAndNum(T *First, IndexType Num, PredicateType Predicate)
Definition ParallelSort.h:222
void MergeSortRecursive(T *First, IndexType Num, PredicateType Predicate, TMergeBuffer< T > &MergeBuffer)
Definition ParallelSort.h:236
Definition ParallelSort.h:13
void ParallelSortForceSingleThreaded(RangeType &&Range, PredicateType Predicate)
Definition ParallelSort.h:87
void ParallelSort(RangeType &&Range, PredicateType Predicate)
Definition ParallelSort.h:75
Definition AnalyticsProviderLog.h:8
Definition ParallelSort.h:44
int32 InputNum
Definition ParallelSort.h:48
int32 ConstructorSite
Definition ParallelSort.h:47
int32 Round
Definition ParallelSort.h:50
int32 AllocatedSize
Definition ParallelSort.h:46
TRingBuffer< T > Buffer
Definition ParallelSort.h:45
int32 BatchSize
Definition ParallelSort.h:49
Definition Less.h:19