UDocumentation UE5.7 10.02.2026 (Source)
API documentation for Unreal Engine 5.7
radaudio_mdct_internal.inl
Go to the documentation of this file.
1// Copyright Epic Games Tools, LLC. All Rights Reserved.
2#include "rrbits.h"
3
4namespace radaudio_fft_impl {
5
6template<typename T>
7static RADFORCEINLINE void swap(T& a, T& b)
8{
9 T t { a };
10 a = b;
11 b = t;
12}
13
14// Pure radix2 butterfly
15// a' = a + b
16// b' = a - b
17template<typename T>
18static RADFORCEINLINE void radix2_bfly(T& ar, T& ai, T& br, T& bi)
19{
20 T in_ar = ar;
21 T in_ai = ai;
22
23 ar = in_ar + br;
24 ai = in_ai + bi;
25 br = in_ar - br;
26 bi = in_ai - bi;
27}
28
29// Twiddle bt = b * w
30// a' = a + bt
31// b' = a - bt
32template<typename T>
33static RADFORCEINLINE void radix2_twiddle_unfused(T& ar, T& ai, T& br, T& bi, T wr, T wi)
34{
35 T btr = br*wr - bi*wi;
36 T bti = bi*wr + br*wi;
37 T in_ar = ar;
38 T in_ai = ai;
39
40 ar = in_ar + btr;
41 ai = in_ai + bti;
42 br = in_ar - btr;
43 bi = in_ai - bti;
44}
45
46// Permuted DFT4 butterfly for use in radix-2 decompositions
47template<typename T>
48static RADFORCEINLINE void dft4_bfly_permuted(T& ar, T& ai, T& br, T& bi, T& cr, T& ci, T& dr, T& di)
49{
50 // First pass of r2 butterflies
51 radix2_bfly(ar, ai, br, bi);
52 radix2_bfly(cr, ci, dr, di);
53
54 // Second pass / output stage
55 radix2_bfly(ar, ai, cr, ci);
56
57 // need to work with twiddled dt = -i * d = -i * (dr + i*di) = di - i*dr
58 // so write final bfly "by hand"
59 T in_br = br;
60 T in_bi = bi;
61 T in_dr = dr;
62 T in_di = di;
63 br = in_br + in_di;
64 bi = in_bi - in_dr;
65 dr = in_br - in_di;
66 di = in_bi + in_dr;
67}
68
69// Default bit reverse alg uses our LUT
71{
72 size_t shift_amt;
73
75 : shift_amt(kMaxFFTLog2 - fft_nbits)
76 {
77 }
78
79 size_t operator()(size_t i) const
80 {
81 return s_bit_reverse[i] >> shift_amt;
82 }
83};
84
85template<typename T, typename TBitReverse=DefaultBitReverse>
86static size_t bitrev_initial_radix4_core_vec4(float *out, float const *in, size_t N, FftSign sign)
87{
88 static_assert(kBurstSize >= 4, "This code requires a burst size of at least 4.");
89
90 size_t Nbits = rrCtz64(N);
92
93 size_t step = N / 4;
94 float * outA = out;
95 float * outB = out + burst_swizzle(2 * step); // note: 2 not 1 because it's bit-reversed
96 float * outC = out + burst_swizzle(1 * step); // note: 1 not 2 because it's bit-reversed
97 float * outD = out + burst_swizzle(3 * step);
98
99 float const * inA = in;
100 float const * inB = in + burst_swizzle(2 * step); // note: 2 not 1 because it's bit-reversed
101 float const * inC = in + burst_swizzle(1 * step); // note: 1 not 2 because it's bit-reversed
102 float const * inD = in + burst_swizzle(3 * step);
103
104 // This was originally written for the negative sign variant, but all we need to do
105 // to toggle the sign is to swap inC and inD pointers
106 if (sign == FftSign_Positive)
107 swap(inC, inD);
108
109 // Apply the initial permutation along with the initial radix-4 butterflies
110 // (which are special because the twiddles are with +-1 and +-i only, i.e. trivial)
111 for (size_t i = 0; i < step; i += 4)
112 {
113 size_t is = burst_swizzle(i); // dest index
114 size_t j = bit_reverse(i);
115 size_t js = burst_swizzle(j); // source index
116
117 T ar = T::load(&inA[js + 0*kBurstSize]);
118 T ai = T::load(&inA[js + 1*kBurstSize]);
119 T br = T::load(&inB[js + 0*kBurstSize]);
120 T bi = T::load(&inB[js + 1*kBurstSize]);
121 T cr = T::load(&inC[js + 0*kBurstSize]);
122 T ci = T::load(&inC[js + 1*kBurstSize]);
123 T dr = T::load(&inD[js + 0*kBurstSize]);
124 T di = T::load(&inD[js + 1*kBurstSize]);
125
126 // Radix-4 pass
127 dft4_bfly_permuted(ar, ai, br, bi, cr, ci, dr, di);
128
129 // Transposes
130 T::transpose4x4(ar, br, cr, dr);
131 T::transpose4x4(ai, bi, ci, di);
132
133 // Output
134 ar.store(&outA[is + 0*kBurstSize]);
135 ai.store(&outA[is + 1*kBurstSize]);
136 br.store(&outB[is + 0*kBurstSize]);
137 bi.store(&outB[is + 1*kBurstSize]);
138 cr.store(&outC[is + 0*kBurstSize]);
139 ci.store(&outC[is + 1*kBurstSize]);
140 dr.store(&outD[is + 0*kBurstSize]);
141 di.store(&outD[is + 1*kBurstSize]);
142 }
143
144 return Nbits;
145}
146
147template<typename T>
148static void initial_radix2_core_vec4(float * out, size_t N, FftSign sign)
149{
150 size_t const swiz_N = burst_swizzle(N);
151 T wr = T::load(&s_fft_twiddles[8 + 2]);
152 T wi = T::load(&s_fft_twiddles[((sign == FftSign_Positive) ? 8 : 8 + 4)]);
153
154 float * outA = out;
155 float * outB = out + burst_swizzle(4);
156 size_t swiz_dec = burst_swizzle(~size_t(7));
157
158 for (size_t j = 0; j < swiz_N; j = (j - swiz_dec) & swiz_dec)
159 {
160 T ar = T::load(&outA[j + 0*kBurstSize]);
161 T ai = T::load(&outA[j + 1*kBurstSize]);
162 T br = T::load(&outB[j + 0*kBurstSize]);
163 T bi = T::load(&outB[j + 1*kBurstSize]);
164
165 T::radix2_twiddle(ar, ai, br, bi, wr, wi);
166
167 ar.store(&outA[j + 0*kBurstSize]);
168 ai.store(&outA[j + 1*kBurstSize]);
169 br.store(&outB[j + 0*kBurstSize]);
170 bi.store(&outB[j + 1*kBurstSize]);
171 }
172}
173
174// This is initial_radix4 + initial_radix2 in one fused pass, for archs
175// with plenty of regs.
176template<typename T, typename TBitReverse=DefaultBitReverse>
177static size_t bitrev_initial_radix8_core_vec4(float *out, float const *in, size_t N, FftSign sign)
178{
179 static_assert(kBurstSize >= 8, "This code requires a burst size of at least 8.");
180
181 size_t Nbits = rrCtz64(N);
183
184 // Twiddles for final radix2
185 T wr = T::load(&s_fft_twiddles[8 + 2]);
186 T wi = T::load(&s_fft_twiddles[((sign == FftSign_Positive) ? 8 : 8 + 4)]);
187
188 size_t step = N / 4;
189 float * outA = out;
190 float * outB = out + burst_swizzle(2 * step); // note: 2 not 1 because it's bit-reversed
191 float * outC = out + burst_swizzle(1 * step); // note: 1 not 2 because it's bit-reversed
192 float * outD = out + burst_swizzle(3 * step);
193
194 size_t group1_offs = burst_swizzle(step / 2); // offset into group 1 (=where the second vec of 4 ends up)
195
196 float const * inA = in;
197 float const * inB = in + burst_swizzle(2 * step); // note: 2 not 1 because it's bit-reversed
198 float const * inC = in + burst_swizzle(1 * step); // note: 1 not 2 because it's bit-reversed
199 float const * inD = in + burst_swizzle(3 * step);
200
201 // This was originally written for the negative sign variant, but all we need to do
202 // to toggle the sign is to swap inC and inD pointers
203 if (sign == FftSign_Positive)
204 swap(inC, inD);
205
206 // Apply the initial permutation along with the initial radix-4 butterflies
207 // (which are special because the twiddles are with +-1 and +-i only, i.e. trivial)
208 for (size_t i = 0; i < step; i += 8)
209 {
210 // First group of 4
211 size_t j0s = burst_swizzle(bit_reverse(i)); // source index 0
212 T a0r = T::load(&inA[j0s + 0*kBurstSize]);
213 T a0i = T::load(&inA[j0s + 1*kBurstSize]);
214 T b0r = T::load(&inB[j0s + 0*kBurstSize]);
215 T b0i = T::load(&inB[j0s + 1*kBurstSize]);
216 T c0r = T::load(&inC[j0s + 0*kBurstSize]);
217 T c0i = T::load(&inC[j0s + 1*kBurstSize]);
218 T d0r = T::load(&inD[j0s + 0*kBurstSize]);
219 T d0i = T::load(&inD[j0s + 1*kBurstSize]);
220
221 // Initial radix-4 pass
222 dft4_bfly_permuted(a0r, a0i, b0r, b0i, c0r, c0i, d0r, d0i);
223
224 // Transposes
225 T::transpose4x4(a0r, b0r, c0r, d0r);
226 T::transpose4x4(a0i, b0i, c0i, d0i);
227
228 // Second group of 4
229 size_t j1s = j0s + group1_offs; // == burst_swizzle(bit_reverse(i + 4))
230 T a1r = T::load(&inA[j1s + 0*kBurstSize]);
231 T a1i = T::load(&inA[j1s + 1*kBurstSize]);
232 T b1r = T::load(&inB[j1s + 0*kBurstSize]);
233 T b1i = T::load(&inB[j1s + 1*kBurstSize]);
234 T c1r = T::load(&inC[j1s + 0*kBurstSize]);
235 T c1i = T::load(&inC[j1s + 1*kBurstSize]);
236 T d1r = T::load(&inD[j1s + 0*kBurstSize]);
237 T d1i = T::load(&inD[j1s + 1*kBurstSize]);
238
239 // Initial radix-4 pass
240 dft4_bfly_permuted(a1r, a1i, b1r, b1i, c1r, c1i, d1r, d1i);
241
242 // Transposes
243 T::transpose4x4(a1r, b1r, c1r, d1r);
244 T::transpose4x4(a1i, b1i, c1i, d1i);
245
246 // Final radix-2 pass between groups and output
247 size_t is = burst_swizzle(i); // dest index
248
249 T::radix2_twiddle(a0r, a0i, a1r, a1i, wr, wi);
250 a0r.store(&outA[is + 0*kBurstSize]);
251 a1r.store(&outA[is + 0*kBurstSize + 4]);
252 a0i.store(&outA[is + 1*kBurstSize]);
253 a1i.store(&outA[is + 1*kBurstSize + 4]);
254
255 T::radix2_twiddle(b0r, b0i, b1r, b1i, wr, wi);
256 b0r.store(&outB[is + 0*kBurstSize]);
257 b1r.store(&outB[is + 0*kBurstSize + 4]);
258 b0i.store(&outB[is + 1*kBurstSize]);
259 b1i.store(&outB[is + 1*kBurstSize + 4]);
260
261 T::radix2_twiddle(c0r, c0i, c1r, c1i, wr, wi);
262 c0r.store(&outC[is + 0*kBurstSize]);
263 c1r.store(&outC[is + 0*kBurstSize + 4]);
264 c0i.store(&outC[is + 1*kBurstSize]);
265 c1i.store(&outC[is + 1*kBurstSize + 4]);
266
267 T::radix2_twiddle(d0r, d0i, d1r, d1i, wr, wi);
268 d0r.store(&outD[is + 0*kBurstSize]);
269 d1r.store(&outD[is + 0*kBurstSize + 4]);
270 d0i.store(&outD[is + 1*kBurstSize]);
271 d1i.store(&outD[is + 1*kBurstSize + 4]);
272 }
273
274 return Nbits;
275}
276
277template<typename T>
278static void burst_r4_fft_single_pass(float * out, size_t step, size_t swiz_N, FftSign sign)
279{
280 float const *twiddle1_i = s_fft_twiddles + step*2;
281 float const *twiddle1_r = twiddle1_i + step/2;
282
283 float const *twiddle2_i = s_fft_twiddles + step*4;
284 float const *twiddle2_r = twiddle2_i + step;
285
286 // NOTE: this doesn't work unless step >= T::kCount
287 // i.e. our initial "regular" level is determined by vector width
288 const size_t twiddle_mask = step - 1;
289
290 size_t swiz_dec = burst_swizzle(~((3 * step) | (T::kCount - 1)));
291 float * outA = out;
292 float * outB = out + burst_swizzle(1 * step);
293 float * outC = out + burst_swizzle(2 * step);
294 float * outD = out + burst_swizzle(3 * step);
295
296 // Defaults to B/D swapped; see below
297 float * outWrB = outD;
298 float * outWrD = outB;
299
300 // Advance in sine table by half the phase if negative sign requested
301 // Also swap B/D outputs which effectively swaps our twiddle from +i to -i, see below.
302 if (sign == FftSign_Negative)
303 {
304 twiddle1_i += step;
305 twiddle2_i += step*2;
306 swap(outWrB, outWrD);
307 }
308
309 // This is actually radix 2^2, not radix 4.
310 for (size_t j = 0, k = 0; j < swiz_N; )
311 {
312 T ar = T::load(&outA[j + 0*kBurstSize]);
313 T ai = T::load(&outA[j + 1*kBurstSize]);
314 T br = T::load(&outB[j + 0*kBurstSize]);
315 T bi = T::load(&outB[j + 1*kBurstSize]);
316 T cr = T::load(&outC[j + 0*kBurstSize]);
317 T ci = T::load(&outC[j + 1*kBurstSize]);
318 T dr = T::load(&outD[j + 0*kBurstSize]);
319 T di = T::load(&outD[j + 1*kBurstSize]);
320
321 // First stage twiddle b * w0, d * w0
322 T w0r = T::load(&twiddle1_r[k]);
323 T w0i = T::load(&twiddle1_i[k]);
324
325 // First stage butterflies
326 T::radix2_twiddle(ar, ai, br, bi, w0r, w0i);
327 T::radix2_twiddle(cr, ci, dr, di, w0r, w0i);
328
329 // Second stage twiddle
330 T w1r = T::load(&twiddle2_r[k]);
331 T w1i = T::load(&twiddle2_i[k]);
332
333 // Second stage butterfly and output
334 T::radix2_twiddle(ar, ai, cr, ci, w1r, w1i);
335 ar.store(&outA[j + 0*kBurstSize]);
336 ai.store(&outA[j + 1*kBurstSize]);
337 cr.store(&outC[j + 0*kBurstSize]);
338 ci.store(&outC[j + 1*kBurstSize]);
339
340 // w2 is exactly a quarter-circle away
341 // i.e. multiply by i. w2 = i*(w1r + i*w1i) = i*w1r - w1i = (-w1i) + i*w1r
342 //
343 // Note "swap identity" above. We want
344 // b', d' = b +- w2 * d
345 // = b +- i (w1 * d)
346 // = b -+ -i (w1 * d)
347 // = b -+ s(s(w1) s(d))
348 // <=>
349 // d', b' = b +- s(s(w1) s(d))
350 // = s(s(b) +- s(w1) s(d))
351 //
352 // (note d', b' swapped places).
353 // Therefore, we compute a regular twiddled r2 butterfly with real/imag parts
354 // of b, w1 and d swapped (on both the inputs and outputs). That takes care of
355 // the s() applications. Finally for the positive FFT sign we also swap the
356 // output B and D pointer (this happens outside). For the negative FFT sign
357 // we need to multiply by -i not i to begin with, so we get another swap of
358 // output B and D which cancels out the first one.
359 T::radix2_twiddle(bi, br, di, dr, w1i, w1r);
360 br.store(&outWrB[j + 0*kBurstSize]);
361 bi.store(&outWrB[j + 1*kBurstSize]);
362 dr.store(&outWrD[j + 0*kBurstSize]);
363 di.store(&outWrD[j + 1*kBurstSize]);
364
365 j = (j - swiz_dec) & swiz_dec;
366 k = (k + T::kCount) & twiddle_mask;
367 }
368}
369
370template<typename T>
371static void burst_imdct_prefft(float * dest, float const * coeffs, float const * tw_re, float const * tw_im, size_t N)
372{
373 size_t M1 = N >> 2;
374 size_t M2 = N >> 1;
375
376 // Reference version:
377 //
378 // for (size_t i = 0; i < M2; ++i)
379 // {
380 // size_t is = burst_swizzle(i);
381 //
382 // float wre = tw_re[i];
383 // float wim = tw_im[i];
384 // float re = coeffs[i*2];
385 // float im = coeffs[N-1-i*2];
386 //
387 // dest[is + 0*kBurstSize] = wre*re - wim*im;
388 // dest[is + 1*kBurstSize] = wre*im + wim*re;
389 // }
390 //
391 // Note "re" only reads even elements and "im" only reads odd elements (from the end of the array).
392 // This suggests interleaving a forward and backward pass simultaneously so we use all the values
393 // in the vectors we load:
394 //
395 // for (size_t i = 0; i < M1; ++i)
396 // {
397 // size_t j = M2 - 1 - i;
398 // size_t is = burst_swizzle(i);
399 // size_t js = burst_swizzle(j);
400 //
401 // // Twiddles
402 // float wr0 = tw_re[i];
403 // float wi0 = tw_im[i];
404 // float wr1 = tw_re[j];
405 // float wi1 = tw_im[j];
406 //
407 // // Pairs of coefficients
408 // float re0 = coeffs[i*2];
409 // float im1 = coeffs[i*2 + 1];
410 //
411 // float re1 = coeffs[j*2];
412 // float im0 = coeffs[j*2 + 1];
413 //
414 // // Do the twiddle
415 // dest[is + 0*kBurstSize] = wr0*re0 - wi0*im0;
416 // dest[is + 1*kBurstSize] = wr0*im0 + wi0*re0;
417 // dest[js + 0*kBurstSize] = wr1*re1 - wi1*im1;
418 // dest[js + 1*kBurstSize] = wr1*im1 + wi1*re1;
419 // }
420 //
421 // This now vectorizes very cleanly with the primitives we have:
422
423 for (size_t i = 0; i < M1; i += T::kCount)
424 {
425 size_t j = M2 - T::kCount - i;
426 size_t is = burst_swizzle(i);
427 size_t js = burst_swizzle(j);
428
429 // Twiddles
430 T wr0 = T::load(&tw_re[i]);
431 T wi0 = T::load(&tw_im[i]);
432 T wr1 = T::load(&tw_re[j]); // note: should be reversed, but we defer it
433 T wi1 = T::load(&tw_im[j]); // note: should be reversed, but we defer it
434
435 // Pairs of coefficients
436 T re0, im0, re1, im1;
437 T::load_deinterleave(re0, im1, &coeffs[i*2]);
438 T::load_deinterleave(re1, im0, &coeffs[j*2]); // note: re1/im0 should be reversed, but we defer it
439
440 // We have several reversals to take care of here.
441 // What we actually want is:
442 //
443 // oir = wr0*re0 - wi0*rev(im0);
444 // oii = wr0*rev(im0) + wi0*re0;
445 // ojr = rev(wr1)*rev(re1) - rev(wi1)*im1;
446 // oji = rev(wr1)*im1 + rev(wi1)*rev(re1);
447 //
448 // and then ojr and oji get stored reversed as well. If we factor that final reversal in, we get:
449 //
450 // ojr' = rev(rev(wr1)*rev(re1) - rev(wi1)*im1)
451 // = rev(rev(wr1 * re1) - rev(wi1)*im1)
452 // = wr1 * re1 - wi1*rev(im1)
453 // oji' = rev(rev(wr1)*im1 + rev(wi1)*rev(re1))
454 // = rev(rev(wr1*rev(im1)) + rev(wi1*re1))
455 // = wr1*rev(im1) + wi1*re1
456 //
457 // in other words, almost all the reversals cancel each other and we end up with mostly in-order
458 // operation with just reversals on the two imaginary inputs:
459 im0 = im0.reverse(); // NOTE optimize (should be able to fold into the load shuffles)
460 im1 = im1.reverse();
461
462 // Do the twiddle
463 T oir = wr0*re0 - wi0*im0;
464 T oii = wr0*im0 + wi0*re0;
465 T ojr = wr1*re1 - wi1*im1;
466 T oji = wr1*im1 + wi1*re1;
467
468 oir.store(&dest[is + 0*kBurstSize]);
469 oii.store(&dest[is + 1*kBurstSize]);
470 ojr.store(&dest[js + 0*kBurstSize]);
471 oji.store(&dest[js + 1*kBurstSize]);
472 }
473}
474
475template<typename T>
476static void burst_imdct_postfft(float * signal0, float * signal1, float const * dft, float const * tw_re, float const * tw_im, size_t N)
477{
478 size_t M1 = N >> 2;
479 size_t M2 = N >> 1;
480
481 for (size_t i = 0; i < M1; i += T::kCount)
482 {
483 size_t j = M2 - T::kCount - i; // reversed index
484 size_t is = burst_swizzle(i);
485 size_t js = burst_swizzle(j);
486
487 // Twiddles
488 T w0re = T::load(&tw_re[i]);
489 T w0im = T::load(&tw_im[i]);
490 T w1re = T::load(&tw_re[j]); // note: should be reversed, but we defer it
491 T w1im = T::load(&tw_im[j]); // note: should be reversed, but we defer it
492
493 // FFT outputs
494 T re0 = T::load(&dft[is + 0*kBurstSize]);
495 T im0 = T::load(&dft[is + 1*kBurstSize]);
496 T re1 = T::load(&dft[js + 0*kBurstSize]); // note: should be reversed, but we defer it
497 T im1 = T::load(&dft[js + 1*kBurstSize]); // note: should be reversed, but we defer it
498
499 // We have several reversals to take care of here.
500 // w1re, w1im, re1, im1 should all get reversed on use,
501 // and the outputs out1e and out1o also get stored in reversed
502 // form.
503 //
504 // We start out with
505 //
506 // out0e = w0re*im0 + w0im*re0
507 // out0o = rev(w1im)*rev(im1) - rev(w1re)*rev(re1)
508 // out1e = rev(w1re)*rev(im1) + rev(w1im)*rev(re1)
509 // out1o = w0im*im0 - w0re*re0
510 //
511 // and we finally want out1e' = rev(out1e), out1o' = rev(out1o).
512 // Simplifying that, we get:
513 //
514 // out0e = w0re*im0 + w0im*re0
515 // out0o = rev(w1im*im1 - w1re*re1)
516 // out1e' = w1re*im1 + w1im*re1
517 // out1o' = rev(w0im*im0 - w0re*re0)
518
519 // Simplified twiddles (see above)
520 T out0e = w0re*im0 + w0im*re0;
521 T out0o = (w1im*im1 - w1re*re1).reverse();
522 T out1e = w1re*im1 + w1im*re1;
523 T out1o = (w0im*im0 - w0re*re0).reverse();
524
525 // Write outputs
526 T::store_interleaved(&signal0[i*2], out0e, out0o);
527 T::store_interleaved(&signal1[M2 - T::kCount*2 - i*2], out1e, out1o);
528 }
529}
530
531} // namespace radaudio_fft_impl
532
#define RADFORCEINLINE
Definition rrCore.h:159
UE_FORCEINLINE_HINT TSharedRef< CastToType, Mode > StaticCastSharedRef(TSharedRef< CastFromType, Mode > const &InSharedRef)
Definition SharedPointer.h:127
char * dest
Definition lz4.h:709
Definition radaudio_mdct.cpp:49
FFTIndex s_bit_reverse[kMaxFFTN]
FftSign
Definition radaudio_mdct_internal.h:10
@ FftSign_Positive
Definition radaudio_mdct_internal.h:12
@ FftSign_Negative
Definition radaudio_mdct_internal.h:11
Definition radaudio_mdct_internal.inl:71
size_t shift_amt
Definition radaudio_mdct_internal.inl:72
DefaultBitReverse(size_t fft_nbits)
Definition radaudio_mdct_internal.inl:74
size_t operator()(size_t i) const
Definition radaudio_mdct_internal.inl:79