UDocumentation UE5.7 10.02.2026 (Source)
API documentation for Unreal Engine 5.7
UnrealMathSSE.h
Go to the documentation of this file.
1// Copyright Epic Games, Inc. All Rights Reserved.
2
3#pragma once
4
5#include "HAL/Platform.h"
6#include "HAL/PlatformMath.h"
7#include "Math/MathFwd.h"
9
10// IWYU pragma: begin_exports
11
12#if !PLATFORM_ENABLE_VECTORINTRINSICS_NEON && !(defined(__cplusplus_cli)) && PLATFORM_ENABLE_VECTORINTRINSICS
13
15
16// We require SSE2
17#include <emmintrin.h>
18
19#ifndef UE_PLATFORM_MATH_USE_SSE4_1
20#define UE_PLATFORM_MATH_USE_SSE4_1 PLATFORM_ALWAYS_HAS_SSE4_1
21#endif
22
23#ifndef UE_PLATFORM_MATH_USE_AVX
24#define UE_PLATFORM_MATH_USE_AVX PLATFORM_ALWAYS_HAS_AVX
25#endif
26
27#ifndef UE_PLATFORM_MATH_USE_AVX_2
28#define UE_PLATFORM_MATH_USE_AVX_2 (PLATFORM_ALWAYS_HAS_AVX_2 && UE_PLATFORM_MATH_USE_AVX)
29#endif
30
31#ifndef UE_PLATFORM_MATH_USE_FMA3
32#define UE_PLATFORM_MATH_USE_FMA3 PLATFORM_ALWAYS_HAS_FMA3
33#endif
34
35#ifndef UE_PLATFORM_MATH_USE_SVML
36 #if defined(_MSC_VER) && !defined(__clang__)
37 #define UE_PLATFORM_MATH_USE_SVML (_MSC_VER >= 1920) // Support added to MSVC 2019 16.0+
38 #else
39 #define UE_PLATFORM_MATH_USE_SVML 0
40 #endif // defined(_MSC_VER)
41#endif
42
43#ifndef UE_PLATFORM_MATH_USE_SVML_AVX
44#define UE_PLATFORM_MATH_USE_SVML_AVX (UE_PLATFORM_MATH_USE_SVML && UE_PLATFORM_MATH_USE_AVX)
45#endif
46
47// If SSE4.1 is enabled, need additional defines.
48#if UE_PLATFORM_MATH_USE_SSE4_1
49#include <smmintrin.h>
50#endif
51
52// If AVX is enabled, need additional defines.
53#if UE_PLATFORM_MATH_USE_AVX || UE_PLATFORM_MATH_USE_SVML
54#include <immintrin.h>
55#endif
56
57#define UE_SSE_FLOAT_ALIGNMENT 16
58
59#if UE_PLATFORM_MATH_USE_AVX
60#define UE_SSE_DOUBLE_ALIGNMENT 32 // required for __m256d
61#else
62#define UE_SSE_DOUBLE_ALIGNMENT 16
63#endif
64
65// We suppress static analysis warnings for the cast from (double*) to (float*) in VectorLoadFloat2
66// and VectorLoadTwoPairsFloat below:
67// -V:VectorLoadFloat2:615
68// -V:VectorLoadTwoPairsFloat:615
69
70/*=============================================================================
71 * Helpers:
72 *============================================================================*/
73
78// 4 floats
80
81// 4 int32s
83
84// 2 int64s
86
87// 2 doubles
89
90typedef struct
91{
92 //TODO: alias for AVX2!
95
96
97namespace SSE
98{
99 //wrapper for sse_mathfun.h and sse_mathfun_extension.h
109};
110
111// 4 doubles
113{
114#if !UE_PLATFORM_MATH_USE_AVX
117
118 FORCEINLINE VectorRegister2Double GetXY() const { return XY; }
119 FORCEINLINE VectorRegister2Double GetZW() const { return ZW; }
120#else
121 union
122 {
123 struct
124 {
127 };
128 __m256d XYZW;
129 };
130
131 // Use in preference when reading XY or ZW to extract values, it's better on MSVC than the generated memory reads.
132 FORCEINLINE VectorRegister2Double GetXY() const { return _mm256_extractf128_pd(XYZW, 0); } // { return _mm256_castpd256_pd128(XYZW); } // Possible MSVC compiler bug in optimized bugs when using this cast, but can be more efficient.
134#endif
135
137
139 {
140#if UE_PLATFORM_MATH_USE_AVX
141 XYZW = _mm256_setr_m128d(InXY, InZW);
142#else
143 XY = InXY;
144 ZW = InZW;
145#endif
146 }
147
149 : XY(InXY)
150 , ZW(InZW)
151 {}
152
153 // Construct from a vector of 4 floats
155 {
156#if !UE_PLATFORM_MATH_USE_AVX
157 XY = _mm_cvtps_pd(FloatVector);
158 ZW = _mm_cvtps_pd(_mm_movehl_ps(FloatVector, FloatVector));
159#else
160 XYZW = _mm256_cvtps_pd(FloatVector);
161#endif
162 }
163
164 // Assign from a vector of 4 floats
166 {
167#if !UE_PLATFORM_MATH_USE_AVX
168 XY = _mm_cvtps_pd(FloatVector);
169 ZW = _mm_cvtps_pd(_mm_movehl_ps(FloatVector, FloatVector));
170#else
171 XYZW = _mm256_cvtps_pd(FloatVector);
172#endif
173 return *this;
174 }
175
176#if UE_PLATFORM_MATH_USE_AVX
177 // Convenience for things like 'Result = _mm256_add_pd(...)'
179 {
180 XYZW = Register;
181 }
182
183 // Convenience for things like 'Result = _mm256_add_pd(...)'
185 {
186 XYZW = Register;
187 return *this;
188 }
189
190 // Convenience for passing VectorRegister4Double to _mm256_* functions without needing '.XYZW'
191 FORCEINLINE operator __m256d() const
192 {
193 return XYZW;
194 }
195#endif
196
197};
198
199
200// Aliases
205#define VectorZeroVectorRegister() VectorZeroDouble()
206#define VectorOneVectorRegister() VectorOneDouble()
207
208// Backwards compatibility
212
213
214// Forward declarations
219
220
221// Helper for conveniently aligning a float array for extraction from VectorRegister4Float
223{
224 float V[4];
225
227 {
229 }
230
231 FORCEINLINE float operator[](int32 Index) const { return V[Index]; }
232 FORCEINLINE float& operator[](int32 Index) { return V[Index]; }
233
234 FORCEINLINE VectorRegister4Float ToVectorRegister() const
235 {
236 return VectorLoadAligned(V);
237 }
238};
239
240
241// Helper for conveniently aligning a double array for extraction from VectorRegister4Double
242struct alignas(alignof(VectorRegister4Double)) AlignedDouble4
243{
244 double V[4];
245
247 {
249 }
250
251 FORCEINLINE double operator[](int32 Index) const { return V[Index]; }
252 FORCEINLINE double& operator[](int32 Index) { return V[Index]; }
253
254 FORCEINLINE VectorRegister4Double ToVectorRegister() const
255 {
256 return VectorLoadAligned(V);
257 }
258};
259
261
262#define DECLARE_VECTOR_REGISTER(X, Y, Z, W) MakeVectorRegister(X, Y, Z, W)
263
270#define SHUFFLEMASK(A0,A1,B2,B3) ( (A0) | ((A1)<<2) | ((B2)<<4) | ((B3)<<6) )
271
272#define SHUFFLEMASK2(A0,A1) ((A0) | ((A1)<<1))
273
274
276{
277 return _mm_setr_pd(X, Y);
278}
279
280// Bitwise equivalent from two 64-bit values.
282{
284 // Note: this instruction only exists on 64-bit
285 Result.Vi = _mm_set_epi64x(Y, X); // intentionally (Y,X), there is no 'setr' version.
286 return Result.Vd;
287}
288
299{
300 union { VectorRegister4Float v; VectorRegister4Int i; } Tmp;
301 Tmp.i = _mm_setr_epi32( X, Y, Z, W );
302 return Tmp.v;
303}
304
306{
308}
309
311{
312 return MakeVectorRegisterFloat(X, Y, Z, W);
313}
314
315// Nicer aliases
317{
318 return MakeVectorRegisterFloat(X, Y, Z, W);
319}
320
322{
323 return MakeVectorRegisterDouble(X, Y, Z, W);
324}
325
335FORCEINLINE VectorRegister4Float MakeVectorRegisterFloat(float X, float Y, float Z, float W)
336{
337 return _mm_setr_ps( X, Y, Z, W );
338}
339
340FORCEINLINE VectorRegister4Double MakeVectorRegisterDouble(double X, double Y, double Z, double W)
341{
343#if !UE_PLATFORM_MATH_USE_AVX
344 Result.XY = _mm_setr_pd(X, Y);
345 Result.ZW = _mm_setr_pd(Z, W);
346#else
347 Result = _mm256_setr_pd(X, Y, Z, W);
348#endif
349 return Result;
350}
351
352FORCEINLINE VectorRegister4Float MakeVectorRegister(float X, float Y, float Z, float W)
353{
354 return MakeVectorRegisterFloat(X, Y, Z, W);
355}
356
357FORCEINLINE VectorRegister4Double MakeVectorRegister(double X, double Y, double Z, double W)
358{
359 return MakeVectorRegisterDouble(X, Y, Z, W);
360}
361
363{
364 return VectorRegister4Double(XY, ZW);
365}
366
367// Make double register from float register
369{
370 return VectorRegister4Double(From);
371}
372
373// Lossy conversion: double->float vector
375{
376#if !UE_PLATFORM_MATH_USE_AVX
378#else
379 return _mm256_cvtpd_ps(Vec4d);
380#endif
381}
382
393{
394 return _mm_setr_epi32(X, Y, Z, W);
395}
396
398{
399 return _mm_set_epi64x(Y, X);
400}
401
403#if defined(PRAGMA_DISABLE_MISSING_BRACES_WARNINGS)
405#endif
406
414{
415#if !PLATFORM_LITTLE_ENDIAN
416#error Big-endian unimplemented
417#elif defined(_MSC_VER) && !defined(__clang__)
418 return {static_cast<char>(X >> 0), static_cast<char>(X >> 8), static_cast<char>(X >> 16), static_cast<char>(X >> 24),
419 static_cast<char>(Y >> 0), static_cast<char>(Y >> 8), static_cast<char>(Y >> 16), static_cast<char>(Y >> 24),
420 static_cast<char>(Z >> 0), static_cast<char>(Z >> 8), static_cast<char>(Z >> 16), static_cast<char>(Z >> 24),
421 static_cast<char>(W >> 0), static_cast<char>(W >> 8), static_cast<char>(W >> 16), static_cast<char>(W >> 24)};
422#else
423 uint64 XY = uint64(uint32(X)) | (uint64(uint32(Y)) << 32);
424 uint64 ZW = uint64(uint32(Z)) | (uint64(uint32(W)) << 32);
425 return VectorRegister4Int { (long long)XY, (long long)ZW };
426#endif
427}
428
429FORCEINLINE constexpr VectorRegister4Float MakeVectorRegisterFloatConstant(float X, float Y, float Z, float W)
430{
431 return VectorRegister4Float { X, Y, Z, W };
432}
433
434#if defined(PRAGMA_ENABLE_MISSING_BRACES_WARNINGS)
436#endif
438
440{
441 return VectorRegister2Double { X, Y };
442}
443
444/*=============================================================================
445 * Constants:
446 *============================================================================*/
447
449
450/*=============================================================================
451 * Intrinsics:
452 *============================================================================*/
453
460{
461 return _mm_setzero_ps();
462}
463
465{
467#if !UE_PLATFORM_MATH_USE_AVX
468 Result.XY = _mm_setzero_pd();
469 Result.ZW = _mm_setzero_pd();
470#else
472#endif
473 return Result;
474}
475
482{
484}
485
487{
489}
490
499template <uint32 ComponentIndex>
501{
502 return (((float*)&(Vec))[ComponentIndex]);
503}
504
505// Specializations
507{
508 return _mm_cvtss_f32(Vec);
509}
510
511template <uint32 ComponentIndex>
513{
514#if !UE_PLATFORM_MATH_USE_AVX
515 return (((double*)&(Vec.XY))[ComponentIndex]);
516#else
517 return (((double*)&(Vec.XYZW))[ComponentIndex]);
518#endif
519}
520
521// Specializations
522#if UE_PLATFORM_MATH_USE_AVX
523// Lower latency than `vmovsd`, required since MSVC doesn't optimize the above impl well compared to clang/gcc. The latter basically generates something like this below (checked in godbolt).
528#endif
529
530#define VectorGetComponent(Vec, ComponentIndex) VectorGetComponentImpl<ComponentIndex>(Vec)
531
533{
534 return (((float*)&(Vec))[ComponentIndex]);
535}
536
538{
539#if !UE_PLATFORM_MATH_USE_AVX
540 return (((double*)&(Vec.XY))[ComponentIndex]);
541#else
542 return (((double*)&(Vec.XYZW))[ComponentIndex]);
543#endif
544}
545
554{
555 return _mm_loadu_ps((float*)(Ptr));
556}
557
559{
561#if !UE_PLATFORM_MATH_USE_AVX
562 Result.XY = _mm_loadu_pd((double*)(Ptr));
563 Result.ZW = _mm_loadu_pd((double*)(Ptr + 2));
564#else
565 Result = _mm256_loadu_pd((double*)Ptr);
566#endif
567 return Result;
568}
569
577{
579 Result.val[0] = VectorLoad(Ptr);
580 Result.val[1] = VectorLoad(Ptr + 4);
581 Result.val[2] = VectorLoad(Ptr + 8);
582 Result.val[3] = VectorLoad(Ptr + 12);
583 return Result;
584}
585
593{
594#if !UE_PLATFORM_MATH_USE_AVX_2
596 Result.XY = _mm_loadu_pd((double*)(Ptr));
597 Result.ZW = _mm_load_sd((double*)(Ptr+2));
598 return Result;
599#else
601#endif
602}
603
611{
612#if !UE_PLATFORM_MATH_USE_AVX_2
614 Result.XY = _mm_loadu_pd((double*)(Ptr));
615 Result.ZW = MakeVectorRegister2Double(Ptr[2], 1.0);
616 return Result;
617#else
618 //return MakeVectorRegisterDouble(Ptr[0], Ptr[1], Ptr[2], 1.0);
621 Result = _mm256_blend_pd(Result, VectorOneDouble(), 0b1000);
622 return Result;
623#endif
624}
625
633{
634 return _mm_load_ps((const float*)(Ptr));
635}
636
638{
640#if !UE_PLATFORM_MATH_USE_AVX
641 Result.XY = _mm_load_pd((const double*)(Ptr));
642 Result.ZW = _mm_load_pd((const double*)(Ptr + 2));
643#else
644 // AVX using unaligned here, since we don't ensure 32-byte alignment (not significant on most modern processors)
645 Result = _mm256_loadu_pd(Ptr);
646#endif
647 return Result;
648}
649
657{
658#if !UE_PLATFORM_MATH_USE_AVX
659 return _mm_load1_ps(Ptr);
660#else
661 return _mm_broadcast_ss(Ptr);
662#endif
663}
664
666{
668#if !UE_PLATFORM_MATH_USE_AVX
669 Result.XY = _mm_load1_pd(Ptr);
670 Result.ZW = Result.XY;
671#else
673#endif
674 return Result;
675}
676
678{
679 return _mm_loadu_si64((__m128i *)Ptr);
680}
681
689{
690 // Switched from _mm_load1_pd and a cast to avoid a compiler bug in VC. This has the benefit of
691 // being very clear about not needing any alignment, and the optimizer will still result in
692 // movsd and movlhps in both clang and vc.
693 return _mm_setr_ps(Ptr[0], Ptr[1], Ptr[0], Ptr[1]);
694}
695
697{
699#if !UE_PLATFORM_MATH_USE_AVX
700 Result.XY = _mm_loadu_pd(Ptr);
701 Result.ZW = Result.XY;
702#else
703 const __m128d Temp = _mm_loadu_pd(Ptr);
704 Result = _mm256_set_m128d(Temp, Temp);
705#endif
706 return Result;
707}
708
718{
719 // This intentionally casts to a double* to be able to load 64 bits of data using the "load 1 double" instruction to fill in the two 32-bit floats.
720 __m128 Ret = _mm_castpd_ps(_mm_load_sd(reinterpret_cast<const double*>(Ptr1))); // -V615
721 Ret = _mm_loadh_pi(Ret, (__m64 const*)(Ptr2));
722 return Ret;
723}
724
726{
728#if !UE_PLATFORM_MATH_USE_AVX
731#else
732 Result = _mm256_loadu2_m128d(Ptr2, Ptr1); // Note: arguments are (hi, lo)
733#endif
734 return Result;
735}
736
744{
745 return _mm_set1_ps(F);
746}
747
749{
751#if !UE_PLATFORM_MATH_USE_AVX
752 Result.XY = _mm_set1_pd(D);
753 Result.ZW = Result.XY;
754#else
756#endif
757 return Result;
758}
759
767{
768 _mm_storeu_ps(Ptr, Vec);
769}
770
772{
773#if !UE_PLATFORM_MATH_USE_AVX
774 _mm_storeu_pd(Dst, Vec.XY);
775 _mm_storeu_pd(Dst + 2, Vec.ZW);
776#else
777 _mm256_storeu_pd(Dst, Vec);
778#endif
779}
780
788{
789 VectorStore(Vec.val[0], Ptr);
790 VectorStore(Vec.val[1], Ptr + 4);
791 VectorStore(Vec.val[2], Ptr + 8);
792 VectorStore(Vec.val[3], Ptr + 12);
793}
794
802{
803 _mm_store_ps(Dst, Vec);
804}
805
807{
808#if !UE_PLATFORM_MATH_USE_AVX
809 _mm_store_pd(Dst, Vec.XY);
810 _mm_store_pd(Dst + 2, Vec.ZW);
811#else
812 // AVX using unaligned here, since we don't ensure 32-byte alignment (not significant on most modern processors)
813 _mm256_storeu_pd(Dst, Vec);
814#endif
815}
816
817
825{
826 _mm_stream_ps(Dst, Vec);
827}
828
830{
831#if !UE_PLATFORM_MATH_USE_AVX
832 _mm_stream_pd(Dst, Vec.XY);
833 _mm_stream_pd(Dst + 2, Vec.ZW);
834#else
835 // AVX using two 128-bit stores since we don't require 32-byte alignment requirement for our data (so don't use _mm256_stream_pd)
836 _mm_stream_pd(Dst, Vec.XY);
837 _mm_stream_pd(Dst + 2, Vec.ZW);
838#endif
839}
840
848{
850 _mm_storel_pi((__m64*)(Ptr), Tmp);
851 _mm_store_ss(&Ptr[2], _mm_movehl_ps(Tmp, Tmp));
852}
853
855{
856#if !UE_PLATFORM_MATH_USE_AVX
857 _mm_storeu_pd(Dst, Vec.XY);
858 _mm_store_sd(Dst + 2, Vec.ZW);
859#else
860 _mm_storeu_pd(Dst, Vec.XY);
861 _mm_store_sd(Dst + 2, Vec.ZW);
862#endif
863}
864
872{
873 _mm_store_ss(Ptr, Vec);
874}
875
877{
878 _mm_store_sd(Dst, Vec.XY);
879}
880
881namespace SSEPermuteHelpers
882{
883#define InLane0(Index0, Index1) ((Index0) <= 1 && (Index1) <= 1)
884#define InLane1(Index0, Index1) ((Index0) >= 2 && (Index1) >= 2)
885#define InSameLane(Index0, Index1) (InLane0(Index0, Index1) || InLane1(Index0, Index1))
886#define OutOfLane(Index0, Index1) (!InSameLane(Index0, Index1))
887
889 // Double swizzle
891
892 // Double Swizzle helpers
893 // Templated swizzles required for double shuffles when using __m128d, since we have to break it down in to two separate operations.
894
895 template <int Index0, int Index1>
897 {
898 if constexpr (Index0 <= 1)
899 {
900 if constexpr (Index1 <= 1)
901 {
902 // [0,1]:[0,1]
903 return _mm_shuffle_pd(Vec.GetXY(), Vec.GetXY(), SHUFFLEMASK2(Index0, Index1));
904 }
905 else
906 {
907 // [0,1]:[2,3]
908 return _mm_shuffle_pd(Vec.GetXY(), Vec.GetZW(), SHUFFLEMASK2(Index0, Index1 - 2));
909 }
910 }
911 else
912 {
913 if constexpr (Index1 <= 1)
914 {
915 // [2,3]:[0,1]
916 return _mm_shuffle_pd(Vec.GetZW(), Vec.GetXY(), SHUFFLEMASK2(Index0 - 2, Index1));
917 }
918 else
919 {
920 // [2,3]:[2,3]
921 return _mm_shuffle_pd(Vec.GetZW(), Vec.GetZW(), SHUFFLEMASK2(Index0 - 2, Index1 - 2));
922 }
923 }
924 }
925
928
929#if UE_PLATFORM_MATH_USE_SSE4_1
930 // blend can run on more ports than shuffle, so are preferable even if latency is claimed to be the same.
933#endif // UE_PLATFORM_MATH_USE_SSE4_1
934
935
936#if UE_PLATFORM_MATH_USE_AVX
937
938 // Helper to swap lanes (128-bit pairs)
939 constexpr int PERMUTE_LANE_MASK(int A, int B) { return (A == 0 ? 0x00 : 0x01) | (B == 0 ? (0x02 << 4) : (0x03 << 4)); }
940
941 template<int Lane0, int Lane1>
943 {
944 static_assert(Lane0 >= 0 && Lane0 <= 1 && Lane1 >= 0 && Lane1 <= 1, "Invalid Index");
946 }
947
948 // Identity
950#if !UE_PLATFORM_MATH_USE_AVX_2
951 // On AVX1, permute2f128 can be quite slow, so look for alternatives (extract + insert). On AVX2, permute2f128 is more efficient and should equal or beat (extract + insert).
952 // Sources: https://www.agner.org/optimize/instruction_tables.pdf, https://uops.info/table.html
953 template<> FORCEINLINE VectorRegister4Double PermuteLanes<0, 0>(VectorRegister4Double Vec) { return _mm256_insertf128_pd(Vec, Vec.GetXY(), 1); } // copy XY to lane 1
954 template<> FORCEINLINE VectorRegister4Double PermuteLanes<1, 0>(VectorRegister4Double Vec) { return _mm256_setr_m128d(Vec.GetZW(), Vec.GetXY()); } // swap XY and ZW
955 template<> FORCEINLINE VectorRegister4Double PermuteLanes<1, 1>(VectorRegister4Double Vec) { return _mm256_insertf128_pd(Vec, Vec.GetZW(), 0); } // copy ZW to lane 0
956#endif // !AVX2
957
958 //
959 // AVX2 _mm256_permute4x64_pd has a latency of 3-6, but there are some specializations using instructions which have a latency of 1 but are restricted to in-lane (128 bit) permutes.
960 // AVX1 benefits from lower latency instructions here than the toggling between 128-bit and 256-bit operations of the generic implementation.
961
962 constexpr int PERMUTE_MASK(int A, int B, int C, int D) { return ((A == 1 ? (1 << 0) : 0) | (B == 1 ? (1 << 1) : 0) | (C == 3 ? (1 << 2) : 0) | (D == 3 ? (1 << 3) : 0)); }
963
964 template <int Index0, int Index1, int Index2, int Index3>
966 {
967 if constexpr (InLane0(Index0, Index1) && InLane1(Index2, Index3))
968 {
969 // [0..1][0..1][2..3][2..3]
970 return _mm256_permute_pd(Vec, PERMUTE_MASK(Index0, Index1, Index2, Index3));
971 }
972 else if constexpr (InLane1(Index0, Index1) && InLane0(Index2, Index3))
973 {
974 // [2..3][2..3][0..1][0..1]
975 // Permute lanes then use [lane0][lane1] swizzle
976 return SelectVectorSwizzle<Index0 - 2, Index1 - 2, Index2 + 2, Index3 + 2>(PermuteLanes<1, 0>(Vec));
977 }
978 else if constexpr (InLane0(Index0, Index1) && InLane0(Index2, Index3))
979 {
980 // [0..1][0..1][0..1][0..1]
981 // Permute lanes then use [lane0][lane1] swizzle
983 }
984 else if constexpr (InLane1(Index0, Index1) && InLane1(Index2, Index3))
985 {
986 // [2..3][2..3][2..3][2..3]
987 // Permute lanes then use [lane0][lane1] swizzle
988 return SelectVectorSwizzle<Index0 - 2, Index1 - 2, Index2, Index3>(PermuteLanes<1, 1>(Vec));
989 }
990 else
991 {
992 // Anything with out-of-lane pairs
993#if UE_PLATFORM_MATH_USE_AVX_2
994 return _mm256_permute4x64_pd(Vec, SHUFFLEMASK(Index0, Index1, Index2, Index3));
995#else
999 );
1000#endif
1001 }
1002 }
1003
1004 //
1005 // Specializations
1006 //
1007 template<> FORCEINLINE VectorRegister4Double SelectVectorSwizzle<0, 0, 2, 2>(VectorRegister4Double Vec) { return _mm256_movedup_pd(Vec); } // special instruction exists for this.
1012
1013#endif // AVX
1014
1015 // Double swizzle wrapper
1016 template<int Index0, int Index1, int Index2, int Index3>
1018 {
1019 static_assert(Index0 >= 0 && Index0 <= 3 && Index1 >= 0 && Index1 <= 3 && Index2 >= 0 && Index2 <= 3 && Index3 >= 0 && Index3 <= 3, "Invalid Index");
1020
1021#if UE_PLATFORM_MATH_USE_AVX
1023#else
1024 return VectorRegister4Double(
1027 );
1028#endif
1029 }
1030
1031 // Specializations
1033
1035 // Double replicate
1037
1038 template <int Index>
1040 {
1041 // Note: 2 doubles (VectorRegister2Double / m128d)
1043 }
1044
1045 // Double replicate (4 doubles)
1046 template <int Index>
1048 {
1049 if constexpr (Index <= 1)
1050 {
1052 return VectorRegister4Double(Temp, Temp);
1053 }
1054 else
1055 {
1057 return VectorRegister4Double(Temp, Temp);
1058 }
1059 }
1060
1061 //
1062 // Double replicate wrapper
1063 //
1064 template<int Index>
1066 {
1067 static_assert(Index >= 0 && Index <= 3, "Invalid Index");
1068
1069#if UE_PLATFORM_MATH_USE_AVX_2
1071#else
1073#endif
1074 }
1075
1077 // Double shuffle
1079
1080#if UE_PLATFORM_MATH_USE_AVX
1081
1082 //
1083 // Lane shuffle helper
1084 //
1085 template<int Lane0, int Lane1>
1087 {
1088 static_assert(Lane0 >= 0 && Lane0 <= 1 && Lane1 >= 0 && Lane1 <= 1, "Invalid Index");
1090 }
1091
1092 // Lane shuffle helper specialization
1094#if !UE_PLATFORM_MATH_USE_AVX_2
1095 // On AVX1, permute2f128 can be quite slow, so look for alternatives (extract + insert). On AVX2, permute2f128 is more efficient and should equal or beat (extract + insert).
1096 // Sources: https://www.agner.org/optimize/instruction_tables.pdf, https://uops.info/table.html
1100#endif // !AVX2
1101
1102 //
1103 // Double shuffle helpers
1104 //
1105
1106 // When index pairs are within the same lane, SelectVectorShuffle first efficiently blends elements from the two vectors,
1107 // then efficiently swizzles within 128-bit lanes using specializations for indices [0..1][0..1][2..3][2..3]
1108 //
1109 template <int Index0, int Index1, int Index2, int Index3>
1111 {
1112 if constexpr (InLane0(Index0, Index1) && InLane1(Index2, Index3))
1113 {
1114 // [0..1][0..1][2..3][2..3]
1117 }
1118 else if constexpr (InLane1(Index0, Index1) && InLane0(Index2, Index3))
1119 {
1120 // [2..3][2..3][0..1][0..1]
1122 return VectorSwizzleTemplate<Index0 - 2, Index1 - 2, Index2 + 2, Index3 + 2>(Blended);
1123 }
1124 else if constexpr (InLane0(Index0, Index1) && InLane0(Index2, Index3))
1125 {
1126 // [0..1][0..1][0..1][0..1]
1129 }
1130 else if constexpr (InLane1(Index0, Index1) && InLane1(Index2, Index3))
1131 {
1132 // [2..3][2..3][2..3][2..3]
1134 return VectorSwizzleTemplate<Index0 - 2, Index1 - 2, Index2, Index3>(Blended);
1135 }
1136 else if constexpr (InSameLane(Index0, Index1) && OutOfLane(Index2, Index3))
1137 {
1140 return _mm256_insertf128_pd(Vec1_XY, Vec2_ZW, 0x1);
1141 }
1142 else if constexpr (OutOfLane(Index0, Index1) && InSameLane(Index2, Index3))
1143 {
1146 return _mm256_insertf128_pd(Vec2_ZW, Vec1_XY, 0x0);
1147 }
1148 else
1149 {
1150 return VectorRegister4Double(
1153 );
1154 }
1155 }
1156
1157 // AVX Double Shuffle specializations
1158 // Shuffles of 128-bit pairs, ie combinations of [0,1][2,3].
1163
1164#else
1165
1166 // Non-AVX implementation
1167 template<int Index0, int Index1, int Index2, int Index3>
1169 {
1170 return VectorRegister4Double(
1173 );
1174 }
1175
1176#endif // AVX
1177
1178 //
1179 // Double shuffle wrapper
1180 //
1181 template<int Index0, int Index1, int Index2, int Index3>
1183 {
1184 static_assert(Index0 >= 0 && Index0 <= 3 && Index1 >= 0 && Index1 <= 3 && Index2 >= 0 && Index2 <= 3 && Index3 >= 0 && Index3 <= 3, "Invalid Index");
1186 }
1187
1189 // Float swizzle
1191
1192 template<int Index0, int Index1, int Index2, int Index3>
1194 {
1196 return Result;
1197 }
1198
1199 // Float Swizzle specializations.
1200 // These can result in no-ops or simpler ops than shuffle which don't compete with the shuffle unit, or which can copy directly to the destination and avoid an intermediate mov.
1201 // See: https://stackoverflow.com/questions/56238197/what-is-the-difference-between-mm-movehdup-ps-and-mm-shuffle-ps-in-this-case
1207
1208#if UE_PLATFORM_MATH_USE_SSE4_1
1211#endif
1212
1213#if UE_PLATFORM_MATH_USE_AVX_2
1215#endif
1216
1218 // Float replicate
1219 template<int Index>
1221 {
1222 static_assert(Index >= 0 && Index <= 3, "Invalid Index");
1224 }
1225
1227 // Float shuffle
1228 template<int Index0, int Index1, int Index2, int Index3>
1230 {
1231 static_assert(Index0 >= 0 && Index0 <= 3 && Index1 >= 0 && Index1 <= 3 && Index2 >= 0 && Index2 <= 3 && Index3 >= 0 && Index3 <= 3, "Invalid Index");
1232 return _mm_shuffle_ps(Vec1, Vec2, SHUFFLEMASK(Index0, Index1, Index2, Index3));
1233 }
1234
1235 // Float Shuffle specializations
1237 template<> FORCEINLINE VectorRegister4Float VectorShuffleTemplate<2, 3, 2, 3>(VectorRegister4Float Vec1, VectorRegister4Float Vec2) { return _mm_movehl_ps(Vec2, Vec1); } // Note: movehl copies first from the 2nd argument
1238
1239#undef OutOfLane
1240#undef InSameLane
1241#undef InLane1
1242#undef InLane0
1243} // namespace SSEPermuteHelpers
1244
1253#define VectorReplicate(Vec, ElementIndex) SSEPermuteHelpers::VectorReplicateTemplate<ElementIndex>(Vec)
1254
1265#define VectorSwizzle(Vec, X, Y, Z, W) SSEPermuteHelpers::VectorSwizzleTemplate<X,Y,Z,W>(Vec)
1266
1278#define VectorShuffle(Vec1, Vec2, X, Y, Z, W) SSEPermuteHelpers::VectorShuffleTemplate<X,Y,Z,W>(Vec1, Vec2)
1279
1280
1288{
1290}
1291
1293{
1295#if !UE_PLATFORM_MATH_USE_AVX
1299#else
1301#endif
1302 return Result;
1303}
1304
1312{
1313 return _mm_sub_ps(_mm_setzero_ps(), Vec);
1314}
1315
1317{
1319#if !UE_PLATFORM_MATH_USE_AVX
1320 Result.XY = _mm_sub_pd(_mm_setzero_pd(), Vec.XY);
1321 Result.ZW = _mm_sub_pd(_mm_setzero_pd(), Vec.ZW);
1322#else
1324#endif
1325 return Result;
1326}
1327
1337{
1338 return _mm_add_ps(Vec1, Vec2);
1339}
1340
1342{
1344#if !UE_PLATFORM_MATH_USE_AVX
1345 Result.XY = _mm_add_pd(Vec1.XY, Vec2.XY);
1346 Result.ZW = _mm_add_pd(Vec1.ZW, Vec2.ZW);
1347#else
1349#endif
1350 return Result;
1351}
1352
1361{
1362 return _mm_sub_ps(Vec1, Vec2);
1363}
1364
1366{
1368#if !UE_PLATFORM_MATH_USE_AVX
1369 Result.XY = _mm_sub_pd(Vec1.XY, Vec2.XY);
1370 Result.ZW = _mm_sub_pd(Vec1.ZW, Vec2.ZW);
1371#else
1373#endif
1374 return Result;
1375}
1376
1385{
1386 return _mm_mul_ps(Vec1, Vec2);
1387}
1388
1390{
1392#if !UE_PLATFORM_MATH_USE_AVX
1393 Result.XY = _mm_mul_pd(Vec1.XY, Vec2.XY);
1394 Result.ZW = _mm_mul_pd(Vec1.ZW, Vec2.ZW);
1395#else
1397#endif
1398 return Result;
1399}
1400
1401
1402
1412{
1413#if UE_PLATFORM_MATH_USE_FMA3
1414 return _mm_fmadd_ps(A, B, C);
1415#else
1416 return VectorAdd(VectorMultiply(A, B), C);
1417#endif
1418}
1419
1421{
1422#if UE_PLATFORM_MATH_USE_FMA3 && UE_PLATFORM_MATH_USE_AVX
1423 return _mm256_fmadd_pd(A, B, C);
1424#elif UE_PLATFORM_MATH_USE_FMA3
1426 Result.XY = _mm_fmadd_pd(A.XY, B.XY, C.XY);
1427 Result.ZW = _mm_fmadd_pd(A.ZW, B.ZW, C.ZW);
1428 return Result;
1429#else
1430 return VectorAdd(VectorMultiply(A, B), C);
1431#endif
1432}
1433
1443{
1444#if UE_PLATFORM_MATH_USE_FMA3
1445 return _mm_fnmadd_ps(A, B, C);
1446#else
1447 return VectorSubtract(C, VectorMultiply(A, B));
1448#endif
1449}
1450
1452{
1453#if UE_PLATFORM_MATH_USE_FMA3 && UE_PLATFORM_MATH_USE_AVX
1454 return _mm256_fnmadd_pd(A, B, C);
1455#elif UE_PLATFORM_MATH_USE_FMA3
1457 Result.XY = _mm_fnmadd_pd(A.XY, B.XY, C.XY);
1458 Result.ZW = _mm_fnmadd_pd(A.ZW, B.ZW, C.ZW);
1459 return Result;
1460#else
1461 return VectorSubtract(C, VectorMultiply(A, B));
1462#endif
1463}
1464
1465
1474{
1475 return _mm_div_ps(Vec1, Vec2);
1476}
1477
1479{
1481#if !UE_PLATFORM_MATH_USE_AVX
1482 Result.XY = _mm_div_pd(Vec1.XY, Vec2.XY);
1483 Result.ZW = _mm_div_pd(Vec1.ZW, Vec2.ZW);
1484#else
1486#endif
1487 return Result;
1488}
1489
1490namespace SSEVectorHelperFuncs
1491{
1492 // Computes VectorDot3 but only with the result in the first (X) element of a VectorRegister4Float
1494 {
1495 // (X, Y, Z, W)
1497 // (Y, Y, W, W)
1498 VectorRegister4Float Shuf = VectorSwizzle(Prod, 1, 1, 3, 3); // _mm_movehdup_ps on SSE4.1, shuffle otherwise
1499 // (X+Y, ???, ???, ???)
1501 // (Z, W, Z, W)
1502 Shuf = VectorSwizzle(Prod, 2, 3, 2, 3); // _mm_movehl_ps
1503 // (X+Y+Z, ???, ???, ???)
1504 Sum = VectorAdd(Sum, Shuf);
1505 return Sum;
1506 }
1507
1508 // Computes VectorDot3 but only with the result in the first (X) element of a VectorRegister4Double
1510 {
1511 // (X, Y, Z, W)
1513 // (Y, Y, W, W)
1514 VectorRegister4Double Shuf = VectorSwizzle(Prod, 1, 1, 3, 3); // fast in-lane permute on AVX (_mm256_permute_pd)
1515 // (X+Y, ???, ???, ???)
1517 // (Z, W, Z, W)
1518 Shuf = VectorSwizzle(Prod, 2, 3, 2, 3); // various specializations exist for this depending on platform
1519 // (X+Y+Z, ???, ???, ???)
1520 Sum = VectorAdd(Sum, Shuf);
1521
1522 return Sum;
1523 }
1524
1525 // Computes VectorDot3 but only with the result in the first (X) element of a VectorRegister2Double (half of VectorRegister4Double)
1527 {
1529
1530 // (X, Y)
1531 T = _mm_mul_pd(Vec1.XY, Vec2.XY);
1532
1533 // (X + Z, Y + W)
1534#if UE_PLATFORM_MATH_USE_FMA3
1535 A = _mm_fmadd_pd(Vec1.ZW, Vec2.ZW, T);
1536#else
1537 A = _mm_add_pd(_mm_mul_pd(Vec1.ZW, Vec2.ZW), T);
1538#endif // UE_PLATFORM_MATH_USE_FMA3
1539
1540 // (Y, X) // Reverse of T
1541 T = _mm_shuffle_pd(T, T, SHUFFLEMASK2(1, 0));
1542
1543 // (X + Z + Y, Y + W + X)
1544 T = _mm_add_pd(A, T);
1545
1546 return T;
1547 }
1548
1549} // namespace SSEVectorHelperFuncs
1550
1551
1560{
1561 return VectorGetComponent(SSEVectorHelperFuncs::InternalVectorDot3X(Vec1, Vec2), 0);
1562}
1563
1565{
1566#if UE_PLATFORM_MATH_USE_AVX
1567 return VectorGetComponent(SSEVectorHelperFuncs::InternalVectorDot3X_Full(Vec1, Vec2), 0);
1568#else
1569 VectorRegister2Double T = SSEVectorHelperFuncs::InternalVectorDot3X_Half(Vec1, Vec2);
1570 // Extract first component
1571 return _mm_cvtsd_f64(T);
1572#endif
1573}
1574
1583{
1584 return VectorReplicate(SSEVectorHelperFuncs::InternalVectorDot3X(Vec1, Vec2), 0);
1585}
1586
1588{
1589#if UE_PLATFORM_MATH_USE_AVX
1590 return VectorReplicate(SSEVectorHelperFuncs::InternalVectorDot3X_Full(Vec1, Vec2), 0);
1591#else
1592 VectorRegister2Double T = SSEVectorHelperFuncs::InternalVectorDot3X_Half(Vec1, Vec2);
1593 // Replicate in half (X,X)
1594 T = _mm_shuffle_pd(T, T, SHUFFLEMASK2(0, 0));
1595 // Replicate in full (X,X,X,X)
1596 return VectorRegister4Double(T, T);
1597#endif
1598}
1599
1608{
1610 R = VectorMultiply(Vec1, Vec2); // (XX, YY, ZZ, WW)
1611 T = VectorSwizzle(R, 1, 0, 3, 2); // (YY, XX, WW, ZZ)
1612 R = VectorAdd(R, T); // (XX + YY, YY + XX, ZZ + WW, WW + ZZ)
1613 T = VectorSwizzle(R, 2, 3, 0, 1); // (ZZ + WW, WW + ZZ, XX + YY, YY + XX)
1614 return VectorAdd(R, T); // (XX + YY + ZZ + WW, YY + XX + WW + ZZ, ZZ + WW + XX + YY, WW + ZZ + YY + XX)
1615}
1616
1618{
1619#if UE_PLATFORM_MATH_USE_AVX
1620 // AVX implementation uses fast permutes
1622 R = VectorMultiply(Vec1, Vec2); // (XX, YY, ZZ, WW)
1623 T = VectorSwizzle(R, 1, 0, 3, 2); // (YY, XX, WW, ZZ) // fast in-lane permute
1624 R = VectorAdd(R, T); // (XX + YY, YY + XX, ZZ + WW, WW + ZZ)
1625 T = VectorSwizzle(R, 2, 3, 0, 1); // (ZZ + WW, WW + ZZ, XX + YY, YY + XX) // lane-swap permute
1626 return VectorAdd(R, T); // (XX + YY + ZZ + WW, YY + XX + WW + ZZ, ZZ + WW + XX + YY, WW + ZZ + YY + XX)
1627#else
1629
1630 // (X, Y)
1631 T = _mm_mul_pd(Vec1.XY, Vec2.XY);
1632
1633 // (X + Z, Y + W)
1634#if UE_PLATFORM_MATH_USE_FMA3
1635 A = _mm_fmadd_pd(Vec1.ZW, Vec2.ZW, T);
1636#else
1637 A = _mm_add_pd(_mm_mul_pd(Vec1.ZW, Vec2.ZW), T);
1638#endif // UE_PLATFORM_MATH_USE_FMA3
1639
1640 // (Y + W, X + Z) // Reverse of A
1641 T = _mm_shuffle_pd(A, A, SHUFFLEMASK2(1, 0));
1642
1643 // (X + Z + Y + W, Y + W + X + Z)
1644 T = _mm_add_pd(A, T);
1645 return VectorRegister4Double(T, T);
1646#endif
1647}
1648
1657{
1658 return _mm_cmpeq_ps(Vec1, Vec2);
1659}
1660
1662{
1664#if !UE_PLATFORM_MATH_USE_AVX
1665 Result.XY = _mm_cmpeq_pd(Vec1.XY, Vec2.XY);
1666 Result.ZW = _mm_cmpeq_pd(Vec1.ZW, Vec2.ZW);
1667#else
1669#endif
1670 return Result;
1671}
1672
1681{
1682 return _mm_cmpneq_ps(Vec1, Vec2);
1683}
1684
1686{
1688#if !UE_PLATFORM_MATH_USE_AVX
1689 Result.XY = _mm_cmpneq_pd(Vec1.XY, Vec2.XY);
1690 Result.ZW = _mm_cmpneq_pd(Vec1.ZW, Vec2.ZW);
1691#else
1692 // For X != Y, if either is NaN it should return true (this matches the normal C behavior).
1693 // We use the *unordered* comparison operation that is true if either value is NaN.
1695#endif
1696 return Result;
1697}
1698
1707{
1708 return _mm_cmpgt_ps(Vec1, Vec2);
1709}
1710
1712{
1714#if !UE_PLATFORM_MATH_USE_AVX
1715 Result.XY = _mm_cmpgt_pd(Vec1.XY, Vec2.XY);
1716 Result.ZW = _mm_cmpgt_pd(Vec1.ZW, Vec2.ZW);
1717#else
1719#endif
1720 return Result;
1721}
1722
1731{
1732 return _mm_cmpge_ps(Vec1, Vec2);
1733}
1734
1736{
1738#if !UE_PLATFORM_MATH_USE_AVX
1739 Result.XY = _mm_cmpge_pd(Vec1.XY, Vec2.XY);
1740 Result.ZW = _mm_cmpge_pd(Vec1.ZW, Vec2.ZW);
1741#else
1743#endif
1744 return Result;
1745}
1746
1755{
1756 return _mm_cmplt_ps(Vec1, Vec2);
1757}
1758
1760{
1762#if !UE_PLATFORM_MATH_USE_AVX
1763 Result.XY = _mm_cmplt_pd(Vec1.XY, Vec2.XY);
1764 Result.ZW = _mm_cmplt_pd(Vec1.ZW, Vec2.ZW);
1765#else
1767#endif
1768 return Result;
1769}
1770
1779{
1780 return _mm_cmple_ps(Vec1, Vec2);
1781}
1782
1784{
1786#if !UE_PLATFORM_MATH_USE_AVX
1787 Result.XY = _mm_cmple_pd(Vec1.XY, Vec2.XY);
1788 Result.ZW = _mm_cmple_pd(Vec1.ZW, Vec2.ZW);
1789#else
1791#endif
1792 return Result;
1793}
1794
1806{
1807 // Can't (in general) use BLENDVPS despite our SSE4.1 minimum requirement since
1808 // this is defined to be bitwise, not element-wise with MSB as toggle
1810}
1811
1813{
1815}
1816
1818{
1820#if !UE_PLATFORM_MATH_USE_AVX
1821 Result.XY = VectorSelect(Mask.XY, Vec1.XY, Vec2.XY);
1822 Result.ZW = VectorSelect(Mask.ZW, Vec1.ZW, Vec2.ZW);
1823#else
1825#endif
1826 return Result;
1827}
1828
1837{
1838 return _mm_or_ps(Vec1, Vec2);
1839}
1840
1842{
1844#if !UE_PLATFORM_MATH_USE_AVX
1845 Result.XY = _mm_or_pd(Vec1.XY, Vec2.XY);
1846 Result.ZW = _mm_or_pd(Vec1.ZW, Vec2.ZW);
1847#else
1849#endif
1850 return Result;
1851}
1852
1861{
1862 return _mm_and_ps(Vec1, Vec2);
1863}
1864
1866{
1868#if !UE_PLATFORM_MATH_USE_AVX
1869 Result.XY = _mm_and_pd(Vec1.XY, Vec2.XY);
1870 Result.ZW = _mm_and_pd(Vec1.ZW, Vec2.ZW);
1871#else
1873#endif
1874 return Result;
1875}
1876
1885{
1886 return _mm_xor_ps(Vec1, Vec2);
1887}
1888
1890{
1892#if !UE_PLATFORM_MATH_USE_AVX
1893 Result.XY = _mm_xor_pd(Vec1.XY, Vec2.XY);
1894 Result.ZW = _mm_xor_pd(Vec1.ZW, Vec2.ZW);
1895#else
1897#endif
1898 return Result;
1899}
1900
1909{
1910 // YZX
1913 // XY, YZ, ZX
1914 A = VectorMultiply(A, Vec1);
1915 // XY-YX, YZ-ZY, ZX-XZ
1917 // YZ-ZY, ZX-XZ, XY-YX
1918 return VectorSwizzle(A, 1, 2, 0, 3);
1919}
1920
1922{
1923 // YZX
1926 // XY, YZ, ZX
1927 A = VectorMultiply(A, Vec1);
1928 // XY-YX, YZ-ZY, ZX-XZ
1930 // YZ-ZY, ZX-XZ, XY-YX
1931 return VectorSwizzle(A, 1, 2, 0, 3);
1932}
1933
1942{
1943#if UE_PLATFORM_MATH_USE_SVML
1944 return _mm_pow_ps(Base, Exponent);
1945#else
1946 // using SseMath library
1947 return SSE::exp_ps(_mm_mul_ps(SSE::log_ps(Base), Exponent));
1948#endif
1949/*
1950 // old version, keeping for reference in case something breaks and we need to debug it.
1951 union { VectorRegister4Float v; float f[4]; } B, E;
1952 B.v = Base;
1953 E.v = Exponent;
1954 return _mm_setr_ps( powf(B.f[0], E.f[0]), powf(B.f[1], E.f[1]), powf(B.f[2], E.f[2]), powf(B.f[3], E.f[3]) );
1955*/
1956}
1957
1959{
1960#if UE_PLATFORM_MATH_USE_SVML_AVX
1961 return _mm256_pow_pd(Base, Exponent);
1962#elif UE_PLATFORM_MATH_USE_SVML
1963 return VectorRegister4Double(_mm_pow_pd(Base.XY, Exponent.XY), _mm_pow_pd(Base.ZW, Exponent.ZW));
1964#else
1965 AlignedDouble4 Values(Base);
1966 AlignedDouble4 Exponents(Exponent);
1967
1968 Values[0] = FMath::Pow(Values[0], Exponents[0]);
1969 Values[1] = FMath::Pow(Values[1], Exponents[1]);
1970 Values[2] = FMath::Pow(Values[2], Exponents[2]);
1971 Values[3] = FMath::Pow(Values[3], Exponents[3]);
1972 return Values.ToVectorRegister();
1973#endif
1974}
1975
1983{
1984 return _mm_sqrt_ps(Vec);
1985}
1986
1988{
1989#if UE_PLATFORM_MATH_USE_AVX
1990 return _mm256_sqrt_pd(Vec);
1991#else
1993#endif
1994}
1995
2003{
2004 // Warning: Discrepancies between Intel and AMD hardware estimates make this diverge between platforms.
2005 return _mm_rsqrt_ps(Vec);
2006}
2007
2015{
2016#if UE_PLATFORM_MATH_USE_SVML && 0 // NOTE: DISABLED
2017 // TODO: this appears to deliver slightly different results on Intel vs AMD hardware,
2018 // similar to prior issues with our use of rsqrt refinements in UnrealPlatformMathSSE.
2019 return _mm_invsqrt_ps(Vec);
2020#else
2022#endif
2023
2024 /*
2025 // Legacy implementation based on refinements of estimate, left for reference.
2026 // Discrepancies between Intel and AMD hardware estimates make this diverge between platforms,
2027 // similar to prior issues with our use of rsqrt refinements in UnrealPlatformMathSSE.
2028 //
2029 // Perform two passes of Newton-Raphson iteration on the hardware estimate
2030 // v^-0.5 = x
2031 // => x^2 = v^-1
2032 // => 1/(x^2) = v
2033 // => F(x) = x^-2 - v
2034 // F'(x) = -2x^-3
2035
2036 // x1 = x0 - F(x0)/F'(x0)
2037 // => x1 = x0 + 0.5 * (x0^-2 - Vec) * x0^3
2038 // => x1 = x0 + 0.5 * (x0 - Vec * x0^3)
2039 // => x1 = x0 + x0 * (0.5 - 0.5 * Vec * x0^2)
2040
2041 const VectorRegister4Float OneHalf = GlobalVectorConstants::FloatOneHalf;
2042 const VectorRegister4Float VecDivBy2 = VectorMultiply(Vec, OneHalf);
2043
2044 // Initial estimate
2045 const VectorRegister4Float x0 = VectorReciprocalSqrtEstimate(Vec);
2046
2047 // First iteration
2048 VectorRegister4Float x1 = VectorMultiply(x0, x0);
2049 x1 = VectorSubtract(OneHalf, VectorMultiply(VecDivBy2, x1));
2050 x1 = VectorMultiplyAdd(x0, x1, x0);
2051
2052 // Second iteration
2053 VectorRegister4Float x2 = VectorMultiply(x1, x1);
2054 x2 = VectorSubtract(OneHalf, VectorMultiply(VecDivBy2, x2));
2055 x2 = VectorMultiplyAdd(x1, x2, x1);
2056
2057 return x2;
2058 */
2059}
2060
2062{
2063#if UE_PLATFORM_MATH_USE_AVX
2065#else
2067#endif
2068}
2069
2071{
2072#if UE_PLATFORM_MATH_USE_SVML_AVX
2073 return _mm256_invsqrt_pd(Vec);
2074#elif UE_PLATFORM_MATH_USE_SVML
2076#else
2077 return VectorReciprocalSqrt(Vec);
2078#endif
2079}
2080
2081
2089{
2091}
2092
2094{
2096}
2097
2105{
2107}
2108
2110{
2112}
2113
2122{
2123 // Warning: Discrepancies between Intel and AMD hardware estimates make this diverge between platforms.
2124 return _mm_rcp_ps(Vec);
2125}
2126
2134{
2136 /*
2137 // Legacy implementation based on refinements of estimate, left for reference.
2138 // Discrepancies between Intel and AMD hardware estimates make this diverge between platforms.
2139 //
2140 // Perform two passes of Newton-Raphson iteration on the hardware estimate
2141 // x1 = x0 - f(x0) / f'(x0)
2142 //
2143 // 1 / Vec = x
2144 // => x * Vec = 1
2145 // => F(x) = x * Vec - 1
2146 // F'(x) = Vec
2147 // => x1 = x0 - (x0 * Vec - 1) / Vec
2148 //
2149 // Since 1/Vec is what we're trying to solve, use an estimate for it, x0
2150 // => x1 = x0 - (x0 * Vec - 1) * x0 = 2 * x0 - Vec * x0^2
2151
2152 // Initial estimate
2153 const VectorRegister4Float x0 = VectorReciprocalEstimate(Vec);
2154
2155 // First iteration
2156 const VectorRegister4Float x0Squared = VectorMultiply(x0, x0);
2157 const VectorRegister4Float x0Times2 = VectorAdd(x0, x0);
2158 const VectorRegister4Float x1 = VectorNegateMultiplyAdd(Vec, x0Squared, x0Times2);
2159
2160 // Second iteration
2161 const VectorRegister4Float x1Squared = VectorMultiply(x1, x1);
2162 const VectorRegister4Float x1Times2 = VectorAdd(x1, x1);
2163 const VectorRegister4Float x2 = VectorNegateMultiplyAdd(Vec, x1Squared, x1Times2);
2164
2165 return x2;
2166 */
2167}
2168
2170{
2172}
2173
2175{
2176 // Not an estimate.
2177 return VectorReciprocal(Vec);
2178}
2179
2188{
2189 return _mm_blend_ps(VecXYZ, VecW, 0b1000);
2190}
2191
2193{
2195}
2196
2204{
2205 return _mm_insert_ps(Vec, Vec, 0x08); // Copies lane 0 to lane 0 and zero-masks lane 3
2206}
2207
2209{
2211#if !UE_PLATFORM_MATH_USE_AVX
2212 Result.XY = Vec.XY;
2214#else
2216#endif
2217 return Result;
2218}
2219
2227{
2229}
2230
2232{
2234#if !UE_PLATFORM_MATH_USE_AVX
2235 Result.XY = Vec.XY;
2237#else
2239#endif
2240 return Result;
2241}
2242
2252
2262{
2264}
2266{
2268}
2269
2278{
2281
2282 // Splat x,y,z and w
2287 // Mul by the matrix
2288 VTempX = VectorMultiply(VTempX, M[0]);
2292
2293 return VTempX;
2294}
2295
2297{
2298 // Warning: FMatrix44d alignment may not match VectorRegister4Double, so you can't just cast to VectorRegister4Double*.
2299 typedef double Double4x4[4][4];
2300 const Double4x4& MRows = *((const Double4x4*)MatrixM);
2301
2303 M[0] = VectorLoad(MRows[0]);
2304 M[1] = VectorLoad(MRows[1]);
2305 M[2] = VectorLoad(MRows[2]);
2306 M[3] = VectorLoad(MRows[3]);
2307
2310
2311 // Splat x,y,z and w
2316 // Mul by the matrix
2317 VTempX = VectorMultiply(VTempX, M[0]);
2321
2322 // LWC_TODO: this will be a lossy conversion.
2324}
2325
2327{
2328 // Warning: FMatrix44d alignment may not match VectorRegister4Double, so you can't just cast to VectorRegister4Double*.
2329 typedef double Double4x4[4][4];
2330 const Double4x4& MRows = *((const Double4x4*)MatrixM);
2331
2333 M[0] = VectorLoad(MRows[0]);
2334 M[1] = VectorLoad(MRows[1]);
2335 M[2] = VectorLoad(MRows[2]);
2336 M[3] = VectorLoad(MRows[3]);
2337
2339
2340 // Splat x,y,z and w
2345 // Mul by the matrix
2346 VTempX = VectorMultiply(VTempX, M[0]);
2350
2351 return VTempX;
2352}
2353
2355{
2358
2359 // Splat x,y,z and w
2364 // Mul by the matrix
2369
2370 return VTempX;
2371}
2372
2381{
2382 return _mm_min_ps(Vec1, Vec2);
2383}
2384
2386{
2388#if !UE_PLATFORM_MATH_USE_AVX
2389 Result.XY = _mm_min_pd(Vec1.XY, Vec2.XY);
2390 Result.ZW = _mm_min_pd(Vec1.ZW, Vec2.ZW);
2391#else
2393#endif
2394 return Result;
2395}
2396
2405{
2406 return _mm_max_ps(Vec1, Vec2);
2407}
2408
2410{
2412#if !UE_PLATFORM_MATH_USE_AVX
2413 Result.XY = _mm_max_pd(Vec1.XY, Vec2.XY);
2414 Result.ZW = _mm_max_pd(Vec1.ZW, Vec2.ZW);
2415#else
2417#endif
2418 return Result;
2419}
2420
2429{
2430 return VectorShuffle(Vec1, Vec2, 2, 3, 2, 3);
2431}
2432
2434{
2435 return VectorShuffle(Vec1, Vec2, 2, 3, 2, 3);
2436}
2437
2446{
2447 return VectorShuffle(Vec1, Vec2, 0, 1, 0, 1);
2448}
2449
2451{
2452 return VectorShuffle(Vec1, Vec2, 0, 1, 0, 1);
2453}
2454
2465{
2466 OutEvens = VectorShuffle(Lo, Hi, 0, 2, 0, 2);
2467 OutOdds = VectorShuffle(Lo, Hi, 1, 3, 1, 3);
2468}
2469
2471{
2472 OutEvens = VectorShuffle(Lo, Hi, 0, 2, 0, 2);
2473 OutOdds = VectorShuffle(Lo, Hi, 1, 3, 1, 3);
2474}
2475
2476
2484{
2485 return _mm_movemask_ps(VecMask);
2486}
2487
2489{
2490#if !UE_PLATFORM_MATH_USE_AVX
2491 const int MaskXY = _mm_movemask_pd(VecMask.XY);
2492 const int MaskZW = _mm_movemask_pd(VecMask.ZW);
2493 return (MaskZW << 2) | (MaskXY);
2494#else
2496#endif
2497}
2498
2505// Looks complex but is really quite straightforward:
2506// Load as 32-bit value, convert into 4x 32-bit ints, then convert to 4x floats
2507#define VectorLoadByte4(Ptr) _mm_cvtepi32_ps(_mm_cvtepu8_epi32(_mm_cvtsi32_si128(*(int32*)Ptr)))
2508
2515// Looks complex but is really quite straightforward:
2516// Load as 32-bit value, unpack 4x signed 8-bit ints to 4x 32-bit ints, then convert to 4x floats
2518{
2520 return _mm_cvtepi32_ps(Temp);
2521}
2522
2530{
2532 return VectorSwizzle( Temp, 3, 2, 1, 0 );
2533}
2534
2542{
2543 // Looks complex but is really quite straightforward:
2544 // Convert 4x floats to 4x 32-bit ints, then pack into 4x 16-bit ints, then into 4x 8-bit unsigned ints, then store as a 32-bit value
2549}
2550
2558{
2559 // Looks complex but is really quite straightforward:
2560 // Convert 4x floats to 4x 32-bit ints, then pack into 4x 16-bit ints, then into 4x 8-bit unsigned ints, then store as a 32-bit value
2565}
2566
2567
2575{
2577
2578 Tmp = _mm_and_ps(_mm_load_ps1((const float *)Ptr), MakeVectorRegisterFloat(0x3FFu, 0x3FFu << 10, 0x3FFu << 20, 0x3u << 30));
2579 Tmp = _mm_xor_ps(Tmp, MakeVectorRegister(0, 0, 0, 0x80000000));
2580 Tmp = _mm_cvtepi32_ps(*(const VectorRegister4Int*)&Tmp);
2581 Tmp = _mm_add_ps(Tmp, MakeVectorRegister(0, 0, 0, 32768.0f*65536.0f));
2582 Tmp = _mm_mul_ps(Tmp, MakeVectorRegister(1.0f / 1023.0f, 1.0f / (1023.0f*1024.0f), 1.0f / (1023.0f*1024.0f*1024.0f), 1.0f / (3.0f*1024.0f*1024.0f*1024.0f)));
2583
2584 return Tmp;
2585}
2586
2594{
2596 Tmp = _mm_max_ps(Vec, MakeVectorRegisterFloat(0.0f, 0.0f, 0.0f, 0.0f));
2597 Tmp = _mm_min_ps(Tmp, MakeVectorRegisterFloat(1.0f, 1.0f, 1.0f, 1.0f));
2598 Tmp = _mm_mul_ps(Tmp, MakeVectorRegisterFloat(1023.0f, 1023.0f*1024.0f*0.5f, 1023.0f*1024.0f*1024.0f, 3.0f*1024.0f*1024.0f*1024.0f*0.5f));
2599
2601 TmpI = _mm_cvttps_epi32(Tmp);
2602 TmpI = _mm_and_si128(TmpI, MakeVectorRegisterInt(0x3FFu, 0x3FFu << (10 - 1), 0x3FFu << 20, 0x3u << (30 - 1)));
2603
2605 TmpI2 = _mm_shuffle_epi32(TmpI, _MM_SHUFFLE(3, 2, 3, 2));
2607
2608 TmpI2 = _mm_shuffle_epi32(TmpI, _MM_SHUFFLE(1, 1, 1, 1));
2611
2612 _mm_store_ss((float *)Ptr, *(const VectorRegister4Float*)&TmpI);
2613}
2614
2621#define VectorLoadURGBA16N(Ptr) _mm_cvtepi32_ps(_mm_cvtepu16_epi32(_mm_loadl_epi64((const __m128i*)Ptr)))
2622
2630{
2632 return _mm_cvtepi32_ps(Temp);
2633}
2634
2642{
2644 Tmp = _mm_max_ps(Vec, MakeVectorRegisterFloat(0.0f, 0.0f, 0.0f, 0.0f));
2645 Tmp = _mm_min_ps(Tmp, MakeVectorRegisterFloat(1.0f, 1.0f, 1.0f, 1.0f));
2646 Tmp = _mm_mul_ps(Tmp, MakeVectorRegisterFloat(65535.0f, 65535.0f, 65535.0f, 65535.0f));
2647
2650}
2651
2660{
2662}
2663
2665{
2667}
2668
2673// This is no longer necessary now that we don't use MMX instructions
2674#define VectorResetFloatRegisters()
2675// TODO: LWC: remove?
2676
2682#define VectorGetControlRegister() _mm_getcsr()
2683
2684#if PLATFORM_SUPPORTS_VECTOR_CONTROL_REGISTERS
2690#define VectorSetControlRegister(ControlStatus) _mm_setcsr( ControlStatus )
2691
2695#define VECTOR_ROUND_TOWARD_ZERO _MM_ROUND_TOWARD_ZERO
2696
2700#define VECTOR_DENORMALS_FLUSH_TO_ZERO _MM_FLUSH_ZERO_ON
2701
2702#else
2703#define VectorSetControlRegister(...)
2704#define VECTOR_ROUND_TOWARD_ZERO
2705#define VECTOR_DENORMALS_FLUSH_TO_ZERO
2706#endif
2707
2719{
2724
2725 return Result;
2726}
2727
2729{
2734
2735 return Result;
2736}
2737
2749{
2751}
2752
2754{
2756}
2757
2758// Returns true if the vector contains a component that is either NAN or +/-infinite.
2760{
2761 // https://en.wikipedia.org/wiki/IEEE_754-1985
2762 // Infinity is represented with all exponent bits set, with the correct sign bit.
2763 // NaN is represented with all exponent bits set, plus at least one fraction/significand bit set.
2764 // This means finite values will not have all exponent bits set, so check against those bits.
2765
2766 union { float F; uint32 U; } InfUnion;
2767 InfUnion.U = 0x7F800000;
2768 const float Inf = InfUnion.F;
2770
2771 // Mask off Exponent
2773 // Compare to full exponent. If any are full exponent (not finite), the signs copied to the mask are non-zero, otherwise it's zero and finite.
2774 bool IsFinite = VectorMaskBits(VectorCompareEQ(ExpTest, FloatInfinity)) == 0;
2775 return !IsFinite;
2776}
2777
2779{
2780 // https://en.wikipedia.org/wiki/IEEE_754-1985
2781 // Infinity is represented with all exponent bits set, with the correct sign bit.
2782 // NaN is represented with all exponent bits set, plus at least one fraction/significand bit set.
2783 // This means finite values will not have all exponent bits set, so check against those bits.
2784
2785 union { double D; uint64 U; } InfUnion;
2786 InfUnion.U = 0x7FF0000000000000;
2787 const double Inf = InfUnion.D;
2789
2790 // Mask off Exponent
2792 // Compare to full exponent. If any are full exponent (not finite), the signs copied to the mask are non-zero, otherwise it's zero and finite.
2793 bool IsFinite = VectorMaskBits(VectorCompareEQ(ExpTest, DoubleInfinity)) == 0;
2794 return !IsFinite;
2795}
2796
2798{
2800}
2801
2803{
2804 return _mm_round_pd(V, _MM_FROUND_TRUNC);
2805}
2806
2808{
2810#if !UE_PLATFORM_MATH_USE_AVX
2813#else
2815#endif
2816 return Result;
2817}
2818
2820{
2822}
2823
2825{
2826 return _mm_cvtps_epi32(Vec);
2827}
2828
2830{
2831 return _mm_ceil_ps(V);
2832}
2833
2835{
2836#if UE_PLATFORM_MATH_USE_AVX
2839 return Result;
2840#else
2842 Result.XY = _mm_ceil_pd(V.XY);
2843 Result.ZW = _mm_ceil_pd(V.ZW);
2844 return Result;
2845#endif
2846}
2847
2849{
2850 return _mm_floor_ps(V);
2851}
2852
2854{
2855#if UE_PLATFORM_MATH_USE_AVX
2858 return Result;
2859#else
2861 Result.XY = _mm_floor_pd(V.XY);
2862 Result.ZW = _mm_floor_pd(V.ZW);
2863 return Result;
2864#endif
2865}
2866
2868{
2869 // Check against invalid divisor
2871
2872#if UE_PLATFORM_MATH_USE_SVML
2874#else
2876 XFloats[0] = fmodf(XFloats[0], YFloats[0]);
2877 XFloats[1] = fmodf(XFloats[1], YFloats[1]);
2878 XFloats[2] = fmodf(XFloats[2], YFloats[2]);
2879 XFloats[3] = fmodf(XFloats[3], YFloats[3]);
2880 VectorRegister4Float Result = XFloats.ToVectorRegister();
2881#endif
2882
2883 // Return 0 where divisor Y was too small
2885 return Result;
2886}
2887
2889{
2890 // Check against invalid divisor
2892
2893#if UE_PLATFORM_MATH_USE_SVML_AVX
2894 VectorRegister4Double DoubleResult = _mm256_fmod_pd(X, Y);
2895#elif UE_PLATFORM_MATH_USE_SVML
2896 VectorRegister4Double DoubleResult = VectorRegister4Double(_mm_fmod_pd(X.XY, Y.XY), _mm_fmod_pd(X.ZW, Y.ZW));
2897#else
2899 XDoubles[0] = fmod(XDoubles[0], YDoubles[0]);
2900 XDoubles[1] = fmod(XDoubles[1], YDoubles[1]);
2901 XDoubles[2] = fmod(XDoubles[2], YDoubles[2]);
2902 XDoubles[3] = fmod(XDoubles[3], YDoubles[3]);
2903 VectorRegister4Double DoubleResult = XDoubles.ToVectorRegister();
2904#endif
2905
2906 // Return 0 where divisor Y was too small
2908 return DoubleResult;
2909}
2910
2912{
2915}
2916
2918{
2921}
2922
2924{
2927}
2928
2930{
2933}
2934
2936{
2937#if UE_PLATFORM_MATH_USE_SVML
2938 return _mm_exp_ps(X);
2939#else
2940 return SSE::exp_ps(X);
2941#endif
2942}
2943
2945{
2946#if UE_PLATFORM_MATH_USE_SVML_AVX
2947 return _mm256_exp_pd(X);
2948#elif UE_PLATFORM_MATH_USE_SVML
2950#else
2952 Doubles[0] = FMath::Exp(Doubles[0]);
2953 Doubles[1] = FMath::Exp(Doubles[1]);
2954 Doubles[2] = FMath::Exp(Doubles[2]);
2955 Doubles[3] = FMath::Exp(Doubles[3]);
2956 return Doubles.ToVectorRegister();
2957#endif
2958}
2959
2961{
2962#if UE_PLATFORM_MATH_USE_SVML
2963 return _mm_exp2_ps(X);
2964#else
2965 AlignedFloat4 Floats(X);
2966 Floats[0] = FMath::Exp2(Floats[0]);
2967 Floats[1] = FMath::Exp2(Floats[1]);
2968 Floats[2] = FMath::Exp2(Floats[2]);
2969 Floats[3] = FMath::Exp2(Floats[3]);
2970 return Floats.ToVectorRegister();
2971#endif
2972}
2973
2975{
2976#if UE_PLATFORM_MATH_USE_SVML_AVX
2977 return _mm256_exp2_pd(X);
2978#elif UE_PLATFORM_MATH_USE_SVML
2980#else
2982 Doubles[0] = FMath::Exp2(Doubles[0]);
2983 Doubles[1] = FMath::Exp2(Doubles[1]);
2984 Doubles[2] = FMath::Exp2(Doubles[2]);
2985 Doubles[3] = FMath::Exp2(Doubles[3]);
2986 return Doubles.ToVectorRegister();
2987#endif
2988}
2989
2991{
2992#if UE_PLATFORM_MATH_USE_SVML
2993 return _mm_log_ps(X);
2994#else
2995 return SSE::log_ps(X);
2996#endif
2997}
2998
3000{
3001#if UE_PLATFORM_MATH_USE_SVML_AVX
3002 return _mm256_log_pd(X);
3003#elif UE_PLATFORM_MATH_USE_SVML
3005#else
3007 Doubles[0] = FMath::Loge(Doubles[0]);
3008 Doubles[1] = FMath::Loge(Doubles[1]);
3009 Doubles[2] = FMath::Loge(Doubles[2]);
3010 Doubles[3] = FMath::Loge(Doubles[3]);
3011 return Doubles.ToVectorRegister();
3012#endif
3013}
3014
3016{
3017#if UE_PLATFORM_MATH_USE_SVML
3018 return _mm_log2_ps(X);
3019#else
3020 AlignedFloat4 Floats(X);
3021 Floats[0] = FMath::Log2(Floats[0]);
3022 Floats[1] = FMath::Log2(Floats[1]);
3023 Floats[2] = FMath::Log2(Floats[2]);
3024 Floats[3] = FMath::Log2(Floats[3]);
3025 return Floats.ToVectorRegister();
3026#endif
3027}
3028
3030{
3031#if UE_PLATFORM_MATH_USE_SVML_AVX
3032 return _mm256_log2_pd(X);
3033#elif UE_PLATFORM_MATH_USE_SVML
3035#else
3037 Doubles[0] = FMath::Log2(Doubles[0]);
3038 Doubles[1] = FMath::Log2(Doubles[1]);
3039 Doubles[2] = FMath::Log2(Doubles[2]);
3040 Doubles[3] = FMath::Log2(Doubles[3]);
3041 return Doubles.ToVectorRegister();
3042#endif
3043}
3044
3045
3052namespace VectorSinConstantsSSE
3053{
3054 static const float p = 0.225f;
3055 static const float a = 7.58946609f; // 16 * sqrtf(p)
3056 static const float b = 1.63384342f; // (1 - p) / sqrtf(p)
3057 static const VectorRegister4Float A = MakeVectorRegisterFloatConstant(a, a, a, a);
3058 static const VectorRegister4Float B = MakeVectorRegisterFloatConstant(b, b, b, b);
3059}
3060
3062{
3063#if UE_PLATFORM_MATH_USE_SVML
3064 return _mm_sin_ps(V);
3065#else
3066 return SSE::sin_ps(V);
3067#endif
3068}
3069
3071{
3072#if UE_PLATFORM_MATH_USE_SVML_AVX
3073 return _mm256_sin_pd(V);
3074#elif UE_PLATFORM_MATH_USE_SVML
3076#else
3078 Doubles[0] = FMath::Sin(Doubles[0]);
3079 Doubles[1] = FMath::Sin(Doubles[1]);
3080 Doubles[2] = FMath::Sin(Doubles[2]);
3081 Doubles[3] = FMath::Sin(Doubles[3]);
3082 return Doubles.ToVectorRegister();
3083#endif
3084}
3085
3087{
3088#if UE_PLATFORM_MATH_USE_SVML
3089 return _mm_cos_ps(V);
3090#else
3091 return SSE::cos_ps(V);
3092#endif
3093}
3094
3096{
3097#if UE_PLATFORM_MATH_USE_SVML_AVX
3098 return _mm256_cos_pd(V);
3099#elif UE_PLATFORM_MATH_USE_SVML
3101#else
3103 Doubles[0] = FMath::Cos(Doubles[0]);
3104 Doubles[1] = FMath::Cos(Doubles[1]);
3105 Doubles[2] = FMath::Cos(Doubles[2]);
3106 Doubles[3] = FMath::Cos(Doubles[3]);
3107 return Doubles.ToVectorRegister();
3108#endif
3109}
3110
3119{
3120#if UE_PLATFORM_MATH_USE_SVML
3121 *VSinAngles = _mm_sincos_ps(VCosAngles, *VAngles);
3122#else
3123 // Map to [-pi, pi]
3124 // X = A - 2pi * round(A/2pi)
3125 // Note the round(), not truncate(). In this case round() can round halfway cases using round-to-nearest-even OR round-to-nearest.
3126
3127 // Quotient = round(A/2pi)
3129 Quotient = _mm_cvtepi32_ps(_mm_cvtps_epi32(Quotient)); // round to nearest even is the default rounding mode but that's fine here.
3130 // X = A - 2pi * Quotient
3132
3133 // Map in [-pi/2,pi/2]
3135 VectorRegister4Float c = VectorBitwiseOr(GlobalVectorConstants::Pi, sign); // pi when x >= 0, -pi when x < 0
3139 X = VectorSelect(comp, rflx, X);
3141
3143
3144 // 11-degree minimax approximation
3145 //*ScalarSin = (((((-2.3889859e-08f * y2 + 2.7525562e-06f) * y2 - 0.00019840874f) * y2 + 0.0083333310f) * y2 - 0.16666667f) * y2 + 1.0f) * y;
3146 const VectorRegister4Float SinCoeff0 = MakeVectorRegisterFloat(1.0f, -0.16666667f, 0.0083333310f, -0.00019840874f);
3147 const VectorRegister4Float SinCoeff1 = MakeVectorRegisterFloat(2.7525562e-06f, -2.3889859e-08f, /*unused*/ 0.f, /*unused*/ 0.f);
3148
3157
3158 // 10-degree minimax approximation
3159 //*ScalarCos = sign * (((((-2.6051615e-07f * y2 + 2.4760495e-05f) * y2 - 0.0013888378f) * y2 + 0.041666638f) * y2 - 0.5f) * y2 + 1.0f);
3160 const VectorRegister4Float CosCoeff0 = MakeVectorRegisterFloat(1.0f, -0.5f, 0.041666638f, -0.0013888378f);
3161 const VectorRegister4Float CosCoeff1 = MakeVectorRegisterFloat(2.4760495e-05f, -2.6051615e-07f, /*unused*/ 0.f, /*unused*/ 0.f);
3162
3171#endif
3172}
3173
3175{
3176#if UE_PLATFORM_MATH_USE_SVML_AVX
3177 VSinAngles->XYZW = _mm256_sincos_pd(&(VCosAngles->XYZW), VAngles->XYZW);
3178#elif UE_PLATFORM_MATH_USE_SVML
3179 VSinAngles->XY = _mm_sincos_pd(&(VCosAngles->XY), VAngles->XY);
3180 VSinAngles->ZW = _mm_sincos_pd(&(VCosAngles->ZW), VAngles->ZW);
3181#else
3182 *VSinAngles = VectorSin(*VAngles);
3183 *VCosAngles = VectorCos(*VAngles);
3184#endif
3185}
3186
3187
3189{
3190#if UE_PLATFORM_MATH_USE_SVML
3191 return _mm_tan_ps(X);
3192#else
3193 //return SSE::tan_ps(X);
3194 AlignedFloat4 Floats(X);
3195 Floats[0] = FMath::Tan(Floats[0]);
3196 Floats[1] = FMath::Tan(Floats[1]);
3197 Floats[2] = FMath::Tan(Floats[2]);
3198 Floats[3] = FMath::Tan(Floats[3]);
3199 return Floats.ToVectorRegister();
3200#endif
3201}
3202
3204{
3205#if UE_PLATFORM_MATH_USE_SVML_AVX
3206 return _mm256_tan_pd(X);
3207#elif UE_PLATFORM_MATH_USE_SVML
3209#else
3211 Doubles[0] = FMath::Tan(Doubles[0]);
3212 Doubles[1] = FMath::Tan(Doubles[1]);
3213 Doubles[2] = FMath::Tan(Doubles[2]);
3214 Doubles[3] = FMath::Tan(Doubles[3]);
3215 return Doubles.ToVectorRegister();
3216#endif
3217}
3218
3220{
3221#if UE_PLATFORM_MATH_USE_SVML
3222 return _mm_asin_ps(X);
3223#else
3224 AlignedFloat4 Floats(X);
3225 Floats[0] = FMath::Asin(Floats[0]);
3226 Floats[1] = FMath::Asin(Floats[1]);
3227 Floats[2] = FMath::Asin(Floats[2]);
3228 Floats[3] = FMath::Asin(Floats[3]);
3229 return Floats.ToVectorRegister();
3230#endif
3231}
3232
3234{
3235#if UE_PLATFORM_MATH_USE_SVML_AVX
3236 return _mm256_asin_pd(X);
3237#elif UE_PLATFORM_MATH_USE_SVML
3239#else
3241 Doubles[0] = FMath::Asin(Doubles[0]);
3242 Doubles[1] = FMath::Asin(Doubles[1]);
3243 Doubles[2] = FMath::Asin(Doubles[2]);
3244 Doubles[3] = FMath::Asin(Doubles[3]);
3245 return Doubles.ToVectorRegister();
3246#endif
3247}
3248
3250{
3251#if UE_PLATFORM_MATH_USE_SVML
3252 return _mm_acos_ps(X);
3253#else
3254 AlignedFloat4 Floats(X);
3255 Floats[0] = FMath::Acos(Floats[0]);
3256 Floats[1] = FMath::Acos(Floats[1]);
3257 Floats[2] = FMath::Acos(Floats[2]);
3258 Floats[3] = FMath::Acos(Floats[3]);
3259 return Floats.ToVectorRegister();
3260#endif
3261}
3262
3264{
3265#if UE_PLATFORM_MATH_USE_SVML_AVX
3266 return _mm256_acos_pd(X);
3267#elif UE_PLATFORM_MATH_USE_SVML
3269#else
3271 Doubles[0] = FMath::Acos(Doubles[0]);
3272 Doubles[1] = FMath::Acos(Doubles[1]);
3273 Doubles[2] = FMath::Acos(Doubles[2]);
3274 Doubles[3] = FMath::Acos(Doubles[3]);
3275 return Doubles.ToVectorRegister();
3276#endif
3277}
3278
3280{
3281#if UE_PLATFORM_MATH_USE_SVML
3282 return _mm_atan_ps(X);
3283#else
3284 //return SSE::atan_ps(X);
3285 AlignedFloat4 Floats(X);
3286 Floats[0] = FMath::Atan(Floats[0]);
3287 Floats[1] = FMath::Atan(Floats[1]);
3288 Floats[2] = FMath::Atan(Floats[2]);
3289 Floats[3] = FMath::Atan(Floats[3]);
3290 return Floats.ToVectorRegister();
3291#endif
3292}
3293
3295{
3296#if UE_PLATFORM_MATH_USE_SVML_AVX
3297 return _mm256_atan_pd(X);
3298#elif UE_PLATFORM_MATH_USE_SVML
3300#else
3302 Doubles[0] = FMath::Atan(Doubles[0]);
3303 Doubles[1] = FMath::Atan(Doubles[1]);
3304 Doubles[2] = FMath::Atan(Doubles[2]);
3305 Doubles[3] = FMath::Atan(Doubles[3]);
3306 return Doubles.ToVectorRegister();
3307#endif
3308}
3309
3311{
3312#if UE_PLATFORM_MATH_USE_SVML
3313 return _mm_atan2_ps(Y, X);
3314#else
3315 //return SSE::atan2_ps(Y, X);
3318 FloatsY[0] = FMath::Atan2(FloatsY[0], FloatsX[0]);
3319 FloatsY[1] = FMath::Atan2(FloatsY[1], FloatsX[1]);
3320 FloatsY[2] = FMath::Atan2(FloatsY[2], FloatsX[2]);
3321 FloatsY[3] = FMath::Atan2(FloatsY[3], FloatsX[3]);
3322 return FloatsY.ToVectorRegister();
3323#endif
3324}
3325
3327{
3328#if UE_PLATFORM_MATH_USE_SVML_AVX
3329 return _mm256_atan2_pd(Y, X);
3330#elif UE_PLATFORM_MATH_USE_SVML
3331 return VectorRegister4Double(_mm_atan2_pd(Y.XY, X.XY), _mm_atan2_pd(Y.ZW, X.ZW));
3332#else
3335 DoublesY[0] = FMath::Atan2(DoublesY[0], DoublesX[0]);
3336 DoublesY[1] = FMath::Atan2(DoublesY[1], DoublesX[1]);
3337 DoublesY[2] = FMath::Atan2(DoublesY[2], DoublesX[2]);
3338 DoublesY[3] = FMath::Atan2(DoublesY[3], DoublesX[3]);
3339 return DoublesY.ToVectorRegister();
3340#endif
3341}
3342
3343
3345//Integer ops
3346
3347//Bitwise
3349#define VectorIntAnd(A, B) _mm_and_si128(A, B)
3351#define VectorIntOr(A, B) _mm_or_si128(A, B)
3353#define VectorIntXor(A, B) _mm_xor_si128(A, B)
3355#define VectorIntAndNot(A, B) _mm_andnot_si128(A, B)
3357#define VectorIntNot(A) _mm_xor_si128(A, GlobalVectorConstants::IntAllMask)
3358
3359//Comparison
3360#define VectorIntCompareEQ(A, B) _mm_cmpeq_epi32(A,B)
3361#define VectorIntCompareNEQ(A, B) VectorIntNot(_mm_cmpeq_epi32(A,B))
3362#define VectorIntCompareGT(A, B) _mm_cmpgt_epi32(A,B)
3363#define VectorIntCompareLT(A, B) _mm_cmplt_epi32(A,B)
3364#define VectorIntCompareGE(A, B) VectorIntNot(VectorIntCompareLT(A,B))
3365#define VectorIntCompareLE(A, B) VectorIntNot(VectorIntCompareGT(A,B))
3366
3367
3369{
3370 // Can't use PBLENDVB in general because this is a bitwise select, not byte-lane-wise
3372}
3373
3374//Arithmetic
3375#define VectorIntAdd(A, B) _mm_add_epi32(A, B)
3376#define VectorIntSubtract(A, B) _mm_sub_epi32(A, B)
3377
3379{
3380 return _mm_mullo_epi32(A, B);
3381}
3382
3383#define VectorIntNegate(A) VectorIntSubtract(GlobalVectorConstants::IntZero, A)
3384
3386{
3387 return _mm_min_epi32(A, B);
3388}
3389
3391{
3392 return _mm_max_epi32(A, B);
3393}
3394
3396{
3397 return _mm_abs_epi32(A);
3398}
3399
3401{
3403}
3404
3405#define VectorIntSign(A) VectorIntSelect(VectorIntCompareGE(A, GlobalVectorConstants::IntZero), GlobalVectorConstants::IntOne, GlobalVectorConstants::IntMinusOne)
3406
3407#define VectorIntToFloat(A) _mm_cvtepi32_ps(A)
3408
3410{
3411 return _mm_cvttps_epi32(A);
3412}
3413
3414// TODO: LWC: potential loss of data
3416{
3418}
3419
3421{
3424 return _mm_unpacklo_epi64(A, B);
3425}
3426
3428{
3429 return _mm_shuffle_epi8(Vec, Mask);
3430}
3431
3432
3433//Loads and stores
3434
3441#define VectorIntStore( Vec, Ptr ) _mm_storeu_si128( (VectorRegister4Int*)(Ptr), Vec )
3442#define VectorIntStore_16( Vec, Ptr ) _mm_storeu_si64( (VectorRegister4Int*)(Ptr), Vec )
3443
3450#define VectorIntLoad( Ptr ) _mm_loadu_si128( (VectorRegister4Int*)(Ptr) )
3451#define VectorIntLoad_16( Ptr ) _mm_loadu_si64 ( (VectorRegister4Int*)(Ptr) )
3458#define VectorIntStoreAligned( Vec, Ptr ) _mm_store_si128( (VectorRegister4Int*)(Ptr), Vec )
3459
3466#define VectorIntLoadAligned( Ptr ) _mm_load_si128( (VectorRegister4Int*)(Ptr) )
3467
3474#define VectorIntLoad1(Ptr) _mm_set1_epi32(*(Ptr))
3475#define VectorIntLoad1_16(Ptr) _mm_set1_epi16(*(Ptr))
3476#define VectorSetZero() _mm_setzero_si128()
3477#define VectorSet1(F) _mm_set1_ps(F)
3478#define VectorIntSet1(F) _mm_set1_epi32(F)
3479#define VectorShiftLeftImm(Vec, ImmAmt) _mm_slli_epi32(Vec, ImmAmt)
3480#define VectorShiftRightImmArithmetic(Vec, ImmAmt) _mm_srai_epi32(Vec, ImmAmt)
3481#define VectorShiftRightImmLogical(Vec, ImmAmt) _mm_srli_epi32(Vec, ImmAmt)
3482#define VectorCastIntToFloat(Vec) _mm_castsi128_ps(Vec)
3483#define VectorCastFloatToInt(Vec) _mm_castps_si128(Vec)
3484#define VectorCastDoubleToInt(Vec) _mm_castpd_si128(Vec)
3485#define VectorCastIntToDouble(Vec) _mm_castsi128_pd(Vec)
3486#define VectorShuffleImmediate(Vec, I0, I1, I2, I3) _mm_shuffle_epi32(Vec, _MM_SHUFFLE(I0, I1, I2, I3))
3487#define VectorIntExpandLow16To32(V0) _mm_unpacklo_epi16(V0, _mm_setzero_si128())
3488
3489#endif
3490
3491// IWYU pragma: end_exports
#define FORCEINLINE
Definition AndroidPlatform.h:140
FPlatformTypes::int64 int64
A 64-bit signed integer.
Definition Platform.h:1127
FPlatformTypes::int32 int32
A 32-bit signed integer.
Definition Platform.h:1125
#define RESTRICT
Definition Platform.h:706
FPlatformTypes::uint64 uint64
A 64-bit unsigned integer.
Definition Platform.h:1117
UE_FORCEINLINE_HINT TSharedRef< CastToType, Mode > StaticCastSharedRef(TSharedRef< CastFromType, Mode > const &InSharedRef)
Definition SharedPointer.h:127
#define X(Name, Desc)
Definition FormatStringSan.h:47
#define PRAGMA_DISABLE_UNSAFE_TYPECAST_WARNINGS
Definition MSVCPlatformCompilerPreSetup.h:81
#define PRAGMA_RESTORE_UNSAFE_TYPECAST_WARNINGS
Definition MSVCPlatformCompilerPreSetup.h:100
FORCEINLINE VectorRegister4Int MakeVectorRegisterInt(int32 X, int32 Y, int32 Z, int32 W)
Definition UnrealMathFPU.h:282
FORCEINLINE VectorRegister4Float VectorTan(const VectorRegister4Float &Vec)
Definition UnrealMathFPU.h:2039
FORCEINLINE VectorRegister4Float VectorSubtract(const VectorRegister4Float &Vec1, const VectorRegister4Float &Vec2)
Definition UnrealMathFPU.h:731
FORCEINLINE void VectorDeinterleave(VectorRegister4Float &RESTRICT OutEvens, VectorRegister4Float &RESTRICT OutOdds, const VectorRegister4Float &RESTRICT Lo, const VectorRegister4Float &RESTRICT Hi)
Definition UnrealMathFPU.h:1777
FORCEINLINE VectorRegister4Double VectorLoadFloat3(const double *Ptr)
Definition UnrealMathFPU.h:427
FORCEINLINE VectorRegister4Float VectorATan2(const VectorRegister4Float &Y, const VectorRegister4Float &X)
Definition UnrealMathFPU.h:2083
FORCEINLINE uint32 VectorAnyGreaterThan(const VectorRegister4Float &Vec1, const VectorRegister4Float &Vec2)
Definition UnrealMathFPU.h:1917
FORCEINLINE VectorRegister4Float VectorSqrt(const VectorRegister4Float &Vec)
Definition UnrealMathFPU.h:1263
FORCEINLINE VectorRegister4Int VectorIntAbs(const VectorRegister4Int &A)
Definition UnrealMathFPU.h:2471
FORCEINLINE VectorRegister4Float VectorReciprocalSqrt(const VectorRegister4Float &Vec)
Definition UnrealMathFPU.h:1279
FORCEINLINE bool VectorMatrixInverse(FMatrix44d *DstMatrix, const FMatrix44d *SrcMatrix)
Definition UnrealMathFPU.h:1603
FORCEINLINE VectorRegister4Float VectorLoadSRGBA16N(void *Ptr)
Definition UnrealMathFPU.h:2268
FORCEINLINE VectorRegister4Int VectorIntMin(const VectorRegister4Int &A, const VectorRegister4Int &B)
Definition UnrealMathFPU.h:2453
FORCEINLINE VectorRegister4Float VectorDot3(const VectorRegister4Float &Vec1, const VectorRegister4Float &Vec2)
Definition UnrealMathFPU.h:880
FORCEINLINE VectorRegister4Float VectorMin(const VectorRegister4Float &Vec1, const VectorRegister4Float &Vec2)
Definition UnrealMathFPU.h:1686
FORCEINLINE float VectorGetComponentImpl(const VectorRegister4Float &Vec)
Definition UnrealMathFPU.h:364
FORCEINLINE VectorRegister4x4Float VectorLoad16(const float *Ptr)
Definition UnrealMathFPU.h:410
FORCEINLINE VectorRegister4Float VectorDot4(const VectorRegister4Float &Vec1, const VectorRegister4Float &Vec2)
Definition UnrealMathFPU.h:901
FORCEINLINE VectorRegister4Float MakeVectorRegister(uint32 X, uint32 Y, uint32 Z, uint32 W)
Definition UnrealMathFPU.h:195
FORCEINLINE void VectorSinCos(VectorRegister4Float *RESTRICT VSinAngles, VectorRegister4Float *RESTRICT VCosAngles, const VectorRegister4Float *RESTRICT VAngles)
Definition UnrealMathFPU.h:2109
FORCEINLINE VectorRegister4Float VectorLoadURGB10A2N(void *Ptr)
Definition UnrealMathFPU.h:1875
FORCEINLINE void VectorStoreSignedByte4(const VectorRegister4Float &Vec, void *Ptr)
Definition UnrealMathFPU.h:1858
FORCEINLINE VectorRegister4Float VectorSet_W1(const VectorRegister4Float &Vec)
Definition UnrealMathFPU.h:1407
FORCEINLINE VectorRegister4Float VectorSetFloat1(float F)
Definition UnrealMathFPU.h:518
FORCEINLINE VectorRegister4Float VectorLog2(const VectorRegister4Float &Vec)
Definition UnrealMathFPU.h:2004
FORCEINLINE void VectorQuaternionMultiply(VectorRegister4Float *RESTRICT Result, const VectorRegister4Float *RESTRICT Quat1, const VectorRegister4Float *RESTRICT Quat2)
Definition UnrealMathFPU.h:1431
#define VectorShuffle(Vec1, Vec2, X, Y, Z, W)
Definition UnrealMathFPU.h:652
VectorRegister4Double VectorRegister4d
Definition UnrealMathFPU.h:90
FORCEINLINE VectorRegister4Float VectorTruncate(const VectorRegister4Float &Vec)
Definition UnrealMathFPU.h:2153
FORCEINLINE VectorRegister4Double VectorZeroDouble(void)
Definition UnrealMathFPU.h:336
FORCEINLINE VectorRegister4Float VectorDivide(const VectorRegister4Float &Vec1, const VectorRegister4Float &Vec2)
Definition UnrealMathFPU.h:834
FORCEINLINE VectorRegister4Float VectorMultiply(const VectorRegister4Float &Vec1, const VectorRegister4Float &Vec2)
Definition UnrealMathFPU.h:758
AlignedDouble4 AlignedRegister4
Definition UnrealMathFPU.h:150
FORCEINLINE VectorRegister4Float VectorMax(const VectorRegister4Float &Vec1, const VectorRegister4Float &Vec2)
Definition UnrealMathFPU.h:1713
FORCEINLINE VectorRegister4Float VectorBitwiseAnd(const VectorRegister4Float &Vec1, const VectorRegister4Float &Vec2)
Definition UnrealMathFPU.h:1165
FORCEINLINE VectorRegister4Float VectorLoadFloat1(const float *Ptr)
Definition UnrealMathFPU.h:468
FORCEINLINE VectorRegister4Float VectorReciprocalLen(const VectorRegister4Float &Vec)
Definition UnrealMathFPU.h:1343
FORCEINLINE constexpr VectorRegister4Float MakeVectorRegisterFloatConstant(float X, float Y, float Z, float W)
Definition UnrealMathFPU.h:297
FORCEINLINE VectorRegister4Float VectorCos(const VectorRegister4Float &Vec)
Definition UnrealMathFPU.h:2027
FORCEINLINE VectorRegister4Float VectorLoadFloat2(const float *Ptr)
Definition UnrealMathFPU.h:485
FORCEINLINE VectorRegister4Int VectorIntSelect(const VectorRegister4Int &Mask, const VectorRegister4Int &Vec1, const VectorRegister4Int &Vec2)
Definition UnrealMathFPU.h:2411
FORCEINLINE VectorRegister4Float VectorRound(const VectorRegister4Float &Vec)
Definition UnrealMathFPU.h:2164
FORCEINLINE void VectorStoreByte4(const VectorRegister4Float &Vec, void *Ptr)
Definition UnrealMathFPU.h:1842
FORCEINLINE VectorRegister4Float VectorCombineLow(const VectorRegister4Float &Vec1, const VectorRegister4Float &Vec2)
Definition UnrealMathFPU.h:1757
#define VectorGetComponent(Vec, ComponentIndex)
Definition UnrealMathFPU.h:385
VectorRegister4Double VectorRegister4
Definition UnrealMathFPU.h:94
FORCEINLINE void VectorStore16(const VectorRegister4x4Float &Vec, float *Dst)
Definition UnrealMathFPU.h:582
FORCEINLINE VectorRegister4Float VectorReciprocalSqrtEstimate(const VectorRegister4Float &Vec)
Definition UnrealMathFPU.h:1295
FORCEINLINE VectorRegister4Double VectorLoadDouble1(const double *Ptr)
Definition UnrealMathFPU.h:473
FORCEINLINE void VectorMatrixMultiply(FMatrix44d *Result, const FMatrix44d *Matrix1, const FMatrix44d *Matrix2)
Definition UnrealMathFPU.h:1538
VectorRegister4Float VectorLoadAligned(const float *Ptr)
Definition UnrealMathFPU.h:451
FORCEINLINE VectorRegister4Float VectorMultiplyAdd(const VectorRegister4Float &Vec1, const VectorRegister4Float &Vec2, const VectorRegister4Float &Vec3)
Definition UnrealMathFPU.h:786
VectorRegister4Int VectorRegister4i
Definition UnrealMathFPU.h:88
FORCEINLINE VectorRegister4Float VectorSelect(const VectorRegister4Float &Mask, const VectorRegister4Float &Vec1, const VectorRegister4Float &Vec2)
Definition UnrealMathFPU.h:1105
FORCEINLINE VectorRegister4Float VectorExp(const VectorRegister4Float &Vec)
Definition UnrealMathFPU.h:1971
FORCEINLINE VectorRegister4Float VectorCompareGT(const VectorRegister4Float &Vec1, const VectorRegister4Float &Vec2)
Definition UnrealMathFPU.h:974
FORCEINLINE VectorRegister4Int VectorRoundToIntHalfToEven(const VectorRegister4Float &A)
Definition UnrealMathFPU.h:2175
FORCEINLINE VectorRegister4Double MakeVectorRegisterDoubleMask(uint64 X, uint64 Y, uint64 Z, uint64 W)
Definition UnrealMathFPU.h:206
FORCEINLINE VectorRegister4Float VectorExp2(const VectorRegister4Float &Vec)
Definition UnrealMathFPU.h:1982
FORCEINLINE VectorRegister4Float VectorASin(const VectorRegister4Float &Vec)
Definition UnrealMathFPU.h:2050
FORCEINLINE VectorRegister4Float VectorLoadTwoPairsFloat(const float *Ptr1, const float *Ptr2)
Definition UnrealMathFPU.h:503
FORCEINLINE VectorRegister4Double VectorOneDouble(void)
Definition UnrealMathFPU.h:351
FORCEINLINE VectorRegister4Float VectorReciprocalLenEstimate(const VectorRegister4Float &Vector)
Definition UnrealMathFPU.h:1375
FORCEINLINE void VectorStore(const VectorRegister4Float &Vec, float *Dst)
Definition UnrealMathFPU.h:566
FORCEINLINE VectorRegister4Float VectorTransformVector(const VectorRegister4Float &VecP, const FMatrix44f *MatrixM)
Definition UnrealMathFPU.h:1619
FORCEINLINE VectorRegister4Float VectorCompareGE(const VectorRegister4Float &Vec1, const VectorRegister4Float &Vec2)
Definition UnrealMathFPU.h:1000
FORCEINLINE VectorRegister4Float VectorMod(const VectorRegister4Float &X, const VectorRegister4Float &Y)
Definition UnrealMathFPU.h:2185
FORCEINLINE VectorRegister4Int MakeVectorRegisterInt64(int64 X, int64 Y)
Definition UnrealMathFPU.h:307
FORCEINLINE VectorRegister4Float VectorCombineHigh(const VectorRegister4Float &Vec1, const VectorRegister4Float &Vec2)
Definition UnrealMathFPU.h:1740
FORCEINLINE VectorRegister4Float VectorCompareLT(const VectorRegister4Float &Vec1, const VectorRegister4Float &Vec2)
Definition UnrealMathFPU.h:1025
FORCEINLINE VectorRegister4Double MakeVectorRegisterDouble(uint64 X, uint64 Y, uint64 Z, uint64 W)
Definition UnrealMathFPU.h:185
VectorRegister4 VectorRegister
Definition UnrealMathFPU.h:95
FORCEINLINE void VectorStoreAlignedStreamed(const VectorRegister4Float &Vec, float *Dst)
Definition UnrealMathFPU.h:550
FORCEINLINE float VectorGetComponentDynamic(const VectorRegister4Float &Vec, uint32 ComponentIndex)
Definition UnrealMathFPU.h:369
FORCEINLINE VectorRegister4Float VectorLog(const VectorRegister4Float &Vec)
Definition UnrealMathFPU.h:1993
FORCEINLINE VectorRegister4Float VectorSet_W0(const VectorRegister4Float &Vec)
Definition UnrealMathFPU.h:1391
#define VectorLoadSignedByte4(Ptr)
Definition UnrealMathFPU.h:1823
FORCEINLINE constexpr VectorRegister4Int MakeVectorRegisterIntConstant(int32 X, int32 Y, int32 Z, int32 W)
Definition UnrealMathFPU.h:292
FORCEINLINE int32 VectorMaskBits(const VectorRegister4Float &Vec1)
Definition UnrealMathFPU.h:1075
FORCEINLINE VectorRegister4Float VectorNegate(const VectorRegister4Float &Vec)
Definition UnrealMathFPU.h:687
FORCEINLINE VectorRegister4Float VectorNegateMultiplyAdd(const VectorRegister4Float &Vec1, const VectorRegister4Float &Vec2, const VectorRegister4Float &Vec3)
Definition UnrealMathFPU.h:815
FORCEINLINE VectorRegister4Float VectorReciprocal(const VectorRegister4Float &Vec)
Definition UnrealMathFPU.h:1311
FORCEINLINE VectorRegister4Float VectorSin(const VectorRegister4Float &Vec)
Definition UnrealMathFPU.h:2015
FORCEINLINE constexpr VectorRegister2Double MakeVectorRegister2DoubleConstant(double X, double Y)
Definition UnrealMathFPU.h:302
FORCEINLINE void VectorStoreURGBA16N(const VectorRegister4Float &Vec, void *Ptr)
Definition UnrealMathFPU.h:2288
FORCEINLINE VectorRegister4Int VectorShuffleByte4(const VectorRegister4Int &Vec, const VectorRegister4Int &Mask)
Definition UnrealMathFPU.h:2515
FORCEINLINE VectorRegister4Float VectorAbs(const VectorRegister4Float &Vec)
Definition UnrealMathFPU.h:661
FORCEINLINE VectorRegister4Float VectorACos(const VectorRegister4Float &Vec)
Definition UnrealMathFPU.h:2061
FORCEINLINE VectorRegister4Float VectorAdd(const VectorRegister4Float &Vec1, const VectorRegister4Float &Vec2)
Definition UnrealMathFPU.h:704
FORCEINLINE VectorRegister4Int VectorDoubleToInt(const VectorRegister4Double &A)
Definition UnrealMathFPU.h:2510
FORCEINLINE VectorRegister4Float VectorFloor(const VectorRegister4Float &Vec)
Definition UnrealMathFPU.h:2142
FORCEINLINE float VectorDot3Scalar(const VectorRegister4Float &Vec1, const VectorRegister4Float &Vec2)
Definition UnrealMathFPU.h:861
VectorRegister4Int VectorRegisterInt
Definition UnrealMathFPU.h:96
void VectorStoreAligned(const VectorRegister4Float &Vec, float *Ptr)
Definition UnrealMathFPU.h:534
FORCEINLINE VectorRegister4Float MakeVectorRegisterFloatMask(uint32 X, uint32 Y, uint32 Z, uint32 W)
Definition UnrealMathFPU.h:201
FORCEINLINE VectorRegister4Float VectorBitwiseXor(const VectorRegister4Float &Vec1, const VectorRegister4Float &Vec2)
Definition UnrealMathFPU.h:1190
FORCEINLINE VectorRegister4Float VectorCeil(const VectorRegister4Float &Vec)
Definition UnrealMathFPU.h:2131
FORCEINLINE VectorRegister4Double VectorLoadFloat3_W1(const double *Ptr)
Definition UnrealMathFPU.h:439
#define VectorSwizzle(Vec, X, Y, Z, W)
Definition UnrealMathFPU.h:639
FORCEINLINE VectorRegister4Float VectorQuaternionMultiply2(const VectorRegister4Float &Quat1, const VectorRegister4Float &Quat2)
Definition UnrealMathFPU.h:1517
FORCEINLINE VectorRegister4Int VectorIntMax(const VectorRegister4Int &A, const VectorRegister4Int &B)
Definition UnrealMathFPU.h:2462
FORCEINLINE VectorRegister4Float VectorOneFloat(void)
Definition UnrealMathFPU.h:346
#define VectorLoadByte4(Ptr)
Definition UnrealMathFPU.h:1814
FORCEINLINE VectorRegister4Float VectorZeroFloat(void)
Definition UnrealMathFPU.h:331
FORCEINLINE VectorRegister4Float VectorATan(const VectorRegister4Float &Vec)
Definition UnrealMathFPU.h:2072
FORCEINLINE VectorRegister4Float VectorLoad(const float *Ptr)
Definition UnrealMathFPU.h:394
FORCEINLINE VectorRegister4Float VectorCross(const VectorRegister4Float &Vec1, const VectorRegister4Float &Vec2)
Definition UnrealMathFPU.h:1216
FORCEINLINE VectorRegister4Float VectorBitwiseOr(const VectorRegister4Float &Vec1, const VectorRegister4Float &Vec2)
Definition UnrealMathFPU.h:1140
FORCEINLINE VectorRegister4Int VectorFloatToInt(const VectorRegister4Float &A)
Definition UnrealMathFPU.h:2491
bool VectorContainsNaNOrInfinite(const VectorRegister4Float &Vec)
Definition UnrealMathFPU.h:1960
FORCEINLINE VectorRegister4Float VectorStep(const VectorRegister4Float &Vec)
Definition UnrealMathFPU.h:2223
VectorRegister4Float VectorRegister4f
Definition UnrealMathFPU.h:89
FORCEINLINE VectorRegister4Float VectorPow(const VectorRegister4Float &Base, const VectorRegister4Float &Exponent)
Definition UnrealMathFPU.h:1243
FORCEINLINE VectorRegister4Int VectorIntMultiply(const VectorRegister4Int &A, const VectorRegister4Int &B)
Definition UnrealMathFPU.h:2435
FORCEINLINE void VectorStoreFloat3(const VectorRegister4Float &Vec, float *Dst)
Definition UnrealMathFPU.h:594
FORCEINLINE VectorRegister4Float VectorReciprocalEstimate(const VectorRegister4Float &Vec)
Definition UnrealMathFPU.h:1327
FORCEINLINE VectorRegister4Float VectorCompareLE(const VectorRegister4Float &Vec1, const VectorRegister4Float &Vec2)
Definition UnrealMathFPU.h:1050
FORCEINLINE VectorRegister4Float VectorCompareEQ(const VectorRegister4Float &Vec1, const VectorRegister4Float &Vec2)
Definition UnrealMathFPU.h:923
FORCEINLINE void VectorStoreFloat1(const VectorRegister4Float &Vec, float *Dst)
Definition UnrealMathFPU.h:610
FORCEINLINE VectorRegister4Float MakeVectorRegisterFloat(uint32 X, uint32 Y, uint32 Z, uint32 W)
Definition UnrealMathFPU.h:175
FORCEINLINE VectorRegister4Float MakeVectorRegisterFloatFromDouble(const VectorRegister4Double &Vec4d)
Definition UnrealMathFPU.h:262
FORCEINLINE VectorRegister2Double MakeVectorRegister2Double(double X, double Y)
Definition UnrealMathFPU.h:158
#define VectorReplicate(Vec, ElementIndex)
Definition UnrealMathFPU.h:627
VectorRegister2Double VectorRegister2d
Definition UnrealMathFPU.h:91
FORCEINLINE VectorRegister4Float VectorSign(const VectorRegister4Float &Vec)
Definition UnrealMathFPU.h:2204
#define VectorLoadByte4Reverse(Ptr)
Definition UnrealMathFPU.h:1833
FORCEINLINE VectorRegister4Float VectorCompareNE(const VectorRegister4Float &Vec1, const VectorRegister4Float &Vec2)
Definition UnrealMathFPU.h:948
FORCEINLINE VectorRegister4Float VectorMergeVecXYZ_VecW(const VectorRegister4Float &VecXYZ, const VectorRegister4Float &VecW)
Definition UnrealMathFPU.h:1797
FORCEINLINE void VectorStoreURGB10A2N(const VectorRegister4Float &Vec, void *Ptr)
Definition UnrealMathFPU.h:1895
uint32_t uint32
Definition binka_ue_file_header.h:6
@ XY
Definition Axis.h:31
VectorRegister4Float FloatInfinity()
Definition UnrealMathVectorConstants.h.inl:118
VectorRegister4Double DoubleSignMask()
Definition UnrealMathVectorConstants.h.inl:115
constexpr VectorRegister4Float FloatZero
Definition UnrealMathVectorConstants.h.inl:41
constexpr VectorRegister4Float FloatOne
Definition UnrealMathVectorConstants.h.inl:40
constexpr VectorRegister4Float FloatMinusOne
Definition UnrealMathVectorConstants.h.inl:42
constexpr VectorRegister4Float OneOverTwoPi
Definition UnrealMathVectorConstants.h.inl:126
constexpr VectorRegister4Float SmallNumber
Definition UnrealMathVectorConstants.h.inl:53
constexpr VectorRegister4Double DoubleMinusOne
Definition UnrealMathVectorConstants.h.inl:60
constexpr VectorRegister4Double DOUBLE_QMULTI_SIGN_MASK2
Definition UnrealMathVectorConstants.h.inl:91
constexpr VectorRegister4Float QMULTI_SIGN_MASK0
Definition UnrealMathVectorConstants.h.inl:86
constexpr VectorRegister4Float TwoPi
Definition UnrealMathVectorConstants.h.inl:122
constexpr VectorRegister4Float QMULTI_SIGN_MASK1
Definition UnrealMathVectorConstants.h.inl:87
constexpr VectorRegister4Double DoubleSmallNumber
Definition UnrealMathVectorConstants.h.inl:71
constexpr VectorRegister4Float PiByTwo
Definition UnrealMathVectorConstants.h.inl:123
constexpr VectorRegister4Double DOUBLE_QMULTI_SIGN_MASK0
Definition UnrealMathVectorConstants.h.inl:89
constexpr VectorRegister4Float Pi
Definition UnrealMathVectorConstants.h.inl:121
VectorRegister4Double DoubleXYZMask()
Definition UnrealMathVectorConstants.h.inl:102
constexpr VectorRegister4Float QMULTI_SIGN_MASK2
Definition UnrealMathVectorConstants.h.inl:88
VectorRegister4Float SignMask()
Definition UnrealMathVectorConstants.h.inl:106
constexpr VectorRegister2Double DoubleOne2d
Definition UnrealMathVectorConstants.h.inl:57
VectorRegister4Double DoubleInfinity()
Definition UnrealMathVectorConstants.h.inl:119
constexpr VectorRegister4Double DOUBLE_QMULTI_SIGN_MASK1
Definition UnrealMathVectorConstants.h.inl:90
VectorRegister4Float SignBit()
Definition UnrealMathVectorConstants.h.inl:105
constexpr VectorRegister4Double DoubleOne
Definition UnrealMathVectorConstants.h.inl:58
constexpr VectorRegister4Double DoubleZero
Definition UnrealMathVectorConstants.h.inl:59
constexpr UE::Math::TVector2< T > GetXY(const UE::Math::TVector< T > &V)
Definition VectorTypes.h:262
UE_STRING_CLASS Result(Forward< LhsType >(Lhs), RhsLen)
Definition String.cpp.inl:732
float v
Definition radaudio_mdct.cpp:62
U16 Index
Definition radfft.cpp:71
Definition UnrealMathFPU.h:133
Definition UnrealMathFPU.h:113
static CORE_API bool MatrixInverse(FMatrix44f *DstMatrix, const FMatrix44f *SrcMatrix)
Definition UnrealMath.cpp:928
static float Log2(float Value)
Definition UnrealMathUtility.h:722
Definition UnrealMathFPU.h:34
Definition UnrealMathFPU.h:42
VectorRegister2Double XY
Definition UnrealMathFPU.h:47
VectorRegister2Double ZW
Definition UnrealMathFPU.h:48
Definition UnrealMathFPU.h:20
Definition UnrealMathFPU.h:28
Definition UnrealMathFPU.h:99
Definition UnrealMathFPU.h:14