UDocumentation UE5.7 10.02.2026 (Source)
API documentation for Unreal Engine 5.7
UnrealMathNeon.h
Go to the documentation of this file.
1// Copyright Epic Games, Inc. All Rights Reserved.
2
3#pragma once
4
5#include "HAL/Platform.h"
6
7// HEADER_UNIT_SKIP - Not included directly
8
10
11#if PLATFORM_ENABLE_VECTORINTRINSICS_NEON
12
14
15// Include the intrinsic functions header
16#if (PLATFORM_WINDOWS && PLATFORM_64BITS && !PLATFORM_COMPILER_CLANG)
17#include <arm64_neon.h>
18#else
19#include <arm_neon.h>
20#endif
21
22#include "Math/Float16.h"
23
24/*=============================================================================
25 * Helpers:
26 *============================================================================*/
27
28#if PLATFORM_COMPILER_MSVC
29
30// MSVC NEON headers typedef float32x4_t and int32x4_t both to __n128
31// This wrapper type allows VectorRegister4Float and VectorRegister4Int to be
32// discriminated for template specialization (e.g. FConstantHandler)
33//
34// This comes at the cost of having to define constructors for some
35// anonymous unions, because VectorRegister4Float/VectorRegister4Int are no
36// longer trivially constructible. The optimizer should eliminate the
37// redundant zero initialization in these cases for non-MSVC (e.g. V()
38// is called now where it wasn't before)
39template<typename T, typename BASE_TYPE>
40struct alignas(alignof(T)) VectorRegisterWrapper
41{
44
45 FORCEINLINE operator T&() { return m_vec; }
46 FORCEINLINE operator const T&() const { return m_vec; }
47
48 FORCEINLINE BASE_TYPE operator[](int Index) const;
49
50 T m_vec;
51};
52
53template<>
54FORCEINLINE float VectorRegisterWrapper<float32x4_t, float>::operator[](int Index) const
55{
56 return m_vec.n128_f32[Index];
57}
58
59template<>
60FORCEINLINE double VectorRegisterWrapper<float64x2_t, double>::operator[](int Index) const
61{
62 return m_vec.n128_f64[Index];
63}
64
65template<>
66FORCEINLINE int VectorRegisterWrapper<int32x4_t, int>::operator[](int Index) const
67{
68 return m_vec.n128_i32[Index];
69}
70
71template<>
72FORCEINLINE int64 VectorRegisterWrapper<int64x2_t, int64>::operator[](int Index) const
73{
74 return m_vec.n128_i64[Index];
75}
76
82
84
86{
87 int32x4_t Out = {};
88 Out.n128_i32[0] = X;
89 Out.n128_i32[1] = Y;
90 Out.n128_i32[2] = Z;
91 Out.n128_i32[3] = W;
92 return Out;
93}
94
95FORCEINLINE constexpr VectorRegister4Float MakeVectorRegisterFloatConstant(float X, float Y, float Z, float W)
96{
97 float32x4_t Out = {};
98 Out.n128_f32[0] = X;
99 Out.n128_f32[1] = Y;
100 Out.n128_f32[2] = Z;
101 Out.n128_f32[3] = W;
102 return Out;
103}
104
106{
107 float64x2_t Out = {};
108 Out.n128_f64[0] = X;
109 Out.n128_f64[1] = Y;
110 return Out;
111}
112
113#else
114
121
123{
124 return VectorRegister4Int { X, Y, Z, W };
125}
126
127FORCEINLINE constexpr VectorRegister4Float MakeVectorRegisterFloatConstant(float X, float Y, float Z, float W)
128{
129 return VectorRegister4Float { X, Y, Z, W };
130}
131
133{
134 return VectorRegister2Double { X, Y };
135}
136
137#endif
138
139#define DECLARE_VECTOR_REGISTER(X, Y, Z, W) MakeVectorRegister( X, Y, Z, W )
140
141struct alignas(16) VectorRegister4Double
142{
143 struct
144 {
147 };
148
152
154 {
155 XY = vcvt_f64_f32(*(float32x2_t*)&From);
156 ZW = vcvt_high_f64_f32(From);
157 }
158
160
162 {
163 *this = VectorRegister4Double(From);
164 return *this;
165 }
166};
167
169#define VectorZeroVectorRegister() VectorZeroDouble()
170#define VectorOneVectorRegister() VectorOneDouble()
171
172// Forward declarations
177
178
179// Helper for conveniently aligning a float array for extraction from VectorRegister4Float
180struct alignas(alignof(VectorRegister4Float)) AlignedFloat4
181{
182 float V[4];
183
185 {
187 }
188
189 FORCEINLINE float operator[](int32 Index) const { return V[Index]; }
190 FORCEINLINE float& operator[](int32 Index) { return V[Index]; }
191
193 {
194 return VectorLoadAligned(V);
195 }
196};
197
198
199// Helper for conveniently aligning a double array for extraction from VectorRegister4Double
200struct alignas(alignof(VectorRegister4Double)) AlignedDouble4
201{
202 double V[4];
203
205 {
207 }
208
209 FORCEINLINE double operator[](int32 Index) const { return V[Index]; }
210 FORCEINLINE double& operator[](int32 Index) { return V[Index]; }
211
213 {
214 return VectorLoadAligned(V);
215 }
216};
217
219
220// Aliases
225
226// Backwards compatibility
230
241{
242 union U {
244 FORCEINLINE U() : V() {}
245 } Tmp;
246 Tmp.F[0] = X;
247 Tmp.F[1] = Y;
248 Tmp.F[2] = Z;
249 Tmp.F[3] = W;
250 return Tmp.V;
251}
252
254{
255 return MakeVectorRegister(X, Y, Z, W);
256}
257
258// Nicer alias
260{
261 return MakeVectorRegisterFloat(X, Y, Z, W);
262}
263
264
274FORCEINLINE VectorRegister4Float MakeVectorRegister(float X, float Y, float Z, float W)
275{
276 union U {
277 VectorRegister4Float V; float F[4];
278 FORCEINLINE U() : V() {}
279 } Tmp;
280 Tmp.F[0] = X;
281 Tmp.F[1] = Y;
282 Tmp.F[2] = Z;
283 Tmp.F[3] = W;
284 return Tmp.V;
285}
286
287FORCEINLINE VectorRegister4Float MakeVectorRegisterFloat(float X, float Y, float Z, float W)
288{
289 return MakeVectorRegister(X, Y, Z, W);
290}
291
301FORCEINLINE VectorRegister4Double MakeVectorRegister(double X, double Y, double Z, double W)
302{
303 union U
304 {
305 VectorRegister4Double V; double D[4];
306 FORCEINLINE U() : V() {}
307 } Tmp;
308 Tmp.D[0] = X;
309 Tmp.D[1] = Y;
310 Tmp.D[2] = Z;
311 Tmp.D[3] = W;
312 return Tmp.V;
313}
314
315FORCEINLINE VectorRegister4Double MakeVectorRegisterDouble(double X, double Y, double Z, double W)
316{
317 return MakeVectorRegister(X, Y, Z, W);
318}
319
321{
322 return VectorRegister4Double(XY, ZW);
323}
324
326{
327 union U
328 {
330 FORCEINLINE U() : V() {}
331 } Tmp;
332 Tmp.D[0] = X;
333 Tmp.D[1] = Y;
334 Tmp.D[2] = Z;
335 Tmp.D[3] = W;
336 return Tmp.V;
337}
338
339// Nicer alias
341{
342 return MakeVectorRegisterDouble(X, Y, Z, W);
343}
344
346{
347 union U
348 {
349 VectorRegister2Double V; double D[2];
350 FORCEINLINE U() : V() {}
351 } Tmp;
352 Tmp.D[0] = X;
353 Tmp.D[1] = Y;
354 return Tmp.V;
355}
356
358{
359 union U
360 {
362 FORCEINLINE U() : V() {}
363 } Tmp;
364 Tmp.D[0] = X;
365 Tmp.D[1] = Y;
366 return Tmp.V;
367}
368
379{
380 union U {
382 FORCEINLINE U() : V() {}
383 } Tmp;
384 Tmp.I[0] = X;
385 Tmp.I[1] = Y;
386 Tmp.I[2] = Z;
387 Tmp.I[3] = W;
388 return Tmp.V;
389}
390
392{
393 union U
394 {
396 FORCEINLINE U() : V() {}
397 } Tmp;
398 Tmp.I[0] = X;
399 Tmp.I[1] = Y;
400 return Tmp.V;
401}
402
403// Make double register from float register
405{
406 return VectorRegister4Double(From);
407}
408
409// Lossy conversion: double->float vector
411{
413}
414
415/*
416#define VectorPermute(Vec1, Vec2, Mask) my_perm(Vec1, Vec2, Mask)
417
418/ ** Reads NumBytesMinusOne+1 bytes from the address pointed to by Ptr, always reading the aligned 16 bytes containing the start of Ptr, but only reading the next 16 bytes if the data straddles the boundary * /
419FORCEINLINE VectorRegister4Float VectorLoadNPlusOneUnalignedBytes(const void* Ptr, int NumBytesMinusOne)
420{
421 return VectorPermute( my_ld (0, (float*)Ptr), my_ld(NumBytesMinusOne, (float*)Ptr), my_lvsl(0, (float*)Ptr) );
422}
423*/
424
425
426/*=============================================================================
427 * Constants:
428 *============================================================================*/
429
431
432
433/*=============================================================================
434 * Intrinsics:
435 *============================================================================*/
436
443{
444 return vdupq_n_f32( 0.0f );
445}
446
448{
451}
452
453
460{
461 return vdupq_n_f32( 1.0f );
462}
463
465{
467 Result.XY = vdupq_n_f64(1.0f);
468 Result.ZW = Result.XY;
469 return Result;
470}
471
479{
480 return vld1q_f32( (float32_t*)Ptr );
481}
482
484{
487 return Result;
488}
489
497{
498 return vld1q_f32_x4(Ptr);
499}
500
508{
509 return MakeVectorRegister(Ptr[0], Ptr[1], Ptr[0], Ptr[1]);
510}
511
519{
521 Result.XY = vld1q_f64(Ptr);
522 Result.ZW = vcombine_f64(vld1_f64(&Ptr[2]), vdup_n_f64(0.0));
523 return Result;
524}
525
533{
535 Result.XY = vld1q_f64(Ptr);
536 Result.ZW = vcombine_f64(vld1_f64(&Ptr[2]), vdup_n_f64(1.0));
537 return Result;
538}
539
543template <int ElementIndex>
545{
546 return vsetq_lane_f32(Scalar, Vec, ElementIndex);
547}
548
549template <int ElementIndex>
551{
552 return vsetq_lane_f64(Scalar, Vec, ElementIndex);
553}
554
555template <int ElementIndex>
557{
559 if constexpr (ElementIndex > 1)
560 {
561 Result.XY = Vec.XY;
562 Result.ZW = VectorSetComponentImpl<ElementIndex - 2>(Vec.ZW, Scalar);
563 }
564 else
565 {
567 Result.ZW = Vec.ZW;
568 }
569 return Result;
570}
571
572#define VectorSetComponent( Vec, ElementIndex, Scalar ) VectorSetComponentImpl<ElementIndex>(Vec, Scalar)
573
574
582{
583 return VectorLoad(Ptr);
584}
585
587{
588 return VectorLoad(Ptr);
589}
590
598{
599 return vdupq_n_f32(Ptr[0]);
600}
601
603{
605 Result.XY = vdupq_n_f64(Ptr[0]);
606 Result.ZW = Result.XY;
607 return Result;
608}
609
611{
612 return vcombine_s64(vld1_s64((const int64_t *)Ptr), vdup_n_s64(0));
613}
614
624{
627 return vcombine_f32(Lo, Hi);
628}
629
631{
633 Res.XY = vld1q_f64(Ptr1);
634 Res.ZW = vld1q_f64(Ptr2);
635 return Res;
636}
637
645{
646 return vdupq_n_f32(X);
647}
648
650{
652 Result.XY = vdupq_n_f64(X);
653 Result.ZW = Result.XY;
654 return Result;
655}
656
664{
665 vst1q_f32(Ptr, Vec);
666}
667
669{
671}
672
673//TODO: LWC VectorVM.cpp calls it on a line 3294, case EVectorVMOp::outputdata_half: Context.WriteExecFunction(CopyConstantToOutput<float, FFloat16, 2>); break;
675{
676 AlignedFloat4 Floats(Vec);
677 for (int i = 0; i < 4; ++i)
678 {
679 Ptr[i] = Floats[i];
680 }
681}
682
689#define VectorStoreAlignedStreamed(Vec, Ptr) VectorStoreAligned(Vec, Ptr)
690
698{
699 vst1q_f32(Ptr, Vec);
700}
701
703{
705}
706
714{
715 vst1q_f32_x4(Ptr, Vec);
716}
717
725{
726 vst1_f32(Ptr, *(float32x2_t*)&Vec);
727 vst1q_lane_f32(((float32_t*)Ptr) + 2, Vec, 2);
728}
729
737{
738 vst1q_f64(Ptr, Vec.XY);
739 vst1q_lane_f64(((float64_t*)Ptr) + 2, Vec.ZW, 0);
740}
741
742
750{
751 vst1q_lane_f32( Ptr, Vec, 0 );
752}
753
755{
756 vst1q_lane_f64(Ptr, Vec.XY, 0);
757}
758
767template <int ElementIndex>
769{
770 return vdupq_n_f32(vgetq_lane_f32(Vec, ElementIndex));
771}
772
773template <int ElementIndex>
775{
776 return vdupq_n_f64(vgetq_lane_f64(Vec, ElementIndex));
777}
778
779template <int ElementIndex>
781{
783 if constexpr (ElementIndex <= 1)
784 {
786 Result.ZW = Result.XY;
787 }
788 else
789 {
790 Result.ZW = VectorReplicateImpl<ElementIndex - 2>(Vec.ZW);
791 Result.XY = Result.ZW;
792 }
793 return Result;
794}
795
796#define VectorReplicate( Vec, ElementIndex ) VectorReplicateImpl<ElementIndex>(Vec)
797
798
806{
807 return vabsq_f32( Vec );
808}
809
811{
813 Result.XY = vabsq_f64(Vec.XY);
814 Result.ZW = vabsq_f64(Vec.ZW);
815 return Result;
816}
817
825{
826 return vnegq_f32( Vec );
827}
828
830{
832 Result.XY = vnegq_f64(Vec.XY);
833 Result.ZW = vnegq_f64(Vec.ZW);
834 return Result;
835}
836
845{
846 return vaddq_f32( Vec1, Vec2 );
847}
848
850{
852 Result.XY = vaddq_f64(Vec1.XY, Vec2.XY);
853 Result.ZW = vaddq_f64(Vec1.ZW, Vec2.ZW);
854 return Result;
855}
856
857
866{
867 return vsubq_f32( Vec1, Vec2 );
868}
869
871{
873 Res.XY = vsubq_f64(Vec1.XY, Vec2.XY);
874 Res.ZW = vsubq_f64(Vec1.ZW, Vec2.ZW);
875 return Res;
876}
877
878
887{
888 return vmulq_f32( Vec1, Vec2 );
889}
890
892{
893 return vmulq_f64(Vec1, Vec2);
894}
895
897{
899 Result.XY = vmulq_f64(Vec1.XY, Vec2.XY);
900 Result.ZW = vmulq_f64(Vec1.ZW, Vec2.ZW);
901 return Result;
902}
903
904
913{
914 return vdivq_f32(Vec1, Vec2);
915}
916
918{
920 Res.XY = vdivq_f64(Vec1.XY, Vec2.XY);
921 Res.ZW = vdivq_f64(Vec1.ZW, Vec2.ZW);
922 return Res;
923}
924
925
935{
936 return vfmaq_f32(Acc, Vec1, Vec2 );
937}
938
940{
942 Result.XY = vfmaq_f64(Acc.XY, Vec1.XY, Vec2.XY);
943 Result.ZW = vfmaq_f64(Acc.ZW, Vec1.ZW, Vec2.ZW);
944 return Result;
945}
946
956{
957 return vfmsq_f32(Sub, Vec1, Vec2);
958}
959
961{
963 Result.XY = vfmsq_f64(Sub.XY, Vec1.XY, Vec2.XY);
964 Result.ZW = vfmsq_f64(Sub.ZW, Vec1.ZW, Vec2.ZW);
965 return Result;
966}
967
968
978{
980 Temp = vsetq_lane_f32( 0.0f, Temp, 3 );
981 float32x2_t sum = vpadd_f32( vget_low_f32( Temp ), vget_high_f32( Temp ) );
982 sum = vpadd_f32( sum, sum );
983 return vdupq_lane_f32( sum, 0 );
984}
985
987{
989 A = vmulq_f64(Vec1.XY, Vec2.XY);
990 B = vfmaq_f64(A, Vec1.ZW, Vec2.ZW);
993 Temp.XY = vdupq_lane_f64(Sum, 0);
994 Temp.ZW = Temp.XY;
995 return Temp;
996}
997
999{
1000 return vgetq_lane_f32(VectorDot3(Vec1, Vec2), 0);
1001}
1002
1004{
1006 A = vmulq_f64(Vec1.XY, Vec2.XY);
1007 B = vfmaq_f64(A, Vec1.ZW, Vec2.ZW);
1009 return *(double*)&Sum;
1010}
1011
1012
1013
1023{
1026 sum = vpadd_f32(sum, sum);
1027 return vdupq_lane_f32(sum, 0);
1028}
1029
1031{
1033 A = vmulq_f64(Vec1.XY, Vec2.XY);
1034 B = vfmaq_f64(A, Vec1.ZW, Vec2.ZW);
1035 A = vextq_f64(B, B, 1);
1037 Temp.XY = vaddq_f64(A, B);
1038 Temp.ZW = Temp.XY;
1039 return Temp;
1040}
1041
1051{
1053}
1054
1056{
1060 return Result;
1061}
1062
1063
1064
1074{
1076}
1077
1079{
1083 return Result;
1084}
1085
1095{
1097}
1098
1100{
1104 return Result;
1105}
1106
1116{
1118}
1119
1121{
1125 return Result;
1126}
1127
1136{
1138}
1139
1141{
1145 return Res;
1146}
1147
1156{
1158}
1159
1161{
1165 return Res;
1166}
1167
1179{
1181}
1182
1184{
1188 return Result;
1189}
1190
1199{
1201}
1202
1204{
1208 return Result;
1209}
1210
1219{
1221}
1222
1224{
1228 return Result;
1229}
1230
1239{
1241}
1242
1244{
1248 return Result;
1249}
1250
1251
1262#ifndef __clang__
1264(
1266 uint32 E0,
1267 uint32 E1,
1268 uint32 E2,
1269 uint32 E3
1270)
1271{
1272 check((E0 < 4) && (E1 < 4) && (E2 < 4) && (E3 < 4));
1273 static constexpr uint32_t ControlElement[4] =
1274 {
1275 0x03020100, // XM_SWIZZLE_X
1276 0x07060504, // XM_SWIZZLE_Y
1277 0x0B0A0908, // XM_SWIZZLE_Z
1278 0x0F0E0D0C, // XM_SWIZZLE_W
1279 };
1280
1281 uint8x8x2_t tbl;
1282 tbl.val[0] = vget_low_f32(V);
1283 tbl.val[1] = vget_high_f32(V);
1284
1285 uint32x2_t idx = vcreate_u32(static_cast<uint64>(ControlElement[E0]) | (static_cast<uint64>(ControlElement[E1]) << 32));
1286 const uint8x8_t rL = vtbl2_u8(tbl, idx);
1287
1288 idx = vcreate_u32(static_cast<uint64>(ControlElement[E2]) | (static_cast<uint64>(ControlElement[E3]) << 32));
1289 const uint8x8_t rH = vtbl2_u8(tbl, idx);
1290
1291 return vcombine_f32(rL, rH);
1292}
1293
1295(
1297 uint32 E0,
1298 uint32 E1,
1299 uint32 E2,
1300 uint32 E3
1301)
1302{
1303 check((E0 < 4) && (E1 < 4) && (E2 < 4) && (E3 < 4));
1304 static constexpr uint64_t ControlElement[4] =
1305 {
1306 0x0706050403020100ULL, // XM_SWIZZLE_X
1307 0x0F0E0D0C0B0A0908ULL, // XM_SWIZZLE_Y
1308 0x1716151413121110ULL, // XM_SWIZZLE_Z
1309 0x1F1E1D1C1B1A1918ULL, // XM_SWIZZLE_W
1310 };
1311
1312 uint8x16x2_t tbl;
1313 tbl.val[0] = V.XY;
1314 tbl.val[1] = V.ZW;
1315
1318 Result.XY = vqtbl2q_u8(tbl, idx);
1319
1321 Result.ZW = vqtbl2q_u8(tbl, idx);
1322
1323 return Result;
1324}
1325#else
1326template <int X, int Y, int Z, int W>
1328{
1329 return __builtin_shufflevector(Vec, Vec, X, Y, Z, W);
1330}
1331
1332template <int X, int Y>
1334{
1335 if constexpr (X <= 1)
1336 {
1337 if constexpr (Y <= 1)
1338 {
1339 return __builtin_shufflevector(Vec.XY, Vec.XY, X, Y);
1340 }
1341 else
1342 {
1343 return __builtin_shufflevector(Vec.XY, Vec.ZW, X, Y);
1344 }
1345 }
1346 else
1347 {
1348 if constexpr (Y <= 1)
1349 {
1350 return __builtin_shufflevector(Vec.ZW, Vec.XY, X - 2, Y + 2);
1351 }
1352 else
1353 {
1354 return __builtin_shufflevector(Vec.ZW, Vec.ZW, X - 2, Y);
1355 }
1356 }
1357}
1358
1359template <int X, int Y, int Z, int W>
1361{
1365 return Result;
1366}
1367
1368#define VectorSwizzle( Vec, X, Y, Z, W ) VectorSwizzleImpl<X, Y, Z, W>(Vec)
1369#endif // __clang__
1370
1371
1383#ifndef __clang__
1385(
1392)
1393{
1394 check(PermuteX <= 3 && PermuteY <= 3 && PermuteZ <= 3 && PermuteW <= 3);
1395
1396 static constexpr uint32 ControlElement[8] =
1397 {
1398 0x03020100, // XM_PERMUTE_0X
1399 0x07060504, // XM_PERMUTE_0Y
1400 0x0B0A0908, // XM_PERMUTE_0Z
1401 0x0F0E0D0C, // XM_PERMUTE_0W
1402 0x13121110, // XM_PERMUTE_1X
1403 0x17161514, // XM_PERMUTE_1Y
1404 0x1B1A1918, // XM_PERMUTE_1Z
1405 0x1F1E1D1C, // XM_PERMUTE_1W
1406 };
1407
1408 uint8x8x4_t tbl;
1409 tbl.val[0] = vget_low_f32(V1);
1410 tbl.val[1] = vget_high_f32(V1);
1411 tbl.val[2] = vget_low_f32(V2);
1412 tbl.val[3] = vget_high_f32(V2);
1413
1414 uint32x2_t idx = vcreate_u32(static_cast<uint64>(ControlElement[PermuteX]) | (static_cast<uint64>(ControlElement[PermuteY]) << 32));
1415 const uint8x8_t rL = vtbl4_u8(tbl, idx);
1416
1417 idx = vcreate_u32(static_cast<uint64>(ControlElement[PermuteZ + 4]) | (static_cast<uint64>(ControlElement[PermuteW + 4]) << 32));
1418 const uint8x8_t rH = vtbl4_u8(tbl, idx);
1419
1420 return vcombine_f32(rL, rH);
1421}
1422
1424(
1431)
1432{
1433 check(PermuteX <= 3 && PermuteY <= 3 && PermuteZ <= 3 && PermuteW <= 3);
1434
1435 static constexpr uint64 ControlElement[8] =
1436 {
1437 0x0706050403020100ULL, // XM_PERMUTE_0X
1438 0x0F0E0D0C0B0A0908ULL, // XM_PERMUTE_0Y
1439 0x1716151413121110ULL, // XM_PERMUTE_0Z
1440 0x1F1E1D1C1B1A1918ULL, // XM_PERMUTE_0W
1441
1442 0x2726252423222120ULL, // XM_PERMUTE_1X
1443 0x2F2E2D2C2B2A2928ULL, // XM_PERMUTE_1Y
1444 0x3736353433323130ULL, // XM_PERMUTE_1Z
1445 0x3F3E3D3C3B3A3938ULL, // XM_PERMUTE_1W
1446 };
1447
1448 uint8x16x4_t tbl;
1449 tbl.val[0] = V1.XY;
1450 tbl.val[1] = V1.ZW;
1451 tbl.val[2] = V2.XY;
1452 tbl.val[3] = V2.ZW;
1453
1456 Result.XY = vqtbl4q_u8(tbl, idx);
1457
1459 Result.ZW = vqtbl4q_u8(tbl, idx);
1460
1461 return Result;
1462}
1463#else
1464
1465template <int X, int Y, int Z, int W>
1467{
1468 return __builtin_shufflevector(Vec1, Vec2, X, Y, Z + 4, W + 4);
1469}
1470
1471template <int X, int Y, int Z, int W>
1473{
1477 return Result;
1478}
1479
1480#define VectorShuffle( Vec1, Vec2, X, Y, Z, W ) VectorShuffleImpl<X, Y, Z, W>(Vec1, Vec2)
1481#endif // __clang__
1482
1490{
1491 int32x4_t Signs = vshrq_n_s32(vreinterpretq_s32_f32(VecMask), 31); // sign bit of each lane replicated 32x
1492 int32x4_t Masked = vandq_s32(Signs, MakeVectorRegisterInt(0x1, 0x2, 0x4, 0x8)); // pick bit for lane position
1493 return uint32(vaddvq_s32(Masked)); // reduce via add
1494}
1495
1497{
1498 int64x2_t Signs0 = vshrq_n_s64(vreinterpretq_s64_f32(VecMask.XY), 63); // sign bit of each lane replicated 64x
1499 int64x2_t Signs1 = vshrq_n_s64(vreinterpretq_s64_f32(VecMask.ZW), 63); // sign bit of each lane replicated 64x
1500 int32x4_t Signs = vuzp1q_s32(Signs0, Signs1); // 32-bit masks
1501 int32x4_t Masked = vandq_s32(Signs, MakeVectorRegisterInt(0x1, 0x2, 0x4, 0x8)); // pick bit for lane position
1502 return uint32(vaddvq_s32(Masked)); // reduce via add
1503}
1504
1513{
1515}
1516
1518{
1520 Result.XY = Vec1.ZW;
1521 Result.ZW = Vec2.ZW;
1522 return Result;
1523}
1524
1533{
1535}
1536
1538{
1540 Result.XY = Vec1.XY;
1541 Result.ZW = Vec2.XY;
1542 return Result;
1543}
1544
1555{
1557 OutEvens = deinterleaved.val[0];
1558 OutOdds = deinterleaved.val[1];
1559}
1560
1562{
1563 OutEvens = VectorShuffle(Lo, Hi, 0, 2, 0, 2);
1564 OutOdds = VectorShuffle(Lo, Hi, 1, 3, 1, 3);
1565}
1566
1575{
1578 C = VectorSwizzle(C, 1, 2, 0, 3);
1579 return C;
1580}
1581
1583{
1586 C = VectorSwizzle(C, 1, 2, 0, 3);
1587 return C;
1588}
1589
1598{
1599 //@TODO: Optimize this
1600 union U {
1601 VectorRegister4Float V; float F[4];
1602 FORCEINLINE U() : V() {}
1603 } B, E;
1604 B.V = Base;
1605 E.V = Exponent;
1606 return MakeVectorRegister( powf(B.F[0], E.F[0]), powf(B.F[1], E.F[1]), powf(B.F[2], E.F[2]), powf(B.F[3], E.F[3]) );
1607}
1608
1610{
1611 //@TODO: Optimize this
1612 AlignedDouble4 Values(Base);
1613 AlignedDouble4 Exponents(Exponent);
1614
1615 Values[0] = FMath::Pow(Values[0], Exponents[0]);
1616 Values[1] = FMath::Pow(Values[1], Exponents[1]);
1617 Values[2] = FMath::Pow(Values[2], Exponents[2]);
1618 Values[3] = FMath::Pow(Values[3], Exponents[3]);
1619 return Values.ToVectorRegister();
1620}
1621
1629{
1630 return vrecpeq_f32(Vec);
1631}
1632
1634{
1636 Result.XY = vrecpeq_f64(Vec.XY);
1637 Result.ZW = vrecpeq_f64(Vec.ZW);
1638 return Result;
1639}
1640
1641
1649{
1650 // Perform two passes of Newton-Raphson iteration on the hardware estimate
1651 // The built-in instruction (VRECPS) is not as accurate
1652
1653 // Initial estimate
1655
1656 // First iteration
1660
1661 // Second iteration
1665}
1666
1668{
1670}
1671
1672
1680{
1681 return vsqrtq_f32(Vec);
1682}
1683
1685{
1687 Result.XY = vsqrtq_f64(Vec.XY);
1688 Result.ZW = vsqrtq_f64(Vec.ZW);
1689 return Result;
1690}
1691
1699{
1700 return vrsqrteq_f32(Vec);
1701}
1702
1704{
1706 Result.XY = vrsqrteq_f64(Vec.XY);
1707 Result.ZW = vrsqrteq_f64(Vec.ZW);
1708 return Result;
1709}
1710
1718{
1719 // Initial estimate
1721
1722 // Two refinement
1725}
1726
1728{
1729 // Initial estimate
1731
1732 // Two refinement
1737
1740 return VectorMultiply(Tmp, RecipSqrt);
1741}
1742
1750{
1752}
1753
1755{
1757}
1758
1766{
1768}
1769
1771{
1773}
1774
1775
1783{
1784 return VectorSetComponent(Vec, 3, 0.0f);
1785}
1786
1788{
1789 return VectorSetComponent(Vec, 3, 0.0);
1790}
1791
1792
1800{
1801 return VectorSetComponent(Vec, 3, 1.0f);
1802}
1803
1805{
1806 return VectorSetComponent(Vec, 3, 1.0);
1807}
1808
1809
1810
1818template <uint32 ElementIndex>
1820{
1821 return vgetq_lane_f32(Vec, ElementIndex);
1822}
1823
1824template <int ElementIndex>
1826{
1827 return vgetq_lane_f64(Vec, ElementIndex);
1828}
1829
1830template <int ElementIndex>
1832{
1833 if constexpr (ElementIndex > 1)
1834 {
1835 return VectorGetComponentImpl<ElementIndex - 2>(Vec.ZW);
1836 }
1837 else
1838 {
1840 }
1841}
1842
1843#define VectorGetComponent(Vec, ElementIndex) VectorGetComponentImpl<ElementIndex>(Vec)
1844
1846{
1847 AlignedFloat4 Floats(Vec);
1848 return Floats[ElementIndex];
1849}
1850
1852{
1854 return Doubles[ElementIndex];
1855}
1856
1865{
1866 float32x4x4_t A = vld1q_f32_x4((const float*)Matrix1);
1867 float32x4x4_t B = vld1q_f32_x4((const float*)Matrix2);
1869
1870 // First row of result (Matrix1[0] * Matrix2).
1871 R.val[0] = vmulq_lane_f32(B.val[0], vget_low_f32(A.val[0]), 0);
1872 R.val[0] = vfmaq_lane_f32(R.val[0], B.val[1], vget_low_f32(A.val[0]), 1);
1873 R.val[0] = vfmaq_lane_f32(R.val[0], B.val[2], vget_high_f32(A.val[0]), 0);
1874 R.val[0] = vfmaq_lane_f32(R.val[0], B.val[3], vget_high_f32(A.val[0]), 1);
1875
1876 // Second row of result (Matrix1[1] * Matrix2).
1877 R.val[1] = vmulq_lane_f32(B.val[0], vget_low_f32(A.val[1]), 0);
1878 R.val[1] = vfmaq_lane_f32(R.val[1], B.val[1], vget_low_f32(A.val[1]), 1);
1879 R.val[1] = vfmaq_lane_f32(R.val[1], B.val[2], vget_high_f32(A.val[1]), 0);
1880 R.val[1] = vfmaq_lane_f32(R.val[1], B.val[3], vget_high_f32(A.val[1]), 1);
1881
1882 // Third row of result (Matrix1[2] * Matrix2).
1883 R.val[2] = vmulq_lane_f32(B.val[0], vget_low_f32(A.val[2]), 0);
1884 R.val[2] = vfmaq_lane_f32(R.val[2], B.val[1], vget_low_f32(A.val[2]), 1);
1885 R.val[2] = vfmaq_lane_f32(R.val[2], B.val[2], vget_high_f32(A.val[2]), 0);
1886 R.val[2] = vfmaq_lane_f32(R.val[2], B.val[3], vget_high_f32(A.val[2]), 1);
1887
1888 // Fourth row of result (Matrix1[3] * Matrix2).
1889 R.val[3] = vmulq_lane_f32(B.val[0], vget_low_f32(A.val[3]), 0);
1890 R.val[3] = vfmaq_lane_f32(R.val[3], B.val[1], vget_low_f32(A.val[3]), 1);
1891 R.val[3] = vfmaq_lane_f32(R.val[3], B.val[2], vget_high_f32(A.val[3]), 0);
1892 R.val[3] = vfmaq_lane_f32(R.val[3], B.val[3], vget_high_f32(A.val[3]), 1);
1893
1894 vst1q_f32_x4((float*)Result, R);
1895}
1896
1898{
1899 float64x2x4_t A = vld1q_f64_x4((const double*)Matrix1);
1900 float64x2x4_t B1 = vld1q_f64_x4((const double*)Matrix2);
1901 float64x2x4_t B2 = vld1q_f64_x4((const double*)Matrix2 + 8);
1902 float64_t* V = (float64_t*)&A;
1904
1905 // First row of result (Matrix1[0] * Matrix2).
1906 R.val[0] = vmulq_n_f64(B1.val[0], V[0]);
1907 R.val[0] = vfmaq_n_f64(R.val[0], B1.val[2], V[1]);
1908 R.val[0] = vfmaq_n_f64(R.val[0], B2.val[0], V[2]);
1909 R.val[0] = vfmaq_n_f64(R.val[0], B2.val[2], V[3]);
1910
1911 R.val[1] = vmulq_n_f64(B1.val[1], V[0]);
1912 R.val[1] = vfmaq_n_f64(R.val[1], B1.val[3], V[1]);
1913 R.val[1] = vfmaq_n_f64(R.val[1], B2.val[1], V[2]);
1914 R.val[1] = vfmaq_n_f64(R.val[1], B2.val[3], V[3]);
1915
1916 // Second row of result (Matrix1[1] * Matrix2).
1917 R.val[2] = vmulq_n_f64(B1.val[0], V[4]);
1918 R.val[2] = vfmaq_n_f64(R.val[2], B1.val[2], V[5]);
1919 R.val[2] = vfmaq_n_f64(R.val[2], B2.val[0], V[6]);
1920 R.val[2] = vfmaq_n_f64(R.val[2], B2.val[2], V[7]);
1921
1922 R.val[3] = vmulq_n_f64(B1.val[1], V[4]);
1923 R.val[3] = vfmaq_n_f64(R.val[3], B1.val[3], V[5]);
1924 R.val[3] = vfmaq_n_f64(R.val[3], B2.val[1], V[6]);
1925 R.val[3] = vfmaq_n_f64(R.val[3], B2.val[3], V[7]);
1926
1927 vst1q_f64_x4((double*)Result, R);
1928 A = vld1q_f64_x4((const double*)Matrix1 + 8);
1929 V = (float64_t*)&A;
1930
1931 // Third row of result (Matrix1[2] * Matrix2).
1932 R.val[0] = vmulq_n_f64(B1.val[0], V[0]);
1933 R.val[0] = vfmaq_n_f64(R.val[0], B1.val[2], V[1]);
1934 R.val[0] = vfmaq_n_f64(R.val[0], B2.val[0], V[2]);
1935 R.val[0] = vfmaq_n_f64(R.val[0], B2.val[2], V[3]);
1936
1937 R.val[1] = vmulq_n_f64(B1.val[1], V[0]);
1938 R.val[1] = vfmaq_n_f64(R.val[1], B1.val[3], V[1]);
1939 R.val[1] = vfmaq_n_f64(R.val[1], B2.val[1], V[2]);
1940 R.val[1] = vfmaq_n_f64(R.val[1], B2.val[3], V[3]);
1941
1942 // Fourth row of result (Matrix1[3] * Matrix2).
1943 R.val[2] = vmulq_n_f64(B1.val[0], V[4]);
1944 R.val[2] = vfmaq_n_f64(R.val[2], B1.val[2], V[5]);
1945 R.val[2] = vfmaq_n_f64(R.val[2], B2.val[0], V[6]);
1946 R.val[2] = vfmaq_n_f64(R.val[2], B2.val[2], V[7]);
1947
1948 R.val[3] = vmulq_n_f64(B1.val[1], V[4]);
1949 R.val[3] = vfmaq_n_f64(R.val[3], B1.val[3], V[5]);
1950 R.val[3] = vfmaq_n_f64(R.val[3], B2.val[1], V[6]);
1951 R.val[3] = vfmaq_n_f64(R.val[3], B2.val[3], V[7]);
1952
1953 vst1q_f64_x4((double*)Result + 8, R);
1954}
1955
1965{
1967}
1969{
1971}
1972
1981{
1982 float32x4x4_t M = vld1q_f32_x4((const float*)MatrixM);
1984
1985 Result = vmulq_n_f32(M.val[0], VecP[0]);
1986 Result = vfmaq_n_f32(Result, M.val[1], VecP[1]);
1987 Result = vfmaq_n_f32(Result, M.val[2], VecP[2]);
1988 Result = vfmaq_n_f32(Result, M.val[3], VecP[3]);
1989
1990 return Result;
1991}
1992
1994{
1995 float64x2x4_t M1 = vld1q_f64_x4((const double*)MatrixM);
1996 float64x2x4_t M2 = vld1q_f64_x4(((const double*)MatrixM) + 8);
1999
2000 Result.XY = vmulq_n_f64(M1.val[0], Vec.XY[0]);
2001 Result.XY = vfmaq_n_f64(Result.XY, M1.val[2], Vec.XY[1]);
2002 Result.XY = vfmaq_n_f64(Result.XY, M2.val[0], Vec.ZW[0]);
2003 Result.XY = vfmaq_n_f64(Result.XY, M2.val[2], Vec.ZW[1]);
2004
2005 Result.ZW = vmulq_n_f64(M1.val[1], Vec.XY[0]);
2006 Result.ZW = vfmaq_n_f64(Result.ZW, M1.val[3], Vec.XY[1]);
2007 Result.ZW = vfmaq_n_f64(Result.ZW, M2.val[1], Vec.ZW[0]);
2008 Result.ZW = vfmaq_n_f64(Result.ZW, M2.val[3], Vec.ZW[1]);
2009
2010 return MakeVectorRegisterFloatFromDouble(Result);
2011}
2012
2014{
2015 float64x2x4_t M1 = vld1q_f64_x4((const double*)MatrixM);
2016 float64x2x4_t M2 = vld1q_f64_x4(((const double*)MatrixM) + 8);
2018
2019 //TODO: this can be rewritten to avoid using M2 var, saves some registers
2020 Result.XY = vmulq_n_f64(M1.val[0], VecP.XY[0]);
2021 Result.XY = vfmaq_n_f64(Result.XY, M1.val[2], VecP.XY[1]);
2022 Result.XY = vfmaq_n_f64(Result.XY, M2.val[0], VecP.ZW[0]);
2023 Result.XY = vfmaq_n_f64(Result.XY, M2.val[2], VecP.ZW[1]);
2024
2025 Result.ZW = vmulq_n_f64(M1.val[1], VecP.XY[0]);
2026 Result.ZW = vfmaq_n_f64(Result.ZW, M1.val[3], VecP.XY[1]);
2027 Result.ZW = vfmaq_n_f64(Result.ZW, M2.val[1], VecP.ZW[0]);
2028 Result.ZW = vfmaq_n_f64(Result.ZW, M2.val[3], VecP.ZW[1]);
2029
2030 return Result;
2031}
2032
2041{
2042 return vminq_f32( Vec1, Vec2 );
2043}
2044
2046{
2048 Result.XY = vminq_f64(Vec1.XY, Vec2.XY);
2049 Result.ZW = vminq_f64(Vec1.ZW, Vec2.ZW);
2050 return Result;
2051}
2052
2061{
2062 return vmaxq_f32( Vec1, Vec2 );
2063}
2064
2066{
2068 Result.XY = vmaxq_f64(Vec1.XY, Vec2.XY);
2069 Result.ZW = vmaxq_f64(Vec1.ZW, Vec2.ZW);
2070 return Result;
2071}
2072
2081{
2082 return vsetq_lane_f32(vgetq_lane_f32(VecW, 3), VecXYZ, 3);
2083}
2084
2086{
2088 Res.XY = VecXYZ.XY;
2089 Res.ZW = vsetq_lane_f64(vgetq_lane_f64(VecW.ZW, 1), VecXYZ.ZW, 1);
2090 return Res;
2091}
2092
2100{
2101 uint8x8_t AsUInt8 = vreinterpret_u8_u32(vld1_dup_u32((const uint32*)Ptr));
2102 uint16x8_t AsUInt16 = vmovl_u8(AsUInt8);
2103 uint32x4_t AsUInt32 = vmovl_u16(vget_low_u16(AsUInt16));
2104 return vcvtq_f32_u32(AsUInt32);
2105}
2106
2114{
2115 int8x8_t AsInt8 = vreinterpret_s8_u32(vld1_dup_u32((const uint32*)Ptr));
2116 int16x8_t AsInt16 = vmovl_s8(AsInt8);
2117 int32x4_t AsInt32 = vmovl_s16(vget_low_u16(AsInt16));
2118 return vcvtq_f32_s32(AsInt32);
2119}
2120
2128{
2130 uint16x8_t AsUInt16 = vmovl_u8(AsUInt8);
2131 uint32x4_t AsUInt32 = vmovl_u16(vget_low_u16(AsUInt16));
2132 return vcvtq_f32_u32(AsUInt32);
2133}
2134
2142{
2143 uint32x4_t AsUInt32 = vcvtq_u32_f32(Vec); // Saturates (clamps) to [0,2^32 - 1]
2144 uint16x4_t AsUInt16 = vqmovn_u32(AsUInt32); // Saturates further to [0,2^16 - 1]
2145 uint8x8_t AsUInt8 = vqmovn_u16(vcombine_u16(AsUInt16, vdup_n_u16(0))); // Saturates to [0,255]
2146 vst1_lane_u32((uint32_t*)Ptr, AsUInt8, 0);
2147}
2148
2156{
2157 int32x4_t AsInt32 = vcvtq_s32_f32(Vec); // Saturates (clamps) to [-2^31,2^31 - 1]
2158 int16x4_t AsInt16 = vqmovn_s32(AsInt32); // Saturates further to [-32768,32767]
2159 int8x8_t AsInt8 = vqmovn_s16(vcombine_s16(AsInt16, vdup_n_s16(0))); // Saturates to [-128,127]
2160 vst1_lane_u32((uint32_t*)Ptr, AsInt8, 0);
2161}
2162
2169template <bool bAligned>
2171{
2173 vst1_u8((uint8_t*)Ptr, f16x4);
2174}
2175
2183{
2184 alignas(16) float V[4];
2185 const uint32 E = *(uint32*)Ptr;
2186 V[0] = float((E >> 00) & 0x3FF);
2187 V[1] = float((E >> 10) & 0x3FF);
2188 V[2] = float((E >> 20) & 0x3FF);
2189 V[3] = float((E >> 30) & 0x3);
2190
2191 VectorRegister4Float Div = MakeVectorRegister(1.0f / 1023.0f, 1.0f / 1023.0f, 1.0f / 1023.0f, 1.0f / 3.0f);
2192 return VectorMultiply(MakeVectorRegister(V[0], V[1], V[2], V[3]), Div);
2193}
2194
2202{
2203 union U {
2204 VectorRegister4Float V; float F[4];
2205 FORCEINLINE U() : V() {}
2206 } Tmp;
2207 Tmp.V = VectorMax(Vec, VectorZeroFloat());
2208 Tmp.V = VectorMin(Tmp.V, VectorOneFloat());
2209 Tmp.V = VectorMultiply(Tmp.V, MakeVectorRegister(1023.0f, 1023.0f, 1023.0f, 3.0f));
2210
2211 uint32* Out = (uint32*)Ptr;
2212 *Out = (uint32(Tmp.F[0]) & 0x3FF) << 00 |
2213 (uint32(Tmp.F[1]) & 0x3FF) << 10 |
2214 (uint32(Tmp.F[2]) & 0x3FF) << 20 |
2215 (uint32(Tmp.F[3]) & 0x003) << 30;
2216}
2217
2226{
2228 return vmaxvq_u32(Mask);
2229}
2230
2232{
2236}
2237
2241#define VectorResetFloatRegisters()
2242
2243
2250#if PLATFORM_WINDOWS_ARM64EC
2251 #pragma warning(push)
2252 #pragma warning(disable:5076) // warning C5076: read from FPCR
2253 #pragma warning(disable:5077) // warning C5076: write to FPCR
2254#endif
2255
2256
2263{
2264#if PLATFORM_WINDOWS && !PLATFORM_COMPILER_CLANG
2266#else
2268 // The system register read/write instructions use 64-bit registers,
2269 __asm__ volatile("mrs %0, fpcr" : "=r"(Value));
2270 return (uint32_t)Value;
2271#endif
2272}
2273
2280{
2281#if PLATFORM_WINDOWS && !PLATFORM_COMPILER_CLANG
2283#else
2284 uint64_t State64 = ControlStatus; // instruction needs a 64b reg, but all control bits fit in the lower 32b
2285 __asm__ volatile("msr fpcr, %0" : : "r"(State64));
2286#endif
2287}
2288
2292#if PLATFORM_WINDOWS_ARM64EC
2293 #pragma warning(pop)
2294#endif
2295
2296
2300#define VECTOR_ROUND_TOWARD_ZERO (3 << 22)
2301
2305#define VECTOR_DENORMALS_FLUSH_TO_ZERO (1 << 24)
2306
2307
2319{
2324
2325 return Result;
2326}
2327
2329{
2334
2335 return Result;
2336}
2337
2349{
2351}
2352
2354{
2356}
2357
2366{
2367 // Map to [-pi, pi]
2368 // X = A - 2pi * round(A/2pi)
2369 // Note the round(), not truncate(). In this case round() can round halfway cases using round-to-nearest-even OR round-to-nearest.
2370
2371 // Quotient = round(A/2pi)
2373 Quotient = vrndnq_f32(Quotient); // round to nearest even is the default rounding mode but that's fine here.
2374
2375 // X = A - 2pi * Quotient
2377
2378 // Map in [-pi/2,pi/2]
2380 VectorRegister4Float c = VectorBitwiseOr(GlobalVectorConstants::Pi, sign); // pi when x >= 0, -pi when x < 0
2384 X = VectorSelect(comp, rflx, X);
2386
2388
2389 // 11-degree minimax approximation
2390 //*ScalarSin = (((((-2.3889859e-08f * y2 + 2.7525562e-06f) * y2 - 0.00019840874f) * y2 + 0.0083333310f) * y2 - 0.16666667f) * y2 + 1.0f) * y;
2391 const VectorRegister4Float SinCoeff0 = MakeVectorRegister(1.0f, -0.16666667f, 0.0083333310f, -0.00019840874f);
2392 const VectorRegister4Float SinCoeff1 = MakeVectorRegister(2.7525562e-06f, -2.3889859e-08f, /*unused*/ 0.f, /*unused*/ 0.f);
2393
2402
2403 // 10-degree minimax approximation
2404 //*ScalarCos = sign * (((((-2.6051615e-07f * y2 + 2.4760495e-05f) * y2 - 0.0013888378f) * y2 + 0.041666638f) * y2 - 0.5f) * y2 + 1.0f);
2405 const VectorRegister4Float CosCoeff0 = MakeVectorRegister(1.0f, -0.5f, 0.041666638f, -0.0013888378f);
2406 const VectorRegister4Float CosCoeff1 = MakeVectorRegister(2.4760495e-05f, -2.6051615e-07f, /*unused*/ 0.f, /*unused*/ 0.f);
2407
2416}
2417
2418// Returns true if the vector contains a component that is either NAN or +/-infinite.
2420{
2421 // https://en.wikipedia.org/wiki/IEEE_754-1985
2422 // Infinity is represented with all exponent bits set, with the correct sign bit.
2423 // NaN is represented with all exponent bits set, plus at least one fraction/significant bit set.
2424 // This means finite values will not have all exponent bits set, so check against those bits.
2425
2426 union { float F; uint32 U; } InfUnion;
2427 InfUnion.U = 0x7F800000;
2428 const float Inf = InfUnion.F;
2430
2431 // Mask off Exponent
2433
2434 // Compare to full exponent & combine resulting flags into lane 0
2435 const int32x4_t Table = MakeVectorRegisterIntConstant(0x0C080400, 0, 0, 0);
2436
2437 uint8x16_t res = (uint8x16_t)VectorCompareEQ(ExpTest, FloatInfinity);
2438 // If we have all zeros, all elements are finite
2439 return vgetq_lane_u32((uint32x4_t)vqtbx1q_u8(res, res, Table), 0) != 0;
2440}
2441
2443{
2444 // https://en.wikipedia.org/wiki/IEEE_754-1985
2445 // Infinity is represented with all exponent bits set, with the correct sign bit.
2446 // NaN is represented with all exponent bits set, plus at least one fraction/significant bit set.
2447 // This means finite values will not have all exponent bits set, so check against those bits.
2448
2449 union { double F; uint64 U; } InfUnion;
2450 InfUnion.U = 0x7FF0000000000000ULL;
2451 const double Inf = InfUnion.F;
2453
2454 // Mask off Exponent
2456
2457 // Compare to full exponent & combine resulting flags into lane 0
2458 const int32x4_t Table = MakeVectorRegisterIntConstant(0x18100800, 0, 0, 0);
2459
2461
2462 // If we have all zeros, all elements are finite
2463 uint8x16_t ZeroVec = vdupq_n_u8(0);
2464 //TODO: there must be a better instruction to just get the top bits or smth
2465 return vgetq_lane_u32((uint32x4_t)vqtbx2q_u8(ZeroVec, *(uint8x16x2_t*)&InfTestRes, Table), 0) != 0;
2466}
2467
2468//TODO: Vectorize
2470{
2472 return MakeVectorRegister(FMath::Exp(Val[0]), FMath::Exp(Val[1]), FMath::Exp(Val[2]), FMath::Exp(Val[3]));
2473}
2474
2476{
2478 return MakeVectorRegister(FMath::Exp(Val[0]), FMath::Exp(Val[1]), FMath::Exp(Val[2]), FMath::Exp(Val[3]));
2479}
2480
2481//TODO: Vectorize
2483{
2485 return MakeVectorRegister(FMath::Exp2(Val[0]), FMath::Exp2(Val[1]), FMath::Exp2(Val[2]), FMath::Exp2(Val[3]));
2486}
2487
2489{
2491 return MakeVectorRegister(FMath::Exp2(Val[0]), FMath::Exp2(Val[1]), FMath::Exp2(Val[2]), FMath::Exp2(Val[3]));
2492}
2493
2494//TODO: Vectorize
2496{
2498 return MakeVectorRegister(FMath::Loge(Val[0]), FMath::Loge(Val[1]), FMath::Loge(Val[2]), FMath::Loge(Val[3]));
2499}
2500
2502{
2504 return MakeVectorRegister(FMath::Loge(Val[0]), FMath::Loge(Val[1]), FMath::Loge(Val[2]), FMath::Loge(Val[3]));
2505}
2506
2507//TODO: Vectorize
2509{
2512}
2513
2515{
2518}
2519
2520//TODO: Vectorize
2522{
2524 return MakeVectorRegister(FMath::Tan(Val[0]), FMath::Tan(Val[1]), FMath::Tan(Val[2]), FMath::Tan(Val[3]));
2525}
2526
2528{
2530 return MakeVectorRegister(FMath::Tan(Val[0]), FMath::Tan(Val[1]), FMath::Tan(Val[2]), FMath::Tan(Val[3]));
2531}
2532
2533//TODO: Vectorize
2535{
2537 return MakeVectorRegister(FMath::Asin(Val[0]), FMath::Asin(Val[1]), FMath::Asin(Val[2]), FMath::Asin(Val[3]));
2538}
2539
2541{
2543 return MakeVectorRegister(FMath::Asin(Val[0]), FMath::Asin(Val[1]), FMath::Asin(Val[2]), FMath::Asin(Val[3]));
2544}
2545
2546//TODO: Vectorize
2548{
2550 return MakeVectorRegister(FMath::Acos(Val[0]), FMath::Acos(Val[1]), FMath::Acos(Val[2]), FMath::Acos(Val[3]));
2551}
2552
2554{
2556 return MakeVectorRegister(FMath::Acos(Val[0]), FMath::Acos(Val[1]), FMath::Acos(Val[2]), FMath::Acos(Val[3]));
2557}
2558
2559//TODO: Vectorize
2561{
2563 return MakeVectorRegister(FMath::Atan(Val[0]), FMath::Atan(Val[1]), FMath::Atan(Val[2]), FMath::Atan(Val[3]));
2564}
2565
2567{
2569 return MakeVectorRegister(FMath::Atan(Val[0]), FMath::Atan(Val[1]), FMath::Atan(Val[2]), FMath::Atan(Val[3]));
2570}
2571
2572//TODO: Vectorize
2574{
2577
2578 return MakeVectorRegister(FMath::Atan2(ValX[0], ValY[0]),
2579 FMath::Atan2(ValX[1], ValY[1]),
2580 FMath::Atan2(ValX[2], ValY[2]),
2581 FMath::Atan2(ValX[3], ValY[3]));
2582}
2583
2585{
2588
2589 return MakeVectorRegister(FMath::Atan2(ValX[0], ValY[0]),
2590 FMath::Atan2(ValX[1], ValY[1]),
2591 FMath::Atan2(ValX[2], ValY[2]),
2592 FMath::Atan2(ValX[3], ValY[3]));
2593}
2594
2596{
2597 return vrndpq_f32(X);
2598}
2599
2601{
2603 Result.XY = vrndpq_f64(X.XY);
2604 Result.ZW = vrndpq_f64(X.ZW);
2605 return Result;
2606}
2607
2609{
2610 return vrndmq_f32(X);
2611}
2612
2614{
2616 Result.XY = vrndmq_f64(X.XY);
2617 Result.ZW = vrndmq_f64(X.ZW);
2618 return Result;
2619}
2620
2622{
2623 return vrndq_f32(X);
2624}
2625
2627{
2629 Result.XY = vrndq_f64(X.XY);
2630 Result.ZW = vrndq_f64(X.ZW);
2631 return Result;
2632}
2633
2635{
2636 // Check against invalid divisor
2638
2640 XFloats[0] = fmodf(XFloats[0], YFloats[0]);
2641 XFloats[1] = fmodf(XFloats[1], YFloats[1]);
2642 XFloats[2] = fmodf(XFloats[2], YFloats[2]);
2643 XFloats[3] = fmodf(XFloats[3], YFloats[3]);
2644 VectorRegister4Float Result = XFloats.ToVectorRegister();
2645
2646 // Return 0 where divisor Y was too small
2648 return Result;
2649}
2650
2652{
2653 // Check against invalid divisor
2655
2657 XDoubles[0] = fmod(XDoubles[0], YDoubles[0]);
2658 XDoubles[1] = fmod(XDoubles[1], YDoubles[1]);
2659 XDoubles[2] = fmod(XDoubles[2], YDoubles[2]);
2660 XDoubles[3] = fmod(XDoubles[3], YDoubles[3]);
2661 VectorRegister4Double DoubleResult = XDoubles.ToVectorRegister();
2662
2663 // Return 0 where divisor Y was too small
2665 return DoubleResult;
2666}
2667
2669{
2672}
2673
2675{
2678}
2679
2681{
2684}
2685
2687{
2690}
2691
2692namespace VectorSinConstantsNEON
2693{
2694 static const float p = 0.225f;
2695 static const float a = 7.58946609f; // 16 * sqrtf(p)
2696 static const float b = 1.63384342f; // (1 - p) / sqrtf(p)
2697 static const VectorRegister4Float A = MakeVectorRegisterConstant(a, a, a, a);
2698 static const VectorRegister4Float B = MakeVectorRegisterConstant(b, b, b, b);
2699}
2700
2702{
2703 //Sine approximation using a squared parabola restrained to f(0) = 0, f(PI) = 0, f(PI/2) = 1.
2704 //based on a good discussion here http://forum.devmaster.net/t/fast-and-accurate-sine-cosine/9648
2705 //After approx 2.5 million tests comparing to sin():
2706 //Average error of 0.000128
2707 //Max error of 0.001091
2708 //
2709 // Error clarification - the *relative* error rises above 1.2% near
2710 // 0 and PI (as the result nears 0). This is enough to introduce
2711 // harmonic distortion when used as an oscillator - VectorSinCos
2712 // doesn't cost that much more and is significantly more accurate.
2713 // (though don't use either for an oscillator if you care about perf)
2714
2718 return VectorMultiply(Y, VectorAdd(VectorSinConstantsNEON::B, VectorAbs(Y)));
2719}
2720
2722{
2724 Doubles[0] = FMath::Sin(Doubles[0]);
2725 Doubles[1] = FMath::Sin(Doubles[1]);
2726 Doubles[2] = FMath::Sin(Doubles[2]);
2727 Doubles[3] = FMath::Sin(Doubles[3]);
2728 return Doubles.ToVectorRegister();
2729}
2730
2732{
2734}
2735
2737{
2739 Doubles[0] = FMath::Cos(Doubles[0]);
2740 Doubles[1] = FMath::Cos(Doubles[1]);
2741 Doubles[2] = FMath::Cos(Doubles[2]);
2742 Doubles[3] = FMath::Cos(Doubles[3]);
2743 return Doubles.ToVectorRegister();
2744}
2745
2747{
2748 *VSinAngles = VectorSin(*VAngles);
2749 *VCosAngles = VectorCos(*VAngles);
2750}
2751
2759{
2762 return vcvtq_f32_u32(UInt32s);
2763}
2764
2772{
2773 int16x4_t Int16s = vld1_s16((const int16 *)Ptr);
2775 return vcvtq_f32_s32(Int32s);
2776}
2777
2785{
2787 Tmp = VectorMax(Vec, VectorZeroFloat());
2788 Tmp = VectorMin(Tmp, VectorOneFloat());
2789 Tmp = VectorMultiply(Tmp, vdupq_n_f32(65535.0f));
2790
2792 vst1_u16(Out, vmovn_u32(TmpUInt));
2793}
2794
2796//Integer ops
2797
2798//Bitwise
2800#define VectorIntAnd(A, B) vandq_s32(A, B)
2802#define VectorIntOr(A, B) vorrq_s32(A, B)
2804#define VectorIntXor(A, B) veorq_s32(A, B)
2806#define VectorIntAndNot(A, B) vbicq_s32(B, A)
2808#define VectorIntNot(A) vmvnq_s32(A)
2809
2810//Comparison
2811#define VectorIntCompareEQ(A, B) vceqq_s32(A,B)
2812#define VectorIntCompareNEQ(A, B) VectorIntNot(VectorIntCompareEQ(A,B))
2813#define VectorIntCompareGT(A, B) vcgtq_s32(A,B)
2814#define VectorIntCompareLT(A, B) vcltq_s32(A,B)
2815#define VectorIntCompareGE(A, B) vcgeq_s32(A,B)
2816#define VectorIntCompareLE(A, B) vcleq_s32(A,B)
2817
2818
2820{
2821 return vbslq_s32(Mask, Vec1, Vec2);
2822}
2823
2824//Arithmetic
2825#define VectorIntAdd(A, B) vaddq_s32(A, B)
2826#define VectorIntSubtract(A, B) vsubq_s32(A, B)
2827#define VectorIntMultiply(A, B) vmulq_s32(A, B)
2828#define VectorIntNegate(A) vnegq_s32(A)
2829#define VectorIntMin(A, B) vminq_s32(A,B)
2830#define VectorIntMax(A, B) vmaxq_s32(A,B)
2831#define VectorIntClamp(A, B, C) VectorIntMin(VectorIntMax(A, B), C)
2832#define VectorIntAbs(A) vabsq_s32(A)
2833
2834#define VectorIntSign(A) VectorIntSelect( VectorIntCompareGE(A, GlobalVectorConstants::IntZero), GlobalVectorConstants::IntOne, GlobalVectorConstants::IntMinusOne )
2835
2836#define VectorIntToFloat(A) vcvtq_f32_s32(A)
2837
2839{
2840 return vcvtq_s32_f32(A);
2841}
2842
2844{
2846}
2847
2849{
2852
2854}
2855
2857{
2858 return vqtbl1q_u8(Vec, Mask);
2859}
2860
2861//Loads and stores
2862
2869#define VectorIntStore( Vec, Ptr ) vst1q_s32( (int32*)(Ptr), Vec )
2870#define VectorIntStore_16( Vec, Ptr ) vst1q_s16( (int16*)(Ptr), Vec )
2871
2878#define VectorIntLoad( Ptr ) vld1q_s32( (int32*)((void*)(Ptr)) )
2879#define VectorIntLoad_16( Ptr ) vld1q_s16( (int16*)((void*)(Ptr)) )
2880
2887#define VectorIntStoreAligned( Vec, Ptr ) vst1q_s32( (int32*)(Ptr), Vec )
2888
2895#define VectorIntLoadAligned( Ptr ) vld1q_s32( (int32*)((void*)(Ptr)) )
2896
2903#define VectorIntLoad1(Ptr) vld1q_dup_s32((int32*)(Ptr))
2904#define VectorIntLoad1_16(Ptr) vld1q_dup_s16((int16*)(Ptr))
2905
2906#define VectorIntSet1(F) (VectorRegister4Int)vdupq_n_s32(F)
2907#define VectorSetZero() vdupq_n_s32(0)
2908#define VectorSet1(F) (VectorRegister4Float)vdupq_n_f32(F)
2909#define VectorCastIntToFloat(Vec) ((VectorRegister4f)vreinterpretq_f32_s32(Vec))
2910#define VectorCastFloatToInt(Vec) ((VectorRegister4i)vreinterpretq_s32_f32(Vec))
2911#define VectorCastDoubleToInt(Vec) ((VectorRegister4i)vreinterpretq_s64_f64(Vec))
2912#define VectorCastIntToDouble(Vec) ((VectorRegister2Double)vreinterpretq_f64_s64(Vec))
2913#define VectorShiftLeftImm(Vec, ImmAmt) vshlq_n_s32(Vec, ImmAmt)
2914#define VectorShiftRightImmArithmetic(Vec, ImmAmt) vshrq_n_s32(Vec, ImmAmt)
2915#define VectorShiftRightImmLogical(Vec, ImmAmt) vshrq_n_u32(Vec, ImmAmt)
2916#define VectorRound(Vec) vrndnq_f32(Vec)
2917
2919{
2920 return vcvtnq_s32_f32(Vec);
2921}
2922
2924{
2925 return vmovl_u16(vget_low_u16(V));
2926}
2927
2928// To be continued...
2929
2930#endif // #if PLATFORM_ENABLE_VECTORINTRINSICS_NEON
2931
2933
2934#if UE_ENABLE_INCLUDE_ORDER_DEPRECATED_IN_5_4
2935#include <type_traits>
2936#endif
2937
#define FORCEINLINE
Definition AndroidPlatform.h:140
#define GCC_ALIGN(n)
Definition AndroidPlatform.h:163
#define check(expr)
Definition AssertionMacros.h:314
FPlatformTypes::int16 int16
A 16-bit signed integer.
Definition Platform.h:1123
FPlatformTypes::int64 int64
A 64-bit signed integer.
Definition Platform.h:1127
FPlatformTypes::int32 int32
A 32-bit signed integer.
Definition Platform.h:1125
#define RESTRICT
Definition Platform.h:706
FPlatformTypes::uint64 uint64
A 64-bit unsigned integer.
Definition Platform.h:1117
UE_FORCEINLINE_HINT TSharedRef< CastToType, Mode > StaticCastSharedRef(TSharedRef< CastFromType, Mode > const &InSharedRef)
Definition SharedPointer.h:127
#define X(Name, Desc)
Definition FormatStringSan.h:47
#define PRAGMA_ENABLE_SHADOW_VARIABLE_WARNINGS
Definition MSVCPlatformCompilerPreSetup.h:65
#define PRAGMA_DISABLE_SHADOW_VARIABLE_WARNINGS
Definition MSVCPlatformCompilerPreSetup.h:55
USkinnedMeshComponent float
Definition SkinnedMeshComponent.h:60
FORCEINLINE VectorRegister4Int MakeVectorRegisterInt(int32 X, int32 Y, int32 Z, int32 W)
Definition UnrealMathFPU.h:282
FORCEINLINE VectorRegister4Float VectorTan(const VectorRegister4Float &Vec)
Definition UnrealMathFPU.h:2039
FORCEINLINE VectorRegister4Float VectorSubtract(const VectorRegister4Float &Vec1, const VectorRegister4Float &Vec2)
Definition UnrealMathFPU.h:731
FORCEINLINE void VectorDeinterleave(VectorRegister4Float &RESTRICT OutEvens, VectorRegister4Float &RESTRICT OutOdds, const VectorRegister4Float &RESTRICT Lo, const VectorRegister4Float &RESTRICT Hi)
Definition UnrealMathFPU.h:1777
FORCEINLINE VectorRegister4Double VectorLoadFloat3(const double *Ptr)
Definition UnrealMathFPU.h:427
FORCEINLINE VectorRegister4Float VectorATan2(const VectorRegister4Float &Y, const VectorRegister4Float &X)
Definition UnrealMathFPU.h:2083
FORCEINLINE uint32 VectorAnyGreaterThan(const VectorRegister4Float &Vec1, const VectorRegister4Float &Vec2)
Definition UnrealMathFPU.h:1917
FORCEINLINE VectorRegister4Float VectorSqrt(const VectorRegister4Float &Vec)
Definition UnrealMathFPU.h:1263
FORCEINLINE VectorRegister4Float VectorReciprocalSqrt(const VectorRegister4Float &Vec)
Definition UnrealMathFPU.h:1279
FORCEINLINE bool VectorMatrixInverse(FMatrix44d *DstMatrix, const FMatrix44d *SrcMatrix)
Definition UnrealMathFPU.h:1603
FORCEINLINE VectorRegister4Float VectorLoadSRGBA16N(void *Ptr)
Definition UnrealMathFPU.h:2268
FORCEINLINE VectorRegister4Float VectorDot3(const VectorRegister4Float &Vec1, const VectorRegister4Float &Vec2)
Definition UnrealMathFPU.h:880
FORCEINLINE VectorRegister4Float VectorMin(const VectorRegister4Float &Vec1, const VectorRegister4Float &Vec2)
Definition UnrealMathFPU.h:1686
FORCEINLINE float VectorGetComponentImpl(const VectorRegister4Float &Vec)
Definition UnrealMathFPU.h:364
FORCEINLINE VectorRegister4x4Float VectorLoad16(const float *Ptr)
Definition UnrealMathFPU.h:410
FORCEINLINE VectorRegister4Float VectorDot4(const VectorRegister4Float &Vec1, const VectorRegister4Float &Vec2)
Definition UnrealMathFPU.h:901
FORCEINLINE VectorRegister4Float MakeVectorRegister(uint32 X, uint32 Y, uint32 Z, uint32 W)
Definition UnrealMathFPU.h:195
FORCEINLINE void VectorSinCos(VectorRegister4Float *RESTRICT VSinAngles, VectorRegister4Float *RESTRICT VCosAngles, const VectorRegister4Float *RESTRICT VAngles)
Definition UnrealMathFPU.h:2109
FORCEINLINE VectorRegister4Float VectorLoadURGB10A2N(void *Ptr)
Definition UnrealMathFPU.h:1875
FORCEINLINE void VectorStoreSignedByte4(const VectorRegister4Float &Vec, void *Ptr)
Definition UnrealMathFPU.h:1858
FORCEINLINE VectorRegister4Float VectorSet_W1(const VectorRegister4Float &Vec)
Definition UnrealMathFPU.h:1407
FORCEINLINE VectorRegister4Float VectorSetFloat1(float F)
Definition UnrealMathFPU.h:518
FORCEINLINE VectorRegister4Float VectorLog2(const VectorRegister4Float &Vec)
Definition UnrealMathFPU.h:2004
FORCEINLINE void VectorQuaternionMultiply(VectorRegister4Float *RESTRICT Result, const VectorRegister4Float *RESTRICT Quat1, const VectorRegister4Float *RESTRICT Quat2)
Definition UnrealMathFPU.h:1431
FORCEINLINE VectorRegister4Float VectorLoadURGBA16N(void *Ptr)
Definition UnrealMathFPU.h:2248
#define VectorShuffle(Vec1, Vec2, X, Y, Z, W)
Definition UnrealMathFPU.h:652
VectorRegister4Double VectorRegister4d
Definition UnrealMathFPU.h:90
FORCEINLINE VectorRegister4Float VectorTruncate(const VectorRegister4Float &Vec)
Definition UnrealMathFPU.h:2153
FORCEINLINE VectorRegister4Double VectorZeroDouble(void)
Definition UnrealMathFPU.h:336
FORCEINLINE VectorRegister4Float VectorDivide(const VectorRegister4Float &Vec1, const VectorRegister4Float &Vec2)
Definition UnrealMathFPU.h:834
FORCEINLINE VectorRegister4Float VectorMultiply(const VectorRegister4Float &Vec1, const VectorRegister4Float &Vec2)
Definition UnrealMathFPU.h:758
AlignedDouble4 AlignedRegister4
Definition UnrealMathFPU.h:150
FORCEINLINE VectorRegister4Float VectorMax(const VectorRegister4Float &Vec1, const VectorRegister4Float &Vec2)
Definition UnrealMathFPU.h:1713
FORCEINLINE VectorRegister4Float VectorBitwiseAnd(const VectorRegister4Float &Vec1, const VectorRegister4Float &Vec2)
Definition UnrealMathFPU.h:1165
FORCEINLINE VectorRegister4Float VectorLoadFloat1(const float *Ptr)
Definition UnrealMathFPU.h:468
FORCEINLINE VectorRegister4Float VectorReciprocalLen(const VectorRegister4Float &Vec)
Definition UnrealMathFPU.h:1343
FORCEINLINE constexpr VectorRegister4Float MakeVectorRegisterFloatConstant(float X, float Y, float Z, float W)
Definition UnrealMathFPU.h:297
FORCEINLINE VectorRegister4Float VectorCos(const VectorRegister4Float &Vec)
Definition UnrealMathFPU.h:2027
FORCEINLINE VectorRegister4Float VectorLoadFloat2(const float *Ptr)
Definition UnrealMathFPU.h:485
#define VectorIntExpandLow16To32(V0)
Definition UnrealMathFPU.h:2661
FORCEINLINE VectorRegister4Int VectorIntSelect(const VectorRegister4Int &Mask, const VectorRegister4Int &Vec1, const VectorRegister4Int &Vec2)
Definition UnrealMathFPU.h:2411
FORCEINLINE void VectorStoreByte4(const VectorRegister4Float &Vec, void *Ptr)
Definition UnrealMathFPU.h:1842
FORCEINLINE VectorRegister4Float VectorCombineLow(const VectorRegister4Float &Vec1, const VectorRegister4Float &Vec2)
Definition UnrealMathFPU.h:1757
VectorRegister4Double VectorRegister4
Definition UnrealMathFPU.h:94
FORCEINLINE void VectorStore16(const VectorRegister4x4Float &Vec, float *Dst)
Definition UnrealMathFPU.h:582
FORCEINLINE VectorRegister4Float VectorReciprocalSqrtEstimate(const VectorRegister4Float &Vec)
Definition UnrealMathFPU.h:1295
FORCEINLINE VectorRegister4Double VectorLoadDouble1(const double *Ptr)
Definition UnrealMathFPU.h:473
FORCEINLINE void VectorMatrixMultiply(FMatrix44d *Result, const FMatrix44d *Matrix1, const FMatrix44d *Matrix2)
Definition UnrealMathFPU.h:1538
VectorRegister4Float VectorLoadAligned(const float *Ptr)
Definition UnrealMathFPU.h:451
FORCEINLINE VectorRegister4Float VectorMultiplyAdd(const VectorRegister4Float &Vec1, const VectorRegister4Float &Vec2, const VectorRegister4Float &Vec3)
Definition UnrealMathFPU.h:786
VectorRegister4Int VectorRegister4i
Definition UnrealMathFPU.h:88
FORCEINLINE VectorRegister4Float VectorSelect(const VectorRegister4Float &Mask, const VectorRegister4Float &Vec1, const VectorRegister4Float &Vec2)
Definition UnrealMathFPU.h:1105
FORCEINLINE VectorRegister4Float VectorExp(const VectorRegister4Float &Vec)
Definition UnrealMathFPU.h:1971
FORCEINLINE VectorRegister4Float VectorCompareGT(const VectorRegister4Float &Vec1, const VectorRegister4Float &Vec2)
Definition UnrealMathFPU.h:974
FORCEINLINE VectorRegister4Int VectorRoundToIntHalfToEven(const VectorRegister4Float &A)
Definition UnrealMathFPU.h:2175
FORCEINLINE VectorRegister4Double MakeVectorRegisterDoubleMask(uint64 X, uint64 Y, uint64 Z, uint64 W)
Definition UnrealMathFPU.h:206
FORCEINLINE VectorRegister4Float VectorExp2(const VectorRegister4Float &Vec)
Definition UnrealMathFPU.h:1982
FORCEINLINE VectorRegister4Float VectorASin(const VectorRegister4Float &Vec)
Definition UnrealMathFPU.h:2050
FORCEINLINE VectorRegister4Float VectorLoadTwoPairsFloat(const float *Ptr1, const float *Ptr2)
Definition UnrealMathFPU.h:503
FORCEINLINE VectorRegister4Double VectorOneDouble(void)
Definition UnrealMathFPU.h:351
FORCEINLINE VectorRegister4Float VectorReciprocalLenEstimate(const VectorRegister4Float &Vector)
Definition UnrealMathFPU.h:1375
FORCEINLINE void VectorStore(const VectorRegister4Float &Vec, float *Dst)
Definition UnrealMathFPU.h:566
#define VectorSetControlRegister(ControlStatus)
Definition UnrealMathFPU.h:1947
FORCEINLINE VectorRegister4Float VectorTransformVector(const VectorRegister4Float &VecP, const FMatrix44f *MatrixM)
Definition UnrealMathFPU.h:1619
FORCEINLINE VectorRegister4Float VectorCompareGE(const VectorRegister4Float &Vec1, const VectorRegister4Float &Vec2)
Definition UnrealMathFPU.h:1000
FORCEINLINE VectorRegister4Float VectorMod(const VectorRegister4Float &X, const VectorRegister4Float &Y)
Definition UnrealMathFPU.h:2185
FORCEINLINE VectorRegister4Int MakeVectorRegisterInt64(int64 X, int64 Y)
Definition UnrealMathFPU.h:307
FORCEINLINE VectorRegister4Float VectorCombineHigh(const VectorRegister4Float &Vec1, const VectorRegister4Float &Vec2)
Definition UnrealMathFPU.h:1740
FORCEINLINE VectorRegister4Float VectorCompareLT(const VectorRegister4Float &Vec1, const VectorRegister4Float &Vec2)
Definition UnrealMathFPU.h:1025
FORCEINLINE VectorRegister4Double MakeVectorRegisterDouble(uint64 X, uint64 Y, uint64 Z, uint64 W)
Definition UnrealMathFPU.h:185
VectorRegister4 VectorRegister
Definition UnrealMathFPU.h:95
FORCEINLINE float VectorGetComponentDynamic(const VectorRegister4Float &Vec, uint32 ComponentIndex)
Definition UnrealMathFPU.h:369
FORCEINLINE VectorRegister4Float VectorLog(const VectorRegister4Float &Vec)
Definition UnrealMathFPU.h:1993
FORCEINLINE VectorRegister4Float VectorSet_W0(const VectorRegister4Float &Vec)
Definition UnrealMathFPU.h:1391
#define VectorLoadSignedByte4(Ptr)
Definition UnrealMathFPU.h:1823
FORCEINLINE constexpr VectorRegister4Int MakeVectorRegisterIntConstant(int32 X, int32 Y, int32 Z, int32 W)
Definition UnrealMathFPU.h:292
FORCEINLINE int32 VectorMaskBits(const VectorRegister4Float &Vec1)
Definition UnrealMathFPU.h:1075
FORCEINLINE VectorRegister4Float VectorNegate(const VectorRegister4Float &Vec)
Definition UnrealMathFPU.h:687
FORCEINLINE VectorRegister4Float VectorNegateMultiplyAdd(const VectorRegister4Float &Vec1, const VectorRegister4Float &Vec2, const VectorRegister4Float &Vec3)
Definition UnrealMathFPU.h:815
FORCEINLINE VectorRegister4Float VectorReciprocal(const VectorRegister4Float &Vec)
Definition UnrealMathFPU.h:1311
FORCEINLINE VectorRegister4Float VectorSin(const VectorRegister4Float &Vec)
Definition UnrealMathFPU.h:2015
FORCEINLINE constexpr VectorRegister2Double MakeVectorRegister2DoubleConstant(double X, double Y)
Definition UnrealMathFPU.h:302
FORCEINLINE void VectorStoreURGBA16N(const VectorRegister4Float &Vec, void *Ptr)
Definition UnrealMathFPU.h:2288
FORCEINLINE VectorRegister4Int VectorShuffleByte4(const VectorRegister4Int &Vec, const VectorRegister4Int &Mask)
Definition UnrealMathFPU.h:2515
FORCEINLINE VectorRegister4Float VectorAbs(const VectorRegister4Float &Vec)
Definition UnrealMathFPU.h:661
FORCEINLINE VectorRegister4Float VectorACos(const VectorRegister4Float &Vec)
Definition UnrealMathFPU.h:2061
FORCEINLINE VectorRegister4Float VectorAdd(const VectorRegister4Float &Vec1, const VectorRegister4Float &Vec2)
Definition UnrealMathFPU.h:704
FORCEINLINE VectorRegister4Int VectorDoubleToInt(const VectorRegister4Double &A)
Definition UnrealMathFPU.h:2510
FORCEINLINE VectorRegister4Float VectorFloor(const VectorRegister4Float &Vec)
Definition UnrealMathFPU.h:2142
FORCEINLINE float VectorDot3Scalar(const VectorRegister4Float &Vec1, const VectorRegister4Float &Vec2)
Definition UnrealMathFPU.h:861
VectorRegister4Int VectorRegisterInt
Definition UnrealMathFPU.h:96
void VectorStoreAligned(const VectorRegister4Float &Vec, float *Ptr)
Definition UnrealMathFPU.h:534
FORCEINLINE VectorRegister4Float MakeVectorRegisterFloatMask(uint32 X, uint32 Y, uint32 Z, uint32 W)
Definition UnrealMathFPU.h:201
FORCEINLINE VectorRegister4Float VectorBitwiseXor(const VectorRegister4Float &Vec1, const VectorRegister4Float &Vec2)
Definition UnrealMathFPU.h:1190
FORCEINLINE VectorRegister4Float VectorCeil(const VectorRegister4Float &Vec)
Definition UnrealMathFPU.h:2131
FORCEINLINE VectorRegister4Double VectorLoadFloat3_W1(const double *Ptr)
Definition UnrealMathFPU.h:439
#define VectorSwizzle(Vec, X, Y, Z, W)
Definition UnrealMathFPU.h:639
FORCEINLINE VectorRegister4Float VectorQuaternionMultiply2(const VectorRegister4Float &Quat1, const VectorRegister4Float &Quat2)
Definition UnrealMathFPU.h:1517
FORCEINLINE VectorRegister4Float VectorOneFloat(void)
Definition UnrealMathFPU.h:346
#define VectorLoadByte4(Ptr)
Definition UnrealMathFPU.h:1814
FORCEINLINE VectorRegister4Float VectorZeroFloat(void)
Definition UnrealMathFPU.h:331
#define VectorGetControlRegister()
Definition UnrealMathFPU.h:1940
FORCEINLINE VectorRegister4Float VectorATan(const VectorRegister4Float &Vec)
Definition UnrealMathFPU.h:2072
FORCEINLINE VectorRegister4Float VectorLoad(const float *Ptr)
Definition UnrealMathFPU.h:394
FORCEINLINE VectorRegister4Float VectorCross(const VectorRegister4Float &Vec1, const VectorRegister4Float &Vec2)
Definition UnrealMathFPU.h:1216
FORCEINLINE VectorRegister4Float VectorBitwiseOr(const VectorRegister4Float &Vec1, const VectorRegister4Float &Vec2)
Definition UnrealMathFPU.h:1140
FORCEINLINE VectorRegister4Int VectorFloatToInt(const VectorRegister4Float &A)
Definition UnrealMathFPU.h:2491
bool VectorContainsNaNOrInfinite(const VectorRegister4Float &Vec)
Definition UnrealMathFPU.h:1960
FORCEINLINE VectorRegister4Float VectorStep(const VectorRegister4Float &Vec)
Definition UnrealMathFPU.h:2223
VectorRegister4Float VectorRegister4f
Definition UnrealMathFPU.h:89
FORCEINLINE VectorRegister4Float VectorPow(const VectorRegister4Float &Base, const VectorRegister4Float &Exponent)
Definition UnrealMathFPU.h:1243
FORCEINLINE void VectorStoreFloat3(const VectorRegister4Float &Vec, float *Dst)
Definition UnrealMathFPU.h:594
FORCEINLINE VectorRegister4Float VectorReciprocalEstimate(const VectorRegister4Float &Vec)
Definition UnrealMathFPU.h:1327
FORCEINLINE VectorRegister4Float VectorCompareLE(const VectorRegister4Float &Vec1, const VectorRegister4Float &Vec2)
Definition UnrealMathFPU.h:1050
FORCEINLINE VectorRegister4Float VectorCompareEQ(const VectorRegister4Float &Vec1, const VectorRegister4Float &Vec2)
Definition UnrealMathFPU.h:923
FORCEINLINE void VectorStoreFloat1(const VectorRegister4Float &Vec, float *Dst)
Definition UnrealMathFPU.h:610
FORCEINLINE VectorRegister4Float MakeVectorRegisterFloat(uint32 X, uint32 Y, uint32 Z, uint32 W)
Definition UnrealMathFPU.h:175
FORCEINLINE VectorRegister4Float MakeVectorRegisterFloatFromDouble(const VectorRegister4Double &Vec4d)
Definition UnrealMathFPU.h:262
FORCEINLINE VectorRegister2Double MakeVectorRegister2Double(double X, double Y)
Definition UnrealMathFPU.h:158
#define VectorReplicate(Vec, ElementIndex)
Definition UnrealMathFPU.h:627
VectorRegister2Double VectorRegister2d
Definition UnrealMathFPU.h:91
FORCEINLINE VectorRegister4Float VectorSign(const VectorRegister4Float &Vec)
Definition UnrealMathFPU.h:2204
#define VectorLoadByte4Reverse(Ptr)
Definition UnrealMathFPU.h:1833
FORCEINLINE VectorRegister4Float VectorCompareNE(const VectorRegister4Float &Vec1, const VectorRegister4Float &Vec2)
Definition UnrealMathFPU.h:948
FORCEINLINE VectorRegister4Float VectorMergeVecXYZ_VecW(const VectorRegister4Float &VecXYZ, const VectorRegister4Float &VecW)
Definition UnrealMathFPU.h:1797
FORCEINLINE void VectorStoreURGB10A2N(const VectorRegister4Float &Vec, void *Ptr)
Definition UnrealMathFPU.h:1895
FORCEINLINE constexpr VectorRegister4Float MakeVectorRegisterConstant(float X, float Y, float Z, float W)
Definition UnrealMathVectorConstants.h.inl:28
float Val(const FString &Value)
Definition UnrealMath.cpp:3163
uint8_t uint8
Definition binka_ue_file_header.h:8
uint16_t uint16
Definition binka_ue_file_header.h:7
uint32_t uint32
Definition binka_ue_file_header.h:6
Definition Float16.h:34
VectorRegister4Float FloatInfinity()
Definition UnrealMathVectorConstants.h.inl:118
constexpr VectorRegister4Float FloatZero
Definition UnrealMathVectorConstants.h.inl:41
constexpr VectorRegister4Float FloatOne
Definition UnrealMathVectorConstants.h.inl:40
constexpr VectorRegister4Float FloatMinusOne
Definition UnrealMathVectorConstants.h.inl:42
constexpr VectorRegister4Float OneOverTwoPi
Definition UnrealMathVectorConstants.h.inl:126
constexpr VectorRegister4Float SmallNumber
Definition UnrealMathVectorConstants.h.inl:53
constexpr VectorRegister4Double DoubleMinusOne
Definition UnrealMathVectorConstants.h.inl:60
constexpr VectorRegister4Double DOUBLE_QMULTI_SIGN_MASK2
Definition UnrealMathVectorConstants.h.inl:91
constexpr VectorRegister4Float QMULTI_SIGN_MASK0
Definition UnrealMathVectorConstants.h.inl:86
constexpr VectorRegister4Float TwoPi
Definition UnrealMathVectorConstants.h.inl:122
constexpr VectorRegister4Float QMULTI_SIGN_MASK1
Definition UnrealMathVectorConstants.h.inl:87
constexpr VectorRegister4Double DoubleSmallNumber
Definition UnrealMathVectorConstants.h.inl:71
constexpr VectorRegister4Float PiByTwo
Definition UnrealMathVectorConstants.h.inl:123
constexpr VectorRegister4Double DOUBLE_QMULTI_SIGN_MASK0
Definition UnrealMathVectorConstants.h.inl:89
constexpr VectorRegister4Float Pi
Definition UnrealMathVectorConstants.h.inl:121
constexpr VectorRegister4Float QMULTI_SIGN_MASK2
Definition UnrealMathVectorConstants.h.inl:88
constexpr VectorRegister4Float FloatOneHalf
Definition UnrealMathVectorConstants.h.inl:50
VectorRegister4Double DoubleInfinity()
Definition UnrealMathVectorConstants.h.inl:119
constexpr VectorRegister4Double DOUBLE_QMULTI_SIGN_MASK1
Definition UnrealMathVectorConstants.h.inl:90
VectorRegister4Float SignBit()
Definition UnrealMathVectorConstants.h.inl:105
constexpr VectorRegister4Double DoubleOne
Definition UnrealMathVectorConstants.h.inl:58
constexpr VectorRegister4Double DoubleZero
Definition UnrealMathVectorConstants.h.inl:59
@ V2
Definition NNEModelData.cpp:18
@ V1
Definition NNEModelData.cpp:17
FValue Div(const FValue &Lhs, const FValue &Rhs)
Definition ShaderValue.cpp:1519
UE_STRING_CLASS Result(Forward< LhsType >(Lhs), RhsLen)
Definition String.cpp.inl:732
U16 Index
Definition radfft.cpp:71
Definition UnrealMathFPU.h:133
FORCEINLINE double operator[](int32 Index) const
Definition UnrealMathFPU.h:141
double V[4]
Definition UnrealMathFPU.h:134
FORCEINLINE VectorRegister4Double ToVectorRegister() const
Definition UnrealMathFPU.h:144
Definition UnrealMathFPU.h:113
float V[4]
Definition UnrealMathFPU.h:114
FORCEINLINE float operator[](int32 Index) const
Definition UnrealMathFPU.h:121
FORCEINLINE VectorRegister4Float ToVectorRegister() const
Definition UnrealMathFPU.h:124
static CORE_API bool MatrixInverse(FMatrix44f *DstMatrix, const FMatrix44f *SrcMatrix)
Definition UnrealMath.cpp:928
static float Log2(float Value)
Definition UnrealMathUtility.h:722
Definition UnrealMathFPU.h:34
double V[2]
Definition UnrealMathFPU.h:35
Definition UnrealMathFPU.h:42
VectorRegister4Double()=default
VectorRegister2Double XY
Definition UnrealMathFPU.h:47
FORCEINLINE VectorRegister4Double & operator=(const VectorRegister4Float &From)
Definition UnrealMathFPU.h:77
VectorRegister2Double ZW
Definition UnrealMathFPU.h:48
double V[4]
Definition UnrealMathFPU.h:50
Definition UnrealMathFPU.h:20
float V[4]
Definition UnrealMathFPU.h:21
Definition UnrealMathFPU.h:28
int32 V[4]
Definition UnrealMathFPU.h:29
Definition UnrealMathFPU.h:99
Definition UnrealMathFPU.h:14